Commit e91eba31 by Nathan Sidwell Committed by Tom de Vries

[nvptx] Fix calls to vector and worker routines

2018-04-20  Nathan Sidwell  <nathan@codesourcery.com>
	    Tom de Vries  <tom@codesourcery.com>

	PR target/85445
	* config/nvptx/nvptx.c (nvptx_emit_forking, nvptx_emit_joining):
	Emit insns for calls too.
	(nvptx_find_par): Always look for worker-level predecessor insn.
	(nvptx_propagate): Add is_call parm, return bool.  Copy frame for
	calls.
	(nvptx_vpropagate, nvptx_wpropagate): Adjust.
	(nvptx_process_pars): Propagate frames for calls.

	* testsuite/libgomp.oacc-c++/ref-1.C: New.

Co-Authored-By: Tom de Vries <tom@codesourcery.com>

From-SVN: r259523
parent e95dda95
2018-04-20 Nathan Sidwell <nathan@codesourcery.com>
Tom de Vries <tom@codesourcery.com>
PR target/85445
* config/nvptx/nvptx.c (nvptx_emit_forking, nvptx_emit_joining):
Emit insns for calls too.
(nvptx_find_par): Always look for worker-level predecessor insn.
(nvptx_propagate): Add is_call parm, return bool. Copy frame for
calls.
(nvptx_vpropagate, nvptx_wpropagate): Adjust.
(nvptx_process_pars): Propagate frames for calls.
2018-04-20 H.J. Lu <hongjiu.lu@intel.com> 2018-04-20 H.J. Lu <hongjiu.lu@intel.com>
PR target/85469 PR target/85469
......
...@@ -399,8 +399,7 @@ nvptx_emit_forking (unsigned mask, bool is_call) ...@@ -399,8 +399,7 @@ nvptx_emit_forking (unsigned mask, bool is_call)
it creates a block with a single successor before entering a it creates a block with a single successor before entering a
partitooned region. That is a good candidate for the end of partitooned region. That is a good candidate for the end of
an SESE region. */ an SESE region. */
if (!is_call) emit_insn (gen_nvptx_fork (op));
emit_insn (gen_nvptx_fork (op));
emit_insn (gen_nvptx_forked (op)); emit_insn (gen_nvptx_forked (op));
} }
} }
...@@ -419,8 +418,7 @@ nvptx_emit_joining (unsigned mask, bool is_call) ...@@ -419,8 +418,7 @@ nvptx_emit_joining (unsigned mask, bool is_call)
/* Emit joining for all non-call pars to ensure there's a single /* Emit joining for all non-call pars to ensure there's a single
predecessor for the block the join insn ends up in. This is predecessor for the block the join insn ends up in. This is
needed for skipping entire loops. */ needed for skipping entire loops. */
if (!is_call) emit_insn (gen_nvptx_joining (op));
emit_insn (gen_nvptx_joining (op));
emit_insn (gen_nvptx_join (op)); emit_insn (gen_nvptx_join (op));
} }
} }
...@@ -3086,8 +3084,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block) ...@@ -3086,8 +3084,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
par = new parallel (par, mask); par = new parallel (par, mask);
par->forked_block = block; par->forked_block = block;
par->forked_insn = end; par->forked_insn = end;
if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
par->fork_insn par->fork_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_fork); = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
} }
...@@ -3102,8 +3099,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block) ...@@ -3102,8 +3099,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
gcc_assert (par->mask == mask); gcc_assert (par->mask == mask);
par->join_block = block; par->join_block = block;
par->join_insn = end; par->join_insn = end;
if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
par->joining_insn par->joining_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_joining); = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
par = par->parent; par = par->parent;
...@@ -3782,29 +3778,34 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions) ...@@ -3782,29 +3778,34 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
#undef BB_SET_SESE #undef BB_SET_SESE
#undef BB_GET_SESE #undef BB_GET_SESE
/* Propagate live state at the start of a partitioned region. BLOCK /* Propagate live state at the start of a partitioned region. IS_CALL
provides the live register information, and might not contain indicates whether the propagation is for a (partitioned) call
INSN. Propagation is inserted just after INSN. RW indicates whether instruction. BLOCK provides the live register information, and
we are reading and/or writing state. This might not contain INSN. Propagation is inserted just after INSN. RW
indicates whether we are reading and/or writing state. This
separation is needed for worker-level proppagation where we separation is needed for worker-level proppagation where we
essentially do a spill & fill. FN is the underlying worker essentially do a spill & fill. FN is the underlying worker
function to generate the propagation instructions for single function to generate the propagation instructions for single
register. DATA is user data. register. DATA is user data.
We propagate the live register set and the entire frame. We could Returns true if we didn't emit any instructions.
do better by (a) propagating just the live set that is used within
the partitioned regions and (b) only propagating stack entries that We propagate the live register set for non-calls and the entire
are used. The latter might be quite hard to determine. */ frame for calls and non-calls. We could do better by (a)
propagating just the live set that is used within the partitioned
regions and (b) only propagating stack entries that are used. The
latter might be quite hard to determine. */
typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *); typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
static void static bool
nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw, nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
propagator_fn fn, void *data) propagate_mask rw, propagator_fn fn, void *data)
{ {
bitmap live = DF_LIVE_IN (block); bitmap live = DF_LIVE_IN (block);
bitmap_iterator iterator; bitmap_iterator iterator;
unsigned ix; unsigned ix;
bool empty = true;
/* Copy the frame array. */ /* Copy the frame array. */
HOST_WIDE_INT fs = get_frame_size (); HOST_WIDE_INT fs = get_frame_size ();
...@@ -3816,6 +3817,7 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw, ...@@ -3816,6 +3817,7 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
rtx pred = NULL_RTX; rtx pred = NULL_RTX;
rtx_code_label *label = NULL; rtx_code_label *label = NULL;
empty = false;
/* The frame size might not be DImode compatible, but the frame /* The frame size might not be DImode compatible, but the frame
array's declaration will be. So it's ok to round up here. */ array's declaration will be. So it's ok to round up here. */
fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode); fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
...@@ -3862,18 +3864,21 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw, ...@@ -3862,18 +3864,21 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
insn = emit_insn_after (cpy, insn); insn = emit_insn_after (cpy, insn);
} }
/* Copy live registers. */ if (!is_call)
EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator) /* Copy live registers. */
{ EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
rtx reg = regno_reg_rtx[ix]; {
rtx reg = regno_reg_rtx[ix];
if (REGNO (reg) >= FIRST_PSEUDO_REGISTER) if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
{ {
rtx bcast = fn (reg, rw, 0, data); rtx bcast = fn (reg, rw, 0, data);
insn = emit_insn_after (bcast, insn); insn = emit_insn_after (bcast, insn);
} empty = false;
} }
}
return empty;
} }
/* Worker for nvptx_vpropagate. */ /* Worker for nvptx_vpropagate. */
...@@ -3889,12 +3894,13 @@ vprop_gen (rtx reg, propagate_mask pm, ...@@ -3889,12 +3894,13 @@ vprop_gen (rtx reg, propagate_mask pm,
} }
/* Propagate state that is live at start of BLOCK across the vectors /* Propagate state that is live at start of BLOCK across the vectors
of a single warp. Propagation is inserted just after INSN. */ of a single warp. Propagation is inserted just after INSN.
IS_CALL and return as for nvptx_propagate. */
static void static bool
nvptx_vpropagate (basic_block block, rtx_insn *insn) nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
{ {
nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0); return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
} }
/* Worker for nvptx_wpropagate. */ /* Worker for nvptx_wpropagate. */
...@@ -3930,10 +3936,10 @@ wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_) ...@@ -3930,10 +3936,10 @@ wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
/* Spill or fill live state that is live at start of BLOCK. PRE_P /* Spill or fill live state that is live at start of BLOCK. PRE_P
indicates if this is just before partitioned mode (do spill), or indicates if this is just before partitioned mode (do spill), or
just after it starts (do fill). Sequence is inserted just after just after it starts (do fill). Sequence is inserted just after
INSN. */ INSN. IS_CALL and return as for nvptx_propagate. */
static void static bool
nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn) nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
{ {
wcast_data_t data; wcast_data_t data;
...@@ -3941,7 +3947,9 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn) ...@@ -3941,7 +3947,9 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
data.offset = 0; data.offset = 0;
data.ptr = NULL_RTX; data.ptr = NULL_RTX;
nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data); bool empty = nvptx_propagate (is_call, block, insn,
pre_p ? PM_read : PM_write, wprop_gen, &data);
gcc_assert (empty == !data.offset);
if (data.offset) if (data.offset)
{ {
/* Stuff was emitted, initialize the base pointer now. */ /* Stuff was emitted, initialize the base pointer now. */
...@@ -3951,6 +3959,7 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn) ...@@ -3951,6 +3959,7 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
if (worker_bcast_size < data.offset) if (worker_bcast_size < data.offset)
worker_bcast_size = data.offset; worker_bcast_size = data.offset;
} }
return empty;
} }
/* Emit a worker-level synchronization barrier. We use different /* Emit a worker-level synchronization barrier. We use different
...@@ -4311,18 +4320,23 @@ nvptx_process_pars (parallel *par) ...@@ -4311,18 +4320,23 @@ nvptx_process_pars (parallel *par)
inner_mask |= par->inner_mask; inner_mask |= par->inner_mask;
} }
if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
/* No propagation needed for a call. */;
else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
{ {
nvptx_wpropagate (false, par->forked_block, par->forked_insn); nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
nvptx_wpropagate (true, par->forked_block, par->fork_insn); bool empty = nvptx_wpropagate (true, is_call,
/* Insert begin and end synchronizations. */ par->forked_block, par->fork_insn);
emit_insn_before (nvptx_wsync (false), par->forked_insn);
emit_insn_before (nvptx_wsync (true), par->join_insn); if (!empty || !is_call)
{
/* Insert begin and end synchronizations. */
emit_insn_before (nvptx_wsync (false), par->forked_insn);
emit_insn_before (nvptx_wsync (true), par->join_insn);
}
} }
else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
nvptx_vpropagate (par->forked_block, par->forked_insn); nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
/* Now do siblings. */ /* Now do siblings. */
if (par->next) if (par->next)
......
2018-04-20 Nathan Sidwell <nathan@codesourcery.com>
Tom de Vries <tom@codesourcery.com>
PR target/85445
* testsuite/libgomp.oacc-c++/ref-1.C: New.
2018-04-19 Thomas Schwinge <thomas@codesourcery.com> 2018-04-19 Thomas Schwinge <thomas@codesourcery.com>
PR libgomp/85463 PR libgomp/85463
......
/* { dg-do run } */
#include <stdio.h>
#pragma acc routine vector
void __attribute__((noinline, noclone))
Vector (int *ptr, int n, const int &inc)
{
#pragma acc loop vector
for (unsigned ix = 0; ix < n; ix++)
ptr[ix] += inc;
}
#pragma acc routine worker
void __attribute__((noinline, noclone))
Worker (int *ptr, int m, int n, const int &inc)
{
#pragma acc loop worker
for (unsigned ix = 0; ix < m; ix++)
Vector(ptr + ix * n, n, inc);
}
int
main (void)
{
const int n = 32, m = 32;
int ary[m][n];
unsigned ix, iy;
for (ix = m; ix--;)
for (iy = n; iy--;)
ary[ix][iy] = (ix << 8) + iy;
#pragma acc parallel copy(ary)
{
Worker (&ary[0][0], m, n, 1 << 16);
}
int err = 0;
for (ix = m; ix--;)
for (iy = n; iy--;)
if (ary[ix][iy] != ((1 << 16) + (ix << 8) + iy))
{
printf ("ary[%u][%u] = %x expected %x\n",
ix, iy, ary[ix][iy], ((1 << 16) + (ix << 8) + iy));
err++;
}
if (err)
{
printf ("%d failed\n", err);
return 1;
}
#pragma acc parallel copy(ary)
{
Vector (&ary[0][0], m * n, (1 << 24) - (1 << 16));
}
for (ix = m; ix--;)
for (iy = n; iy--;)
if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy))
{
printf ("ary[%u][%u] = %x expected %x\n",
ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy));
err++;
}
if (err)
{
printf ("%d failed\n", err);
return 1;
}
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment