Commit e91eba31 by Nathan Sidwell Committed by Tom de Vries

[nvptx] Fix calls to vector and worker routines

2018-04-20  Nathan Sidwell  <nathan@codesourcery.com>
	    Tom de Vries  <tom@codesourcery.com>

	PR target/85445
	* config/nvptx/nvptx.c (nvptx_emit_forking, nvptx_emit_joining):
	Emit insns for calls too.
	(nvptx_find_par): Always look for worker-level predecessor insn.
	(nvptx_propagate): Add is_call parm, return bool.  Copy frame for
	calls.
	(nvptx_vpropagate, nvptx_wpropagate): Adjust.
	(nvptx_process_pars): Propagate frames for calls.

	* testsuite/libgomp.oacc-c++/ref-1.C: New.

Co-Authored-By: Tom de Vries <tom@codesourcery.com>

From-SVN: r259523
parent e95dda95
2018-04-20 Nathan Sidwell <nathan@codesourcery.com>
Tom de Vries <tom@codesourcery.com>
PR target/85445
* config/nvptx/nvptx.c (nvptx_emit_forking, nvptx_emit_joining):
Emit insns for calls too.
(nvptx_find_par): Always look for worker-level predecessor insn.
(nvptx_propagate): Add is_call parm, return bool. Copy frame for
calls.
(nvptx_vpropagate, nvptx_wpropagate): Adjust.
(nvptx_process_pars): Propagate frames for calls.
2018-04-20 H.J. Lu <hongjiu.lu@intel.com>
PR target/85469
......
......@@ -399,8 +399,7 @@ nvptx_emit_forking (unsigned mask, bool is_call)
it creates a block with a single successor before entering a
partitooned region. That is a good candidate for the end of
an SESE region. */
if (!is_call)
emit_insn (gen_nvptx_fork (op));
emit_insn (gen_nvptx_fork (op));
emit_insn (gen_nvptx_forked (op));
}
}
......@@ -419,8 +418,7 @@ nvptx_emit_joining (unsigned mask, bool is_call)
/* Emit joining for all non-call pars to ensure there's a single
predecessor for the block the join insn ends up in. This is
needed for skipping entire loops. */
if (!is_call)
emit_insn (gen_nvptx_joining (op));
emit_insn (gen_nvptx_joining (op));
emit_insn (gen_nvptx_join (op));
}
}
......@@ -3086,8 +3084,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
par = new parallel (par, mask);
par->forked_block = block;
par->forked_insn = end;
if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
par->fork_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
}
......@@ -3102,8 +3099,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
gcc_assert (par->mask == mask);
par->join_block = block;
par->join_insn = end;
if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
par->joining_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
par = par->parent;
......@@ -3782,29 +3778,34 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
#undef BB_SET_SESE
#undef BB_GET_SESE
/* Propagate live state at the start of a partitioned region. BLOCK
provides the live register information, and might not contain
INSN. Propagation is inserted just after INSN. RW indicates whether
we are reading and/or writing state. This
/* Propagate live state at the start of a partitioned region. IS_CALL
indicates whether the propagation is for a (partitioned) call
instruction. BLOCK provides the live register information, and
might not contain INSN. Propagation is inserted just after INSN. RW
indicates whether we are reading and/or writing state. This
separation is needed for worker-level proppagation where we
essentially do a spill & fill. FN is the underlying worker
function to generate the propagation instructions for single
register. DATA is user data.
We propagate the live register set and the entire frame. We could
do better by (a) propagating just the live set that is used within
the partitioned regions and (b) only propagating stack entries that
are used. The latter might be quite hard to determine. */
Returns true if we didn't emit any instructions.
We propagate the live register set for non-calls and the entire
frame for calls and non-calls. We could do better by (a)
propagating just the live set that is used within the partitioned
regions and (b) only propagating stack entries that are used. The
latter might be quite hard to determine. */
typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
static void
nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
propagator_fn fn, void *data)
static bool
nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
propagate_mask rw, propagator_fn fn, void *data)
{
bitmap live = DF_LIVE_IN (block);
bitmap_iterator iterator;
unsigned ix;
bool empty = true;
/* Copy the frame array. */
HOST_WIDE_INT fs = get_frame_size ();
......@@ -3816,6 +3817,7 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
rtx pred = NULL_RTX;
rtx_code_label *label = NULL;
empty = false;
/* The frame size might not be DImode compatible, but the frame
array's declaration will be. So it's ok to round up here. */
fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
......@@ -3862,18 +3864,21 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
insn = emit_insn_after (cpy, insn);
}
/* Copy live registers. */
EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
{
rtx reg = regno_reg_rtx[ix];
if (!is_call)
/* Copy live registers. */
EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
{
rtx reg = regno_reg_rtx[ix];
if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
{
rtx bcast = fn (reg, rw, 0, data);
if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
{
rtx bcast = fn (reg, rw, 0, data);
insn = emit_insn_after (bcast, insn);
}
}
insn = emit_insn_after (bcast, insn);
empty = false;
}
}
return empty;
}
/* Worker for nvptx_vpropagate. */
......@@ -3889,12 +3894,13 @@ vprop_gen (rtx reg, propagate_mask pm,
}
/* Propagate state that is live at start of BLOCK across the vectors
of a single warp. Propagation is inserted just after INSN. */
of a single warp. Propagation is inserted just after INSN.
IS_CALL and return as for nvptx_propagate. */
static void
nvptx_vpropagate (basic_block block, rtx_insn *insn)
static bool
nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
{
nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
}
/* Worker for nvptx_wpropagate. */
......@@ -3930,10 +3936,10 @@ wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
/* Spill or fill live state that is live at start of BLOCK. PRE_P
indicates if this is just before partitioned mode (do spill), or
just after it starts (do fill). Sequence is inserted just after
INSN. */
INSN. IS_CALL and return as for nvptx_propagate. */
static void
nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
static bool
nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
{
wcast_data_t data;
......@@ -3941,7 +3947,9 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
data.offset = 0;
data.ptr = NULL_RTX;
nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
bool empty = nvptx_propagate (is_call, block, insn,
pre_p ? PM_read : PM_write, wprop_gen, &data);
gcc_assert (empty == !data.offset);
if (data.offset)
{
/* Stuff was emitted, initialize the base pointer now. */
......@@ -3951,6 +3959,7 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
if (worker_bcast_size < data.offset)
worker_bcast_size = data.offset;
}
return empty;
}
/* Emit a worker-level synchronization barrier. We use different
......@@ -4311,18 +4320,23 @@ nvptx_process_pars (parallel *par)
inner_mask |= par->inner_mask;
}
if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
/* No propagation needed for a call. */;
else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
{
nvptx_wpropagate (false, par->forked_block, par->forked_insn);
nvptx_wpropagate (true, par->forked_block, par->fork_insn);
/* Insert begin and end synchronizations. */
emit_insn_before (nvptx_wsync (false), par->forked_insn);
emit_insn_before (nvptx_wsync (true), par->join_insn);
nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
bool empty = nvptx_wpropagate (true, is_call,
par->forked_block, par->fork_insn);
if (!empty || !is_call)
{
/* Insert begin and end synchronizations. */
emit_insn_before (nvptx_wsync (false), par->forked_insn);
emit_insn_before (nvptx_wsync (true), par->join_insn);
}
}
else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
nvptx_vpropagate (par->forked_block, par->forked_insn);
nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
/* Now do siblings. */
if (par->next)
......
2018-04-20 Nathan Sidwell <nathan@codesourcery.com>
Tom de Vries <tom@codesourcery.com>
PR target/85445
* testsuite/libgomp.oacc-c++/ref-1.C: New.
2018-04-19 Thomas Schwinge <thomas@codesourcery.com>
PR libgomp/85463
......
/* { dg-do run } */
#include <stdio.h>
#pragma acc routine vector
void __attribute__((noinline, noclone))
Vector (int *ptr, int n, const int &inc)
{
#pragma acc loop vector
for (unsigned ix = 0; ix < n; ix++)
ptr[ix] += inc;
}
#pragma acc routine worker
void __attribute__((noinline, noclone))
Worker (int *ptr, int m, int n, const int &inc)
{
#pragma acc loop worker
for (unsigned ix = 0; ix < m; ix++)
Vector(ptr + ix * n, n, inc);
}
int
main (void)
{
const int n = 32, m = 32;
int ary[m][n];
unsigned ix, iy;
for (ix = m; ix--;)
for (iy = n; iy--;)
ary[ix][iy] = (ix << 8) + iy;
#pragma acc parallel copy(ary)
{
Worker (&ary[0][0], m, n, 1 << 16);
}
int err = 0;
for (ix = m; ix--;)
for (iy = n; iy--;)
if (ary[ix][iy] != ((1 << 16) + (ix << 8) + iy))
{
printf ("ary[%u][%u] = %x expected %x\n",
ix, iy, ary[ix][iy], ((1 << 16) + (ix << 8) + iy));
err++;
}
if (err)
{
printf ("%d failed\n", err);
return 1;
}
#pragma acc parallel copy(ary)
{
Vector (&ary[0][0], m * n, (1 << 24) - (1 << 16));
}
for (ix = m; ix--;)
for (iy = n; iy--;)
if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy))
{
printf ("ary[%u][%u] = %x expected %x\n",
ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy));
err++;
}
if (err)
{
printf ("%d failed\n", err);
return 1;
}
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment