[nvptx] Fix calls to vector and worker routines

2018-04-20 Nathan Sidwell <nathan@codesourcery.com> Tom de Vries <tom@codesourcery.com> PR target/85445 * config/nvptx/nvptx.c (nvptx_emit_forking, nvptx_emit_joining): Emit insns for calls too. (nvptx_find_par): Always look for worker-level predecessor insn. (nvptx_propagate): Add is_call parm, return bool. Copy frame for calls. (nvptx_vpropagate, nvptx_wpropagate): Adjust. (nvptx_process_pars): Propagate frames for calls. * testsuite/libgomp.oacc-c++/ref-1.C: New. Co-Authored-By: Tom de Vries <tom@codesourcery.com> From-SVN: r259523

[nvptx] Fix calls to vector and worker routines
2018-04-20 Nathan Sidwell <nathan@codesourcery.com> Tom de Vries <tom@codesourcery.com> PR target/85445 * config/nvptx/nvptx.c (nvptx_emit_forking, nvptx_emit_joining): Emit insns for calls too. (nvptx_find_par): Always look for worker-level predecessor insn. (nvptx_propagate): Add is_call parm, return bool. Copy frame for calls. (nvptx_vpropagate, nvptx_wpropagate): Adjust. (nvptx_process_pars): Propagate frames for calls. * testsuite/libgomp.oacc-c++/ref-1.C: New. Co-Authored-By: Tom de Vries <tom@codesourcery.com> From-SVN: r259523
e91eba31 · Nathan Sidwell · Tom de Vries · e95dda95 · e91eba31 · e91eba31
Commit e91eba31 authored Apr 20, 2018 by Nathan Sidwell Committed by Tom de Vries Apr 20, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 156 additions and 46 deletions

gcc/ChangeLog
+12 -0

gcc/config/nvptx/nvptx.c
+60 -46

libgomp/ChangeLog
+6 -0

libgomp/testsuite/libgomp.oacc-c++/ref-1.C
+78 -0

No files found.
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2018-04-20  Nathan Sidwell  <nathan@codesourcery.com>
+	    Tom de Vries  <tom@codesourcery.com>
+
+	PR target/85445
+	* config/nvptx/nvptx.c (nvptx_emit_forking, nvptx_emit_joining):
+	Emit insns for calls too.
+	(nvptx_find_par): Always look for worker-level predecessor insn.
+	(nvptx_propagate): Add is_call parm, return bool.  Copy frame for
+	calls.
+	(nvptx_vpropagate, nvptx_wpropagate): Adjust.
+	(nvptx_process_pars): Propagate frames for calls.
+
 2018-04-20  H.J. Lu  <hongjiu.lu@intel.com>

 	PR target/85469

--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -399,8 +399,7 @@ nvptx_emit_forking (unsigned mask, bool is_call)
 	 it creates a block with a single successor before entering a
 	 partitooned region.  That is a good candidate for the end of
 	 an SESE region.  */
-      if (!is_call)
-	emit_insn (gen_nvptx_fork (op));
+      emit_insn (gen_nvptx_fork (op));
      emit_insn (gen_nvptx_forked (op));
    }
 }
@@ -419,8 +418,7 @@ nvptx_emit_joining (unsigned mask, bool is_call)
      /* Emit joining for all non-call pars to ensure there's a single
 	 predecessor for the block the join insn ends up in.  This is
 	 needed for skipping entire loops.  */
-      if (!is_call)
-	emit_insn (gen_nvptx_joining (op));
+      emit_insn (gen_nvptx_joining (op));
      emit_insn (gen_nvptx_join (op));
    }
 }
@@ -3086,8 +3084,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
 	    par = new parallel (par, mask);
 	    par->forked_block = block;
 	    par->forked_insn = end;
-	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	    if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
 	      par->fork_insn
 		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
 	  }
@@ -3102,8 +3099,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
 	    gcc_assert (par->mask == mask);
 	    par->join_block = block;
 	    par->join_insn = end;
-	    if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-		&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+	    if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
 	      par->joining_insn
 		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
 	    par = par->parent;
@@ -3782,29 +3778,34 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
 #undef BB_SET_SESE
 #undef BB_GET_SESE

-/* Propagate live state at the start of a partitioned region.  BLOCK
-   provides the live register information, and might not contain
-   INSN. Propagation is inserted just after INSN. RW indicates whether
-   we are reading and/or writing state.  This
+/* Propagate live state at the start of a partitioned region.  IS_CALL
+   indicates whether the propagation is for a (partitioned) call
+   instruction.  BLOCK provides the live register information, and
+   might not contain INSN. Propagation is inserted just after INSN. RW
+   indicates whether we are reading and/or writing state.  This
   separation is needed for worker-level proppagation where we
   essentially do a spill & fill.  FN is the underlying worker
   function to generate the propagation instructions for single
   register.  DATA is user data.

-   We propagate the live register set and the entire frame.  We could
-   do better by (a) propagating just the live set that is used within
-   the partitioned regions and (b) only propagating stack entries that
-   are used.  The latter might be quite hard to determine.  */
+   Returns true if we didn't emit any instructions.
+
+   We propagate the live register set for non-calls and the entire
+   frame for calls and non-calls.  We could do better by (a)
+   propagating just the live set that is used within the partitioned
+   regions and (b) only propagating stack entries that are used.  The
+   latter might be quite hard to determine.  */

 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);

-static void
-nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
-		 propagator_fn fn, void *data)
+static bool
+nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
+		 propagate_mask rw, propagator_fn fn, void *data)
 {
  bitmap live = DF_LIVE_IN (block);
  bitmap_iterator iterator;
  unsigned ix;
+  bool empty = true;

  /* Copy the frame array.  */
  HOST_WIDE_INT fs = get_frame_size ();
@@ -3816,6 +3817,7 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
      rtx pred = NULL_RTX;
      rtx_code_label *label = NULL;

+      empty = false;
      /* The frame size might not be DImode compatible, but the frame
 	 array's declaration will be.  So it's ok to round up here.  */
      fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
@@ -3862,18 +3864,21 @@ nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
      insn = emit_insn_after (cpy, insn);
    }

-  /* Copy live registers.  */
-  EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
-    {
-      rtx reg = regno_reg_rtx[ix];
+  if (!is_call)
+    /* Copy live registers.  */
+    EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
+      {
+	rtx reg = regno_reg_rtx[ix];

-      if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
-	{
-	  rtx bcast = fn (reg, rw, 0, data);
+	if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
+	  {
+	    rtx bcast = fn (reg, rw, 0, data);

-	  insn = emit_insn_after (bcast, insn);
-	}
-    }
+	    insn = emit_insn_after (bcast, insn);
+	    empty = false;
+	  }
+      }
+  return empty;
 }

 /* Worker for nvptx_vpropagate.  */
@@ -3889,12 +3894,13 @@ vprop_gen (rtx reg, propagate_mask pm,
 }

 /* Propagate state that is live at start of BLOCK across the vectors
-   of a single warp.  Propagation is inserted just after INSN.   */
+   of a single warp.  Propagation is inserted just after INSN.
+   IS_CALL and return as for nvptx_propagate.  */

-static void
-nvptx_vpropagate (basic_block block, rtx_insn *insn)
+static bool
+nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
 {
-  nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
+  return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
 }

 /* Worker for nvptx_wpropagate.  */
@@ -3930,10 +3936,10 @@ wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
 /* Spill or fill live state that is live at start of BLOCK.  PRE_P
   indicates if this is just before partitioned mode (do spill), or
   just after it starts (do fill). Sequence is inserted just after
-   INSN.  */
+   INSN.  IS_CALL and return as for nvptx_propagate.  */

-static void
-nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
+static bool
+nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
 {
  wcast_data_t data;

@@ -3941,7 +3947,9 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
  data.offset = 0;
  data.ptr = NULL_RTX;

-  nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
+  bool empty = nvptx_propagate (is_call, block, insn,
+				pre_p ? PM_read : PM_write, wprop_gen, &data);
+  gcc_assert (empty == !data.offset);
  if (data.offset)
    {
      /* Stuff was emitted, initialize the base pointer now.  */
@@ -3951,6 +3959,7 @@ nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
      if (worker_bcast_size < data.offset)
 	worker_bcast_size = data.offset;
    }
+  return empty;
 }

 /* Emit a worker-level synchronization barrier.  We use different
@@ -4311,18 +4320,23 @@ nvptx_process_pars (parallel *par)
      inner_mask |= par->inner_mask;
    }

-  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
-    /* No propagation needed for a call.  */;
-  else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+  bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
+
+  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
    {
-      nvptx_wpropagate (false, par->forked_block, par->forked_insn);
-      nvptx_wpropagate (true, par->forked_block, par->fork_insn);
-      /* Insert begin and end synchronizations.  */
-      emit_insn_before (nvptx_wsync (false), par->forked_insn);
-      emit_insn_before (nvptx_wsync (true), par->join_insn);
+      nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
+      bool empty = nvptx_wpropagate (true, is_call,
+				     par->forked_block, par->fork_insn);
+
+      if (!empty || !is_call)
+	{
+	  /* Insert begin and end synchronizations.  */
+	  emit_insn_before (nvptx_wsync (false), par->forked_insn);
+	  emit_insn_before (nvptx_wsync (true), par->join_insn);
+	}
    }
  else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
-    nvptx_vpropagate (par->forked_block, par->forked_insn);
+    nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);

  /* Now do siblings.  */
  if (par->next)

--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
+2018-04-20  Nathan Sidwell  <nathan@codesourcery.com>
+	    Tom de Vries  <tom@codesourcery.com>
+
+	PR target/85445
+	* testsuite/libgomp.oacc-c++/ref-1.C: New.
+
 2018-04-19  Thomas Schwinge  <thomas@codesourcery.com>

 	PR libgomp/85463

--- a/libgomp/testsuite/libgomp.oacc-c++/ref-1.C
+++ b/libgomp/testsuite/libgomp.oacc-c++/ref-1.C
+/* { dg-do run } */
+
+#include <stdio.h>
+
+#pragma acc routine vector
+void __attribute__((noinline, noclone))
+Vector (int *ptr, int n, const int &inc)
+{
+#pragma acc loop vector
+  for (unsigned ix = 0; ix < n; ix++)
+    ptr[ix] += inc;
+}
+
+#pragma acc routine worker
+void __attribute__((noinline, noclone))
+Worker (int *ptr, int m, int n, const int &inc)
+{
+#pragma acc loop worker
+  for (unsigned ix = 0; ix < m; ix++)
+    Vector(ptr + ix * n, n, inc);
+}
+
+int
+main (void)
+{
+  const int n = 32, m = 32;
+
+  int ary[m][n];
+  unsigned ix,  iy;
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      ary[ix][iy] = (ix << 8) + iy;
+
+#pragma acc parallel copy(ary)
+  {
+    Worker (&ary[0][0], m, n, 1 << 16);
+  }
+
+  int err = 0;
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      if (ary[ix][iy] != ((1 << 16) + (ix << 8) + iy))
+	{
+	  printf ("ary[%u][%u] = %x expected %x\n",
+		  ix, iy, ary[ix][iy], ((1 << 16) + (ix << 8) + iy));
+	  err++;
+	}
+
+  if (err)
+    {
+      printf ("%d failed\n", err);
+      return 1;
+    }
+
+#pragma acc parallel copy(ary)
+  {
+    Vector (&ary[0][0], m * n, (1 << 24) - (1 << 16));
+  }
+
+  for (ix = m; ix--;)
+    for (iy = n; iy--;)
+      if (ary[ix][iy] != ((1 << 24) + (ix << 8) + iy))
+	{
+	  printf ("ary[%u][%u] = %x expected %x\n",
+		  ix, iy, ary[ix][iy], ((1 << 24) + (ix << 8) + iy));
+	  err++;
+	}
+
+  if (err)
+    {
+      printf ("%d failed\n", err);
+      return 1;
+    }
+
+  return 0;
+}