nvptx.c (global_lock_var): New.

gcc/ * config/nvptx/nvptx.c (global_lock_var): New. (nvptx_global_lock_addr): New. (nvptx_lockless_update): Recomment and adjust for clarity. (nvptx_lockfull_update): New. (nvptx_reduction_update): New. (nvptx_goacc_reduction_fini): Call it. libgcc/ * config/nvptx/reduction.c: New. * config/nvptx/t-nvptx (LIB2ADD): Add it. libgomp/ * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Add worker & gang cases. * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise. From-SVN: r230545

nvptx.c (global_lock_var): New.
gcc/ * config/nvptx/nvptx.c (global_lock_var): New. (nvptx_global_lock_addr): New. (nvptx_lockless_update): Recomment and adjust for clarity. (nvptx_lockfull_update): New. (nvptx_reduction_update): New. (nvptx_goacc_reduction_fini): Call it. libgcc/ * config/nvptx/reduction.c: New. * config/nvptx/t-nvptx (LIB2ADD): Add it. libgomp/ * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Add worker & gang cases. * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise. From-SVN: r230545
33f47f42 · Nathan Sidwell · Nathan Sidwell · d085c468 · 33f47f42 · 33f47f42
Commit 33f47f42 authored Nov 18, 2015 by Nathan Sidwell Committed by Nathan Sidwell Nov 18, 2015
8 changed files
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2015-11-18  Nathan Sidwell  <nathan@codesourcery.com>
+	* config/nvptx/nvptx.c (global_lock_var): New.
+	(nvptx_global_lock_addr): New.
+	(nvptx_lockless_update): Recomment and adjust for clarity.
+	(nvptx_lockfull_update): New.
+	(nvptx_reduction_update): New.
+	(nvptx_goacc_reduction_fini): Call it.
 2015-11-18  Bernd Schmidt  <bschmidt@redhat.com>
 	* regrename.h (struct du_head): Add target_data_1 and target_data_2
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
--- a/libgcc/ChangeLog
+++ b/libgcc/ChangeLog
+2015-11-18  Nathan Sidwell  <nathan@codesourcery.com>
+	* config/nvptx/reduction.c: New.
+	* config/nvptx/t-nvptx (LIB2ADD): Add it.
 2015-11-15  David Edelsohn  <dje.gcc@gmail.com>
 	* config/rs6000/on_exit.c: New file.

--- a/libgcc/config/nvptx/reduction.c
+++ b/libgcc/config/nvptx/reduction.c
+/* Oversized reductions lock  variable
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Mentor Graphics.
+This file is part of GCC.
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+/* We use a global lock variable for reductions on objects larger than
+   64 bits.  Until and unless proven that lock contention for
+   different reduction is a problem, a single lock will suffice.  */
+unsigned volatile __reduction_lock = 0;
--- a/libgcc/config/nvptx/t-nvptx
+++ b/libgcc/config/nvptx/t-nvptx
 LIB2ADD=$(srcdir)/config/nvptx/malloc.asm \
 	$(srcdir)/config/nvptx/free.asm \
-	$(srcdir)/config/nvptx/realloc.c
+	$(srcdir)/config/nvptx/realloc.c \
+	$(srcdir)/config/nvptx/reduction.c
 LIB2ADDEH=
 LIB2FUNCS_EXCLUDE=__main

--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
+2015-11-18  Nathan Sidwell  <nathan@codesourcery.com>
+	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Add
+	worker & gang cases.
+	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise.
 2015-11-17  Cesar Philippidis  <cesar@codesourcery.com>
 	* config/nvptx/priority_queue.c: New file.

--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c
@@ -14,28 +14,41 @@ int close_enough (double _Complex a, double _Complex b)
  return mag2_diff / mag2_a < (FRAC * FRAC);
 }
-int main (void)
-{
 #define N 100
-  double _Complex ary[N], sum, prod, tsum, tprod;
-  int ix;
-  sum = tsum = 0;
+static int __attribute__ ((noinline))
-  prod = tprod = 1;
+vector (double _Complex ary[N], double _Complex sum, double _Complex prod)
+{
-  for (ix = 0; ix < N;  ix++)
+  double _Complex tsum = 0, tprod = 1;
-    {
-      double frac = ix * (1.0 / 1024) + 1.0;
-      ary[ix] = frac + frac * 2.0i - 1.0i;
-      sum += ary[ix];
-      prod *= ary[ix];
-    }
-#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
+#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod)
  {
 #pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
-    for (ix = 0; ix < N; ix++)
+    for (int ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+  if (!close_enough (sum, tsum))
+    return 1;
+  if (!close_enough (prod, tprod))
+    return 1;
+  return 0;
+}
+static int __attribute__ ((noinline))
+worker (double _Complex ary[N], double _Complex sum, double _Complex prod)
+{
+  double _Complex tsum = 0, tprod = 1;
+#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod)
+  {
+#pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
+    for (int ix = 0; ix < N; ix++)
      {
 	tsum += ary[ix];
 	tprod *= ary[ix];
@@ -50,3 +63,52 @@ int main (void)
  return 0;
 }
+static int __attribute__ ((noinline))
+gang (double _Complex ary[N], double _Complex sum, double _Complex prod)
+{
+  double _Complex tsum = 0, tprod = 1;
+#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod)
+  {
+#pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
+    for (int ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+  if (!close_enough (sum, tsum))
+    return 1;
+  if (!close_enough (prod, tprod))
+    return 1;
+  return 0;
+}
+int main (void)
+{
+  double _Complex ary[N], sum = 0, prod = 1;
+  for (int ix = 0; ix < N;  ix++)
+    {
+      double frac = ix * (1.0 / 1024) + 1.0;
+      ary[ix] = frac + frac * 2.0i - 1.0i;
+      sum += ary[ix];
+      prod *= ary[ix];
+    }
+  if (vector (ary, sum, prod))
+    return 1;
+  if (worker (ary, sum, prod))
+    return 1;
+  if (gang (ary, sum, prod))
+    return 1;
+  return 0;
+}
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c
@@ -14,28 +14,41 @@ int close_enough (float _Complex a, float _Complex b)
  return mag2_diff / mag2_a < (FRAC * FRAC);
 }
-int main (void)
-{
 #define N 100
-  float _Complex ary[N], sum, prod, tsum, tprod;
-  int ix;
-  sum = tsum = 0;
+static int __attribute__ ((noinline))
-  prod = tprod = 1;
+vector (float _Complex ary[N], float _Complex sum, float _Complex prod)
+{
-  for (ix = 0; ix < N;  ix++)
+  float _Complex tsum = 0, tprod = 1;
-    {
-      float frac = ix * (1.0f / 1024) + 1.0f;
-      ary[ix] = frac + frac * 2.0i - 1.0i;
-      sum += ary[ix];
-      prod *= ary[ix];
-    }
-#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
+#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod)
  {
 #pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
-    for (ix = 0; ix < N; ix++)
+    for (int ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+  if (!close_enough (sum, tsum))
+    return 1;
+  if (!close_enough (prod, tprod))
+    return 1;
+  return 0;
+}
+static int __attribute__ ((noinline))
+worker (float _Complex ary[N], float _Complex sum, float _Complex prod)
+{
+  float _Complex tsum = 0, tprod = 1;
+#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod)
+  {
+#pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
+    for (int ix = 0; ix < N; ix++)
      {
 	tsum += ary[ix];
 	tprod *= ary[ix];
@@ -50,3 +63,52 @@ int main (void)
  return 0;
 }
+static int __attribute__ ((noinline))
+gang (float _Complex ary[N], float _Complex sum, float _Complex prod)
+{
+  float _Complex tsum = 0, tprod = 1;
+#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod)
+  {
+#pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
+    for (int ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+  if (!close_enough (sum, tsum))
+    return 1;
+  if (!close_enough (prod, tprod))
+    return 1;
+  return 0;
+}
+int main (void)
+{
+  float _Complex ary[N], sum = 0, prod = 1;
+  for (int ix = 0; ix < N;  ix++)
+    {
+      float frac = ix * (1.0f / 1024) + 1.0f;
+      ary[ix] = frac + frac * 2.0i - 1.0i;
+      sum += ary[ix];
+      prod *= ary[ix];
+    }
+  if (vector (ary, sum, prod))
+    return 1;
+  if (worker (ary, sum, prod))
+    return 1;
+  if (gang (ary, sum, prod))
+    return 1;
+  return 0;
+}