Commit 33f47f42 by Nathan Sidwell Committed by Nathan Sidwell

nvptx.c (global_lock_var): New.

	gcc/
	* config/nvptx/nvptx.c (global_lock_var): New.
	(nvptx_global_lock_addr): New.
	(nvptx_lockless_update): Recomment and adjust for clarity.
	(nvptx_lockfull_update): New.
	(nvptx_reduction_update): New.
	(nvptx_goacc_reduction_fini): Call it.

	libgcc/
	* config/nvptx/reduction.c: New.
	* config/nvptx/t-nvptx (LIB2ADD): Add it.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Add
	worker & gang cases.
	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise.

From-SVN: r230545
parent d085c468
2015-11-18 Nathan Sidwell <nathan@codesourcery.com>
* config/nvptx/nvptx.c (global_lock_var): New.
(nvptx_global_lock_addr): New.
(nvptx_lockless_update): Recomment and adjust for clarity.
(nvptx_lockfull_update): New.
(nvptx_reduction_update): New.
(nvptx_goacc_reduction_fini): Call it.
2015-11-18 Bernd Schmidt <bschmidt@redhat.com>
* regrename.h (struct du_head): Add target_data_1 and target_data_2
2015-11-18 Nathan Sidwell <nathan@codesourcery.com>
* config/nvptx/reduction.c: New.
* config/nvptx/t-nvptx (LIB2ADD): Add it.
2015-11-15 David Edelsohn <dje.gcc@gmail.com>
* config/rs6000/on_exit.c: New file.
......
/* Oversized reductions lock variable
Copyright (C) 2015 Free Software Foundation, Inc.
Contributed by Mentor Graphics.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* We use a global lock variable for reductions on objects larger than
64 bits. Until and unless proven that lock contention for
different reduction is a problem, a single lock will suffice. */
unsigned volatile __reduction_lock = 0;
LIB2ADD=$(srcdir)/config/nvptx/malloc.asm \
$(srcdir)/config/nvptx/free.asm \
$(srcdir)/config/nvptx/realloc.c
$(srcdir)/config/nvptx/realloc.c \
$(srcdir)/config/nvptx/reduction.c
LIB2ADDEH=
LIB2FUNCS_EXCLUDE=__main
......
2015-11-18 Nathan Sidwell <nathan@codesourcery.com>
* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Add
worker & gang cases.
* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Likewise.
2015-11-17 Cesar Philippidis <cesar@codesourcery.com>
* config/nvptx/priority_queue.c: New file.
......
......@@ -14,28 +14,41 @@ int close_enough (double _Complex a, double _Complex b)
return mag2_diff / mag2_a < (FRAC * FRAC);
}
int main (void)
{
#define N 100
double _Complex ary[N], sum, prod, tsum, tprod;
int ix;
sum = tsum = 0;
prod = tprod = 1;
for (ix = 0; ix < N; ix++)
{
double frac = ix * (1.0 / 1024) + 1.0;
ary[ix] = frac + frac * 2.0i - 1.0i;
sum += ary[ix];
prod *= ary[ix];
}
static int __attribute__ ((noinline))
vector (double _Complex ary[N], double _Complex sum, double _Complex prod)
{
double _Complex tsum = 0, tprod = 1;
#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod)
{
#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
for (ix = 0; ix < N; ix++)
for (int ix = 0; ix < N; ix++)
{
tsum += ary[ix];
tprod *= ary[ix];
}
}
if (!close_enough (sum, tsum))
return 1;
if (!close_enough (prod, tprod))
return 1;
return 0;
}
static int __attribute__ ((noinline))
worker (double _Complex ary[N], double _Complex sum, double _Complex prod)
{
double _Complex tsum = 0, tprod = 1;
#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod)
{
#pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
{
tsum += ary[ix];
tprod *= ary[ix];
......@@ -50,3 +63,52 @@ int main (void)
return 0;
}
static int __attribute__ ((noinline))
gang (double _Complex ary[N], double _Complex sum, double _Complex prod)
{
double _Complex tsum = 0, tprod = 1;
#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod)
{
#pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
{
tsum += ary[ix];
tprod *= ary[ix];
}
}
if (!close_enough (sum, tsum))
return 1;
if (!close_enough (prod, tprod))
return 1;
return 0;
}
int main (void)
{
double _Complex ary[N], sum = 0, prod = 1;
for (int ix = 0; ix < N; ix++)
{
double frac = ix * (1.0 / 1024) + 1.0;
ary[ix] = frac + frac * 2.0i - 1.0i;
sum += ary[ix];
prod *= ary[ix];
}
if (vector (ary, sum, prod))
return 1;
if (worker (ary, sum, prod))
return 1;
if (gang (ary, sum, prod))
return 1;
return 0;
}
......@@ -14,28 +14,41 @@ int close_enough (float _Complex a, float _Complex b)
return mag2_diff / mag2_a < (FRAC * FRAC);
}
int main (void)
{
#define N 100
float _Complex ary[N], sum, prod, tsum, tprod;
int ix;
sum = tsum = 0;
prod = tprod = 1;
for (ix = 0; ix < N; ix++)
{
float frac = ix * (1.0f / 1024) + 1.0f;
ary[ix] = frac + frac * 2.0i - 1.0i;
sum += ary[ix];
prod *= ary[ix];
}
static int __attribute__ ((noinline))
vector (float _Complex ary[N], float _Complex sum, float _Complex prod)
{
float _Complex tsum = 0, tprod = 1;
#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod)
{
#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
for (ix = 0; ix < N; ix++)
for (int ix = 0; ix < N; ix++)
{
tsum += ary[ix];
tprod *= ary[ix];
}
}
if (!close_enough (sum, tsum))
return 1;
if (!close_enough (prod, tprod))
return 1;
return 0;
}
static int __attribute__ ((noinline))
worker (float _Complex ary[N], float _Complex sum, float _Complex prod)
{
float _Complex tsum = 0, tprod = 1;
#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod)
{
#pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
{
tsum += ary[ix];
tprod *= ary[ix];
......@@ -50,3 +63,52 @@ int main (void)
return 0;
}
static int __attribute__ ((noinline))
gang (float _Complex ary[N], float _Complex sum, float _Complex prod)
{
float _Complex tsum = 0, tprod = 1;
#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod)
{
#pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
for (int ix = 0; ix < N; ix++)
{
tsum += ary[ix];
tprod *= ary[ix];
}
}
if (!close_enough (sum, tsum))
return 1;
if (!close_enough (prod, tprod))
return 1;
return 0;
}
int main (void)
{
float _Complex ary[N], sum = 0, prod = 1;
for (int ix = 0; ix < N; ix++)
{
float frac = ix * (1.0f / 1024) + 1.0f;
ary[ix] = frac + frac * 2.0i - 1.0i;
sum += ary[ix];
prod *= ary[ix];
}
if (vector (ary, sum, prod))
return 1;
if (worker (ary, sum, prod))
return 1;
if (gang (ary, sum, prod))
return 1;
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment