Commit 09e0ad62 by Andrew Stubbs

Update OpenACC tests for amdgcn

2020-01-20  Andrew Stubbs  <ams@codesourcery.com>

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Skip test on gcn.
	* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c (main):
	Adjust test dimensions for amdgcn.
	* testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c (main): Adjust
	gang/worker/vector expectations dynamically.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
	(main): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c (main): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c (main): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c (main): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c (main): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c (main): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-v-1.c (main): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c (main): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c (main): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
	(acc_gang): Recognise acc_device_radeon.
	(acc_worker): Likewise.
	(acc_vector): Likewise.
	(main): Set expectations for amdgcn.
	* testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
	(main): Adjust gang/worker/vector expectations dynamically.
	* testsuite/libgomp.oacc-c-c++-common/routine-v-1.c (main): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/routine-w-1.c (main): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c (main): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c: Set expectations
	for amdgcn.
parent 3a434597
2020-01-20 Andrew Stubbs <ams@codesourcery.com>
* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Skip test on gcn.
* testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c (main):
Adjust test dimensions for amdgcn.
* testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c (main): Adjust
gang/worker/vector expectations dynamically.
* testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
(main): Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-v-1.c (main): Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-v-2.c (main): Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-w-1.c (main): Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-w-2.c (main): Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c (main): Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-v-1.c (main): Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-w-1.c (main): Likewise.
* testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c (main): Likewise.
* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
(acc_gang): Recognise acc_device_radeon.
(acc_worker): Likewise.
(acc_vector): Likewise.
(main): Set expectations for amdgcn.
* testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
(main): Adjust gang/worker/vector expectations dynamically.
* testsuite/libgomp.oacc-c-c++-common/routine-v-1.c (main): Likewise.
* testsuite/libgomp.oacc-c-c++-common/routine-w-1.c (main): Likewise.
* testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c (main): Likewise.
* testsuite/libgomp.oacc-c-c++-common/routine-wv-2.c: Set expectations
for amdgcn.
2020-01-17 Andrew Stubbs <ams@codesourcery.com> 2020-01-17 Andrew Stubbs <ams@codesourcery.com>
* config/accel/openacc.f90 (openacc_kinds): Rename acc_device_gcn to * config/accel/openacc.f90 (openacc_kinds): Rename acc_device_gcn to
......
/* AMD GCN does not use 32-lane vectors.
{ dg-skip-if "unsuitable dimensions" { openacc_amdgcn_accel_selected } { "*" } { "" } } */
/* { dg-additional-options "-fopenacc-dim=32" } */ /* { dg-additional-options "-fopenacc-dim=32" } */
#include <stdio.h> #include <stdio.h>
......
...@@ -128,5 +128,14 @@ int test_1 (int gp, int wp, int vp) ...@@ -128,5 +128,14 @@ int test_1 (int gp, int wp, int vp)
int main () int main ()
{ {
#ifdef ACC_DEVICE_TYPE_gcn
/* AMD GCN uses the autovectorizer for the vector dimension: the use
of a function call in vector-partitioned code in this test is not
currently supported. */
/* AMD GCN does not currently support multiple workers. This should be
set to 16 when that changes. */
return test_1 (16, 1, 1);
#else
return test_1 (16, 16, 32); return test_1 (16, 16, 32);
#endif
} }
...@@ -9,11 +9,13 @@ int main () ...@@ -9,11 +9,13 @@ int main ()
int ix; int ix;
int exit = 0; int exit = 0;
int ondev = 0; int ondev = 0;
int gangsize, workersize, vectorsize;
for (ix = 0; ix < N;ix++) for (ix = 0; ix < N;ix++)
ary[ix] = -1; ary[ix] = -1;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev) #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
copy(ary) copy(ondev) copyout(gangsize, workersize, vectorsize)
{ {
#pragma acc loop gang worker vector #pragma acc loop gang worker vector
for (unsigned ix = 0; ix < N; ix++) for (unsigned ix = 0; ix < N; ix++)
...@@ -32,6 +34,10 @@ int main () ...@@ -32,6 +34,10 @@ int main ()
else else
ary[ix] = ix; ary[ix] = ix;
} }
gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
...@@ -39,11 +45,12 @@ int main () ...@@ -39,11 +45,12 @@ int main ()
int expected = ix; int expected = ix;
if(ondev) if(ondev)
{ {
int chunk_size = (N + 32*32*32 - 1) / (32*32*32); int chunk_size = (N + gangsize * workersize * vectorsize - 1)
/ (gangsize * workersize * vectorsize);
int g = ix / (chunk_size * 32 * 32); int g = ix / (chunk_size * workersize * vectorsize);
int w = ix / 32 % 32; int w = (ix / vectorsize) % workersize;
int v = ix % 32; int v = ix % vectorsize;
expected = (g << 16) | (w << 8) | v; expected = (g << 16) | (w << 8) | v;
} }
......
...@@ -8,8 +8,10 @@ int main () ...@@ -8,8 +8,10 @@ int main ()
int ix; int ix;
int ondev = 0; int ondev = 0;
int t = 0, h = 0; int t = 0, h = 0;
int gangsize, workersize, vectorsize;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ondev)
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
copy(ondev) copyout(gangsize, workersize, vectorsize)
{ {
#pragma acc loop gang worker vector reduction(+:t) #pragma acc loop gang worker vector reduction(+:t)
for (unsigned ix = 0; ix < N; ix++) for (unsigned ix = 0; ix < N; ix++)
...@@ -28,18 +30,22 @@ int main () ...@@ -28,18 +30,22 @@ int main ()
} }
t += val; t += val;
} }
gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
{ {
int val = ix; int val = ix;
if(ondev) if (ondev)
{ {
int chunk_size = (N + 32*32*32 - 1) / (32*32*32); int chunk_size = (N + gangsize * workersize * vectorsize - 1)
/ (gangsize * workersize * vectorsize);
int g = ix / (chunk_size * 32 * 32); int g = ix / (chunk_size * vectorsize * workersize);
int w = ix / 32 % 32; int w = ix / vectorsize % workersize;
int v = ix % 32; int v = ix % vectorsize;
val = (g << 16) | (w << 8) | v; val = (g << 16) | (w << 8) | v;
} }
......
...@@ -9,8 +9,9 @@ int main () ...@@ -9,8 +9,9 @@ int main ()
int ix; int ix;
int ondev = 0; int ondev = 0;
int t = 0, h = 0; int t = 0, h = 0;
int vectorsize;
#pragma acc parallel vector_length(32) copy(ondev) #pragma acc parallel vector_length(32) copy(ondev) copyout(vectorsize)
{ {
#pragma acc loop vector reduction (+:t) #pragma acc loop vector reduction (+:t)
for (unsigned ix = 0; ix < N; ix++) for (unsigned ix = 0; ix < N; ix++)
...@@ -29,6 +30,7 @@ int main () ...@@ -29,6 +30,7 @@ int main ()
} }
t += val; t += val;
} }
vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
...@@ -38,7 +40,7 @@ int main () ...@@ -38,7 +40,7 @@ int main ()
{ {
int g = 0; int g = 0;
int w = 0; int w = 0;
int v = ix % 32; int v = ix % vectorsize;
val = (g << 16) | (w << 8) | v; val = (g << 16) | (w << 8) | v;
} }
......
...@@ -9,8 +9,9 @@ int main () ...@@ -9,8 +9,9 @@ int main ()
int ix; int ix;
int ondev = 0; int ondev = 0;
int q = 0, h = 0; int q = 0, h = 0;
int vectorsize;
#pragma acc parallel vector_length(32) copy(q) copy(ondev) #pragma acc parallel vector_length(32) copy(q) copy(ondev) copyout(vectorsize)
{ {
int t = q; int t = q;
...@@ -32,6 +33,7 @@ int main () ...@@ -32,6 +33,7 @@ int main ()
t += val; t += val;
} }
q = t; q = t;
vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
...@@ -41,7 +43,7 @@ int main () ...@@ -41,7 +43,7 @@ int main ()
{ {
int g = 0; int g = 0;
int w = 0; int w = 0;
int v = ix % 32; int v = ix % vectorsize;
val = (g << 16) | (w << 8) | v; val = (g << 16) | (w << 8) | v;
} }
......
...@@ -8,8 +8,10 @@ int main () ...@@ -8,8 +8,10 @@ int main ()
int ix; int ix;
int ondev = 0; int ondev = 0;
int t = 0, h = 0; int t = 0, h = 0;
int workersize;
#pragma acc parallel num_workers(32) vector_length(32) copy(ondev) #pragma acc parallel num_workers(32) vector_length(32) copy(ondev) \
copyout(workersize)
{ {
#pragma acc loop worker reduction(+:t) #pragma acc loop worker reduction(+:t)
for (unsigned ix = 0; ix < N; ix++) for (unsigned ix = 0; ix < N; ix++)
...@@ -28,6 +30,7 @@ int main () ...@@ -28,6 +30,7 @@ int main ()
} }
t += val; t += val;
} }
workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
...@@ -36,7 +39,7 @@ int main () ...@@ -36,7 +39,7 @@ int main ()
if(ondev) if(ondev)
{ {
int g = 0; int g = 0;
int w = ix % 32; int w = ix % workersize;
int v = 0; int v = 0;
val = (g << 16) | (w << 8) | v; val = (g << 16) | (w << 8) | v;
......
...@@ -8,8 +8,10 @@ int main () ...@@ -8,8 +8,10 @@ int main ()
int ix; int ix;
int ondev = 0; int ondev = 0;
int q = 0, h = 0; int q = 0, h = 0;
int workersize;
#pragma acc parallel num_workers(32) vector_length(32) copy(q) copy(ondev) #pragma acc parallel num_workers(32) vector_length(32) copy(q) copy(ondev) \
copyout(workersize)
{ {
int t = q; int t = q;
...@@ -31,6 +33,7 @@ int main () ...@@ -31,6 +33,7 @@ int main ()
t += val; t += val;
} }
q = t; q = t;
workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
...@@ -39,7 +42,7 @@ int main () ...@@ -39,7 +42,7 @@ int main ()
if(ondev) if(ondev)
{ {
int g = 0; int g = 0;
int w = ix % 32; int w = ix % workersize;
int v = 0; int v = 0;
val = (g << 16) | (w << 8) | v; val = (g << 16) | (w << 8) | v;
......
...@@ -8,8 +8,10 @@ int main () ...@@ -8,8 +8,10 @@ int main ()
int ix; int ix;
int ondev = 0; int ondev = 0;
int t = 0, h = 0; int t = 0, h = 0;
int workersize, vectorsize;
#pragma acc parallel num_workers(32) vector_length(32) copy(ondev) #pragma acc parallel num_workers(32) vector_length(32) copy(ondev) \
copyout(workersize, vectorsize)
{ {
#pragma acc loop worker vector reduction (+:t) #pragma acc loop worker vector reduction (+:t)
for (unsigned ix = 0; ix < N; ix++) for (unsigned ix = 0; ix < N; ix++)
...@@ -28,6 +30,8 @@ int main () ...@@ -28,6 +30,8 @@ int main ()
} }
t += val; t += val;
} }
workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
...@@ -36,8 +40,8 @@ int main () ...@@ -36,8 +40,8 @@ int main ()
if(ondev) if(ondev)
{ {
int g = 0; int g = 0;
int w = (ix / 32) % 32; int w = (ix / vectorsize) % workersize;
int v = ix % 32; int v = ix % vectorsize;
val = (g << 16) | (w << 8) | v; val = (g << 16) | (w << 8) | v;
} }
......
...@@ -9,11 +9,13 @@ int main () ...@@ -9,11 +9,13 @@ int main ()
int ix; int ix;
int exit = 0; int exit = 0;
int ondev = 0; int ondev = 0;
int vectorsize;
for (ix = 0; ix < N;ix++) for (ix = 0; ix < N;ix++)
ary[ix] = -1; ary[ix] = -1;
#pragma acc parallel vector_length(32) copy(ary) copy(ondev) #pragma acc parallel vector_length(32) copy(ary) copy(ondev) \
copyout(vectorsize)
{ {
#pragma acc loop vector #pragma acc loop vector
for (unsigned ix = 0; ix < N; ix++) for (unsigned ix = 0; ix < N; ix++)
...@@ -31,6 +33,7 @@ int main () ...@@ -31,6 +33,7 @@ int main ()
else else
ary[ix] = ix; ary[ix] = ix;
} }
vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
...@@ -40,7 +43,7 @@ int main () ...@@ -40,7 +43,7 @@ int main ()
{ {
int g = 0; int g = 0;
int w = 0; int w = 0;
int v = ix % 32; int v = ix % vectorsize;
expected = (g << 16) | (w << 8) | v; expected = (g << 16) | (w << 8) | v;
} }
......
...@@ -9,11 +9,13 @@ int main () ...@@ -9,11 +9,13 @@ int main ()
int ix; int ix;
int exit = 0; int exit = 0;
int ondev = 0; int ondev = 0;
int workersize;
for (ix = 0; ix < N;ix++) for (ix = 0; ix < N;ix++)
ary[ix] = -1; ary[ix] = -1;
#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) #pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
copyout(workersize)
{ {
#pragma acc loop worker #pragma acc loop worker
for (unsigned ix = 0; ix < N; ix++) for (unsigned ix = 0; ix < N; ix++)
...@@ -31,6 +33,7 @@ int main () ...@@ -31,6 +33,7 @@ int main ()
else else
ary[ix] = ix; ary[ix] = ix;
} }
workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
...@@ -39,7 +42,7 @@ int main () ...@@ -39,7 +42,7 @@ int main ()
if(ondev) if(ondev)
{ {
int g = 0; int g = 0;
int w = ix % 32; int w = ix % workersize;
int v = 0; int v = 0;
expected = (g << 16) | (w << 8) | v; expected = (g << 16) | (w << 8) | v;
......
...@@ -9,11 +9,13 @@ int main () ...@@ -9,11 +9,13 @@ int main ()
int ix; int ix;
int exit = 0; int exit = 0;
int ondev = 0; int ondev = 0;
int workersize, vectorsize;
for (ix = 0; ix < N;ix++) for (ix = 0; ix < N;ix++)
ary[ix] = -1; ary[ix] = -1;
#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) #pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
copyout(workersize, vectorsize)
{ {
#pragma acc loop worker vector #pragma acc loop worker vector
for (unsigned ix = 0; ix < N; ix++) for (unsigned ix = 0; ix < N; ix++)
...@@ -31,6 +33,8 @@ int main () ...@@ -31,6 +33,8 @@ int main ()
else else
ary[ix] = ix; ary[ix] = ix;
} }
workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
...@@ -39,8 +43,8 @@ int main () ...@@ -39,8 +43,8 @@ int main ()
if(ondev) if(ondev)
{ {
int g = 0; int g = 0;
int w = (ix / 32) % 32; int w = (ix / vectorsize) % workersize;
int v = ix % 32; int v = ix % vectorsize;
expected = (g << 16) | (w << 8) | v; expected = (g << 16) | (w << 8) | v;
} }
......
...@@ -14,7 +14,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_gang () ...@@ -14,7 +14,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
{ {
if (acc_on_device ((int) acc_device_host)) if (acc_on_device ((int) acc_device_host))
return 0; return 0;
else if (acc_on_device ((int) acc_device_nvidia)) else if (acc_on_device ((int) acc_device_nvidia)
|| acc_on_device ((int) acc_device_radeon))
return __builtin_goacc_parlevel_id (GOMP_DIM_GANG); return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
else else
__builtin_abort (); __builtin_abort ();
...@@ -25,7 +26,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_worker () ...@@ -25,7 +26,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
{ {
if (acc_on_device ((int) acc_device_host)) if (acc_on_device ((int) acc_device_host))
return 0; return 0;
else if (acc_on_device ((int) acc_device_nvidia)) else if (acc_on_device ((int) acc_device_nvidia)
|| acc_on_device ((int) acc_device_radeon))
return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER); return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
else else
__builtin_abort (); __builtin_abort ();
...@@ -36,7 +38,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_vector () ...@@ -36,7 +38,8 @@ static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
{ {
if (acc_on_device ((int) acc_device_host)) if (acc_on_device ((int) acc_device_host))
return 0; return 0;
else if (acc_on_device ((int) acc_device_nvidia)) else if (acc_on_device ((int) acc_device_nvidia)
|| acc_on_device ((int) acc_device_radeon))
return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR); return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
else else
__builtin_abort (); __builtin_abort ();
...@@ -282,6 +285,12 @@ int main () ...@@ -282,6 +285,12 @@ int main ()
/* The GCC nvptx back end enforces num_workers (32). */ /* The GCC nvptx back end enforces num_workers (32). */
workers_actual = 32; workers_actual = 32;
} }
else if (acc_on_device (acc_device_radeon))
{
/* The GCC GCN back end is limited to num_workers (16).
Temporarily set this to 1 until multiple workers are permitted. */
workers_actual = 1; // 16;
}
else else
__builtin_abort (); __builtin_abort ();
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
...@@ -328,6 +337,11 @@ int main () ...@@ -328,6 +337,11 @@ int main ()
/* We're actually executing with num_workers (32). */ /* We're actually executing with num_workers (32). */
/* workers_actual = 32; */ /* workers_actual = 32; */
} }
else if (acc_on_device (acc_device_radeon))
{
/* The GCC GCN back end is limited to num_workers (16). */
workers_actual = 16;
}
else else
__builtin_abort (); __builtin_abort ();
#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) #pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
...@@ -367,6 +381,11 @@ int main () ...@@ -367,6 +381,11 @@ int main ()
/* The GCC nvptx back end enforces vector_length (32). */ /* The GCC nvptx back end enforces vector_length (32). */
vectors_actual = 1024; vectors_actual = 1024;
} }
else if (acc_on_device (acc_device_radeon))
{
/* The GCC GCN back end enforces vector_length (1): autovectorize. */
vectors_actual = 1;
}
else else
__builtin_abort (); __builtin_abort ();
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
...@@ -407,6 +426,13 @@ int main () ...@@ -407,6 +426,13 @@ int main ()
/* The GCC nvptx back end enforces vector_length (32). */ /* The GCC nvptx back end enforces vector_length (32). */
vectors_actual = 32; vectors_actual = 32;
} }
else if (acc_on_device (acc_device_radeon))
{
/* Because of the way vectors are implemented for GCN, a vector loop
containing a seq routine call will not vectorize calls to that
routine. Hence, we'll only get one "vector". */
vectors_actual = 1;
}
else else
__builtin_abort (); __builtin_abort ();
#pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) #pragma acc loop vector reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
...@@ -433,6 +459,9 @@ int main () ...@@ -433,6 +459,9 @@ int main ()
in the following case. So, limit ourselves here. */ in the following case. So, limit ourselves here. */
if (acc_get_device_type () == acc_device_nvidia) if (acc_get_device_type () == acc_device_nvidia)
gangs = 3; gangs = 3;
/* Similar appears to be true for GCN. */
if (acc_get_device_type () == acc_device_radeon)
gangs = 3;
int gangs_actual = gangs; int gangs_actual = gangs;
#define WORKERS 3 #define WORKERS 3
int workers_actual = WORKERS; int workers_actual = WORKERS;
...@@ -459,6 +488,13 @@ int main () ...@@ -459,6 +488,13 @@ int main ()
/* The GCC nvptx back end enforces vector_length (32). */ /* The GCC nvptx back end enforces vector_length (32). */
vectors_actual = 32; vectors_actual = 32;
} }
else if (acc_on_device (acc_device_radeon))
{
/* Temporary setting, until multiple workers are permitted. */
workers_actual = 1;
/* See above comments about GCN vectors_actual. */
vectors_actual = 1;
}
else else
__builtin_abort (); __builtin_abort ();
#pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) #pragma acc loop gang reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
......
...@@ -30,14 +30,18 @@ int main () ...@@ -30,14 +30,18 @@ int main ()
int ix; int ix;
int exit = 0; int exit = 0;
int ondev = 0; int ondev = 0;
int gangsize, workersize, vectorsize;
for (ix = 0; ix < N;ix++) for (ix = 0; ix < N;ix++)
ary[ix] = -1; ary[ix] = -1;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev) #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev) copyout(gangsize, workersize, vectorsize)
{ {
ondev = acc_on_device (acc_device_not_host); ondev = acc_on_device (acc_device_not_host);
gang (ary); gang (ary);
gangsize = __builtin_goacc_parlevel_size (GOMP_DIM_GANG);
workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
...@@ -45,11 +49,12 @@ int main () ...@@ -45,11 +49,12 @@ int main ()
int expected = ix; int expected = ix;
if(ondev) if(ondev)
{ {
int chunk_size = (N + 32*32*32 - 1) / (32*32*32); int chunk_size = (N + gangsize * workersize * vectorsize - 1)
/ (gangsize * workersize * vectorsize);
int g = ix / (chunk_size * 32 * 32); int g = ix / (chunk_size * vectorsize * workersize);
int w = ix / 32 % 32; int w = (ix / vectorsize) % workersize;
int v = ix % 32; int v = ix % vectorsize;
expected = (g << 16) | (w << 8) | v; expected = (g << 16) | (w << 8) | v;
} }
......
...@@ -30,14 +30,17 @@ int main () ...@@ -30,14 +30,17 @@ int main ()
int ix; int ix;
int exit = 0; int exit = 0;
int ondev = 0; int ondev = 0;
int vectorsize;
for (ix = 0; ix < N;ix++) for (ix = 0; ix < N;ix++)
ary[ix] = -1; ary[ix] = -1;
#pragma acc parallel vector_length(32) copy(ary) copy(ondev) #pragma acc parallel vector_length(32) copy(ary) copy(ondev) \
copyout(vectorsize)
{ {
ondev = acc_on_device (acc_device_not_host); ondev = acc_on_device (acc_device_not_host);
vector (ary); vector (ary);
vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
...@@ -47,7 +50,7 @@ int main () ...@@ -47,7 +50,7 @@ int main ()
{ {
int g = 0; int g = 0;
int w = 0; int w = 0;
int v = ix % 32; int v = ix % vectorsize;
expected = (g << 16) | (w << 8) | v; expected = (g << 16) | (w << 8) | v;
} }
......
...@@ -30,14 +30,17 @@ int main () ...@@ -30,14 +30,17 @@ int main ()
int ix; int ix;
int exit = 0; int exit = 0;
int ondev = 0; int ondev = 0;
int workersize;
for (ix = 0; ix < N;ix++) for (ix = 0; ix < N;ix++)
ary[ix] = -1; ary[ix] = -1;
#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) #pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
copyout(workersize)
{ {
ondev = acc_on_device (acc_device_not_host); ondev = acc_on_device (acc_device_not_host);
worker (ary); worker (ary);
workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
...@@ -46,7 +49,7 @@ int main () ...@@ -46,7 +49,7 @@ int main ()
if(ondev) if(ondev)
{ {
int g = 0; int g = 0;
int w = ix % 32; int w = ix % workersize;
int v = 0; int v = 0;
expected = (g << 16) | (w << 8) | v; expected = (g << 16) | (w << 8) | v;
......
...@@ -30,14 +30,18 @@ int main () ...@@ -30,14 +30,18 @@ int main ()
int ix; int ix;
int exit = 0; int exit = 0;
int ondev = 0; int ondev = 0;
int workersize, vectorsize;
for (ix = 0; ix < N;ix++) for (ix = 0; ix < N;ix++)
ary[ix] = -1; ary[ix] = -1;
#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) #pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) \
copyout(workersize, vectorsize)
{ {
ondev = acc_on_device (acc_device_not_host); ondev = acc_on_device (acc_device_not_host);
worker (ary); worker (ary);
workersize = __builtin_goacc_parlevel_size (GOMP_DIM_WORKER);
vectorsize = __builtin_goacc_parlevel_size (GOMP_DIM_VECTOR);
} }
for (ix = 0; ix < N; ix++) for (ix = 0; ix < N; ix++)
...@@ -46,8 +50,8 @@ int main () ...@@ -46,8 +50,8 @@ int main ()
if(ondev) if(ondev)
{ {
int g = 0; int g = 0;
int w = (ix / 32) % 32; int w = (ix / vectorsize) % workersize;
int v = ix % 32; int v = ix % vectorsize;
expected = (g << 16) | (w << 8) | v; expected = (g << 16) | (w << 8) | v;
} }
......
...@@ -2,8 +2,13 @@ ...@@ -2,8 +2,13 @@
#include <openacc.h> #include <openacc.h>
#include <gomp-constants.h> #include <gomp-constants.h>
#ifdef ACC_DEVICE_TYPE_gcn
#define NUM_WORKERS 16
#define NUM_VECTORS 1
#else
#define NUM_WORKERS 16 #define NUM_WORKERS 16
#define NUM_VECTORS 32 #define NUM_VECTORS 32
#endif
#define WIDTH 64 #define WIDTH 64
#define HEIGHT 32 #define HEIGHT 32
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment