Commit 18c247cc by Thomas Schwinge Committed by Thomas Schwinge

[PR88370] acc_get_cuda_stream/acc_set_cuda_stream: acc_async_sync, acc_async_noval

Per my reading of the OpenACC specification (and as supported by secondary
documentation, such as code examples, or presentations), it's valid to call
"acc_get_cuda_stream"/"acc_set_cuda_stream" also with "acc_async_sync",
"acc_async_noval" arguments, not just with the nonnegative values as currently
implemented.

	libgomp/
	PR libgomp/88370
	* libgomp.texi (acc_get_current_cuda_context, acc_get_cuda_stream)
	(acc_set_cuda_stream): Clarify.
	* oacc-cuda.c (acc_get_cuda_stream, acc_set_cuda_stream): Use
	"async_valid_p".
	* plugin/plugin-nvptx.c (nvptx_set_cuda_stream): Refuse "async ==
	acc_async_sync".
	* testsuite/libgomp.oacc-c-c++-common/acc_set_cuda_stream-1.c: New file.
	* testsuite/libgomp.oacc-c-c++-common/async_queue-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-84.c: Update.
	* testsuite/libgomp.oacc-c-c++-common/lib-85.c: Likewise.

From-SVN: r267147
parent 5d390fd3
2018-12-14 Thomas Schwinge <thomas@codesourcery.com>
PR libgomp/88370
* libgomp.texi (acc_get_current_cuda_context, acc_get_cuda_stream)
(acc_set_cuda_stream): Clarify.
* oacc-cuda.c (acc_get_cuda_stream, acc_set_cuda_stream): Use
"async_valid_p".
* plugin/plugin-nvptx.c (nvptx_set_cuda_stream): Refuse "async ==
acc_async_sync".
* testsuite/libgomp.oacc-c-c++-common/acc_set_cuda_stream-1.c: New file.
* testsuite/libgomp.oacc-c-c++-common/async_queue-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/lib-84.c: Update.
* testsuite/libgomp.oacc-c-c++-common/lib-85.c: Likewise.
2018-12-14 Tom de Vries <tdevries@suse.de> 2018-12-14 Tom de Vries <tdevries@suse.de>
* testsuite/libgomp.c-c++-common/function-not-offloaded-aux.c: New test. * testsuite/libgomp.c-c++-common/function-not-offloaded-aux.c: New test.
......
...@@ -2768,7 +2768,7 @@ as used by the CUDA Runtime or Driver API's. ...@@ -2768,7 +2768,7 @@ as used by the CUDA Runtime or Driver API's.
@item @emph{C/C++}: @item @emph{C/C++}:
@multitable @columnfractions .20 .80 @multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{acc_get_current_cuda_context(void);} @item @emph{Prototype}: @tab @code{void *acc_get_current_cuda_context(void);}
@end multitable @end multitable
@item @emph{Reference}: @item @emph{Reference}:
...@@ -2782,12 +2782,12 @@ A.2.1.2. ...@@ -2782,12 +2782,12 @@ A.2.1.2.
@section @code{acc_get_cuda_stream} -- Get CUDA stream handle. @section @code{acc_get_cuda_stream} -- Get CUDA stream handle.
@table @asis @table @asis
@item @emph{Description} @item @emph{Description}
This function returns the CUDA stream handle. This handle is the same This function returns the CUDA stream handle for the queue @var{async}.
as used by the CUDA Runtime or Driver API's. This handle is the same as used by the CUDA Runtime or Driver API's.
@item @emph{C/C++}: @item @emph{C/C++}:
@multitable @columnfractions .20 .80 @multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{acc_get_cuda_stream(void);} @item @emph{Prototype}: @tab @code{void *acc_get_cuda_stream(int async);}
@end multitable @end multitable
@item @emph{Reference}: @item @emph{Reference}:
...@@ -2802,11 +2802,16 @@ A.2.1.3. ...@@ -2802,11 +2802,16 @@ A.2.1.3.
@table @asis @table @asis
@item @emph{Description} @item @emph{Description}
This function associates the stream handle specified by @var{stream} with This function associates the stream handle specified by @var{stream} with
the asynchronous value specified by @var{async}. the queue @var{async}.
This cannot be used to change the stream handle associated with
@code{acc_async_sync}.
The return value is not specified.
@item @emph{C/C++}: @item @emph{C/C++}:
@multitable @columnfractions .20 .80 @multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{acc_set_cuda_stream(int async void *stream);} @item @emph{Prototype}: @tab @code{int acc_set_cuda_stream(int async, void *stream);}
@end multitable @end multitable
@item @emph{Reference}: @item @emph{Reference}:
......
...@@ -58,7 +58,7 @@ acc_get_cuda_stream (int async) ...@@ -58,7 +58,7 @@ acc_get_cuda_stream (int async)
{ {
struct goacc_thread *thr = goacc_thread (); struct goacc_thread *thr = goacc_thread ();
if (!async_valid_stream_id_p (async)) if (!async_valid_p (async))
return NULL; return NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func) if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
...@@ -72,7 +72,7 @@ acc_set_cuda_stream (int async, void *stream) ...@@ -72,7 +72,7 @@ acc_set_cuda_stream (int async, void *stream)
{ {
struct goacc_thread *thr; struct goacc_thread *thr;
if (!async_valid_stream_id_p (async) || stream == NULL) if (!async_valid_p (async) || stream == NULL)
return 0; return 0;
goacc_lazy_initialize (); goacc_lazy_initialize ();
......
...@@ -1753,8 +1753,14 @@ nvptx_set_cuda_stream (int async, void *stream) ...@@ -1753,8 +1753,14 @@ nvptx_set_cuda_stream (int async, void *stream)
pthread_t self = pthread_self (); pthread_t self = pthread_self ();
struct nvptx_thread *nvthd = nvptx_thread (); struct nvptx_thread *nvthd = nvptx_thread ();
if (async < 0) /* Due to the "null_stream" usage for "acc_async_sync", this cannot be used
GOMP_PLUGIN_fatal ("bad async %d", async); to change the stream handle associated with "acc_async_sync". */
if (async == acc_async_sync)
{
GOMP_PLUGIN_debug (0, "Refusing request to set CUDA stream associated"
" with \"acc_async_sync\"\n");
return 0;
}
pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
......
/* Verify expected nvptx plugin behavior for "acc_set_cuda_stream" for
"acc_async_sync". */
/* { dg-do run { target openacc_nvidia_accel_selected } } */
/* { dg-set-target-env-var GOMP_DEBUG "1" } */
#undef NDEBUG
#include <assert.h>
#include <openacc.h>
int main(void)
{
int async = 42;
/* Initialize. */
#pragma acc parallel async(acc_async_sync)
;
#pragma acc parallel async(async)
;
#pragma acc wait
void *cuda_stream_sync = acc_get_cuda_stream (acc_async_sync);
assert (cuda_stream_sync == NULL);
void *cuda_stream_async = acc_get_cuda_stream (async);
assert (cuda_stream_async != NULL);
int ret = acc_set_cuda_stream (acc_async_sync, cuda_stream_async);
assert (ret == 0);
void *cuda_stream_sync_ = acc_get_cuda_stream (acc_async_sync);
assert (cuda_stream_sync_ == cuda_stream_sync);
void *cuda_stream_async_ = acc_get_cuda_stream (async);
assert (cuda_stream_async_ == cuda_stream_async);
#pragma acc parallel async(acc_async_sync)
;
#pragma acc parallel async(async)
;
#pragma acc wait
return 0;
}
/* { dg-output "Refusing request to set CUDA stream associated with \"acc_async_sync\"" } */
/* Test mapping of async values to specific underlying queues. */
#undef NDEBUG
#include <assert.h>
#include <openacc.h>
/* This is implemented in terms of the "acc_get_cuda_stream" interface. */
struct
{
int async;
void *cuda_stream;
} queues[] = { { acc_async_sync, NULL },
{ acc_async_noval, NULL },
{ 0, NULL },
{ 1, NULL },
{ 2, NULL },
{ 36, NULL },
{ 1982, NULL } };
const size_t queues_n = sizeof queues / sizeof queues[0];
int main(void)
{
/* Explicitly initialize: it's not clear whether the following OpenACC
runtime library calls implicitly initialize;
<https://github.com/OpenACC/openacc-spec/issues/102>. */
acc_device_t d;
#if defined ACC_DEVICE_TYPE_nvidia
d = acc_device_nvidia;
#elif defined ACC_DEVICE_TYPE_host
d = acc_device_host;
#else
# error Not ported to this ACC_DEVICE_TYPE
#endif
acc_init (d);
for (size_t i = 0; i < queues_n; ++i)
{
/* Before actually being used, there are all NULL. */
queues[i].cuda_stream = acc_get_cuda_stream (queues[i].async);
assert (queues[i].cuda_stream == NULL);
}
for (size_t i = 0; i < queues_n; ++i)
{
/* Use the queue to initialize it. */
#pragma acc parallel async(queues[i].async)
;
#pragma acc wait
/* Verify CUDA stream used. */
queues[i].cuda_stream = acc_get_cuda_stream (queues[i].async);
#if defined ACC_DEVICE_TYPE_nvidia
/* "acc_async_sync" maps to the NULL CUDA default stream. */
if (queues[i].async == acc_async_sync)
assert (queues[i].cuda_stream == NULL);
else
assert (queues[i].cuda_stream != NULL);
#elif defined ACC_DEVICE_TYPE_host
/* For "acc_device_host" there are no CUDA streams. */
assert (queues[i].cuda_stream == NULL);
#else
# error Not ported to this ACC_DEVICE_TYPE
#endif
}
/* Verify same results. */
for (size_t i = 0; i < queues_n; ++i)
{
void *cuda_stream;
cuda_stream = acc_get_cuda_stream (queues[i].async);
assert (cuda_stream == queues[i].cuda_stream);
#pragma acc parallel async(queues[i].async)
;
#pragma acc wait
cuda_stream = acc_get_cuda_stream (queues[i].async);
assert (cuda_stream == queues[i].cuda_stream);
}
/* Verify individual underlying queues are all different. */
for (size_t i = 0; i < queues_n; ++i)
{
if (queues[i].cuda_stream == NULL)
continue;
for (size_t j = i + 1; j < queues_n; ++j)
{
if (queues[j].cuda_stream == NULL)
continue;
assert (queues[j].cuda_stream != queues[i].cuda_stream);
}
}
return 0;
}
...@@ -7,6 +7,14 @@ ...@@ -7,6 +7,14 @@
#include <openacc.h> #include <openacc.h>
#include <cuda.h> #include <cuda.h>
#if !defined __cplusplus
# undef static_assert
# define static_assert _Static_assert
#endif
static_assert (acc_async_sync == -2, "acc_async_sync?");
static_assert (acc_async_noval == -1, "acc_async_noval?");
int int
main (int argc, char **argv) main (int argc, char **argv)
{ {
...@@ -20,9 +28,11 @@ main (int argc, char **argv) ...@@ -20,9 +28,11 @@ main (int argc, char **argv)
(void) acc_get_device_num (acc_device_nvidia); (void) acc_get_device_num (acc_device_nvidia);
streams = (CUstream *) malloc (N * sizeof (void *)); streams = (CUstream *) malloc ((2 + N) * sizeof (void *));
streams += 2;
/* "streams[i]" is valid for i in [acc_async_sync..N). */
for (i = 0; i < N; i++) for (i = acc_async_sync; i < N; i++)
{ {
streams[i] = (CUstream) acc_get_cuda_stream (i); streams[i] = (CUstream) acc_get_cuda_stream (i);
if (streams[i] != NULL) if (streams[i] != NULL)
...@@ -35,11 +45,20 @@ main (int argc, char **argv) ...@@ -35,11 +45,20 @@ main (int argc, char **argv)
abort (); abort ();
} }
if (!acc_set_cuda_stream (i, streams[i])) int ret = acc_set_cuda_stream (i, streams[i]);
abort (); if (i == acc_async_sync)
{
if (ret == 1)
abort ();
}
else
{
if (ret != 1)
abort ();
}
} }
for (i = 0; i < N; i++) for (i = acc_async_sync; i < N; i++)
{ {
int j; int j;
int cnt; int cnt;
...@@ -48,7 +67,7 @@ main (int argc, char **argv) ...@@ -48,7 +67,7 @@ main (int argc, char **argv)
s = streams[i]; s = streams[i];
for (j = 0; j < N; j++) for (j = acc_async_sync; j < N; j++)
{ {
if (s == streams[j]) if (s == streams[j])
cnt++; cnt++;
......
...@@ -7,6 +7,14 @@ ...@@ -7,6 +7,14 @@
#include <stdio.h> #include <stdio.h>
#include <cuda.h> #include <cuda.h>
#if !defined __cplusplus
# undef static_assert
# define static_assert _Static_assert
#endif
static_assert (acc_async_sync == -2, "acc_async_sync?");
static_assert (acc_async_noval == -1, "acc_async_noval?");
int int
main (int argc, char **argv) main (int argc, char **argv)
{ {
...@@ -20,9 +28,11 @@ main (int argc, char **argv) ...@@ -20,9 +28,11 @@ main (int argc, char **argv)
(void) acc_get_device_num (acc_device_nvidia); (void) acc_get_device_num (acc_device_nvidia);
streams = (CUstream *) malloc (N * sizeof (void *)); streams = (CUstream *) malloc ((2 + N) * sizeof (void *));
streams += 2;
/* "streams[i]" is valid for i in [acc_async_sync..N). */
for (i = 0; i < N; i++) for (i = acc_async_sync; i < N; i++)
{ {
streams[i] = (CUstream) acc_get_cuda_stream (i); streams[i] = (CUstream) acc_get_cuda_stream (i);
if (streams[i] != NULL) if (streams[i] != NULL)
...@@ -35,8 +45,17 @@ main (int argc, char **argv) ...@@ -35,8 +45,17 @@ main (int argc, char **argv)
abort (); abort ();
} }
if (!acc_set_cuda_stream (i, streams[i])) int ret = acc_set_cuda_stream (i, streams[i]);
abort (); if (i == acc_async_sync)
{
if (ret == 1)
abort ();
}
else
{
if (ret != 1)
abort ();
}
} }
s = NULL; s = NULL;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment