Commit a1c022d1 by Kwok Cheung Yeung

libgomp: Fix hang when profiling OpenACC programs with CUDA 9.0 nvprof

The version of nvprof in CUDA 9.0 causes a hang when used to profile an
OpenACC program.  This is because it calls acc_get_device_type from
a callback called during device initialization, which then attempts
to acquire acc_device_lock while it is already taken, resulting in
deadlock.  This works around the issue by returning acc_device_none
from acc_get_device_type without attempting to acquire the lock when
initialization has not completed yet.

2020-07-14  Tom de Vries  <tom@codesourcery.com>
	    Cesar Philippidis  <cesar@codesourcery.com>
	    Thomas Schwinge  <thomas@codesourcery.com>
	    Kwok Cheung Yeung  <kcy@codesourcery.com>

	libgomp/
	* oacc-init.c (acc_init_state_lock, acc_init_state, acc_init_thread):
	New variable.
	(acc_init_1): Set acc_init_thread to pthread_self ().  Set
	acc_init_state to initializing at the start, and to initialized at the
	end.
	(self_initializing_p): New function.
	(acc_get_device_type): Return acc_device_none if called by thread that
	is currently executing acc_init_1.
	* libgomp.texi (acc_get_device_type): Update documentation.
	(Implementation Status and Implementation-Defined Behavior): Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c: New.

(cherry picked from commit b52643ab9004ba8ecea06a399885fe1e04183eda)
parent 74d4c8bd
...@@ -1967,6 +1967,12 @@ in @var{devicetype}, to use when executing a parallel or kernels region. ...@@ -1967,6 +1967,12 @@ in @var{devicetype}, to use when executing a parallel or kernels region.
This function returns what device type will be used when executing a This function returns what device type will be used when executing a
parallel or kernels region. parallel or kernels region.
This function returns @code{acc_device_none} if
@code{acc_get_device_type} is called from
@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
callbacks of the OpenACC Profiling Interface (@ref{OpenACC Profiling
Interface}), that is, if the device is currently being initialized.
@item @emph{C/C++}: @item @emph{C/C++}:
@multitable @columnfractions .20 .80 @multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{acc_device_t acc_get_device_type(void);} @item @emph{Prototype}: @tab @code{acc_device_t acc_get_device_type(void);}
...@@ -3382,6 +3388,11 @@ every event that has been registered. ...@@ -3382,6 +3388,11 @@ every event that has been registered.
We're not yet accounting for the fact that @cite{OpenACC events may We're not yet accounting for the fact that @cite{OpenACC events may
occur during event processing}. occur during event processing}.
We just handle one case specially, as required by CUDA 9.0
@command{nvprof}, that @code{acc_get_device_type}
(@ref{acc_get_device_type})) may be called from
@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
callbacks.
We're not yet implementing initialization via a We're not yet implementing initialization via a
@code{acc_register_library} function that is either statically linked @code{acc_register_library} function that is either statically linked
......
...@@ -40,6 +40,11 @@ ...@@ -40,6 +40,11 @@
static gomp_mutex_t acc_device_lock; static gomp_mutex_t acc_device_lock;
static gomp_mutex_t acc_init_state_lock;
static enum { uninitialized, initializing, initialized } acc_init_state
= uninitialized;
static pthread_t acc_init_thread;
/* A cached version of the dispatcher for the global "current" accelerator type, /* A cached version of the dispatcher for the global "current" accelerator type,
e.g. used as the default when creating new host threads. This is the e.g. used as the default when creating new host threads. This is the
device-type equivalent of goacc_device_num (which specifies which device to device-type equivalent of goacc_device_num (which specifies which device to
...@@ -228,6 +233,11 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs) ...@@ -228,6 +233,11 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
static struct gomp_device_descr * static struct gomp_device_descr *
acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit) acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
{ {
gomp_mutex_lock (&acc_init_state_lock);
acc_init_state = initializing;
acc_init_thread = pthread_self ();
gomp_mutex_unlock (&acc_init_state_lock);
bool check_not_nested_p; bool check_not_nested_p;
if (implicit) if (implicit)
{ {
...@@ -317,6 +327,14 @@ acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit) ...@@ -317,6 +327,14 @@ acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
&api_info); &api_info);
} }
/* We're setting 'initialized' *after* 'goacc_profiling_dispatch', so that a
nested 'acc_get_device_type' called from a profiling callback still sees
'initializing', so that we don't deadlock when it then again tries to lock
'goacc_prof_lock'. See also the discussion in 'acc_get_device_type'. */
gomp_mutex_lock (&acc_init_state_lock);
acc_init_state = initialized;
gomp_mutex_unlock (&acc_init_state_lock);
return base_dev; return base_dev;
} }
...@@ -643,6 +661,17 @@ acc_set_device_type (acc_device_t d) ...@@ -643,6 +661,17 @@ acc_set_device_type (acc_device_t d)
ialias (acc_set_device_type) ialias (acc_set_device_type)
static bool
self_initializing_p (void)
{
bool res;
gomp_mutex_lock (&acc_init_state_lock);
res = (acc_init_state == initializing
&& pthread_equal (acc_init_thread, pthread_self ()));
gomp_mutex_unlock (&acc_init_state_lock);
return res;
}
acc_device_t acc_device_t
acc_get_device_type (void) acc_get_device_type (void)
{ {
...@@ -652,6 +681,15 @@ acc_get_device_type (void) ...@@ -652,6 +681,15 @@ acc_get_device_type (void)
if (thr && thr->base_dev) if (thr && thr->base_dev)
res = acc_device_type (thr->base_dev->type); res = acc_device_type (thr->base_dev->type);
else if (self_initializing_p ())
/* The Cuda libaccinj64.so version 9.0+ calls acc_get_device_type during the
acc_ev_device_init_start event callback, which is dispatched during
acc_init_1. Trying to lock acc_device_lock during such a call (as we do
in the else clause below), will result in deadlock, since the lock has
already been taken by the acc_init_1 caller. We work around this problem
by using the acc_get_device_type property "If the device type has not yet
been selected, the value acc_device_none may be returned". */
;
else else
{ {
acc_prof_info prof_info; acc_prof_info prof_info;
......
/* { dg-do run } */
/* { dg-timeout 10 } */
/* Test the calling of 'acc_get_device_type' from within
'cb_device_init_start' and 'cb_device_init_end' callbacks. This occurs
when the CUDA 9.0 'nvprof' tool is used, and previously deadlocked. */
#include <assert.h>
#include <stdbool.h>
#include <acc_prof.h>
static acc_prof_reg reg;
static acc_prof_reg unreg;
static acc_prof_lookup_func lookup;
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
{
reg = reg_;
unreg = unreg_;
lookup = lookup_;
}
static bool expect_cb_device_init_start;
static bool expect_cb_device_init_end;
static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
assert (expect_cb_device_init_start);
expect_cb_device_init_start = false;
acc_device_t acc_device_type;
acc_device_type = acc_get_device_type ();
assert (acc_device_type == acc_device_none);
expect_cb_device_init_end = true;
}
static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
assert (expect_cb_device_init_end);
expect_cb_device_init_end = false;
acc_device_t acc_device_type;
acc_device_type = acc_get_device_type ();
assert (acc_device_type == acc_device_none);
}
int main(void)
{
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
expect_cb_device_init_start = true;
expect_cb_device_init_end = false;
acc_init (acc_device_host);
assert (!expect_cb_device_init_start);
assert (!expect_cb_device_init_end);
{
acc_device_t acc_device_type;
acc_device_type = acc_get_device_type ();
assert (acc_device_type == acc_device_host);
}
acc_shutdown (acc_device_host);
expect_cb_device_init_start = true;
expect_cb_device_init_end = false;
acc_init (acc_device_default);
assert (!expect_cb_device_init_start);
assert (!expect_cb_device_init_end);
{
acc_device_t acc_device_type;
acc_device_type = acc_get_device_type ();
assert (acc_device_type != acc_device_none);
}
acc_shutdown (acc_device_default);
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment