Commit 5fae049d by Thomas Schwinge Committed by Thomas Schwinge

OpenACC Profiling Interface (incomplete)

	libgomp/
	* acc_prof.h: New file.
	* oacc-profiling.c: Likewise.
	* Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES):
	Add these, respectively.
	* Makefile.in: Regenerate.
	* env.c (initialize_env): Call goacc_profiling_initialize.
	* oacc-plugin.c (GOMP_PLUGIN_goacc_thread)
	(GOMP_PLUGIN_goacc_profiling_dispatch): New functions.
	* oacc-plugin.h (GOMP_PLUGIN_goacc_thread)
	(GOMP_PLUGIN_goacc_profiling_dispatch): Declare.
	* libgomp.map (OACC_2.5.1): Add acc_prof_lookup,
	acc_prof_register, acc_prof_unregister, and acc_register_library.
	(GOMP_PLUGIN_1.3): Add GOMP_PLUGIN_goacc_profiling_dispatch, and
	GOMP_PLUGIN_goacc_thread.
	* oacc-int.h (struct goacc_thread): Add prof_info, api_info,
	prof_callbacks_enabled members.
	(goacc_prof_enabled, goacc_profiling_initialize)
	(_goacc_profiling_dispatch_p, _goacc_profiling_setup_p)
	(goacc_profiling_dispatch): Declare.
	(GOACC_PROF_ENABLED, GOACC_PROFILING_DISPATCH_P)
	(GOACC_PROFILING_SETUP_P): Define.
	* oacc-async.c (acc_async_test, acc_async_test_all, acc_wait)
	(acc_wait_async, acc_wait_all, acc_wait_all_async): Update for
	OpenACC Profiling Interface.
	* oacc-cuda.c (acc_get_current_cuda_device)
	(acc_get_current_cuda_context, acc_get_cuda_stream)
	(acc_set_cuda_stream): Likewise.
	* oacc-init.c (acc_init_1, goacc_attach_host_thread_to_device)
	(acc_init, acc_set_device_type, acc_get_device_type)
	(acc_get_device_num, goacc_lazy_initialize): Likewise.
	* oacc-mem.c (acc_malloc, acc_free, memcpy_tofrom_device)
	(acc_deviceptr, acc_hostptr, acc_is_present, acc_map_data)
	(acc_unmap_data, present_create_copy, delete_copyout)
	(update_dev_host): Likewise.
	* oacc-parallel.c (GOACC_parallel_keyed, GOACC_data_start)
	(GOACC_data_end, GOACC_enter_exit_data, GOACC_update, GOACC_wait):
	Likewise.
	* plugin/plugin-nvptx.c (nvptx_exec, nvptx_alloc, nvptx_free)
	(GOMP_OFFLOAD_openacc_exec, GOMP_OFFLOAD_openacc_async_exec):
	Likewise.
	* libgomp.texi: Update.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New
	file.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c:
	Likewise.

From-SVN: r271346
parent b48f44bf
2019-05-17 Thomas Schwinge <thomas@codesourcery.com>
* acc_prof.h: New file.
* oacc-profiling.c: Likewise.
* Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES):
Add these, respectively.
* Makefile.in: Regenerate.
* env.c (initialize_env): Call goacc_profiling_initialize.
* oacc-plugin.c (GOMP_PLUGIN_goacc_thread)
(GOMP_PLUGIN_goacc_profiling_dispatch): New functions.
* oacc-plugin.h (GOMP_PLUGIN_goacc_thread)
(GOMP_PLUGIN_goacc_profiling_dispatch): Declare.
* libgomp.map (OACC_2.5.1): Add acc_prof_lookup,
acc_prof_register, acc_prof_unregister, and acc_register_library.
(GOMP_PLUGIN_1.3): Add GOMP_PLUGIN_goacc_profiling_dispatch, and
GOMP_PLUGIN_goacc_thread.
* oacc-int.h (struct goacc_thread): Add prof_info, api_info,
prof_callbacks_enabled members.
(goacc_prof_enabled, goacc_profiling_initialize)
(_goacc_profiling_dispatch_p, _goacc_profiling_setup_p)
(goacc_profiling_dispatch): Declare.
(GOACC_PROF_ENABLED, GOACC_PROFILING_DISPATCH_P)
(GOACC_PROFILING_SETUP_P): Define.
* oacc-async.c (acc_async_test, acc_async_test_all, acc_wait)
(acc_wait_async, acc_wait_all, acc_wait_all_async): Update for
OpenACC Profiling Interface.
* oacc-cuda.c (acc_get_current_cuda_device)
(acc_get_current_cuda_context, acc_get_cuda_stream)
(acc_set_cuda_stream): Likewise.
* oacc-init.c (acc_init_1, goacc_attach_host_thread_to_device)
(acc_init, acc_set_device_type, acc_get_device_type)
(acc_get_device_num, goacc_lazy_initialize): Likewise.
* oacc-mem.c (acc_malloc, acc_free, memcpy_tofrom_device)
(acc_deviceptr, acc_hostptr, acc_is_present, acc_map_data)
(acc_unmap_data, present_create_copy, delete_copyout)
(update_dev_host): Likewise.
* oacc-parallel.c (GOACC_parallel_keyed, GOACC_data_start)
(GOACC_data_end, GOACC_enter_exit_data, GOACC_update, GOACC_wait):
Likewise.
* plugin/plugin-nvptx.c (nvptx_exec, nvptx_alloc, nvptx_free)
(GOMP_OFFLOAD_openacc_exec, GOMP_OFFLOAD_openacc_async_exec):
Likewise.
* libgomp.texi: Update.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New
file.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c:
Likewise.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c:
Likewise.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
Likewise.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c:
Likewise.
2019-05-13 Chung-Lin Tang <cltang@codesourcery.com>
* libgomp-plugin.h (struct goacc_asyncqueue): Declare.
......
......@@ -65,7 +65,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
affinity-fmt.c teams.c
affinity-fmt.c teams.c oacc-profiling.c
include $(top_srcdir)/plugin/Makefrag.am
......@@ -74,7 +74,7 @@ libgomp_la_SOURCES += openacc.f90
endif
nodist_noinst_HEADERS = libgomp_f.h
nodist_libsubinclude_HEADERS = omp.h openacc.h
nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
if USE_FORTRAN
nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
......
......@@ -217,7 +217,7 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \
teams.lo $(am__objects_1)
teams.lo oacc-profiling.lo $(am__objects_1)
libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
......@@ -551,7 +551,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
affinity.c target.c splay-tree.c libgomp-plugin.c \
oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
affinity-fmt.c teams.c $(am__append_3)
affinity-fmt.c teams.c oacc-profiling.c $(am__append_3)
# Nvidia PTX OpenACC plugin.
@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
......@@ -575,7 +575,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
@PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS)
@PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static
nodist_noinst_HEADERS = libgomp_f.h
nodist_libsubinclude_HEADERS = omp.h openacc.h
nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
@USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
@USE_FORTRAN_TRUE@ openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
......@@ -753,6 +753,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-profiling.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@
......
/* OpenACC Profiling Interface
Copyright (C) 2019 Free Software Foundation, Inc.
Contributed by Mentor, a Siemens Business.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
#ifndef _ACC_PROF_H
#define _ACC_PROF_H 1
/* The OpenACC specification doesn't say so explicitly, but as its Profiling
Interface explicitly makes use of, for example, <openacc.h>'s
'acc_device_t', we supposedly are to '#include' that file here. */
#include <openacc.h>
#ifdef __cplusplus
extern "C" {
#endif
/* Events. */
typedef enum acc_event_t
{
acc_ev_none = 0,
acc_ev_device_init_start,
acc_ev_device_init_end,
acc_ev_device_shutdown_start,
acc_ev_device_shutdown_end,
acc_ev_runtime_shutdown,
acc_ev_create,
acc_ev_delete,
acc_ev_alloc,
acc_ev_free,
acc_ev_enter_data_start,
acc_ev_enter_data_end,
acc_ev_exit_data_start,
acc_ev_exit_data_end,
acc_ev_update_start,
acc_ev_update_end,
acc_ev_compute_construct_start,
acc_ev_compute_construct_end,
acc_ev_enqueue_launch_start,
acc_ev_enqueue_launch_end,
acc_ev_enqueue_upload_start,
acc_ev_enqueue_upload_end,
acc_ev_enqueue_download_start,
acc_ev_enqueue_download_end,
acc_ev_wait_start,
acc_ev_wait_end,
acc_ev_last
} acc_event_t;
/* Callbacks Signature. */
/* "The datatype 'ssize_t' means a signed 32-bit integer for a 32-bit binary
and a 64-bit integer for a 64-bit binary". */
typedef signed long int _acc_prof_ssize_t;
/* "The datatype 'size_t' means an unsigned 32-bit integer for a 32-bit binary
and a 64-bit integer for a 64-bit binary". */
typedef unsigned long int _acc_prof_size_t;
/* "The datatype 'int' means a 32-bit integer for both 32-bit and 64-bit
binaries". */
typedef int _acc_prof_int_t;
/* Internal helpers: a struct's 'valid_bytes' may be less than its 'sizeof'. */
#define _ACC_PROF_VALID_BYTES_STRUCT(_struct, _lastfield, _valid_bytes_lastfield) \
offsetof (_struct, _lastfield) + (_valid_bytes_lastfield)
#if 0 /* Untested. */
#define _ACC_PROF_VALID_BYTES_TYPE_N(_type, _n, _valid_bytes_type) \
((_n - 1) * sizeof (_type) + (_valid_bytes_type))
#endif
#define _ACC_PROF_VALID_BYTES_BASICTYPE(_basictype) \
(sizeof (_basictype))
typedef struct acc_prof_info
{
acc_event_t event_type;
_acc_prof_int_t valid_bytes;
_acc_prof_int_t version;
acc_device_t device_type;
_acc_prof_int_t device_number;
_acc_prof_int_t thread_id;
_acc_prof_ssize_t async;
_acc_prof_ssize_t async_queue;
const char *src_file;
const char *func_name;
_acc_prof_int_t line_no, end_line_no;
_acc_prof_int_t func_line_no, func_end_line_no;
#define _ACC_PROF_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_prof_info, func_end_line_no, \
_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_int_t))
} acc_prof_info;
/* We implement the OpenACC 2.6 Profiling Interface. */
#define _ACC_PROF_INFO_VERSION 201711
typedef enum acc_construct_t
{
acc_construct_parallel = 0,
acc_construct_kernels,
acc_construct_loop,
acc_construct_data,
acc_construct_enter_data,
acc_construct_exit_data,
acc_construct_host_data,
acc_construct_atomic,
acc_construct_declare,
acc_construct_init,
acc_construct_shutdown,
acc_construct_set,
acc_construct_update,
acc_construct_routine,
acc_construct_wait,
acc_construct_runtime_api,
acc_construct_serial
} acc_construct_t;
typedef struct acc_data_event_info
{
acc_event_t event_type;
_acc_prof_int_t valid_bytes;
acc_construct_t parent_construct;
_acc_prof_int_t implicit;
void *tool_info;
const char *var_name;
_acc_prof_size_t bytes;
const void *host_ptr;
const void *device_ptr;
#define _ACC_DATA_EVENT_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_data_event_info, device_ptr, \
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
} acc_data_event_info;
typedef struct acc_launch_event_info
{
acc_event_t event_type;
_acc_prof_int_t valid_bytes;
acc_construct_t parent_construct;
_acc_prof_int_t implicit;
void *tool_info;
const char *kernel_name;
_acc_prof_size_t num_gangs, num_workers, vector_length;
#define _ACC_LAUNCH_EVENT_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_launch_event_info, vector_length, \
_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_size_t))
} acc_launch_event_info;
typedef struct acc_other_event_info
{
acc_event_t event_type;
_acc_prof_int_t valid_bytes;
acc_construct_t parent_construct;
_acc_prof_int_t implicit;
void *tool_info;
#define _ACC_OTHER_EVENT_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_other_event_info, tool_info, \
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
} acc_other_event_info;
typedef union acc_event_info
{
acc_event_t event_type;
acc_data_event_info data_event;
acc_launch_event_info launch_event;
acc_other_event_info other_event;
} acc_event_info;
typedef enum acc_device_api
{
acc_device_api_none = 0,
acc_device_api_cuda,
acc_device_api_opencl,
acc_device_api_coi,
acc_device_api_other
} acc_device_api;
typedef struct acc_api_info
{
acc_device_api device_api;
_acc_prof_int_t valid_bytes;
acc_device_t device_type;
_acc_prof_int_t vendor;
const void *device_handle;
const void *context_handle;
const void *async_handle;
#define _ACC_API_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_api_info, async_handle, \
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
} acc_api_info;
/* Don't tag 'acc_prof_callback' as '__GOACC_NOTHROW': these functions are
provided by user code, and must be expected to do anything. */
typedef void (*acc_prof_callback) (acc_prof_info *, acc_event_info *,
acc_api_info *);
/* Loading the Library. */
typedef enum acc_register_t
{
acc_reg = 0,
acc_toggle = 1,
acc_toggle_per_thread = 2
} acc_register_t;
typedef void (*acc_prof_reg) (acc_event_t, acc_prof_callback, acc_register_t);
extern void acc_prof_register (acc_event_t, acc_prof_callback,
acc_register_t) __GOACC_NOTHROW;
extern void acc_prof_unregister (acc_event_t, acc_prof_callback,
acc_register_t) __GOACC_NOTHROW;
typedef void (*acc_query_fn) ();
typedef acc_query_fn (*acc_prof_lookup_func) (const char *);
extern acc_query_fn acc_prof_lookup (const char *) __GOACC_NOTHROW;
/* Don't tag 'acc_register_library' as '__GOACC_NOTHROW': this function can be
overridden by user code, and must be expected to do anything. */
extern void acc_register_library (acc_prof_reg, acc_prof_reg,
acc_prof_lookup_func);
#ifdef __cplusplus
}
#endif
#endif /* _ACC_PROF_H */
......@@ -1425,5 +1425,7 @@ initialize_env (void)
parse_gomp_openacc_dim ();
goacc_runtime_initialize ();
goacc_profiling_initialize ();
}
#endif /* LIBGOMP_OFFLOADED_ONLY */
......@@ -476,6 +476,14 @@ OACC_2.5 {
acc_update_self_async_array_h_;
} OACC_2.0.1;
OACC_2.5.1 {
global:
acc_prof_lookup;
acc_prof_register;
acc_prof_unregister;
acc_register_library;
} OACC_2.5;
GOACC_2.0 {
global:
GOACC_data_end;
......@@ -515,3 +523,9 @@ GOMP_PLUGIN_1.2 {
global:
GOMP_PLUGIN_acc_default_dim;
} GOMP_PLUGIN_1.1;
GOMP_PLUGIN_1.3 {
global:
GOMP_PLUGIN_goacc_profiling_dispatch;
GOMP_PLUGIN_goacc_thread;
} GOMP_PLUGIN_1.2;
......@@ -43,17 +43,6 @@ get_goacc_thread (void)
return thr;
}
static struct gomp_device_descr *
get_goacc_thread_device (void)
{
struct goacc_thread *thr = goacc_thread ();
if (!thr || !thr->dev)
gomp_fatal ("no device active");
return thr->dev;
}
static int
validate_async_val (int async)
{
......@@ -76,7 +65,10 @@ validate_async_val (int async)
/* Return the asyncqueue to be used for OpenACC async-argument ASYNC. This
might return NULL if no asyncqueue is to be used. Otherwise, if CREATE,
create the asyncqueue if it doesn't exist yet. */
create the asyncqueue if it doesn't exist yet.
Unless CREATE, this will not generate any OpenACC Profiling Interface
events. */
attribute_hidden struct goacc_asyncqueue *
lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
......@@ -152,8 +144,25 @@ acc_async_test (int async)
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
if (!aq)
return 1;
else
return thr->dev->openacc.async.test_func (aq);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
int res = thr->dev->openacc.async.test_func (aq);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
return res;
}
int
......@@ -161,6 +170,10 @@ acc_async_test_all (void)
{
struct goacc_thread *thr = get_goacc_thread ();
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
int ret = 1;
gomp_mutex_lock (&thr->dev->openacc.async.lock);
for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
......@@ -170,6 +183,13 @@ acc_async_test_all (void)
break;
}
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
return ret;
}
......@@ -179,8 +199,26 @@ acc_wait (int async)
struct goacc_thread *thr = get_goacc_thread ();
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
if (aq && !thr->dev->openacc.async.synchronize_func (aq))
if (!aq)
return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
if (!thr->dev->openacc.async.synchronize_func (aq))
gomp_fatal ("wait on %d failed", async);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
/* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait. */
......@@ -205,10 +243,19 @@ acc_wait_async (int async1, int async2)
if (!aq1)
return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async2;
prof_info.async_queue = prof_info.async;
}
goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2);
/* An async queue is always synchronized with itself. */
if (aq1 == aq2)
return;
goto out_prof;
if (aq2)
{
......@@ -222,18 +269,35 @@ acc_wait_async (int async1, int async2)
if (!thr->dev->openacc.async.synchronize_func (aq1))
gomp_fatal ("wait on %d failed", async1);
}
out_prof:
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
void
acc_wait_all (void)
{
struct gomp_device_descr *dev = get_goacc_thread_device ();
struct goacc_thread *thr = goacc_thread ();
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
bool ret = true;
gomp_mutex_lock (&dev->openacc.async.lock);
for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next)
ret &= dev->openacc.async.synchronize_func (l->aq);
gomp_mutex_unlock (&dev->openacc.async.lock);
gomp_mutex_lock (&thr->dev->openacc.async.lock);
for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
ret &= thr->dev->openacc.async.synchronize_func (l->aq);
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
if (!ret)
gomp_fatal ("wait all failed");
......@@ -255,6 +319,15 @@ acc_wait_all_async (int async)
{
struct goacc_thread *thr = get_goacc_thread ();
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
bool ret = true;
......@@ -270,6 +343,12 @@ acc_wait_all_async (int async)
}
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
if (!ret)
gomp_fatal ("wait all async(%d) failed", async);
}
......
......@@ -37,10 +37,23 @@ acc_get_current_cuda_device (void)
{
struct goacc_thread *thr = goacc_thread ();
void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func)
return thr->dev->openacc.cuda.get_current_device_func ();
{
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
ret = thr->dev->openacc.cuda.get_current_device_func ();
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
return NULL;
return ret;
}
void *
......@@ -48,10 +61,23 @@ acc_get_current_cuda_context (void)
{
struct goacc_thread *thr = goacc_thread ();
void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func)
return thr->dev->openacc.cuda.get_current_context_func ();
return NULL;
{
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
ret = thr->dev->openacc.cuda.get_current_context_func ();
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
return ret;
}
void *
......@@ -62,14 +88,32 @@ acc_get_cuda_stream (int async)
if (!async_valid_p (async))
return NULL;
void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
{
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
if (aq)
return thr->dev->openacc.cuda.get_stream_func (aq);
if (!aq)
return ret;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
ret = thr->dev->openacc.cuda.get_stream_func (aq);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
return NULL;
return ret;
}
int
......@@ -87,6 +131,15 @@ acc_set_cuda_stream (int async, void *stream)
int ret = -1;
if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
{
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
goacc_aq aq = get_goacc_asyncqueue (async);
/* Due to not using an asyncqueue for "acc_async_sync", this cannot be
used to change the CUDA stream associated with "acc_async_sync". */
......@@ -95,11 +148,19 @@ acc_set_cuda_stream (int async, void *stream)
assert (async == acc_async_sync);
gomp_debug (0, "Refusing request to set CUDA stream associated"
" with \"acc_async_sync\"\n");
return 0;
ret = 0;
goto out_prof;
}
gomp_mutex_lock (&thr->dev->openacc.async.lock);
ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
gomp_mutex_unlock (&thr->dev->openacc.async.lock);
out_prof:
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
return ret;
......
......@@ -210,8 +210,67 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
held before calling this function. */
static struct gomp_device_descr *
acc_init_1 (acc_device_t d)
acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
{
bool check_not_nested_p;
if (implicit)
{
/* In the implicit case, there should (TODO: must?) already be something
have been set up for an outer construct. */
check_not_nested_p = false;
}
else
{
check_not_nested_p = true;
/* TODO: should we set 'thr->prof_info' etc. in this case ('acc_init')?
The problem is, that we don't have 'thr' yet? (So,
'check_not_nested_p = true' also is pointless actually.) */
}
bool profiling_p = GOACC_PROFILING_DISPATCH_P (check_not_nested_p);
acc_prof_info prof_info;
if (profiling_p)
{
prof_info.event_type = acc_ev_device_init_start;
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
prof_info.version = _ACC_PROF_INFO_VERSION;
prof_info.device_type = d;
prof_info.device_number = goacc_device_num;
prof_info.thread_id = -1;
prof_info.async = acc_async_sync;
prof_info.async_queue = prof_info.async;
prof_info.src_file = NULL;
prof_info.func_name = NULL;
prof_info.line_no = -1;
prof_info.end_line_no = -1;
prof_info.func_line_no = -1;
prof_info.func_end_line_no = -1;
}
acc_event_info device_init_event_info;
if (profiling_p)
{
device_init_event_info.other_event.event_type = prof_info.event_type;
device_init_event_info.other_event.valid_bytes
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
device_init_event_info.other_event.parent_construct = parent_construct;
device_init_event_info.other_event.implicit = implicit;
device_init_event_info.other_event.tool_info = NULL;
}
acc_api_info api_info;
if (profiling_p)
{
api_info.device_api = acc_device_api_none;
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
api_info.device_type = prof_info.device_type;
api_info.vendor = -1;
api_info.device_handle = NULL;
api_info.context_handle = NULL;
api_info.async_handle = NULL;
}
if (profiling_p)
goacc_profiling_dispatch (&prof_info, &device_init_event_info, &api_info);
struct gomp_device_descr *base_dev, *acc_dev;
int ndevs;
......@@ -234,6 +293,14 @@ acc_init_1 (acc_device_t d)
gomp_init_device (acc_dev);
gomp_mutex_unlock (&acc_dev->lock);
if (profiling_p)
{
prof_info.event_type = acc_ev_device_init_end;
device_init_event_info.other_event.event_type = prof_info.event_type;
goacc_profiling_dispatch (&prof_info, &device_init_event_info,
&api_info);
}
return base_dev;
}
......@@ -423,7 +490,11 @@ goacc_attach_host_thread_to_device (int ord)
thr->dev = acc_dev = &base_dev[ord];
thr->saved_bound_dev = NULL;
thr->mapped_data = NULL;
thr->prof_info = NULL;
thr->api_info = NULL;
/* Initially, all callbacks for all events are enabled. */
thr->prof_callbacks_enabled = true;
thr->target_tls
= acc_dev->openacc.create_thread_data_func (ord);
}
......@@ -437,9 +508,7 @@ acc_init (acc_device_t d)
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
cached_base_dev = acc_init_1 (d);
cached_base_dev = acc_init_1 (d, acc_construct_runtime_api, 0);
gomp_mutex_unlock (&acc_device_lock);
goacc_attach_host_thread_to_device (-1);
......@@ -498,6 +567,12 @@ acc_set_device_type (acc_device_t d)
struct gomp_device_descr *base_dev, *acc_dev;
struct goacc_thread *thr = goacc_thread ();
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
prof_info.device_type = d;
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
......@@ -522,6 +597,12 @@ acc_set_device_type (acc_device_t d)
}
goacc_attach_host_thread_to_device (-1);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
ialias (acc_set_device_type)
......@@ -537,12 +618,22 @@ acc_get_device_type (void)
res = acc_device_type (thr->base_dev->type);
else
{
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
dev = resolve_device (acc_device_default, true);
gomp_mutex_unlock (&acc_device_lock);
res = acc_device_type (dev->type);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
assert (res != acc_device_default
......@@ -562,12 +653,24 @@ acc_get_device_num (acc_device_t d)
if (d >= _ACC_device_hwm)
gomp_fatal ("unknown device type %u", (unsigned) d);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
prof_info.device_type = d;
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
dev = resolve_device (d, true);
gomp_mutex_unlock (&acc_device_lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
if (thr && thr->base_dev == dev && thr->dev)
return thr->dev->target_id;
......@@ -689,8 +792,13 @@ goacc_lazy_initialize (void)
if (thr && thr->dev)
return;
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
if (!cached_base_dev)
acc_init (acc_device_default);
else
goacc_attach_host_thread_to_device (-1);
cached_base_dev = acc_init_1 (acc_device_default,
acc_construct_parallel, 1);
gomp_mutex_unlock (&acc_device_lock);
goacc_attach_host_thread_to_device (-1);
}
......@@ -40,6 +40,7 @@
#include "openacc.h"
#include "config.h"
#include "acc_prof.h"
#include <stddef.h>
#include <stdbool.h>
#include <stdarg.h>
......@@ -68,6 +69,12 @@ struct goacc_thread
strictly push/pop semantics according to lexical scope. */
struct target_mem_desc *mapped_data;
/* Data of the OpenACC Profiling Interface. */
acc_prof_info *prof_info;
acc_api_info *api_info;
/* Per-thread toggle of OpenACC Profiling Interface callbacks. */
bool prof_callbacks_enabled;
/* These structures form a list: this is the next thread in that list. */
struct goacc_thread *next;
......@@ -128,6 +135,28 @@ async_synchronous_p (int async)
return async == acc_async_sync;
}
extern bool goacc_prof_enabled;
/* Tune for the (very common) case that profiling is not enabled. */
#define GOACC_PROF_ENABLED \
(__builtin_expect (__atomic_load_n (&goacc_prof_enabled, \
MEMMODEL_ACQUIRE) == true, false))
void goacc_profiling_initialize (void);
bool _goacc_profiling_dispatch_p (bool);
/* Tune for the (very common) case that profiling is not enabled. */
#define GOACC_PROFILING_DISPATCH_P(...) \
(GOACC_PROF_ENABLED \
&& _goacc_profiling_dispatch_p (__VA_ARGS__))
bool _goacc_profiling_setup_p (struct goacc_thread *,
acc_prof_info *, acc_api_info *);
/* Tune for the (very common) case that profiling is not enabled. */
#define GOACC_PROFILING_SETUP_P(...) \
(GOACC_PROFILING_DISPATCH_P (false) \
&& _goacc_profiling_setup_p (__VA_ARGS__))
void goacc_profiling_dispatch (acc_prof_info *, acc_event_info *,
acc_api_info *);
#ifdef HAVE_ATTRIBUTE_VISIBILITY
# pragma GCC visibility pop
#endif
......
......@@ -108,7 +108,19 @@ acc_malloc (size_t s)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return malloc (s);
return thr->dev->alloc_func (thr->dev->target_id, s);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
void *res = thr->dev->alloc_func (thr->dev->target_id, s);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
return res;
}
/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
......@@ -131,6 +143,10 @@ acc_free (void *d)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return free (d);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
gomp_mutex_lock (&acc_dev->lock);
/* We don't have to call lazy open here, as the ptr value must have
......@@ -151,6 +167,12 @@ acc_free (void *d)
if (!acc_dev->free_func (acc_dev->target_id, d))
gomp_fatal ("error in freeing device memory in %s", __FUNCTION__);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
static void
......@@ -172,11 +194,26 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
return;
}
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
goacc_aq aq = get_goacc_asyncqueue (async);
if (from)
gomp_copy_dev2host (thr->dev, aq, h, d, s);
else
gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
void
......@@ -221,6 +258,9 @@ acc_deviceptr (void *h)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h;
/* In the following, no OpenACC Profiling Interface events can possibly be
generated. */
gomp_mutex_lock (&dev->lock);
n = lookup_host (dev, h, 1);
......@@ -258,6 +298,9 @@ acc_hostptr (void *d)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return d;
/* In the following, no OpenACC Profiling Interface events can possibly be
generated. */
gomp_mutex_lock (&acc_dev->lock);
n = lookup_dev (acc_dev->openacc.data_environ, d, 1);
......@@ -295,6 +338,9 @@ acc_is_present (void *h, size_t s)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h != NULL;
/* In the following, no OpenACC Profiling Interface events can possibly be
generated. */
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
......@@ -339,6 +385,10 @@ acc_map_data (void *h, void *d, size_t s)
gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map",
(void *)h, (int)s, (void *)d, (int)s);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
gomp_mutex_lock (&acc_dev->lock);
if (lookup_host (acc_dev, h, s))
......@@ -360,6 +410,12 @@ acc_map_data (void *h, void *d, size_t s)
tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
&kinds, true, GOMP_MAP_VARS_OPENACC);
tgt->list[0].key->refcount = REFCOUNT_INFINITY;
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
gomp_mutex_lock (&acc_dev->lock);
......@@ -380,6 +436,10 @@ acc_unmap_data (void *h)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
size_t host_size;
gomp_mutex_lock (&acc_dev->lock);
......@@ -433,6 +493,12 @@ acc_unmap_data (void *h)
gomp_mutex_unlock (&acc_dev->lock);
gomp_unmap_vars (t, true);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
#define FLAG_PRESENT (1 << 0)
......@@ -456,6 +522,15 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
......@@ -518,6 +593,12 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
gomp_mutex_unlock (&acc_dev->lock);
}
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
return d;
}
......@@ -599,6 +680,15 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
......@@ -672,6 +762,12 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
}
gomp_mutex_unlock (&acc_dev->lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
void
......@@ -737,6 +833,15 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s);
......@@ -758,6 +863,12 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
gomp_copy_dev2host (acc_dev, aq, h, d, s);
gomp_mutex_unlock (&acc_dev->lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
void
......
......@@ -29,6 +29,7 @@
#include "libgomp.h"
#include "oacc-plugin.h"
#include "oacc-int.h"
#include "acc_prof.h"
/* This plugin function is now obsolete. */
void
......@@ -38,6 +39,14 @@ GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)),
gomp_fatal ("invalid plugin function");
}
/* Return the TLS data for the current thread. */
struct goacc_thread *
GOMP_PLUGIN_goacc_thread (void)
{
return goacc_thread ();
}
/* Return the target-specific part of the TLS data for the current thread. */
void *
......@@ -57,3 +66,11 @@ GOMP_PLUGIN_acc_default_dim (unsigned int i)
}
return goacc_default_dims[i];
}
void
GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *prof_info,
acc_event_info *event_info,
acc_api_info *api_info)
{
goacc_profiling_dispatch (prof_info, event_info, api_info);
}
......@@ -27,8 +27,15 @@
#ifndef OACC_PLUGIN_H
#define OACC_PLUGIN_H 1
#include "oacc-int.h"
#include "acc_prof.h"
extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
extern struct goacc_thread *GOMP_PLUGIN_goacc_thread (void);
extern void *GOMP_PLUGIN_acc_thread (void);
extern int GOMP_PLUGIN_acc_default_dim (unsigned int);
extern void GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *,
acc_event_info *,
acc_api_info *);
#endif
/* Test dispatch of events to callbacks. */
#undef NDEBUG
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <acc_prof.h>
/* Use explicit 'copyin' clauses, to work around "'firstprivate'
optimizations", which will cause the value at the point of call to be used
(*before* any potential modifications done in callbacks), as opposed to its
address being taken, which then later gets dereferenced (*after* any
modifications done in callbacks). */
#define COPYIN(...) copyin(__VA_ARGS__)
/* See the 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' reference in
'libgomp.texi'. */
#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
volatile // TODO PR90488
static int state = -1;
#define STATE_OP(state, op) \
do \
{ \
typeof (state) state_o = (state); \
(void) state_o; \
(state)op; \
DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
} \
while (0)
static acc_device_t acc_device_type;
static int acc_device_num;
static int num_gangs, num_workers, vector_length;
static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (acc_device_type != acc_device_host);
assert (state == 0);
STATE_OP (state, = 1);
assert (prof_info->event_type == acc_ev_enqueue_launch_start);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == acc_async_sync);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->launch_event.event_type == prof_info->event_type);
assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
assert (event_info->launch_event.parent_construct == acc_construct_parallel);
assert (event_info->launch_event.implicit == 1);
assert (event_info->launch_event.tool_info == NULL);
assert (event_info->launch_event.kernel_name != NULL);
{
const char *s = strstr (event_info->launch_event.kernel_name, "main");
assert (s != NULL);
s = strstr (s, "omp_fn");
assert (s != NULL);
}
if (num_gangs < 1)
assert (event_info->launch_event.num_gangs >= 1);
else
{
#ifdef __OPTIMIZE__
assert (event_info->launch_event.num_gangs == num_gangs);
#else
/* No parallelized OpenACC 'kernels' constructs. Unparallelized OpenACC
'kernels' constructs must get launched as 1 x 1 x 1 GPU kernels. */
assert (event_info->launch_event.num_gangs == 1);
#endif
}
if (num_workers < 1)
assert (event_info->launch_event.num_workers >= 1);
else
{
#ifdef __OPTIMIZE__
assert (event_info->launch_event.num_workers == num_workers);
#else
/* See 'num_gangs' above. */
assert (event_info->launch_event.num_workers == 1);
#endif
}
if (vector_length < 1)
assert (event_info->launch_event.vector_length >= 1);
else if (acc_device_type == acc_device_nvidia) /* ... is special. */
assert (event_info->launch_event.vector_length == 32);
else
{
#ifdef __OPTIMIZE__
assert (event_info->launch_event.vector_length == vector_length);
#else
/* See 'num_gangs' above. */
assert (event_info->launch_event.vector_length == 1);
#endif
}
if (acc_device_type == acc_device_host)
assert (api_info->device_api == acc_device_api_none);
else
assert (api_info->device_api == acc_device_api_cuda);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
}
static acc_prof_reg reg;
static acc_prof_reg unreg;
static acc_prof_lookup_func lookup;
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
{
DEBUG_printf ("%s\n", __FUNCTION__);
reg = reg_;
unreg = unreg_;
lookup = lookup_;
}
int main()
{
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
STATE_OP (state, = 0);
reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
assert (state == 0);
acc_device_type = acc_get_device_type ();
acc_device_num = acc_get_device_num (acc_device_type);
assert (state == 0);
/* Parallelism dimensions: compiler/runtime decides. */
STATE_OP (state, = 0);
num_gangs = num_workers = vector_length = 0;
{
#define N 100
int x[N];
#pragma acc kernels
{
for (int i = 0; i < N; ++i)
x[i] = i * i;
}
if (acc_device_type == acc_device_host)
assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */
else
assert (state == 1);
for (int i = 0; i < N; ++i)
if (x[i] != i * i)
__builtin_abort ();
#undef N
}
/* Parallelism dimensions: literal. */
STATE_OP (state, = 0);
num_gangs = 30;
num_workers = 3;
vector_length = 5;
{
#define N 100
int x[N];
#pragma acc kernels \
num_gangs (30) num_workers (3) vector_length (5)
/* { dg-prune-output "using vector_length \\(32\\), ignoring 5" } */
{
for (int i = 0; i < N; ++i)
x[i] = i * i;
}
if (acc_device_type == acc_device_host)
assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */
else
assert (state == 1);
for (int i = 0; i < N; ++i)
if (x[i] != i * i)
__builtin_abort ();
#undef N
}
/* Parallelism dimensions: variable. */
STATE_OP (state, = 0);
num_gangs = 22;
num_workers = 5;
vector_length = 7;
{
#define N 100
int x[N];
#pragma acc kernels \
num_gangs (num_gangs) num_workers (num_workers) vector_length (vector_length)
/* { dg-prune-output "using vector_length \\(32\\), ignoring runtime setting" } */
{
for (int i = 0; i < N; ++i)
x[i] = i * i;
}
if (acc_device_type == acc_device_host)
assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */
else
assert (state == 1);
for (int i = 0; i < N; ++i)
if (x[i] != i * i)
__builtin_abort ();
#undef N
}
return 0;
}
/* Test the 'valid_bytes' magic. */
#undef NDEBUG
#include <assert.h>
#include <acc_prof.h>
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
static int ev_count_data;
static void cb_data_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (event_info->data_event.valid_bytes == _ACC_DATA_EVENT_INFO_VALID_BYTES);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
++ev_count_data;
}
static int ev_count_launch;
static void cb_launch_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
++ev_count_launch;
}
static int ev_count_other;
static void cb_other_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
++ev_count_other;
}
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
{
DEBUG_printf ("%s\n", __FUNCTION__);
reg_ (acc_ev_device_init_start, cb_other_event, acc_reg);
reg_ (acc_ev_device_init_end, cb_other_event, acc_reg);
reg_ (acc_ev_device_shutdown_start, cb_other_event, acc_reg);
reg_ (acc_ev_device_shutdown_end, cb_other_event, acc_reg);
reg_ (acc_ev_runtime_shutdown, cb_other_event, acc_reg);
reg_ (acc_ev_create, cb_data_event, acc_reg);
reg_ (acc_ev_delete, cb_data_event, acc_reg);
reg_ (acc_ev_alloc, cb_data_event, acc_reg);
reg_ (acc_ev_free, cb_data_event, acc_reg);
reg_ (acc_ev_enter_data_start, cb_other_event, acc_reg);
reg_ (acc_ev_enter_data_end, cb_other_event, acc_reg);
reg_ (acc_ev_exit_data_start, cb_other_event, acc_reg);
reg_ (acc_ev_exit_data_end, cb_other_event, acc_reg);
reg_ (acc_ev_update_start, cb_other_event, acc_reg);
reg_ (acc_ev_update_end, cb_other_event, acc_reg);
reg_ (acc_ev_compute_construct_start, cb_other_event, acc_reg);
reg_ (acc_ev_compute_construct_end, cb_other_event, acc_reg);
reg_ (acc_ev_enqueue_launch_start, cb_launch_event, acc_reg);
reg_ (acc_ev_enqueue_launch_end, cb_launch_event, acc_reg);
reg_ (acc_ev_enqueue_upload_start, cb_data_event, acc_reg);
reg_ (acc_ev_enqueue_upload_end, cb_data_event, acc_reg);
reg_ (acc_ev_enqueue_download_start, cb_data_event, acc_reg);
reg_ (acc_ev_enqueue_download_end, cb_data_event, acc_reg);
reg_ (acc_ev_wait_start, cb_other_event, acc_reg);
reg_ (acc_ev_wait_end, cb_other_event, acc_reg);
}
/* Basic struct. */
typedef struct A
{
int a;
int b;
#define VALID_BYTES_A \
_ACC_PROF_VALID_BYTES_STRUCT (A, b, \
_ACC_PROF_VALID_BYTES_BASICTYPE (int))
} A;
/* Add a 'char' field. */
typedef struct B
{
int a;
int b;
char c;
#define VALID_BYTES_B \
_ACC_PROF_VALID_BYTES_STRUCT (B, c, \
_ACC_PROF_VALID_BYTES_BASICTYPE (char))
} B;
/* Add another 'char' field. */
typedef struct C
{
int a;
int b;
char c, d;
#define VALID_BYTES_C \
_ACC_PROF_VALID_BYTES_STRUCT (C, d, \
_ACC_PROF_VALID_BYTES_BASICTYPE (char))
} C;
/* Add two 'void *' fields. */
typedef struct D
{
int a;
int b;
char c, d;
void *e;
void *f;
#define VALID_BYTES_D \
_ACC_PROF_VALID_BYTES_STRUCT (D, f, \
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
} D;
/* Add another three 'char' fields. */
typedef struct E
{
int a;
int b;
char c, d;
void *e;
void *f;
char g, h, i;
#define VALID_BYTES_E \
_ACC_PROF_VALID_BYTES_STRUCT (E, i, \
_ACC_PROF_VALID_BYTES_BASICTYPE (char))
} E;
int main()
{
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
A A1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof A1, VALID_BYTES_A);
assert (VALID_BYTES_A <= sizeof A1);
DEBUG_printf ("&A1=%p, &A1.b=%p\n", &A1, &A1.b);
assert (((char *) &A1) + VALID_BYTES_A == (char *) (&A1.b + 1));
B B1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof B1, VALID_BYTES_B);
assert (VALID_BYTES_B <= sizeof B1);
DEBUG_printf ("&B1=%p, &B1.c=%p\n", &B1, &B1.c);
assert (((char *) &B1) + VALID_BYTES_B == (char *) (&B1.c + 1));
assert (VALID_BYTES_B == VALID_BYTES_A + 1 * sizeof (char));
C C1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof C1, VALID_BYTES_C);
assert (VALID_BYTES_C <= sizeof C1);
DEBUG_printf ("&C1=%p, &C1.d=%p\n", &C1, &C1.d);
assert (((char *) &C1) + VALID_BYTES_C == (char *) (&C1.d + 1));
assert (VALID_BYTES_C == VALID_BYTES_B + 1 * sizeof (char));
D D1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof D1, VALID_BYTES_D);
assert (VALID_BYTES_D <= sizeof D1);
DEBUG_printf ("&D1=%p, &D1.f=%p\n", &D1, &D1.f);
assert (((char *) &D1) + VALID_BYTES_D == (char *) (&D1.f + 1));
assert (VALID_BYTES_D > VALID_BYTES_C);
E E1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof E1, VALID_BYTES_E);
assert (VALID_BYTES_E <= sizeof E1);
DEBUG_printf ("&E1=%p, &E1.i=%p\n", &E1, &E1.i);
assert (((char *) &E1) + VALID_BYTES_E == (char *) (&E1.i + 1));
assert (VALID_BYTES_E == VALID_BYTES_D + 3 * sizeof (char));
ev_count_data = 0;
ev_count_launch = 0;
ev_count_other = 0;
/* Trigger tests done in 'cb_*' functions. */
int host;
#pragma acc parallel copyout (host)
{
asm volatile ("" : : : "memory"); // TODO PR90488
host = acc_on_device (acc_device_host);
}
DEBUG_printf ("ev_count_data = %d\n", ev_count_data);
if (host)
assert (ev_count_data == 0);
else
{
/* We don't know exactly how many data events to expect, but we at least
expect some. */
assert (ev_count_data > 0);
}
DEBUG_printf ("ev_count_launch = %d\n", ev_count_launch);
if (host)
assert (ev_count_data == 0);
else
{
/* We expect two launch events, 'acc_ev_enqueue_launch_start',
'acc_ev_enqueue_launch_end'. */
assert (ev_count_launch == 2);
}
DEBUG_printf ("ev_count_other = %d\n", ev_count_other);
/* We don't know exactly how many other events to expect, but we at least
expect 'acc_ev_device_init_start', 'acc_ev_device_init_end',
'acc_ev_compute_construct_start', 'acc_ev_compute_construct_end'. */
assert (ev_count_other >= 4);
return 0;
}
/* Test the 'version' field of 'acc_prof_info'. */
#undef NDEBUG
#include <assert.h>
#include <acc_prof.h>
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
static int ev_count;
static void cb_any_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
assert (prof_info->version == 201711);
++ev_count;
}
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
{
DEBUG_printf ("%s\n", __FUNCTION__);
reg_ (acc_ev_device_init_start, cb_any_event, acc_reg);
reg_ (acc_ev_device_init_end, cb_any_event, acc_reg);
reg_ (acc_ev_device_shutdown_start, cb_any_event, acc_reg);
reg_ (acc_ev_device_shutdown_end, cb_any_event, acc_reg);
reg_ (acc_ev_runtime_shutdown, cb_any_event, acc_reg);
reg_ (acc_ev_create, cb_any_event, acc_reg);
reg_ (acc_ev_delete, cb_any_event, acc_reg);
reg_ (acc_ev_alloc, cb_any_event, acc_reg);
reg_ (acc_ev_free, cb_any_event, acc_reg);
reg_ (acc_ev_enter_data_start, cb_any_event, acc_reg);
reg_ (acc_ev_enter_data_end, cb_any_event, acc_reg);
reg_ (acc_ev_exit_data_start, cb_any_event, acc_reg);
reg_ (acc_ev_exit_data_end, cb_any_event, acc_reg);
reg_ (acc_ev_update_start, cb_any_event, acc_reg);
reg_ (acc_ev_update_end, cb_any_event, acc_reg);
reg_ (acc_ev_compute_construct_start, cb_any_event, acc_reg);
reg_ (acc_ev_compute_construct_end, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_launch_start, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_launch_end, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_upload_start, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_upload_end, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_download_start, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_download_end, cb_any_event, acc_reg);
reg_ (acc_ev_wait_start, cb_any_event, acc_reg);
reg_ (acc_ev_wait_end, cb_any_event, acc_reg);
}
int main()
{
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
ev_count = 0;
/* Trigger tests done in 'cb_*' functions. */
#pragma acc parallel
{
asm volatile ("" : : : "memory"); // TODO PR90488
}
DEBUG_printf ("ev_count = %d\n", ev_count);
/* We don't know exactly how many events to expect, but we at least expect
'acc_ev_device_init_start', 'acc_ev_device_init_end',
'acc_ev_compute_construct_start', 'acc_ev_compute_construct_end'. */
assert (ev_count >= 4);
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment