Commit 5fae049d by Thomas Schwinge Committed by Thomas Schwinge

OpenACC Profiling Interface (incomplete)

	libgomp/
	* acc_prof.h: New file.
	* oacc-profiling.c: Likewise.
	* Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES):
	Add these, respectively.
	* Makefile.in: Regenerate.
	* env.c (initialize_env): Call goacc_profiling_initialize.
	* oacc-plugin.c (GOMP_PLUGIN_goacc_thread)
	(GOMP_PLUGIN_goacc_profiling_dispatch): New functions.
	* oacc-plugin.h (GOMP_PLUGIN_goacc_thread)
	(GOMP_PLUGIN_goacc_profiling_dispatch): Declare.
	* libgomp.map (OACC_2.5.1): Add acc_prof_lookup,
	acc_prof_register, acc_prof_unregister, and acc_register_library.
	(GOMP_PLUGIN_1.3): Add GOMP_PLUGIN_goacc_profiling_dispatch, and
	GOMP_PLUGIN_goacc_thread.
	* oacc-int.h (struct goacc_thread): Add prof_info, api_info,
	prof_callbacks_enabled members.
	(goacc_prof_enabled, goacc_profiling_initialize)
	(_goacc_profiling_dispatch_p, _goacc_profiling_setup_p)
	(goacc_profiling_dispatch): Declare.
	(GOACC_PROF_ENABLED, GOACC_PROFILING_DISPATCH_P)
	(GOACC_PROFILING_SETUP_P): Define.
	* oacc-async.c (acc_async_test, acc_async_test_all, acc_wait)
	(acc_wait_async, acc_wait_all, acc_wait_all_async): Update for
	OpenACC Profiling Interface.
	* oacc-cuda.c (acc_get_current_cuda_device)
	(acc_get_current_cuda_context, acc_get_cuda_stream)
	(acc_set_cuda_stream): Likewise.
	* oacc-init.c (acc_init_1, goacc_attach_host_thread_to_device)
	(acc_init, acc_set_device_type, acc_get_device_type)
	(acc_get_device_num, goacc_lazy_initialize): Likewise.
	* oacc-mem.c (acc_malloc, acc_free, memcpy_tofrom_device)
	(acc_deviceptr, acc_hostptr, acc_is_present, acc_map_data)
	(acc_unmap_data, present_create_copy, delete_copyout)
	(update_dev_host): Likewise.
	* oacc-parallel.c (GOACC_parallel_keyed, GOACC_data_start)
	(GOACC_data_end, GOACC_enter_exit_data, GOACC_update, GOACC_wait):
	Likewise.
	* plugin/plugin-nvptx.c (nvptx_exec, nvptx_alloc, nvptx_free)
	(GOMP_OFFLOAD_openacc_exec, GOMP_OFFLOAD_openacc_async_exec):
	Likewise.
	* libgomp.texi: Update.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New
	file.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
	Likewise.
	* testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c:
	Likewise.

From-SVN: r271346
parent b48f44bf
2019-05-17 Thomas Schwinge <thomas@codesourcery.com>
* acc_prof.h: New file.
* oacc-profiling.c: Likewise.
* Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES):
Add these, respectively.
* Makefile.in: Regenerate.
* env.c (initialize_env): Call goacc_profiling_initialize.
* oacc-plugin.c (GOMP_PLUGIN_goacc_thread)
(GOMP_PLUGIN_goacc_profiling_dispatch): New functions.
* oacc-plugin.h (GOMP_PLUGIN_goacc_thread)
(GOMP_PLUGIN_goacc_profiling_dispatch): Declare.
* libgomp.map (OACC_2.5.1): Add acc_prof_lookup,
acc_prof_register, acc_prof_unregister, and acc_register_library.
(GOMP_PLUGIN_1.3): Add GOMP_PLUGIN_goacc_profiling_dispatch, and
GOMP_PLUGIN_goacc_thread.
* oacc-int.h (struct goacc_thread): Add prof_info, api_info,
prof_callbacks_enabled members.
(goacc_prof_enabled, goacc_profiling_initialize)
(_goacc_profiling_dispatch_p, _goacc_profiling_setup_p)
(goacc_profiling_dispatch): Declare.
(GOACC_PROF_ENABLED, GOACC_PROFILING_DISPATCH_P)
(GOACC_PROFILING_SETUP_P): Define.
* oacc-async.c (acc_async_test, acc_async_test_all, acc_wait)
(acc_wait_async, acc_wait_all, acc_wait_all_async): Update for
OpenACC Profiling Interface.
* oacc-cuda.c (acc_get_current_cuda_device)
(acc_get_current_cuda_context, acc_get_cuda_stream)
(acc_set_cuda_stream): Likewise.
* oacc-init.c (acc_init_1, goacc_attach_host_thread_to_device)
(acc_init, acc_set_device_type, acc_get_device_type)
(acc_get_device_num, goacc_lazy_initialize): Likewise.
* oacc-mem.c (acc_malloc, acc_free, memcpy_tofrom_device)
(acc_deviceptr, acc_hostptr, acc_is_present, acc_map_data)
(acc_unmap_data, present_create_copy, delete_copyout)
(update_dev_host): Likewise.
* oacc-parallel.c (GOACC_parallel_keyed, GOACC_data_start)
(GOACC_data_end, GOACC_enter_exit_data, GOACC_update, GOACC_wait):
Likewise.
* plugin/plugin-nvptx.c (nvptx_exec, nvptx_alloc, nvptx_free)
(GOMP_OFFLOAD_openacc_exec, GOMP_OFFLOAD_openacc_async_exec):
Likewise.
* libgomp.texi: Update.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New
file.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c:
Likewise.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c:
Likewise.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
Likewise.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c:
Likewise.
2019-05-13 Chung-Lin Tang <cltang@codesourcery.com> 2019-05-13 Chung-Lin Tang <cltang@codesourcery.com>
* libgomp-plugin.h (struct goacc_asyncqueue): Declare. * libgomp-plugin.h (struct goacc_asyncqueue): Declare.
......
...@@ -65,7 +65,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \ ...@@ -65,7 +65,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \ proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \ splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \ oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
affinity-fmt.c teams.c affinity-fmt.c teams.c oacc-profiling.c
include $(top_srcdir)/plugin/Makefrag.am include $(top_srcdir)/plugin/Makefrag.am
...@@ -74,7 +74,7 @@ libgomp_la_SOURCES += openacc.f90 ...@@ -74,7 +74,7 @@ libgomp_la_SOURCES += openacc.f90
endif endif
nodist_noinst_HEADERS = libgomp_f.h nodist_noinst_HEADERS = libgomp_f.h
nodist_libsubinclude_HEADERS = omp.h openacc.h nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
if USE_FORTRAN if USE_FORTRAN
nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \ nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
......
...@@ -217,7 +217,7 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \ ...@@ -217,7 +217,7 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \ target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \ oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \ oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \
teams.lo $(am__objects_1) teams.lo oacc-profiling.lo $(am__objects_1)
libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS) libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
AM_V_P = $(am__v_P_@AM_V@) AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
...@@ -551,7 +551,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \ ...@@ -551,7 +551,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
affinity.c target.c splay-tree.c libgomp-plugin.c \ affinity.c target.c splay-tree.c libgomp-plugin.c \
oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \ oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \ oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
affinity-fmt.c teams.c $(am__append_3) affinity-fmt.c teams.c oacc-profiling.c $(am__append_3)
# Nvidia PTX OpenACC plugin. # Nvidia PTX OpenACC plugin.
@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION) @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
...@@ -575,7 +575,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \ ...@@ -575,7 +575,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
@PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS) @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS)
@PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static
nodist_noinst_HEADERS = libgomp_f.h nodist_noinst_HEADERS = libgomp_f.h
nodist_libsubinclude_HEADERS = omp.h openacc.h nodist_libsubinclude_HEADERS = omp.h openacc.h acc_prof.h
@USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \ @USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
@USE_FORTRAN_TRUE@ openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod @USE_FORTRAN_TRUE@ openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
...@@ -753,6 +753,7 @@ distclean-compile: ...@@ -753,6 +753,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-profiling.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@
......
/* OpenACC Profiling Interface
Copyright (C) 2019 Free Software Foundation, Inc.
Contributed by Mentor, a Siemens Business.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
#ifndef _ACC_PROF_H
#define _ACC_PROF_H 1
/* The OpenACC specification doesn't say so explicitly, but as its Profiling
Interface explicitly makes use of, for example, <openacc.h>'s
'acc_device_t', we supposedly are to '#include' that file here. */
#include <openacc.h>
#ifdef __cplusplus
extern "C" {
#endif
/* Events. */
typedef enum acc_event_t
{
acc_ev_none = 0,
acc_ev_device_init_start,
acc_ev_device_init_end,
acc_ev_device_shutdown_start,
acc_ev_device_shutdown_end,
acc_ev_runtime_shutdown,
acc_ev_create,
acc_ev_delete,
acc_ev_alloc,
acc_ev_free,
acc_ev_enter_data_start,
acc_ev_enter_data_end,
acc_ev_exit_data_start,
acc_ev_exit_data_end,
acc_ev_update_start,
acc_ev_update_end,
acc_ev_compute_construct_start,
acc_ev_compute_construct_end,
acc_ev_enqueue_launch_start,
acc_ev_enqueue_launch_end,
acc_ev_enqueue_upload_start,
acc_ev_enqueue_upload_end,
acc_ev_enqueue_download_start,
acc_ev_enqueue_download_end,
acc_ev_wait_start,
acc_ev_wait_end,
acc_ev_last
} acc_event_t;
/* Callbacks Signature. */
/* "The datatype 'ssize_t' means a signed 32-bit integer for a 32-bit binary
and a 64-bit integer for a 64-bit binary". */
typedef signed long int _acc_prof_ssize_t;
/* "The datatype 'size_t' means an unsigned 32-bit integer for a 32-bit binary
and a 64-bit integer for a 64-bit binary". */
typedef unsigned long int _acc_prof_size_t;
/* "The datatype 'int' means a 32-bit integer for both 32-bit and 64-bit
binaries". */
typedef int _acc_prof_int_t;
/* Internal helpers: a struct's 'valid_bytes' may be less than its 'sizeof'. */
#define _ACC_PROF_VALID_BYTES_STRUCT(_struct, _lastfield, _valid_bytes_lastfield) \
offsetof (_struct, _lastfield) + (_valid_bytes_lastfield)
#if 0 /* Untested. */
#define _ACC_PROF_VALID_BYTES_TYPE_N(_type, _n, _valid_bytes_type) \
((_n - 1) * sizeof (_type) + (_valid_bytes_type))
#endif
#define _ACC_PROF_VALID_BYTES_BASICTYPE(_basictype) \
(sizeof (_basictype))
typedef struct acc_prof_info
{
acc_event_t event_type;
_acc_prof_int_t valid_bytes;
_acc_prof_int_t version;
acc_device_t device_type;
_acc_prof_int_t device_number;
_acc_prof_int_t thread_id;
_acc_prof_ssize_t async;
_acc_prof_ssize_t async_queue;
const char *src_file;
const char *func_name;
_acc_prof_int_t line_no, end_line_no;
_acc_prof_int_t func_line_no, func_end_line_no;
#define _ACC_PROF_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_prof_info, func_end_line_no, \
_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_int_t))
} acc_prof_info;
/* We implement the OpenACC 2.6 Profiling Interface. */
#define _ACC_PROF_INFO_VERSION 201711
typedef enum acc_construct_t
{
acc_construct_parallel = 0,
acc_construct_kernels,
acc_construct_loop,
acc_construct_data,
acc_construct_enter_data,
acc_construct_exit_data,
acc_construct_host_data,
acc_construct_atomic,
acc_construct_declare,
acc_construct_init,
acc_construct_shutdown,
acc_construct_set,
acc_construct_update,
acc_construct_routine,
acc_construct_wait,
acc_construct_runtime_api,
acc_construct_serial
} acc_construct_t;
typedef struct acc_data_event_info
{
acc_event_t event_type;
_acc_prof_int_t valid_bytes;
acc_construct_t parent_construct;
_acc_prof_int_t implicit;
void *tool_info;
const char *var_name;
_acc_prof_size_t bytes;
const void *host_ptr;
const void *device_ptr;
#define _ACC_DATA_EVENT_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_data_event_info, device_ptr, \
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
} acc_data_event_info;
typedef struct acc_launch_event_info
{
acc_event_t event_type;
_acc_prof_int_t valid_bytes;
acc_construct_t parent_construct;
_acc_prof_int_t implicit;
void *tool_info;
const char *kernel_name;
_acc_prof_size_t num_gangs, num_workers, vector_length;
#define _ACC_LAUNCH_EVENT_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_launch_event_info, vector_length, \
_ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_size_t))
} acc_launch_event_info;
typedef struct acc_other_event_info
{
acc_event_t event_type;
_acc_prof_int_t valid_bytes;
acc_construct_t parent_construct;
_acc_prof_int_t implicit;
void *tool_info;
#define _ACC_OTHER_EVENT_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_other_event_info, tool_info, \
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
} acc_other_event_info;
typedef union acc_event_info
{
acc_event_t event_type;
acc_data_event_info data_event;
acc_launch_event_info launch_event;
acc_other_event_info other_event;
} acc_event_info;
typedef enum acc_device_api
{
acc_device_api_none = 0,
acc_device_api_cuda,
acc_device_api_opencl,
acc_device_api_coi,
acc_device_api_other
} acc_device_api;
typedef struct acc_api_info
{
acc_device_api device_api;
_acc_prof_int_t valid_bytes;
acc_device_t device_type;
_acc_prof_int_t vendor;
const void *device_handle;
const void *context_handle;
const void *async_handle;
#define _ACC_API_INFO_VALID_BYTES \
_ACC_PROF_VALID_BYTES_STRUCT (acc_api_info, async_handle, \
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
} acc_api_info;
/* Don't tag 'acc_prof_callback' as '__GOACC_NOTHROW': these functions are
provided by user code, and must be expected to do anything. */
typedef void (*acc_prof_callback) (acc_prof_info *, acc_event_info *,
acc_api_info *);
/* Loading the Library. */
typedef enum acc_register_t
{
acc_reg = 0,
acc_toggle = 1,
acc_toggle_per_thread = 2
} acc_register_t;
typedef void (*acc_prof_reg) (acc_event_t, acc_prof_callback, acc_register_t);
extern void acc_prof_register (acc_event_t, acc_prof_callback,
acc_register_t) __GOACC_NOTHROW;
extern void acc_prof_unregister (acc_event_t, acc_prof_callback,
acc_register_t) __GOACC_NOTHROW;
typedef void (*acc_query_fn) ();
typedef acc_query_fn (*acc_prof_lookup_func) (const char *);
extern acc_query_fn acc_prof_lookup (const char *) __GOACC_NOTHROW;
/* Don't tag 'acc_register_library' as '__GOACC_NOTHROW': this function can be
overridden by user code, and must be expected to do anything. */
extern void acc_register_library (acc_prof_reg, acc_prof_reg,
acc_prof_lookup_func);
#ifdef __cplusplus
}
#endif
#endif /* _ACC_PROF_H */
...@@ -1425,5 +1425,7 @@ initialize_env (void) ...@@ -1425,5 +1425,7 @@ initialize_env (void)
parse_gomp_openacc_dim (); parse_gomp_openacc_dim ();
goacc_runtime_initialize (); goacc_runtime_initialize ();
goacc_profiling_initialize ();
} }
#endif /* LIBGOMP_OFFLOADED_ONLY */ #endif /* LIBGOMP_OFFLOADED_ONLY */
...@@ -476,6 +476,14 @@ OACC_2.5 { ...@@ -476,6 +476,14 @@ OACC_2.5 {
acc_update_self_async_array_h_; acc_update_self_async_array_h_;
} OACC_2.0.1; } OACC_2.0.1;
OACC_2.5.1 {
global:
acc_prof_lookup;
acc_prof_register;
acc_prof_unregister;
acc_register_library;
} OACC_2.5;
GOACC_2.0 { GOACC_2.0 {
global: global:
GOACC_data_end; GOACC_data_end;
...@@ -515,3 +523,9 @@ GOMP_PLUGIN_1.2 { ...@@ -515,3 +523,9 @@ GOMP_PLUGIN_1.2 {
global: global:
GOMP_PLUGIN_acc_default_dim; GOMP_PLUGIN_acc_default_dim;
} GOMP_PLUGIN_1.1; } GOMP_PLUGIN_1.1;
GOMP_PLUGIN_1.3 {
global:
GOMP_PLUGIN_goacc_profiling_dispatch;
GOMP_PLUGIN_goacc_thread;
} GOMP_PLUGIN_1.2;
...@@ -43,17 +43,6 @@ get_goacc_thread (void) ...@@ -43,17 +43,6 @@ get_goacc_thread (void)
return thr; return thr;
} }
static struct gomp_device_descr *
get_goacc_thread_device (void)
{
struct goacc_thread *thr = goacc_thread ();
if (!thr || !thr->dev)
gomp_fatal ("no device active");
return thr->dev;
}
static int static int
validate_async_val (int async) validate_async_val (int async)
{ {
...@@ -76,7 +65,10 @@ validate_async_val (int async) ...@@ -76,7 +65,10 @@ validate_async_val (int async)
/* Return the asyncqueue to be used for OpenACC async-argument ASYNC. This /* Return the asyncqueue to be used for OpenACC async-argument ASYNC. This
might return NULL if no asyncqueue is to be used. Otherwise, if CREATE, might return NULL if no asyncqueue is to be used. Otherwise, if CREATE,
create the asyncqueue if it doesn't exist yet. */ create the asyncqueue if it doesn't exist yet.
Unless CREATE, this will not generate any OpenACC Profiling Interface
events. */
attribute_hidden struct goacc_asyncqueue * attribute_hidden struct goacc_asyncqueue *
lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async) lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
...@@ -152,8 +144,25 @@ acc_async_test (int async) ...@@ -152,8 +144,25 @@ acc_async_test (int async)
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async); goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
if (!aq) if (!aq)
return 1; return 1;
else
return thr->dev->openacc.async.test_func (aq); acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
int res = thr->dev->openacc.async.test_func (aq);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
return res;
} }
int int
...@@ -161,6 +170,10 @@ acc_async_test_all (void) ...@@ -161,6 +170,10 @@ acc_async_test_all (void)
{ {
struct goacc_thread *thr = get_goacc_thread (); struct goacc_thread *thr = get_goacc_thread ();
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
int ret = 1; int ret = 1;
gomp_mutex_lock (&thr->dev->openacc.async.lock); gomp_mutex_lock (&thr->dev->openacc.async.lock);
for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next) for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
...@@ -170,6 +183,13 @@ acc_async_test_all (void) ...@@ -170,6 +183,13 @@ acc_async_test_all (void)
break; break;
} }
gomp_mutex_unlock (&thr->dev->openacc.async.lock); gomp_mutex_unlock (&thr->dev->openacc.async.lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
return ret; return ret;
} }
...@@ -179,8 +199,26 @@ acc_wait (int async) ...@@ -179,8 +199,26 @@ acc_wait (int async)
struct goacc_thread *thr = get_goacc_thread (); struct goacc_thread *thr = get_goacc_thread ();
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async); goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
if (aq && !thr->dev->openacc.async.synchronize_func (aq)) if (!aq)
return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
if (!thr->dev->openacc.async.synchronize_func (aq))
gomp_fatal ("wait on %d failed", async); gomp_fatal ("wait on %d failed", async);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
} }
/* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait. */ /* acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait. */
...@@ -205,10 +243,19 @@ acc_wait_async (int async1, int async2) ...@@ -205,10 +243,19 @@ acc_wait_async (int async1, int async2)
if (!aq1) if (!aq1)
return; return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async2;
prof_info.async_queue = prof_info.async;
}
goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2); goacc_aq aq2 = lookup_goacc_asyncqueue (thr, true, async2);
/* An async queue is always synchronized with itself. */ /* An async queue is always synchronized with itself. */
if (aq1 == aq2) if (aq1 == aq2)
return; goto out_prof;
if (aq2) if (aq2)
{ {
...@@ -222,18 +269,35 @@ acc_wait_async (int async1, int async2) ...@@ -222,18 +269,35 @@ acc_wait_async (int async1, int async2)
if (!thr->dev->openacc.async.synchronize_func (aq1)) if (!thr->dev->openacc.async.synchronize_func (aq1))
gomp_fatal ("wait on %d failed", async1); gomp_fatal ("wait on %d failed", async1);
} }
out_prof:
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
} }
void void
acc_wait_all (void) acc_wait_all (void)
{ {
struct gomp_device_descr *dev = get_goacc_thread_device (); struct goacc_thread *thr = goacc_thread ();
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
bool ret = true; bool ret = true;
gomp_mutex_lock (&dev->openacc.async.lock); gomp_mutex_lock (&thr->dev->openacc.async.lock);
for (goacc_aq_list l = dev->openacc.async.active; l; l = l->next) for (goacc_aq_list l = thr->dev->openacc.async.active; l; l = l->next)
ret &= dev->openacc.async.synchronize_func (l->aq); ret &= thr->dev->openacc.async.synchronize_func (l->aq);
gomp_mutex_unlock (&dev->openacc.async.lock); gomp_mutex_unlock (&thr->dev->openacc.async.lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
if (!ret) if (!ret)
gomp_fatal ("wait all failed"); gomp_fatal ("wait all failed");
...@@ -255,6 +319,15 @@ acc_wait_all_async (int async) ...@@ -255,6 +319,15 @@ acc_wait_all_async (int async)
{ {
struct goacc_thread *thr = get_goacc_thread (); struct goacc_thread *thr = get_goacc_thread ();
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async); goacc_aq waiting_queue = lookup_goacc_asyncqueue (thr, true, async);
bool ret = true; bool ret = true;
...@@ -270,6 +343,12 @@ acc_wait_all_async (int async) ...@@ -270,6 +343,12 @@ acc_wait_all_async (int async)
} }
gomp_mutex_unlock (&thr->dev->openacc.async.lock); gomp_mutex_unlock (&thr->dev->openacc.async.lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
if (!ret) if (!ret)
gomp_fatal ("wait all async(%d) failed", async); gomp_fatal ("wait all async(%d) failed", async);
} }
......
...@@ -37,10 +37,23 @@ acc_get_current_cuda_device (void) ...@@ -37,10 +37,23 @@ acc_get_current_cuda_device (void)
{ {
struct goacc_thread *thr = goacc_thread (); struct goacc_thread *thr = goacc_thread ();
void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func) if (thr && thr->dev && thr->dev->openacc.cuda.get_current_device_func)
return thr->dev->openacc.cuda.get_current_device_func (); {
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
return NULL; ret = thr->dev->openacc.cuda.get_current_device_func ();
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
return ret;
} }
void * void *
...@@ -48,10 +61,23 @@ acc_get_current_cuda_context (void) ...@@ -48,10 +61,23 @@ acc_get_current_cuda_context (void)
{ {
struct goacc_thread *thr = goacc_thread (); struct goacc_thread *thr = goacc_thread ();
void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func) if (thr && thr->dev && thr->dev->openacc.cuda.get_current_context_func)
return thr->dev->openacc.cuda.get_current_context_func (); {
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
return NULL; ret = thr->dev->openacc.cuda.get_current_context_func ();
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
return ret;
} }
void * void *
...@@ -62,14 +88,32 @@ acc_get_cuda_stream (int async) ...@@ -62,14 +88,32 @@ acc_get_cuda_stream (int async)
if (!async_valid_p (async)) if (!async_valid_p (async))
return NULL; return NULL;
void *ret = NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func) if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
{ {
goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async); goacc_aq aq = lookup_goacc_asyncqueue (thr, false, async);
if (aq) if (!aq)
return thr->dev->openacc.cuda.get_stream_func (aq); return ret;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
} }
return NULL; ret = thr->dev->openacc.cuda.get_stream_func (aq);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
}
return ret;
} }
int int
...@@ -87,6 +131,15 @@ acc_set_cuda_stream (int async, void *stream) ...@@ -87,6 +131,15 @@ acc_set_cuda_stream (int async, void *stream)
int ret = -1; int ret = -1;
if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func) if (thr && thr->dev && thr->dev->openacc.cuda.set_stream_func)
{ {
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
goacc_aq aq = get_goacc_asyncqueue (async); goacc_aq aq = get_goacc_asyncqueue (async);
/* Due to not using an asyncqueue for "acc_async_sync", this cannot be /* Due to not using an asyncqueue for "acc_async_sync", this cannot be
used to change the CUDA stream associated with "acc_async_sync". */ used to change the CUDA stream associated with "acc_async_sync". */
...@@ -95,11 +148,19 @@ acc_set_cuda_stream (int async, void *stream) ...@@ -95,11 +148,19 @@ acc_set_cuda_stream (int async, void *stream)
assert (async == acc_async_sync); assert (async == acc_async_sync);
gomp_debug (0, "Refusing request to set CUDA stream associated" gomp_debug (0, "Refusing request to set CUDA stream associated"
" with \"acc_async_sync\"\n"); " with \"acc_async_sync\"\n");
return 0; ret = 0;
goto out_prof;
} }
gomp_mutex_lock (&thr->dev->openacc.async.lock); gomp_mutex_lock (&thr->dev->openacc.async.lock);
ret = thr->dev->openacc.cuda.set_stream_func (aq, stream); ret = thr->dev->openacc.cuda.set_stream_func (aq, stream);
gomp_mutex_unlock (&thr->dev->openacc.async.lock); gomp_mutex_unlock (&thr->dev->openacc.async.lock);
out_prof:
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
} }
return ret; return ret;
......
...@@ -210,8 +210,67 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs) ...@@ -210,8 +210,67 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
held before calling this function. */ held before calling this function. */
static struct gomp_device_descr * static struct gomp_device_descr *
acc_init_1 (acc_device_t d) acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
{ {
bool check_not_nested_p;
if (implicit)
{
/* In the implicit case, there should (TODO: must?) already be something
have been set up for an outer construct. */
check_not_nested_p = false;
}
else
{
check_not_nested_p = true;
/* TODO: should we set 'thr->prof_info' etc. in this case ('acc_init')?
The problem is, that we don't have 'thr' yet? (So,
'check_not_nested_p = true' also is pointless actually.) */
}
bool profiling_p = GOACC_PROFILING_DISPATCH_P (check_not_nested_p);
acc_prof_info prof_info;
if (profiling_p)
{
prof_info.event_type = acc_ev_device_init_start;
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
prof_info.version = _ACC_PROF_INFO_VERSION;
prof_info.device_type = d;
prof_info.device_number = goacc_device_num;
prof_info.thread_id = -1;
prof_info.async = acc_async_sync;
prof_info.async_queue = prof_info.async;
prof_info.src_file = NULL;
prof_info.func_name = NULL;
prof_info.line_no = -1;
prof_info.end_line_no = -1;
prof_info.func_line_no = -1;
prof_info.func_end_line_no = -1;
}
acc_event_info device_init_event_info;
if (profiling_p)
{
device_init_event_info.other_event.event_type = prof_info.event_type;
device_init_event_info.other_event.valid_bytes
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
device_init_event_info.other_event.parent_construct = parent_construct;
device_init_event_info.other_event.implicit = implicit;
device_init_event_info.other_event.tool_info = NULL;
}
acc_api_info api_info;
if (profiling_p)
{
api_info.device_api = acc_device_api_none;
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
api_info.device_type = prof_info.device_type;
api_info.vendor = -1;
api_info.device_handle = NULL;
api_info.context_handle = NULL;
api_info.async_handle = NULL;
}
if (profiling_p)
goacc_profiling_dispatch (&prof_info, &device_init_event_info, &api_info);
struct gomp_device_descr *base_dev, *acc_dev; struct gomp_device_descr *base_dev, *acc_dev;
int ndevs; int ndevs;
...@@ -234,6 +293,14 @@ acc_init_1 (acc_device_t d) ...@@ -234,6 +293,14 @@ acc_init_1 (acc_device_t d)
gomp_init_device (acc_dev); gomp_init_device (acc_dev);
gomp_mutex_unlock (&acc_dev->lock); gomp_mutex_unlock (&acc_dev->lock);
if (profiling_p)
{
prof_info.event_type = acc_ev_device_init_end;
device_init_event_info.other_event.event_type = prof_info.event_type;
goacc_profiling_dispatch (&prof_info, &device_init_event_info,
&api_info);
}
return base_dev; return base_dev;
} }
...@@ -423,6 +490,10 @@ goacc_attach_host_thread_to_device (int ord) ...@@ -423,6 +490,10 @@ goacc_attach_host_thread_to_device (int ord)
thr->dev = acc_dev = &base_dev[ord]; thr->dev = acc_dev = &base_dev[ord];
thr->saved_bound_dev = NULL; thr->saved_bound_dev = NULL;
thr->mapped_data = NULL; thr->mapped_data = NULL;
thr->prof_info = NULL;
thr->api_info = NULL;
/* Initially, all callbacks for all events are enabled. */
thr->prof_callbacks_enabled = true;
thr->target_tls thr->target_tls
= acc_dev->openacc.create_thread_data_func (ord); = acc_dev->openacc.create_thread_data_func (ord);
...@@ -437,9 +508,7 @@ acc_init (acc_device_t d) ...@@ -437,9 +508,7 @@ acc_init (acc_device_t d)
gomp_init_targets_once (); gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock); gomp_mutex_lock (&acc_device_lock);
cached_base_dev = acc_init_1 (d, acc_construct_runtime_api, 0);
cached_base_dev = acc_init_1 (d);
gomp_mutex_unlock (&acc_device_lock); gomp_mutex_unlock (&acc_device_lock);
goacc_attach_host_thread_to_device (-1); goacc_attach_host_thread_to_device (-1);
...@@ -498,6 +567,12 @@ acc_set_device_type (acc_device_t d) ...@@ -498,6 +567,12 @@ acc_set_device_type (acc_device_t d)
struct gomp_device_descr *base_dev, *acc_dev; struct gomp_device_descr *base_dev, *acc_dev;
struct goacc_thread *thr = goacc_thread (); struct goacc_thread *thr = goacc_thread ();
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
prof_info.device_type = d;
gomp_init_targets_once (); gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock); gomp_mutex_lock (&acc_device_lock);
...@@ -522,6 +597,12 @@ acc_set_device_type (acc_device_t d) ...@@ -522,6 +597,12 @@ acc_set_device_type (acc_device_t d)
} }
goacc_attach_host_thread_to_device (-1); goacc_attach_host_thread_to_device (-1);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
} }
ialias (acc_set_device_type) ialias (acc_set_device_type)
...@@ -537,12 +618,22 @@ acc_get_device_type (void) ...@@ -537,12 +618,22 @@ acc_get_device_type (void)
res = acc_device_type (thr->base_dev->type); res = acc_device_type (thr->base_dev->type);
else else
{ {
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
gomp_init_targets_once (); gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock); gomp_mutex_lock (&acc_device_lock);
dev = resolve_device (acc_device_default, true); dev = resolve_device (acc_device_default, true);
gomp_mutex_unlock (&acc_device_lock); gomp_mutex_unlock (&acc_device_lock);
res = acc_device_type (dev->type); res = acc_device_type (dev->type);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
} }
assert (res != acc_device_default assert (res != acc_device_default
...@@ -562,12 +653,24 @@ acc_get_device_num (acc_device_t d) ...@@ -562,12 +653,24 @@ acc_get_device_num (acc_device_t d)
if (d >= _ACC_device_hwm) if (d >= _ACC_device_hwm)
gomp_fatal ("unknown device type %u", (unsigned) d); gomp_fatal ("unknown device type %u", (unsigned) d);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
prof_info.device_type = d;
gomp_init_targets_once (); gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock); gomp_mutex_lock (&acc_device_lock);
dev = resolve_device (d, true); dev = resolve_device (d, true);
gomp_mutex_unlock (&acc_device_lock); gomp_mutex_unlock (&acc_device_lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
if (thr && thr->base_dev == dev && thr->dev) if (thr && thr->base_dev == dev && thr->dev)
return thr->dev->target_id; return thr->dev->target_id;
...@@ -689,8 +792,13 @@ goacc_lazy_initialize (void) ...@@ -689,8 +792,13 @@ goacc_lazy_initialize (void)
if (thr && thr->dev) if (thr && thr->dev)
return; return;
gomp_init_targets_once ();
gomp_mutex_lock (&acc_device_lock);
if (!cached_base_dev) if (!cached_base_dev)
acc_init (acc_device_default); cached_base_dev = acc_init_1 (acc_device_default,
else acc_construct_parallel, 1);
gomp_mutex_unlock (&acc_device_lock);
goacc_attach_host_thread_to_device (-1); goacc_attach_host_thread_to_device (-1);
} }
...@@ -40,6 +40,7 @@ ...@@ -40,6 +40,7 @@
#include "openacc.h" #include "openacc.h"
#include "config.h" #include "config.h"
#include "acc_prof.h"
#include <stddef.h> #include <stddef.h>
#include <stdbool.h> #include <stdbool.h>
#include <stdarg.h> #include <stdarg.h>
...@@ -68,6 +69,12 @@ struct goacc_thread ...@@ -68,6 +69,12 @@ struct goacc_thread
strictly push/pop semantics according to lexical scope. */ strictly push/pop semantics according to lexical scope. */
struct target_mem_desc *mapped_data; struct target_mem_desc *mapped_data;
/* Data of the OpenACC Profiling Interface. */
acc_prof_info *prof_info;
acc_api_info *api_info;
/* Per-thread toggle of OpenACC Profiling Interface callbacks. */
bool prof_callbacks_enabled;
/* These structures form a list: this is the next thread in that list. */ /* These structures form a list: this is the next thread in that list. */
struct goacc_thread *next; struct goacc_thread *next;
...@@ -128,6 +135,28 @@ async_synchronous_p (int async) ...@@ -128,6 +135,28 @@ async_synchronous_p (int async)
return async == acc_async_sync; return async == acc_async_sync;
} }
extern bool goacc_prof_enabled;
/* Tune for the (very common) case that profiling is not enabled. */
#define GOACC_PROF_ENABLED \
(__builtin_expect (__atomic_load_n (&goacc_prof_enabled, \
MEMMODEL_ACQUIRE) == true, false))
void goacc_profiling_initialize (void);
bool _goacc_profiling_dispatch_p (bool);
/* Tune for the (very common) case that profiling is not enabled. */
#define GOACC_PROFILING_DISPATCH_P(...) \
(GOACC_PROF_ENABLED \
&& _goacc_profiling_dispatch_p (__VA_ARGS__))
bool _goacc_profiling_setup_p (struct goacc_thread *,
acc_prof_info *, acc_api_info *);
/* Tune for the (very common) case that profiling is not enabled. */
#define GOACC_PROFILING_SETUP_P(...) \
(GOACC_PROFILING_DISPATCH_P (false) \
&& _goacc_profiling_setup_p (__VA_ARGS__))
void goacc_profiling_dispatch (acc_prof_info *, acc_event_info *,
acc_api_info *);
#ifdef HAVE_ATTRIBUTE_VISIBILITY #ifdef HAVE_ATTRIBUTE_VISIBILITY
# pragma GCC visibility pop # pragma GCC visibility pop
#endif #endif
......
...@@ -108,7 +108,19 @@ acc_malloc (size_t s) ...@@ -108,7 +108,19 @@ acc_malloc (size_t s)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return malloc (s); return malloc (s);
return thr->dev->alloc_func (thr->dev->target_id, s); acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
void *res = thr->dev->alloc_func (thr->dev->target_id, s);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
return res;
} }
/* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event /* OpenACC 2.0a (3.2.16) doesn't specify what to do in the event
...@@ -131,6 +143,10 @@ acc_free (void *d) ...@@ -131,6 +143,10 @@ acc_free (void *d)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return free (d); return free (d);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
gomp_mutex_lock (&acc_dev->lock); gomp_mutex_lock (&acc_dev->lock);
/* We don't have to call lazy open here, as the ptr value must have /* We don't have to call lazy open here, as the ptr value must have
...@@ -151,6 +167,12 @@ acc_free (void *d) ...@@ -151,6 +167,12 @@ acc_free (void *d)
if (!acc_dev->free_func (acc_dev->target_id, d)) if (!acc_dev->free_func (acc_dev->target_id, d))
gomp_fatal ("error in freeing device memory in %s", __FUNCTION__); gomp_fatal ("error in freeing device memory in %s", __FUNCTION__);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
} }
static void static void
...@@ -172,11 +194,26 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, ...@@ -172,11 +194,26 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
return; return;
} }
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
goacc_aq aq = get_goacc_asyncqueue (async); goacc_aq aq = get_goacc_asyncqueue (async);
if (from) if (from)
gomp_copy_dev2host (thr->dev, aq, h, d, s); gomp_copy_dev2host (thr->dev, aq, h, d, s);
else else
gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL); gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
} }
void void
...@@ -221,6 +258,9 @@ acc_deviceptr (void *h) ...@@ -221,6 +258,9 @@ acc_deviceptr (void *h)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h; return h;
/* In the following, no OpenACC Profiling Interface events can possibly be
generated. */
gomp_mutex_lock (&dev->lock); gomp_mutex_lock (&dev->lock);
n = lookup_host (dev, h, 1); n = lookup_host (dev, h, 1);
...@@ -258,6 +298,9 @@ acc_hostptr (void *d) ...@@ -258,6 +298,9 @@ acc_hostptr (void *d)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return d; return d;
/* In the following, no OpenACC Profiling Interface events can possibly be
generated. */
gomp_mutex_lock (&acc_dev->lock); gomp_mutex_lock (&acc_dev->lock);
n = lookup_dev (acc_dev->openacc.data_environ, d, 1); n = lookup_dev (acc_dev->openacc.data_environ, d, 1);
...@@ -295,6 +338,9 @@ acc_is_present (void *h, size_t s) ...@@ -295,6 +338,9 @@ acc_is_present (void *h, size_t s)
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h != NULL; return h != NULL;
/* In the following, no OpenACC Profiling Interface events can possibly be
generated. */
gomp_mutex_lock (&acc_dev->lock); gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s); n = lookup_host (acc_dev, h, s);
...@@ -339,6 +385,10 @@ acc_map_data (void *h, void *d, size_t s) ...@@ -339,6 +385,10 @@ acc_map_data (void *h, void *d, size_t s)
gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map", gomp_fatal ("[%p,+%d]->[%p,+%d] is a bad map",
(void *)h, (int)s, (void *)d, (int)s); (void *)h, (int)s, (void *)d, (int)s);
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
gomp_mutex_lock (&acc_dev->lock); gomp_mutex_lock (&acc_dev->lock);
if (lookup_host (acc_dev, h, s)) if (lookup_host (acc_dev, h, s))
...@@ -360,6 +410,12 @@ acc_map_data (void *h, void *d, size_t s) ...@@ -360,6 +410,12 @@ acc_map_data (void *h, void *d, size_t s)
tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes, tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
&kinds, true, GOMP_MAP_VARS_OPENACC); &kinds, true, GOMP_MAP_VARS_OPENACC);
tgt->list[0].key->refcount = REFCOUNT_INFINITY; tgt->list[0].key->refcount = REFCOUNT_INFINITY;
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
} }
gomp_mutex_lock (&acc_dev->lock); gomp_mutex_lock (&acc_dev->lock);
...@@ -380,6 +436,10 @@ acc_unmap_data (void *h) ...@@ -380,6 +436,10 @@ acc_unmap_data (void *h)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return; return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
size_t host_size; size_t host_size;
gomp_mutex_lock (&acc_dev->lock); gomp_mutex_lock (&acc_dev->lock);
...@@ -433,6 +493,12 @@ acc_unmap_data (void *h) ...@@ -433,6 +493,12 @@ acc_unmap_data (void *h)
gomp_mutex_unlock (&acc_dev->lock); gomp_mutex_unlock (&acc_dev->lock);
gomp_unmap_vars (t, true); gomp_unmap_vars (t, true);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
} }
#define FLAG_PRESENT (1 << 0) #define FLAG_PRESENT (1 << 0)
...@@ -456,6 +522,15 @@ present_create_copy (unsigned f, void *h, size_t s, int async) ...@@ -456,6 +522,15 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return h; return h;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
gomp_mutex_lock (&acc_dev->lock); gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s); n = lookup_host (acc_dev, h, s);
...@@ -518,6 +593,12 @@ present_create_copy (unsigned f, void *h, size_t s, int async) ...@@ -518,6 +593,12 @@ present_create_copy (unsigned f, void *h, size_t s, int async)
gomp_mutex_unlock (&acc_dev->lock); gomp_mutex_unlock (&acc_dev->lock);
} }
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
return d; return d;
} }
...@@ -599,6 +680,15 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname) ...@@ -599,6 +680,15 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return; return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
gomp_mutex_lock (&acc_dev->lock); gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s); n = lookup_host (acc_dev, h, s);
...@@ -672,6 +762,12 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname) ...@@ -672,6 +762,12 @@ delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
} }
gomp_mutex_unlock (&acc_dev->lock); gomp_mutex_unlock (&acc_dev->lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
} }
void void
...@@ -737,6 +833,15 @@ update_dev_host (int is_dev, void *h, size_t s, int async) ...@@ -737,6 +833,15 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) if (acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
return; return;
acc_prof_info prof_info;
acc_api_info api_info;
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
if (profiling_p)
{
prof_info.async = async;
prof_info.async_queue = prof_info.async;
}
gomp_mutex_lock (&acc_dev->lock); gomp_mutex_lock (&acc_dev->lock);
n = lookup_host (acc_dev, h, s); n = lookup_host (acc_dev, h, s);
...@@ -758,6 +863,12 @@ update_dev_host (int is_dev, void *h, size_t s, int async) ...@@ -758,6 +863,12 @@ update_dev_host (int is_dev, void *h, size_t s, int async)
gomp_copy_dev2host (acc_dev, aq, h, d, s); gomp_copy_dev2host (acc_dev, aq, h, d, s);
gomp_mutex_unlock (&acc_dev->lock); gomp_mutex_unlock (&acc_dev->lock);
if (profiling_p)
{
thr->prof_info = NULL;
thr->api_info = NULL;
}
} }
void void
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include "libgomp.h" #include "libgomp.h"
#include "oacc-plugin.h" #include "oacc-plugin.h"
#include "oacc-int.h" #include "oacc-int.h"
#include "acc_prof.h"
/* This plugin function is now obsolete. */ /* This plugin function is now obsolete. */
void void
...@@ -38,6 +39,14 @@ GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)), ...@@ -38,6 +39,14 @@ GOMP_PLUGIN_async_unmap_vars (void *ptr __attribute__((unused)),
gomp_fatal ("invalid plugin function"); gomp_fatal ("invalid plugin function");
} }
/* Return the TLS data for the current thread. */
struct goacc_thread *
GOMP_PLUGIN_goacc_thread (void)
{
return goacc_thread ();
}
/* Return the target-specific part of the TLS data for the current thread. */ /* Return the target-specific part of the TLS data for the current thread. */
void * void *
...@@ -57,3 +66,11 @@ GOMP_PLUGIN_acc_default_dim (unsigned int i) ...@@ -57,3 +66,11 @@ GOMP_PLUGIN_acc_default_dim (unsigned int i)
} }
return goacc_default_dims[i]; return goacc_default_dims[i];
} }
void
GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *prof_info,
acc_event_info *event_info,
acc_api_info *api_info)
{
goacc_profiling_dispatch (prof_info, event_info, api_info);
}
...@@ -27,8 +27,15 @@ ...@@ -27,8 +27,15 @@
#ifndef OACC_PLUGIN_H #ifndef OACC_PLUGIN_H
#define OACC_PLUGIN_H 1 #define OACC_PLUGIN_H 1
#include "oacc-int.h"
#include "acc_prof.h"
extern void GOMP_PLUGIN_async_unmap_vars (void *, int); extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
extern struct goacc_thread *GOMP_PLUGIN_goacc_thread (void);
extern void *GOMP_PLUGIN_acc_thread (void); extern void *GOMP_PLUGIN_acc_thread (void);
extern int GOMP_PLUGIN_acc_default_dim (unsigned int); extern int GOMP_PLUGIN_acc_default_dim (unsigned int);
extern void GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *,
acc_event_info *,
acc_api_info *);
#endif #endif
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
#include "libgomp-plugin.h" #include "libgomp-plugin.h"
#include "oacc-plugin.h" #include "oacc-plugin.h"
#include "gomp-constants.h" #include "gomp-constants.h"
#include "oacc-int.h"
#include <pthread.h> #include <pthread.h>
#include <cuda.h> #include <cuda.h>
...@@ -904,27 +905,122 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, ...@@ -904,27 +905,122 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
// num_gangs nctaid.x // num_gangs nctaid.x
// num_workers ntid.y // num_workers ntid.y
// vector length ntid.x // vector length ntid.x
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
acc_prof_info *prof_info = thr->prof_info;
acc_event_info enqueue_launch_event_info;
acc_api_info *api_info = thr->api_info;
bool profiling_p = __builtin_expect (prof_info != NULL, false);
if (profiling_p)
{
prof_info->event_type = acc_ev_enqueue_launch_start;
enqueue_launch_event_info.launch_event.event_type
= prof_info->event_type;
enqueue_launch_event_info.launch_event.valid_bytes
= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
enqueue_launch_event_info.launch_event.parent_construct
= acc_construct_parallel;
enqueue_launch_event_info.launch_event.implicit = 1;
enqueue_launch_event_info.launch_event.tool_info = NULL;
enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
enqueue_launch_event_info.launch_event.num_gangs
= dims[GOMP_DIM_GANG];
enqueue_launch_event_info.launch_event.num_workers
= dims[GOMP_DIM_WORKER];
enqueue_launch_event_info.launch_event.vector_length
= dims[GOMP_DIM_VECTOR];
api_info->device_api = acc_device_api_cuda;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
api_info);
}
kargs[0] = &dp; kargs[0] = &dp;
CUDA_CALL_ASSERT (cuLaunchKernel, function, CUDA_CALL_ASSERT (cuLaunchKernel, function,
dims[GOMP_DIM_GANG], 1, 1, dims[GOMP_DIM_GANG], 1, 1,
dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
0, stream, kargs, 0); 0, stream, kargs, 0);
if (profiling_p)
{
prof_info->event_type = acc_ev_enqueue_launch_end;
enqueue_launch_event_info.launch_event.event_type
= prof_info->event_type;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
api_info);
}
GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__,
targ_fn->launch->fn); targ_fn->launch->fn);
} }
void * openacc_get_current_cuda_context (void); void * openacc_get_current_cuda_context (void);
static void
goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
{
acc_prof_info *prof_info = thr->prof_info;
acc_event_info data_event_info;
acc_api_info *api_info = thr->api_info;
prof_info->event_type = acc_ev_alloc;
data_event_info.data_event.event_type = prof_info->event_type;
data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
data_event_info.data_event.parent_construct = acc_construct_parallel;
data_event_info.data_event.implicit = 1;
data_event_info.data_event.tool_info = NULL;
data_event_info.data_event.var_name = NULL;
data_event_info.data_event.bytes = s;
data_event_info.data_event.host_ptr = NULL;
data_event_info.data_event.device_ptr = dp;
api_info->device_api = acc_device_api_cuda;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
}
static void * static void *
nvptx_alloc (size_t s) nvptx_alloc (size_t s)
{ {
CUdeviceptr d; CUdeviceptr d;
CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s); CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
bool profiling_p
= __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
if (profiling_p)
goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
return (void *) d; return (void *) d;
} }
static void
goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
{
acc_prof_info *prof_info = thr->prof_info;
acc_event_info data_event_info;
acc_api_info *api_info = thr->api_info;
prof_info->event_type = acc_ev_free;
data_event_info.data_event.event_type = prof_info->event_type;
data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
data_event_info.data_event.parent_construct = acc_construct_parallel;
data_event_info.data_event.implicit = 1;
data_event_info.data_event.tool_info = NULL;
data_event_info.data_event.var_name = NULL;
data_event_info.data_event.bytes = -1;
data_event_info.data_event.host_ptr = NULL;
data_event_info.data_event.device_ptr = p;
api_info->device_api = acc_device_api_cuda;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
}
static bool static bool
nvptx_free (void *p, struct ptx_device *ptx_dev) nvptx_free (void *p, struct ptx_device *ptx_dev)
{ {
...@@ -952,6 +1048,12 @@ nvptx_free (void *p, struct ptx_device *ptx_dev) ...@@ -952,6 +1048,12 @@ nvptx_free (void *p, struct ptx_device *ptx_dev)
} }
CUDA_CALL (cuMemFree, (CUdeviceptr) p); CUDA_CALL (cuMemFree, (CUdeviceptr) p);
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
bool profiling_p
= __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
if (profiling_p)
goacc_profiling_acc_ev_free (thr, p);
return true; return true;
} }
...@@ -1250,22 +1352,61 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum, ...@@ -1250,22 +1352,61 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
{ {
GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
acc_prof_info *prof_info = thr->prof_info;
acc_event_info data_event_info;
acc_api_info *api_info = thr->api_info;
bool profiling_p = __builtin_expect (prof_info != NULL, false);
void **hp = NULL; void **hp = NULL;
CUdeviceptr dp = 0; CUdeviceptr dp = 0;
if (mapnum > 0) if (mapnum > 0)
{ {
hp = alloca (mapnum * sizeof (void *)); size_t s = mapnum * sizeof (void *);
hp = alloca (s);
for (int i = 0; i < mapnum; i++) for (int i = 0; i < mapnum; i++)
hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]); hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *)); CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
if (profiling_p)
goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
} }
/* Copy the (device) pointers to arguments to the device (dp and hp might in /* Copy the (device) pointers to arguments to the device (dp and hp might in
fact have the same value on a unified-memory system). */ fact have the same value on a unified-memory system). */
if (mapnum > 0) if (mapnum > 0)
{
if (profiling_p)
{
prof_info->event_type = acc_ev_enqueue_upload_start;
data_event_info.data_event.event_type = prof_info->event_type;
data_event_info.data_event.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES;
data_event_info.data_event.parent_construct
= acc_construct_parallel;
data_event_info.data_event.implicit = 1; /* Always implicit. */
data_event_info.data_event.tool_info = NULL;
data_event_info.data_event.var_name = NULL;
data_event_info.data_event.bytes = mapnum * sizeof (void *);
data_event_info.data_event.host_ptr = hp;
data_event_info.data_event.device_ptr = (const void *) dp;
api_info->device_api = acc_device_api_cuda;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
api_info);
}
CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp, CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
mapnum * sizeof (void *)); mapnum * sizeof (void *));
if (profiling_p)
{
prof_info->event_type = acc_ev_enqueue_upload_end;
data_event_info.data_event.event_type = prof_info->event_type;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
api_info);
}
}
nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
dp, NULL); dp, NULL);
...@@ -1277,7 +1418,10 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum, ...@@ -1277,7 +1418,10 @@ GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
maybe_abort_msg); maybe_abort_msg);
else if (r != CUDA_SUCCESS) else if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
CUDA_CALL_ASSERT (cuMemFree, dp); CUDA_CALL_ASSERT (cuMemFree, dp);
if (profiling_p)
goacc_profiling_acc_ev_free (thr, (void *) dp);
} }
static void static void
...@@ -1296,23 +1440,54 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum, ...@@ -1296,23 +1440,54 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
{ {
GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
acc_prof_info *prof_info = thr->prof_info;
acc_event_info data_event_info;
acc_api_info *api_info = thr->api_info;
bool profiling_p = __builtin_expect (prof_info != NULL, false);
void **hp = NULL; void **hp = NULL;
CUdeviceptr dp = 0; CUdeviceptr dp = 0;
void **block = NULL; void **block = NULL;
if (mapnum > 0) if (mapnum > 0)
{ {
block = (void **) GOMP_PLUGIN_malloc ((mapnum + 2) * sizeof (void *)); size_t s = mapnum * sizeof (void *);
block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
hp = block + 2; hp = block + 2;
for (int i = 0; i < mapnum; i++) for (int i = 0; i < mapnum; i++)
hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]); hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
CUDA_CALL_ASSERT (cuMemAlloc, &dp, mapnum * sizeof (void *)); CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
if (profiling_p)
goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
} }
/* Copy the (device) pointers to arguments to the device (dp and hp might in /* Copy the (device) pointers to arguments to the device (dp and hp might in
fact have the same value on a unified-memory system). */ fact have the same value on a unified-memory system). */
if (mapnum > 0) if (mapnum > 0)
{ {
if (profiling_p)
{
prof_info->event_type = acc_ev_enqueue_upload_start;
data_event_info.data_event.event_type = prof_info->event_type;
data_event_info.data_event.valid_bytes
= _ACC_DATA_EVENT_INFO_VALID_BYTES;
data_event_info.data_event.parent_construct
= acc_construct_parallel;
data_event_info.data_event.implicit = 1; /* Always implicit. */
data_event_info.data_event.tool_info = NULL;
data_event_info.data_event.var_name = NULL;
data_event_info.data_event.bytes = mapnum * sizeof (void *);
data_event_info.data_event.host_ptr = hp;
data_event_info.data_event.device_ptr = (const void *) dp;
api_info->device_api = acc_device_api_cuda;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
api_info);
}
CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp, CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
mapnum * sizeof (void *), aq->cuda_stream); mapnum * sizeof (void *), aq->cuda_stream);
block[0] = (void *) dp; block[0] = (void *) dp;
...@@ -1320,7 +1495,16 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum, ...@@ -1320,7 +1495,16 @@ GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
struct nvptx_thread *nvthd = struct nvptx_thread *nvthd =
(struct nvptx_thread *) GOMP_PLUGIN_acc_thread (); (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
block[1] = (void *) nvthd->ptx_dev; block[1] = (void *) nvthd->ptx_dev;
if (profiling_p)
{
prof_info->event_type = acc_ev_enqueue_upload_end;
data_event_info.data_event.event_type = prof_info->event_type;
GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
api_info);
} }
}
nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
dp, aq->cuda_stream); dp, aq->cuda_stream);
......
/* Test dispatch of events to callbacks. */
#undef NDEBUG
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <acc_prof.h>
/* Use explicit 'copyin' clauses, to work around "'firstprivate'
optimizations", which will cause the value at the point of call to be used
(*before* any potential modifications done in callbacks), as opposed to its
address being taken, which then later gets dereferenced (*after* any
modifications done in callbacks). */
#define COPYIN(...) copyin(__VA_ARGS__)
/* See the 'DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT' reference in
'libgomp.texi'. */
#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
volatile // TODO PR90488
static int state = -1;
#define STATE_OP(state, op) \
do \
{ \
typeof (state) state_o = (state); \
(void) state_o; \
(state)op; \
DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
} \
while (0)
static acc_device_t acc_device_type;
static int acc_device_num;
static int num_gangs, num_workers, vector_length;
static void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s\n", __FUNCTION__);
assert (acc_device_type != acc_device_host);
assert (state == 0);
STATE_OP (state, = 1);
assert (prof_info->event_type == acc_ev_enqueue_launch_start);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (prof_info->version == _ACC_PROF_INFO_VERSION);
assert (prof_info->device_type == acc_device_type);
assert (prof_info->device_number == acc_device_num);
assert (prof_info->thread_id == -1);
assert (prof_info->async == acc_async_sync);
assert (prof_info->async_queue == prof_info->async);
assert (prof_info->src_file == NULL);
assert (prof_info->func_name == NULL);
assert (prof_info->line_no == -1);
assert (prof_info->end_line_no == -1);
assert (prof_info->func_line_no == -1);
assert (prof_info->func_end_line_no == -1);
assert (event_info->launch_event.event_type == prof_info->event_type);
assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
assert (event_info->launch_event.parent_construct == acc_construct_parallel);
assert (event_info->launch_event.implicit == 1);
assert (event_info->launch_event.tool_info == NULL);
assert (event_info->launch_event.kernel_name != NULL);
{
const char *s = strstr (event_info->launch_event.kernel_name, "main");
assert (s != NULL);
s = strstr (s, "omp_fn");
assert (s != NULL);
}
if (num_gangs < 1)
assert (event_info->launch_event.num_gangs >= 1);
else
{
#ifdef __OPTIMIZE__
assert (event_info->launch_event.num_gangs == num_gangs);
#else
/* No parallelized OpenACC 'kernels' constructs. Unparallelized OpenACC
'kernels' constructs must get launched as 1 x 1 x 1 GPU kernels. */
assert (event_info->launch_event.num_gangs == 1);
#endif
}
if (num_workers < 1)
assert (event_info->launch_event.num_workers >= 1);
else
{
#ifdef __OPTIMIZE__
assert (event_info->launch_event.num_workers == num_workers);
#else
/* See 'num_gangs' above. */
assert (event_info->launch_event.num_workers == 1);
#endif
}
if (vector_length < 1)
assert (event_info->launch_event.vector_length >= 1);
else if (acc_device_type == acc_device_nvidia) /* ... is special. */
assert (event_info->launch_event.vector_length == 32);
else
{
#ifdef __OPTIMIZE__
assert (event_info->launch_event.vector_length == vector_length);
#else
/* See 'num_gangs' above. */
assert (event_info->launch_event.vector_length == 1);
#endif
}
if (acc_device_type == acc_device_host)
assert (api_info->device_api == acc_device_api_none);
else
assert (api_info->device_api == acc_device_api_cuda);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
assert (api_info->device_type == prof_info->device_type);
assert (api_info->vendor == -1);
assert (api_info->device_handle == NULL);
assert (api_info->context_handle == NULL);
assert (api_info->async_handle == NULL);
}
static acc_prof_reg reg;
static acc_prof_reg unreg;
static acc_prof_lookup_func lookup;
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
{
DEBUG_printf ("%s\n", __FUNCTION__);
reg = reg_;
unreg = unreg_;
lookup = lookup_;
}
int main()
{
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
STATE_OP (state, = 0);
reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
assert (state == 0);
acc_device_type = acc_get_device_type ();
acc_device_num = acc_get_device_num (acc_device_type);
assert (state == 0);
/* Parallelism dimensions: compiler/runtime decides. */
STATE_OP (state, = 0);
num_gangs = num_workers = vector_length = 0;
{
#define N 100
int x[N];
#pragma acc kernels
{
for (int i = 0; i < N; ++i)
x[i] = i * i;
}
if (acc_device_type == acc_device_host)
assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */
else
assert (state == 1);
for (int i = 0; i < N; ++i)
if (x[i] != i * i)
__builtin_abort ();
#undef N
}
/* Parallelism dimensions: literal. */
STATE_OP (state, = 0);
num_gangs = 30;
num_workers = 3;
vector_length = 5;
{
#define N 100
int x[N];
#pragma acc kernels \
num_gangs (30) num_workers (3) vector_length (5)
/* { dg-prune-output "using vector_length \\(32\\), ignoring 5" } */
{
for (int i = 0; i < N; ++i)
x[i] = i * i;
}
if (acc_device_type == acc_device_host)
assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */
else
assert (state == 1);
for (int i = 0; i < N; ++i)
if (x[i] != i * i)
__builtin_abort ();
#undef N
}
/* Parallelism dimensions: variable. */
STATE_OP (state, = 0);
num_gangs = 22;
num_workers = 5;
vector_length = 7;
{
#define N 100
int x[N];
#pragma acc kernels \
num_gangs (num_gangs) num_workers (num_workers) vector_length (vector_length)
/* { dg-prune-output "using vector_length \\(32\\), ignoring runtime setting" } */
{
for (int i = 0; i < N; ++i)
x[i] = i * i;
}
if (acc_device_type == acc_device_host)
assert (state == 0); /* No 'acc_ev_enqueue_launch_start'. */
else
assert (state == 1);
for (int i = 0; i < N; ++i)
if (x[i] != i * i)
__builtin_abort ();
#undef N
}
return 0;
}
/* Test the 'valid_bytes' magic. */
#undef NDEBUG
#include <assert.h>
#include <acc_prof.h>
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
static int ev_count_data;
static void cb_data_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (event_info->data_event.valid_bytes == _ACC_DATA_EVENT_INFO_VALID_BYTES);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
++ev_count_data;
}
static int ev_count_launch;
static void cb_launch_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
++ev_count_launch;
}
static int ev_count_other;
static void cb_other_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES);
assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
++ev_count_other;
}
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
{
DEBUG_printf ("%s\n", __FUNCTION__);
reg_ (acc_ev_device_init_start, cb_other_event, acc_reg);
reg_ (acc_ev_device_init_end, cb_other_event, acc_reg);
reg_ (acc_ev_device_shutdown_start, cb_other_event, acc_reg);
reg_ (acc_ev_device_shutdown_end, cb_other_event, acc_reg);
reg_ (acc_ev_runtime_shutdown, cb_other_event, acc_reg);
reg_ (acc_ev_create, cb_data_event, acc_reg);
reg_ (acc_ev_delete, cb_data_event, acc_reg);
reg_ (acc_ev_alloc, cb_data_event, acc_reg);
reg_ (acc_ev_free, cb_data_event, acc_reg);
reg_ (acc_ev_enter_data_start, cb_other_event, acc_reg);
reg_ (acc_ev_enter_data_end, cb_other_event, acc_reg);
reg_ (acc_ev_exit_data_start, cb_other_event, acc_reg);
reg_ (acc_ev_exit_data_end, cb_other_event, acc_reg);
reg_ (acc_ev_update_start, cb_other_event, acc_reg);
reg_ (acc_ev_update_end, cb_other_event, acc_reg);
reg_ (acc_ev_compute_construct_start, cb_other_event, acc_reg);
reg_ (acc_ev_compute_construct_end, cb_other_event, acc_reg);
reg_ (acc_ev_enqueue_launch_start, cb_launch_event, acc_reg);
reg_ (acc_ev_enqueue_launch_end, cb_launch_event, acc_reg);
reg_ (acc_ev_enqueue_upload_start, cb_data_event, acc_reg);
reg_ (acc_ev_enqueue_upload_end, cb_data_event, acc_reg);
reg_ (acc_ev_enqueue_download_start, cb_data_event, acc_reg);
reg_ (acc_ev_enqueue_download_end, cb_data_event, acc_reg);
reg_ (acc_ev_wait_start, cb_other_event, acc_reg);
reg_ (acc_ev_wait_end, cb_other_event, acc_reg);
}
/* Basic struct. */
typedef struct A
{
int a;
int b;
#define VALID_BYTES_A \
_ACC_PROF_VALID_BYTES_STRUCT (A, b, \
_ACC_PROF_VALID_BYTES_BASICTYPE (int))
} A;
/* Add a 'char' field. */
typedef struct B
{
int a;
int b;
char c;
#define VALID_BYTES_B \
_ACC_PROF_VALID_BYTES_STRUCT (B, c, \
_ACC_PROF_VALID_BYTES_BASICTYPE (char))
} B;
/* Add another 'char' field. */
typedef struct C
{
int a;
int b;
char c, d;
#define VALID_BYTES_C \
_ACC_PROF_VALID_BYTES_STRUCT (C, d, \
_ACC_PROF_VALID_BYTES_BASICTYPE (char))
} C;
/* Add two 'void *' fields. */
typedef struct D
{
int a;
int b;
char c, d;
void *e;
void *f;
#define VALID_BYTES_D \
_ACC_PROF_VALID_BYTES_STRUCT (D, f, \
_ACC_PROF_VALID_BYTES_BASICTYPE (void *))
} D;
/* Add another three 'char' fields. */
typedef struct E
{
int a;
int b;
char c, d;
void *e;
void *f;
char g, h, i;
#define VALID_BYTES_E \
_ACC_PROF_VALID_BYTES_STRUCT (E, i, \
_ACC_PROF_VALID_BYTES_BASICTYPE (char))
} E;
int main()
{
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
A A1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof A1, VALID_BYTES_A);
assert (VALID_BYTES_A <= sizeof A1);
DEBUG_printf ("&A1=%p, &A1.b=%p\n", &A1, &A1.b);
assert (((char *) &A1) + VALID_BYTES_A == (char *) (&A1.b + 1));
B B1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof B1, VALID_BYTES_B);
assert (VALID_BYTES_B <= sizeof B1);
DEBUG_printf ("&B1=%p, &B1.c=%p\n", &B1, &B1.c);
assert (((char *) &B1) + VALID_BYTES_B == (char *) (&B1.c + 1));
assert (VALID_BYTES_B == VALID_BYTES_A + 1 * sizeof (char));
C C1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof C1, VALID_BYTES_C);
assert (VALID_BYTES_C <= sizeof C1);
DEBUG_printf ("&C1=%p, &C1.d=%p\n", &C1, &C1.d);
assert (((char *) &C1) + VALID_BYTES_C == (char *) (&C1.d + 1));
assert (VALID_BYTES_C == VALID_BYTES_B + 1 * sizeof (char));
D D1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof D1, VALID_BYTES_D);
assert (VALID_BYTES_D <= sizeof D1);
DEBUG_printf ("&D1=%p, &D1.f=%p\n", &D1, &D1.f);
assert (((char *) &D1) + VALID_BYTES_D == (char *) (&D1.f + 1));
assert (VALID_BYTES_D > VALID_BYTES_C);
E E1;
DEBUG_printf ("s=%zd, vb=%zd\n", sizeof E1, VALID_BYTES_E);
assert (VALID_BYTES_E <= sizeof E1);
DEBUG_printf ("&E1=%p, &E1.i=%p\n", &E1, &E1.i);
assert (((char *) &E1) + VALID_BYTES_E == (char *) (&E1.i + 1));
assert (VALID_BYTES_E == VALID_BYTES_D + 3 * sizeof (char));
ev_count_data = 0;
ev_count_launch = 0;
ev_count_other = 0;
/* Trigger tests done in 'cb_*' functions. */
int host;
#pragma acc parallel copyout (host)
{
asm volatile ("" : : : "memory"); // TODO PR90488
host = acc_on_device (acc_device_host);
}
DEBUG_printf ("ev_count_data = %d\n", ev_count_data);
if (host)
assert (ev_count_data == 0);
else
{
/* We don't know exactly how many data events to expect, but we at least
expect some. */
assert (ev_count_data > 0);
}
DEBUG_printf ("ev_count_launch = %d\n", ev_count_launch);
if (host)
assert (ev_count_data == 0);
else
{
/* We expect two launch events, 'acc_ev_enqueue_launch_start',
'acc_ev_enqueue_launch_end'. */
assert (ev_count_launch == 2);
}
DEBUG_printf ("ev_count_other = %d\n", ev_count_other);
/* We don't know exactly how many other events to expect, but we at least
expect 'acc_ev_device_init_start', 'acc_ev_device_init_end',
'acc_ev_compute_construct_start', 'acc_ev_compute_construct_end'. */
assert (ev_count_other >= 4);
return 0;
}
/* Test the 'version' field of 'acc_prof_info'. */
#undef NDEBUG
#include <assert.h>
#include <acc_prof.h>
#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
static int ev_count;
static void cb_any_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
{
DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
assert (prof_info->version == 201711);
++ev_count;
}
void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
{
DEBUG_printf ("%s\n", __FUNCTION__);
reg_ (acc_ev_device_init_start, cb_any_event, acc_reg);
reg_ (acc_ev_device_init_end, cb_any_event, acc_reg);
reg_ (acc_ev_device_shutdown_start, cb_any_event, acc_reg);
reg_ (acc_ev_device_shutdown_end, cb_any_event, acc_reg);
reg_ (acc_ev_runtime_shutdown, cb_any_event, acc_reg);
reg_ (acc_ev_create, cb_any_event, acc_reg);
reg_ (acc_ev_delete, cb_any_event, acc_reg);
reg_ (acc_ev_alloc, cb_any_event, acc_reg);
reg_ (acc_ev_free, cb_any_event, acc_reg);
reg_ (acc_ev_enter_data_start, cb_any_event, acc_reg);
reg_ (acc_ev_enter_data_end, cb_any_event, acc_reg);
reg_ (acc_ev_exit_data_start, cb_any_event, acc_reg);
reg_ (acc_ev_exit_data_end, cb_any_event, acc_reg);
reg_ (acc_ev_update_start, cb_any_event, acc_reg);
reg_ (acc_ev_update_end, cb_any_event, acc_reg);
reg_ (acc_ev_compute_construct_start, cb_any_event, acc_reg);
reg_ (acc_ev_compute_construct_end, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_launch_start, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_launch_end, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_upload_start, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_upload_end, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_download_start, cb_any_event, acc_reg);
reg_ (acc_ev_enqueue_download_end, cb_any_event, acc_reg);
reg_ (acc_ev_wait_start, cb_any_event, acc_reg);
reg_ (acc_ev_wait_end, cb_any_event, acc_reg);
}
int main()
{
acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
ev_count = 0;
/* Trigger tests done in 'cb_*' functions. */
#pragma acc parallel
{
asm volatile ("" : : : "memory"); // TODO PR90488
}
DEBUG_printf ("ev_count = %d\n", ev_count);
/* We don't know exactly how many events to expect, but we at least expect
'acc_ev_device_init_start', 'acc_ev_device_init_end',
'acc_ev_compute_construct_start', 'acc_ev_compute_construct_end'. */
assert (ev_count >= 4);
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment