Commit bd9b3d3d by Cesar Philippidis Committed by Tom de Vries

[nvptx] Use CUDA driver API to select default runtime launch geometry

The CUDA driver API starting version 6.5 offers a set of runtime functions to
calculate several occupancy-related measures, as a replacement for the occupancy
calculator spreadsheet.

This patch adds a heuristic for default runtime launch geometry, based on the
new runtime function cuOccupancyMaxPotentialBlockSize.

Build on x86_64 with nvptx accelerator and ran libgomp testsuite.

2018-08-13  Cesar Philippidis  <cesar@codesourcery.com>
	    Tom de Vries  <tdevries@suse.de>

	PR target/85590
	* plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
	(cuOccupancyMaxPotentialBlockSize): Declare.
	* plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New
	CUDA_ONE_CALL_MAYBE_NULL.
	* plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define
	CUoccupancyB2DSize and declare
	cuOccupancyMaxPotentialBlockSize.
	(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
	default num_gangs and num_workers when the driver supports it.

Co-Authored-By: Tom de Vries <tdevries@suse.de>

From-SVN: r263505
parent cdf89978
2018-08-13 Cesar Philippidis <cesar@codesourcery.com>
Tom de Vries <tdevries@suse.de>
PR target/85590
* plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
(cuOccupancyMaxPotentialBlockSize): Declare.
* plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New
CUDA_ONE_CALL_MAYBE_NULL.
* plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define
CUoccupancyB2DSize and declare
cuOccupancyMaxPotentialBlockSize.
(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
default num_gangs and num_workers when the driver supports it.
2018-08-08 Tom de Vries <tdevries@suse.de> 2018-08-08 Tom de Vries <tdevries@suse.de>
* plugin/cuda-lib.def (cuLinkAddData_v2, cuLinkCreate_v2): Declare using * plugin/cuda-lib.def (cuLinkAddData_v2, cuLinkCreate_v2): Declare using
......
...@@ -41,6 +41,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal) ...@@ -41,6 +41,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
CUDA_ONE_CALL (cuModuleLoad) CUDA_ONE_CALL (cuModuleLoad)
CUDA_ONE_CALL (cuModuleLoadData) CUDA_ONE_CALL (cuModuleLoadData)
CUDA_ONE_CALL (cuModuleUnload) CUDA_ONE_CALL (cuModuleUnload)
CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
CUDA_ONE_CALL (cuStreamCreate) CUDA_ONE_CALL (cuStreamCreate)
CUDA_ONE_CALL (cuStreamDestroy) CUDA_ONE_CALL (cuStreamDestroy)
CUDA_ONE_CALL (cuStreamQuery) CUDA_ONE_CALL (cuStreamQuery)
......
...@@ -44,6 +44,7 @@ typedef void *CUevent; ...@@ -44,6 +44,7 @@ typedef void *CUevent;
typedef void *CUfunction; typedef void *CUfunction;
typedef void *CUlinkState; typedef void *CUlinkState;
typedef void *CUmodule; typedef void *CUmodule;
typedef size_t (*CUoccupancyB2DSize)(int);
typedef void *CUstream; typedef void *CUstream;
typedef enum { typedef enum {
...@@ -170,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *); ...@@ -170,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
CUresult cuModuleLoad (CUmodule *, const char *); CUresult cuModuleLoad (CUmodule *, const char *);
CUresult cuModuleLoadData (CUmodule *, const void *); CUresult cuModuleLoadData (CUmodule *, const void *);
CUresult cuModuleUnload (CUmodule); CUresult cuModuleUnload (CUmodule);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
CUresult cuStreamCreate (CUstream *, unsigned); CUresult cuStreamCreate (CUstream *, unsigned);
#define cuStreamDestroy cuStreamDestroy_v2 #define cuStreamDestroy cuStreamDestroy_v2
CUresult cuStreamDestroy (CUstream); CUresult cuStreamDestroy (CUstream);
......
...@@ -61,9 +61,12 @@ CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t, ...@@ -61,9 +61,12 @@ CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
const char *, unsigned, CUjit_option *, void **); const char *, unsigned, CUjit_option *, void **);
CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *); CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
#else #else
typedef size_t (*CUoccupancyB2DSize)(int);
CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t, CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
const char *, unsigned, CUjit_option *, void **); const char *, unsigned, CUjit_option *, void **);
CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *); CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
#endif #endif
#define DO_PRAGMA(x) _Pragma (#x) #define DO_PRAGMA(x) _Pragma (#x)
...@@ -1200,21 +1203,77 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, ...@@ -1200,21 +1203,77 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
{ {
bool default_dim_p[GOMP_DIM_MAX]; bool default_dim_p[GOMP_DIM_MAX];
for (i = 0; i != GOMP_DIM_MAX; i++) for (i = 0; i != GOMP_DIM_MAX; i++)
default_dim_p[i] = !dims[i];
if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
{ {
default_dim_p[i] = !dims[i]; for (i = 0; i != GOMP_DIM_MAX; i++)
if (default_dim_p[i]) if (default_dim_p[i])
dims[i] = nvthd->ptx_dev->default_dims[i]; dims[i] = nvthd->ptx_dev->default_dims[i];
if (default_dim_p[GOMP_DIM_VECTOR])
dims[GOMP_DIM_VECTOR]
= MIN (dims[GOMP_DIM_VECTOR],
(targ_fn->max_threads_per_block / warp_size
* warp_size));
if (default_dim_p[GOMP_DIM_WORKER])
dims[GOMP_DIM_WORKER]
= MIN (dims[GOMP_DIM_WORKER],
targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
} }
else
{
/* Handle the case that the compiler allows the runtime to choose
the vector-length conservatively, by ignoring
gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
it. */
int vectors = 0;
/* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
exceed targ_fn->max_threads_per_block. */
int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
int grids, blocks;
CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
&blocks, function, NULL, 0,
dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
"grid = %d, block = %d\n", grids, blocks);
/* Keep the num_gangs proportional to the block size. In
the case were a block size is limited by shared-memory
or the register file capacity, the runtime will not
excessively over assign gangs to the multiprocessor
units if their state is going to be swapped out even
more than necessary. The constant factor 2 is there to
prevent threads from idling when there is insufficient
work for them. */
if (gangs == 0)
gangs = 2 * grids * (blocks / warp_size);
if (vectors == 0)
vectors = warp_size;
if (workers == 0)
{
int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
? vectors
: dims[GOMP_DIM_VECTOR]);
workers = blocks / actual_vectors;
}
if (default_dim_p[GOMP_DIM_VECTOR]) for (i = 0; i != GOMP_DIM_MAX; i++)
dims[GOMP_DIM_VECTOR] if (default_dim_p[i])
= MIN (dims[GOMP_DIM_VECTOR], switch (i)
(targ_fn->max_threads_per_block / warp_size * warp_size)); {
case GOMP_DIM_GANG: dims[i] = gangs; break;
if (default_dim_p[GOMP_DIM_WORKER]) case GOMP_DIM_WORKER: dims[i] = workers; break;
dims[GOMP_DIM_WORKER] case GOMP_DIM_VECTOR: dims[i] = vectors; break;
= MIN (dims[GOMP_DIM_WORKER], default: GOMP_PLUGIN_fatal ("invalid dim");
targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]); }
}
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment