Commit 0c6c2f5f by Cesar Philippidis Committed by Tom de Vries

[libgomp, nvptx] Move device property sampling from nvptx_exec to nvptx_open

Move sampling of device properties from nvptx_exec to nvptx_open, and assume
the sampling always succeeds.  This simplifies the default dimension
initialization code in nvptx_open.

2018-07-26  Cesar Philippidis  <cesar@codesourcery.com>
	    Tom de Vries  <tdevries@suse.de>

	* plugin/plugin-nvptx.c (struct ptx_device): Add warp_size,
	max_threads_per_block and max_threads_per_multiprocessor fields.
	(nvptx_open_device): Initialize new fields.
	(nvptx_exec): Use num_sms, and new fields.

Co-Authored-By: Tom de Vries <tdevries@suse.de>

From-SVN: r262996
parent 328aa787
2018-07-26 Cesar Philippidis <cesar@codesourcery.com>
Tom de Vries <tdevries@suse.de>
* plugin/plugin-nvptx.c (struct ptx_device): Add warp_size,
max_threads_per_block and max_threads_per_multiprocessor fields.
(nvptx_open_device): Initialize new fields.
(nvptx_exec): Use num_sms, and new fields.
2018-07-26 Tom de Vries <tdevries@suse.de> 2018-07-26 Tom de Vries <tdevries@suse.de>
* testsuite/libgomp.oacc-fortran/lib-12.f90: Move acc_async_test calls * testsuite/libgomp.oacc-fortran/lib-12.f90: Move acc_async_test calls
......
...@@ -414,6 +414,9 @@ struct ptx_device ...@@ -414,6 +414,9 @@ struct ptx_device
int num_sms; int num_sms;
int regs_per_block; int regs_per_block;
int regs_per_sm; int regs_per_sm;
int warp_size;
int max_threads_per_block;
int max_threads_per_multiprocessor;
struct ptx_image_data *images; /* Images loaded on device. */ struct ptx_image_data *images; /* Images loaded on device. */
pthread_mutex_t image_lock; /* Lock for above list. */ pthread_mutex_t image_lock; /* Lock for above list. */
...@@ -800,6 +803,15 @@ nvptx_open_device (int n) ...@@ -800,6 +803,15 @@ nvptx_open_device (int n)
GOMP_PLUGIN_error ("Only warp size 32 is supported"); GOMP_PLUGIN_error ("Only warp size 32 is supported");
return NULL; return NULL;
} }
ptx_dev->warp_size = pi;
CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
ptx_dev->max_threads_per_block = pi;
CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
ptx_dev->max_threads_per_multiprocessor = pi;
r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines, r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
...@@ -1150,33 +1162,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, ...@@ -1150,33 +1162,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
for (int i = 0; i < GOMP_DIM_MAX; ++i) for (int i = 0; i < GOMP_DIM_MAX; ++i)
default_dims[i] = GOMP_PLUGIN_acc_default_dim (i); default_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
int warp_size, block_size, dev_size, cpu_size; int gang, worker, vector;
CUdevice dev = nvptx_thread()->ptx_dev->dev; {
/* 32 is the default for known hardware. */ int warp_size = nvthd->ptx_dev->warp_size;
int gang = 0, worker = 32, vector = 32; int block_size = nvthd->ptx_dev->max_threads_per_block;
CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm; int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
int dev_size = nvthd->ptx_dev->num_sms;
cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK; GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE; " dev_size=%d, cpu_size=%d\n",
cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT; warp_size, block_size, dev_size, cpu_size);
cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
gang = (cpu_size / block_size) * dev_size;
if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb, worker = block_size / warp_size;
dev) == CUDA_SUCCESS vector = warp_size;
&& CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws, }
dev) == CUDA_SUCCESS
&& CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
dev) == CUDA_SUCCESS
&& CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
dev) == CUDA_SUCCESS)
{
GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
" dev_size=%d, cpu_size=%d\n",
warp_size, block_size, dev_size, cpu_size);
gang = (cpu_size / block_size) * dev_size;
worker = block_size / warp_size;
vector = warp_size;
}
/* There is no upper bound on the gang size. The best size /* There is no upper bound on the gang size. The best size
matches the hardware configuration. Logical gangs are matches the hardware configuration. Logical gangs are
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment