Commit 5a28e272 by Kwok Cheung Yeung

[amdgcn] Scale number of threads/workers with VGPR usage

2020-01-31  Kwok Cheung Yeung  <kcy@codesourcery.com>

	gcc/
	* config/gcn/mkoffload.c (process_asm): Add sgpr_count and vgpr_count
	to definition of hsa_kernel_description.  Parse assembly to find SGPR
	and VGPR count of kernel and store in hsa_kernel_description.

	libgomp/
	* plugin/plugin-gcn.c (struct hsa_kernel_description): Add sgpr_count
	and vgpr_count fields.
	(struct kernel_info): Add a field for a hsa_kernel_description.
	(run_kernel): Reduce the number of threads/workers if the requested
	number would require too many VGPRs.
	(init_basic_kernel_info): Initialize description field with
	the hsa_kernel_description entry for the kernel.
parent 6a97d9ea
2020-01-31 Kwok Cheung Yeung <kcy@codesourcery.com>
* config/gcn/mkoffload.c (process_asm): Add sgpr_count and vgpr_count
to definition of hsa_kernel_description. Parse assembly to find SGPR
and VGPR count of kernel and store in hsa_kernel_description.
2020-01-31 Tamar Christina <tamar.christina@arm.com> 2020-01-31 Tamar Christina <tamar.christina@arm.com>
PR rtl-optimization/91838 PR rtl-optimization/91838
......
...@@ -211,12 +211,13 @@ access_check (const char *name, int mode) ...@@ -211,12 +211,13 @@ access_check (const char *name, int mode)
static void static void
process_asm (FILE *in, FILE *out, FILE *cfile) process_asm (FILE *in, FILE *out, FILE *cfile)
{ {
int fn_count = 0, var_count = 0, dims_count = 0; int fn_count = 0, var_count = 0, dims_count = 0, regcount_count = 0;
struct obstack fns_os, vars_os, varsizes_os, dims_os; struct obstack fns_os, vars_os, varsizes_os, dims_os, regcounts_os;
obstack_init (&fns_os); obstack_init (&fns_os);
obstack_init (&vars_os); obstack_init (&vars_os);
obstack_init (&varsizes_os); obstack_init (&varsizes_os);
obstack_init (&dims_os); obstack_init (&dims_os);
obstack_init (&regcounts_os);
struct oaccdims struct oaccdims
{ {
...@@ -224,13 +225,20 @@ process_asm (FILE *in, FILE *out, FILE *cfile) ...@@ -224,13 +225,20 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
char *name; char *name;
} dim; } dim;
struct regcount
{
int sgpr_count;
int vgpr_count;
char *kernel_name;
} regcount;
/* Always add _init_array and _fini_array as kernels. */ /* Always add _init_array and _fini_array as kernels. */
obstack_ptr_grow (&fns_os, xstrdup ("_init_array")); obstack_ptr_grow (&fns_os, xstrdup ("_init_array"));
obstack_ptr_grow (&fns_os, xstrdup ("_fini_array")); obstack_ptr_grow (&fns_os, xstrdup ("_fini_array"));
fn_count += 2; fn_count += 2;
char buf[1000]; char buf[1000];
enum { IN_CODE, IN_VARS, IN_FUNCS } state = IN_CODE; enum { IN_CODE, IN_AMD_KERNEL_CODE_T, IN_VARS, IN_FUNCS } state = IN_CODE;
while (fgets (buf, sizeof (buf), in)) while (fgets (buf, sizeof (buf), in))
{ {
switch (state) switch (state)
...@@ -243,6 +251,22 @@ process_asm (FILE *in, FILE *out, FILE *cfile) ...@@ -243,6 +251,22 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
obstack_grow (&dims_os, &dim, sizeof (dim)); obstack_grow (&dims_os, &dim, sizeof (dim));
dims_count++; dims_count++;
} }
else if (sscanf (buf, " .amdgpu_hsa_kernel %ms\n",
&regcount.kernel_name) == 1)
break;
break;
}
case IN_AMD_KERNEL_CODE_T:
{
gcc_assert (regcount.kernel_name);
if (sscanf (buf, " wavefront_sgpr_count = %d\n",
&regcount.sgpr_count) == 1)
break;
else if (sscanf (buf, " workitem_vgpr_count = %d\n",
&regcount.vgpr_count) == 1)
break;
break; break;
} }
case IN_VARS: case IN_VARS:
...@@ -282,19 +306,36 @@ process_asm (FILE *in, FILE *out, FILE *cfile) ...@@ -282,19 +306,36 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
state = IN_VARS; state = IN_VARS;
else if (sscanf (buf, " .section .gnu.offload_funcs%c", &dummy) > 0) else if (sscanf (buf, " .section .gnu.offload_funcs%c", &dummy) > 0)
state = IN_FUNCS; state = IN_FUNCS;
else if (sscanf (buf, " .amd_kernel_code_%c", &dummy) > 0)
{
state = IN_AMD_KERNEL_CODE_T;
regcount.sgpr_count = regcount.vgpr_count = -1;
}
else if (sscanf (buf, " .section %c", &dummy) > 0 else if (sscanf (buf, " .section %c", &dummy) > 0
|| sscanf (buf, " .text%c", &dummy) > 0 || sscanf (buf, " .text%c", &dummy) > 0
|| sscanf (buf, " .bss%c", &dummy) > 0 || sscanf (buf, " .bss%c", &dummy) > 0
|| sscanf (buf, " .data%c", &dummy) > 0 || sscanf (buf, " .data%c", &dummy) > 0
|| sscanf (buf, " .ident %c", &dummy) > 0) || sscanf (buf, " .ident %c", &dummy) > 0)
state = IN_CODE; state = IN_CODE;
else if (sscanf (buf, " .end_amd_kernel_code_%c", &dummy) > 0)
{
state = IN_CODE;
gcc_assert (regcount.kernel_name != NULL
&& regcount.sgpr_count >= 0
&& regcount.vgpr_count >= 0);
obstack_grow (&regcounts_os, &regcount, sizeof (regcount));
regcount_count++;
regcount.kernel_name = NULL;
regcount.sgpr_count = regcount.vgpr_count = -1;
}
if (state == IN_CODE) if (state == IN_CODE || state == IN_AMD_KERNEL_CODE_T)
fputs (buf, out); fputs (buf, out);
} }
char **fns = XOBFINISH (&fns_os, char **); char **fns = XOBFINISH (&fns_os, char **);
struct oaccdims *dims = XOBFINISH (&dims_os, struct oaccdims *); struct oaccdims *dims = XOBFINISH (&dims_os, struct oaccdims *);
struct regcount *regcounts = XOBFINISH (&regcounts_os, struct regcount *);
fprintf (cfile, "#include <stdlib.h>\n"); fprintf (cfile, "#include <stdlib.h>\n");
fprintf (cfile, "#include <stdbool.h>\n\n"); fprintf (cfile, "#include <stdbool.h>\n\n");
...@@ -322,6 +363,8 @@ process_asm (FILE *in, FILE *out, FILE *cfile) ...@@ -322,6 +363,8 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
fprintf (cfile, "static const struct hsa_kernel_description {\n" fprintf (cfile, "static const struct hsa_kernel_description {\n"
" const char *name;\n" " const char *name;\n"
" int oacc_dims[3];\n" " int oacc_dims[3];\n"
" int sgpr_count;\n"
" int vgpr_count;\n"
"} gcn_kernels[] = {\n "); "} gcn_kernels[] = {\n ");
dim.d[0] = dim.d[1] = dim.d[2] = 0; dim.d[0] = dim.d[1] = dim.d[2] = 0;
const char *comma; const char *comma;
...@@ -329,15 +372,24 @@ process_asm (FILE *in, FILE *out, FILE *cfile) ...@@ -329,15 +372,24 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
{ {
/* Find if we recorded dimensions for this function. */ /* Find if we recorded dimensions for this function. */
int *d = dim.d; /* Previously zeroed. */ int *d = dim.d; /* Previously zeroed. */
int sgpr_count = 0;
int vgpr_count = 0;
for (int j = 0; j < dims_count; j++) for (int j = 0; j < dims_count; j++)
if (strcmp (fns[i], dims[j].name) == 0) if (strcmp (fns[i], dims[j].name) == 0)
{ {
d = dims[j].d; d = dims[j].d;
break; break;
} }
for (int j = 0; j < regcount_count; j++)
if (strcmp (fns[i], regcounts[j].kernel_name) == 0)
{
sgpr_count = regcounts[j].sgpr_count;
vgpr_count = regcounts[j].vgpr_count;
break;
}
fprintf (cfile, "%s{\"%s\", {%d, %d, %d}}", comma, fprintf (cfile, "%s{\"%s\", {%d, %d, %d}, %d, %d}", comma,
fns[i], d[0], d[1], d[2]); fns[i], d[0], d[1], d[2], sgpr_count, vgpr_count);
free (fns[i]); free (fns[i]);
} }
...@@ -346,7 +398,10 @@ process_asm (FILE *in, FILE *out, FILE *cfile) ...@@ -346,7 +398,10 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
obstack_free (&fns_os, NULL); obstack_free (&fns_os, NULL);
for (i = 0; i < dims_count; i++) for (i = 0; i < dims_count; i++)
free (dims[i].name); free (dims[i].name);
for (i = 0; i < regcount_count; i++)
free (regcounts[i].kernel_name);
obstack_free (&dims_os, NULL); obstack_free (&dims_os, NULL);
obstack_free (&regcounts_os, NULL);
} }
/* Embed an object file into a C source file. */ /* Embed an object file into a C source file. */
......
2020-01-31 Kwok Cheung Yeung <kcy@codesourcery.com>
* plugin/plugin-gcn.c (struct hsa_kernel_description): Add sgpr_count
and vgpr_count fields.
(struct kernel_info): Add a field for a hsa_kernel_description.
(run_kernel): Reduce the number of threads/workers if the requested
number would require too many VGPRs.
(init_basic_kernel_info): Initialize description field with
the hsa_kernel_description entry for the kernel.
2020-01-29 Tobias Burnus <tobias@codesourcery.com> 2020-01-29 Tobias Burnus <tobias@codesourcery.com>
PR bootstrap/93409 PR bootstrap/93409
......
...@@ -371,6 +371,8 @@ struct hsa_kernel_description ...@@ -371,6 +371,8 @@ struct hsa_kernel_description
{ {
const char *name; const char *name;
int oacc_dims[3]; /* Only present for GCN kernels. */ int oacc_dims[3]; /* Only present for GCN kernels. */
int sgpr_count;
int vpgr_count;
}; };
/* Mkoffload uses this structure to describe an offload variable. */ /* Mkoffload uses this structure to describe an offload variable. */
...@@ -478,6 +480,8 @@ struct kernel_info ...@@ -478,6 +480,8 @@ struct kernel_info
struct agent_info *agent; struct agent_info *agent;
/* The specific module where the kernel takes place. */ /* The specific module where the kernel takes place. */
struct module_info *module; struct module_info *module;
/* Information provided by mkoffload associated with the kernel. */
struct hsa_kernel_description *description;
/* Mutex enforcing that at most once thread ever initializes a kernel for /* Mutex enforcing that at most once thread ever initializes a kernel for
use. A thread should have locked agent->module_rwlock for reading before use. A thread should have locked agent->module_rwlock for reading before
acquiring it. */ acquiring it. */
...@@ -2102,6 +2106,24 @@ run_kernel (struct kernel_info *kernel, void *vars, ...@@ -2102,6 +2106,24 @@ run_kernel (struct kernel_info *kernel, void *vars,
struct GOMP_kernel_launch_attributes *kla, struct GOMP_kernel_launch_attributes *kla,
struct goacc_asyncqueue *aq, bool module_locked) struct goacc_asyncqueue *aq, bool module_locked)
{ {
GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
kernel->description->vpgr_count);
/* Reduce the number of threads/workers if there are insufficient
VGPRs available to run the kernels together. */
if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
{
int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
int max_threads = (256 / granulated_vgprs) * 4;
if (kla->gdims[2] > max_threads)
{
GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
" per team/gang - reducing to %d threads/workers.\n",
kla->gdims[2], max_threads);
kla->gdims[2] = max_threads;
}
}
GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id, GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
(aq ? aq->id : 0)); (aq ? aq->id : 0));
GCN_DEBUG ("GCN launch attribs: gdims:["); GCN_DEBUG ("GCN launch attribs: gdims:[");
...@@ -2303,6 +2325,7 @@ init_basic_kernel_info (struct kernel_info *kernel, ...@@ -2303,6 +2325,7 @@ init_basic_kernel_info (struct kernel_info *kernel,
kernel->agent = agent; kernel->agent = agent;
kernel->module = module; kernel->module = module;
kernel->name = d->name; kernel->name = d->name;
kernel->description = d;
if (pthread_mutex_init (&kernel->init_mutex, NULL)) if (pthread_mutex_init (&kernel->init_mutex, NULL))
{ {
GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex"); GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment