Commit 0024c320 by Tom de Vries Committed by Tom de Vries

[nvptx] Add support for a per-worker broadcast buffer and barrier

Add support for a per-worker broadcast buffer and barrier, to be used for
openacc vector_length larger than warp-size.

2019-01-07  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.c (oacc_bcast_partition): Declare.
	(nvptx_option_override): Init oacc_bcast_partition.
	(nvptx_init_oacc_workers): New function.
	(nvptx_declare_function_name): Call nvptx_init_oacc_workers.
	(nvptx_needs_shared_bcast): New function.
	(nvptx_find_par): Generalize to enable vectors to use shared-memory
	to propagate state.
	(nvptx_shared_propagate): Initialize vector bcast partition and
	synchronization state.
	(nvptx_single):  Generalize to enable vectors to use shared-memory
	to propagate state.
	(nvptx_process_pars): Likewise.
	(nvptx_set_current_function): Initialize oacc_broadcast_partition.
	* config/nvptx/nvptx.h (struct machine_function): Add
	bcast_partition and sync_bar members.

From-SVN: r267629
parent ccc0d492
2019-01-07 Tom de Vries <tdevries@suse.de> 2019-01-07 Tom de Vries <tdevries@suse.de>
* config/nvptx/nvptx.c (oacc_bcast_partition): Declare.
(nvptx_option_override): Init oacc_bcast_partition.
(nvptx_init_oacc_workers): New function.
(nvptx_declare_function_name): Call nvptx_init_oacc_workers.
(nvptx_needs_shared_bcast): New function.
(nvptx_find_par): Generalize to enable vectors to use shared-memory
to propagate state.
(nvptx_shared_propagate): Initialize vector bcast partition and
synchronization state.
(nvptx_single): Generalize to enable vectors to use shared-memory
to propagate state.
(nvptx_process_pars): Likewise.
(nvptx_set_current_function): Initialize oacc_broadcast_partition.
* config/nvptx/nvptx.h (struct machine_function): Add
bcast_partition and sync_bar members.
2019-01-07 Tom de Vries <tdevries@suse.de>
* config/nvptx/nvptx.c (nvptx_welformed_vector_length_p) * config/nvptx/nvptx.c (nvptx_welformed_vector_length_p)
(nvptx_apply_dim_limits): New function. (nvptx_apply_dim_limits): New function.
(nvptx_goacc_validate_dims_1): Allow PTX_MAX_VECTOR_LENGTH larger than (nvptx_goacc_validate_dims_1): Allow PTX_MAX_VECTOR_LENGTH larger than
......
...@@ -140,6 +140,7 @@ static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab; ...@@ -140,6 +140,7 @@ static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
memory. It'd be nice if PTX supported common blocks, because then memory. It'd be nice if PTX supported common blocks, because then
this could be shared across TUs (taking the largest size). */ this could be shared across TUs (taking the largest size). */
static unsigned oacc_bcast_size; static unsigned oacc_bcast_size;
static unsigned oacc_bcast_partition;
static unsigned oacc_bcast_align; static unsigned oacc_bcast_align;
static GTY(()) rtx oacc_bcast_sym; static GTY(()) rtx oacc_bcast_sym;
...@@ -158,6 +159,8 @@ static bool need_softstack_decl; ...@@ -158,6 +159,8 @@ static bool need_softstack_decl;
/* True if any function references __nvptx_uni. */ /* True if any function references __nvptx_uni. */
static bool need_unisimt_decl; static bool need_unisimt_decl;
static int nvptx_mach_max_workers ();
/* Allocate a new, cleared machine_function structure. */ /* Allocate a new, cleared machine_function structure. */
static struct machine_function * static struct machine_function *
...@@ -217,6 +220,7 @@ nvptx_option_override (void) ...@@ -217,6 +220,7 @@ nvptx_option_override (void)
oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast"); oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED); SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
oacc_bcast_partition = 0;
worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red"); worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED); SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
...@@ -1105,6 +1109,40 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name) ...@@ -1105,6 +1109,40 @@ nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
fprintf (file, "\t}\n"); fprintf (file, "\t}\n");
} }
/* Emit code to initialize OpenACC worker broadcast and synchronization
registers. */
static void
nvptx_init_oacc_workers (FILE *file)
{
fprintf (file, "\t{\n");
fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
if (cfun->machine->bcast_partition)
{
fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
fprintf (file, "\t\t.reg.u64\t%%y64;\n");
}
fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
if (cfun->machine->bcast_partition)
{
fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
"// vector broadcast offset\n",
REGNO (cfun->machine->bcast_partition),
oacc_bcast_partition);
}
/* Verify oacc_bcast_size. */
gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
<= oacc_bcast_size);
if (cfun->machine->sync_bar)
fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
"// vector synchronization barrier\n",
REGNO (cfun->machine->sync_bar));
fprintf (file, "\t}\n");
}
/* Emit code to initialize predicate and master lane index registers for /* Emit code to initialize predicate and master lane index registers for
-muniform-simt code generation variant. */ -muniform-simt code generation variant. */
...@@ -1331,6 +1369,8 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) ...@@ -1331,6 +1369,8 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
if (cfun->machine->unisimt_predicate if (cfun->machine->unisimt_predicate
|| (cfun->machine->has_simtreg && !crtl->is_leaf)) || (cfun->machine->has_simtreg && !crtl->is_leaf))
nvptx_init_unisimt_predicate (file); nvptx_init_unisimt_predicate (file);
if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
nvptx_init_oacc_workers (file);
} }
/* Output code for switching uniform-simt state. ENTERING indicates whether /* Output code for switching uniform-simt state. ENTERING indicates whether
...@@ -3072,6 +3112,19 @@ nvptx_split_blocks (bb_insn_map_t *map) ...@@ -3072,6 +3112,19 @@ nvptx_split_blocks (bb_insn_map_t *map)
} }
} }
/* Return true if MASK contains parallelism that requires shared
memory to broadcast. */
static bool
nvptx_needs_shared_bcast (unsigned mask)
{
bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
&& nvptx_mach_vector_length () != PTX_WARP_SIZE;
return worker || large_vector;
}
/* BLOCK is a basic block containing a head or tail instruction. /* BLOCK is a basic block containing a head or tail instruction.
Locate the associated prehead or pretail instruction, which must be Locate the associated prehead or pretail instruction, which must be
in the single predecessor block. */ in the single predecessor block. */
...@@ -3147,7 +3200,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block) ...@@ -3147,7 +3200,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
par = new parallel (par, mask); par = new parallel (par, mask);
par->forked_block = block; par->forked_block = block;
par->forked_insn = end; par->forked_insn = end;
if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) if (nvptx_needs_shared_bcast (mask))
par->fork_insn par->fork_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_fork); = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
} }
...@@ -3162,7 +3215,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block) ...@@ -3162,7 +3215,7 @@ nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
gcc_assert (par->mask == mask); gcc_assert (par->mask == mask);
par->join_block = block; par->join_block = block;
par->join_insn = end; par->join_insn = end;
if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) if (nvptx_needs_shared_bcast (mask))
par->joining_insn par->joining_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_joining); = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
par = par->parent; par = par->parent;
...@@ -4019,22 +4072,45 @@ nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block, ...@@ -4019,22 +4072,45 @@ nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
gcc_assert (empty == !data.offset); gcc_assert (empty == !data.offset);
if (data.offset) if (data.offset)
{ {
rtx bcast_sym = oacc_bcast_sym;
/* Stuff was emitted, initialize the base pointer now. */ /* Stuff was emitted, initialize the base pointer now. */
rtx init = gen_rtx_SET (data.base, oacc_bcast_sym); if (vector && nvptx_mach_max_workers () > 1)
{
if (!cfun->machine->bcast_partition)
{
/* It would be nice to place this register in
DATA_AREA_SHARED. */
cfun->machine->bcast_partition = gen_reg_rtx (DImode);
}
if (!cfun->machine->sync_bar)
cfun->machine->sync_bar = gen_reg_rtx (SImode);
bcast_sym = cfun->machine->bcast_partition;
}
rtx init = gen_rtx_SET (data.base, bcast_sym);
emit_insn_after (init, insn); emit_insn_after (init, insn);
oacc_bcast_size = MAX (oacc_bcast_size, data.offset); unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
? nvptx_mach_max_workers () + 1
: 1);
oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
} }
return empty; return empty;
} }
/* Emit a worker-level synchronization barrier. We use different /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
markers for before and after synchronizations. */ which is an integer or a register. THREADS is the number of threads
controlled by the barrier. */
static rtx static rtx
nvptx_cta_sync (bool after) nvptx_cta_sync (rtx lock, int threads)
{ {
return gen_nvptx_barsync (GEN_INT (after), GEN_INT (0)); return gen_nvptx_barsync (lock, GEN_INT (threads));
} }
#if WORKAROUND_PTXJIT_BUG #if WORKAROUND_PTXJIT_BUG
...@@ -4327,7 +4403,8 @@ nvptx_single (unsigned mask, basic_block from, basic_block to) ...@@ -4327,7 +4403,8 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
{ {
rtx pvar = XEXP (XEXP (cond_branch, 0), 0); rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
&& nvptx_mach_vector_length () == PTX_WARP_SIZE)
{ {
/* Vector mode only, do a shuffle. */ /* Vector mode only, do a shuffle. */
#if WORKAROUND_PTXJIT_BUG #if WORKAROUND_PTXJIT_BUG
...@@ -4394,25 +4471,50 @@ nvptx_single (unsigned mask, basic_block from, basic_block to) ...@@ -4394,25 +4471,50 @@ nvptx_single (unsigned mask, basic_block from, basic_block to)
/* Includes worker mode, do spill & fill. By construction /* Includes worker mode, do spill & fill. By construction
we should never have worker mode only. */ we should never have worker mode only. */
broadcast_data_t data; broadcast_data_t data;
unsigned size = GET_MODE_SIZE (SImode);
bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
rtx barrier = GEN_INT (0);
int threads = 0;
data.base = oacc_bcast_sym; data.base = oacc_bcast_sym;
data.ptr = 0; data.ptr = 0;
oacc_bcast_size = MAX (oacc_bcast_size, GET_MODE_SIZE (SImode)); bool use_partitioning_p = (vector && !worker
&& nvptx_mach_max_workers () > 1
&& cfun->machine->bcast_partition);
if (use_partitioning_p)
{
data.base = cfun->machine->bcast_partition;
barrier = cfun->machine->sync_bar;
threads = nvptx_mach_vector_length ();
}
gcc_assert (data.base != NULL);
gcc_assert (barrier);
unsigned int psize = ROUND_UP (size, oacc_bcast_align);
unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
? nvptx_mach_max_workers () + 1
: 1);
oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
data.offset = 0; data.offset = 0;
emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data, emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
false), vector),
before); before);
/* Barrier so other workers can see the write. */ /* Barrier so other workers can see the write. */
emit_insn_before (nvptx_cta_sync (false), tail); emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
data.offset = 0; data.offset = 0;
emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data, emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
false), tail); vector),
tail);
/* This barrier is needed to avoid worker zero clobbering /* This barrier is needed to avoid worker zero clobbering
the broadcast buffer before all the other workers have the broadcast buffer before all the other workers have
had a chance to read this instance of it. */ had a chance to read this instance of it. */
emit_insn_before (nvptx_cta_sync (false), tail); emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
} }
extract_insn (tail); extract_insn (tail);
...@@ -4526,20 +4628,32 @@ nvptx_process_pars (parallel *par) ...@@ -4526,20 +4628,32 @@ nvptx_process_pars (parallel *par)
} }
bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0; bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
&& nvptx_mach_vector_length () > PTX_WARP_SIZE);
if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) if (worker || large_vector)
{ {
nvptx_shared_propagate (false, is_call, par->forked_block, nvptx_shared_propagate (false, is_call, par->forked_block,
par->forked_insn, false); par->forked_insn, !worker);
bool empty = nvptx_shared_propagate (true, is_call, bool empty = nvptx_shared_propagate (true, is_call,
par->forked_block, par->fork_insn, par->forked_block, par->fork_insn,
false); !worker);
rtx barrier = GEN_INT (0);
int threads = 0;
if (!worker && cfun->machine->sync_bar)
{
barrier = cfun->machine->sync_bar;
threads = nvptx_mach_vector_length ();
}
if (!empty || !is_call) if (!empty || !is_call)
{ {
/* Insert begin and end synchronizations. */ /* Insert begin and end synchronizations. */
emit_insn_before (nvptx_cta_sync (false), par->forked_insn); emit_insn_before (nvptx_cta_sync (barrier, threads),
emit_insn_before (nvptx_cta_sync (false), par->join_insn); par->forked_insn);
emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
} }
} }
else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
...@@ -6169,6 +6283,7 @@ nvptx_set_current_function (tree fndecl) ...@@ -6169,6 +6283,7 @@ nvptx_set_current_function (tree fndecl)
return; return;
nvptx_previous_fndecl = fndecl; nvptx_previous_fndecl = fndecl;
oacc_bcast_partition = 0;
} }
#undef TARGET_OPTION_OVERRIDE #undef TARGET_OPTION_OVERRIDE
......
...@@ -221,6 +221,10 @@ struct GTY(()) machine_function ...@@ -221,6 +221,10 @@ struct GTY(()) machine_function
int axis_dim[2]; /* Maximum number of threads on each axis, dim[0] is int axis_dim[2]; /* Maximum number of threads on each axis, dim[0] is
vector_length, dim[1] is num_workers. */ vector_length, dim[1] is num_workers. */
bool axis_dim_init_p; bool axis_dim_init_p;
rtx bcast_partition; /* Register containing the size of each
vector's partition of share-memory used to
broadcast state. */
rtx sync_bar; /* Synchronization barrier ID for vectors. */
rtx unisimt_master; /* 'Master lane index' for -muniform-simt. */ rtx unisimt_master; /* 'Master lane index' for -muniform-simt. */
rtx unisimt_predicate; /* Predicate for -muniform-simt. */ rtx unisimt_predicate; /* Predicate for -muniform-simt. */
rtx unisimt_location; /* Mask location for -muniform-simt. */ rtx unisimt_location; /* Mask location for -muniform-simt. */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment