Commit d88cd9c4 by Nathan Sidwell Committed by Nathan Sidwell

nvptx.h (struct machine_function): Add axis_predicate.

	* config/nvptx/nvptx.h (struct machine_function): Add
	axis_predicate.
	* config/nvptx/nvptx-protos.h (nvptx_expand_oacc_fork,
	nvptx_expand_oacc_join): Declare.
	* config/nvptx/nvptx.md (UNSPEC_NTID, UNSPEC_TID): Delete.
	(UNSPEC_DIM_SIZE, UNSPEC_SHARED_DATA, UNSPEC_BIT_CONV,
	UNSPEC_SHUFFLE, UNSPEC_BR_UNIFIED): New.
	(UNSPECV_BARSYNC, UNSPECV_DIM_POS, UNSPECV_FORK, UNSPECV_FORKED,
	UNSPECV_JOINING, UNSPECV_JOIN): New.
	(BITS, BITD): New mode iterators.
	(br_true_uni, br_false_uni): New.
	(*oacc_ntid_insn, oacc_ntid, *oacc_tid_insn, oacc_tid): Delete.
	(oacc_dim_size, oacc_dim_pos): New.
	(nvptx_fork, nvptx_forked, nvptx_joining, nvptx_join): New.
	(oacc_fork, oacc_join): New.
	(nvptx_shuffle<mode>, unpack<mode>si2, packsi<mode>2): New.
	(worker_load<mode>, worker_store<mode>): New.
	(nvptx_barsync): New.
	* config/nvptx/nvptx.c: Include gimple.h & dumpfile.h.
	(SHUFFLE_UP, SHUFFLE_DOWN, SHUFFLE_BFLY, SHUFFLE_IDX): Define.
	(worker_bcast_hwm, worker_bcast_align, worker_bcast_name,
	worker_bcast_sym): New.
	(nvptx_option_override): Initialize worker broadcast buffer.
	(nvptx_emit_forking, nvptx_emit_joining): New.
	(nvptx_init_axis_predicate): New.
	(nvptx_declare_function_name): Init axis predicates.
	(nvptx_expand_call): Add fork/join markers around routine call.
	(nvptx_expand_oacc_fork, nvptx_expand_oacc_join): New.
	(nvptx_gen_unpack, nvptx_gen_pack, nvptx_gen_shuffle): New.
	(nvptx_gen_vcast): New.
	(struct wcast_data_t): New.
	(enum propagate_mask): New.
	(nvptx_gen_wcast): New.
	(nvptx_print_operand): Add 'S' case.
	(struct parallel): New.
	(parallel::parallel, parallel::~parallel): New.
	(bb_insn_map_t, insn_bb_t, insn_bb_vec_t): New typedefs.
	(nvptx_split_blocks, nvptx_discover_pre, nvptx_dump_pars,
	nvptx_find_par, nvptx_discover_pars): New.
	(nvptx_propagate): New.
	(vprop_gen, nvptx_vpropagate): New.
	(wprop_gen, nvptx_wpropagate): New.
	(nvptx_wsync): New.
	(nvptx_single, nvptx_skip_par): New.
	(nvptx_process_pars, nvptx_neuter_pars): New.
	(ntptx_reorg): Split blocks, generate parallel structure, apply
	neutering.
	(nvptx_cannot_copy_insn_p): New.
	(nvptx_file_end): Emit worker broadcast decl.
	(nvptx_goacc_fork_join): New.
	(TARGET_CANNOT_COPY_INSN_P): Override.
	(TARGET_GOACC_FORK_JOIN): Override.

From-SVN: r229486
parent 1e355e1d
2015-10-28 Nathan Sidwell <nathan@codesourcery.com>
* config/nvptx/nvptx.h (struct machine_function): Add
axis_predicate.
* config/nvptx/nvptx-protos.h (nvptx_expand_oacc_fork,
nvptx_expand_oacc_join): Declare.
* config/nvptx/nvptx.md (UNSPEC_NTID, UNSPEC_TID): Delete.
(UNSPEC_DIM_SIZE, UNSPEC_SHARED_DATA, UNSPEC_BIT_CONV,
UNSPEC_SHUFFLE, UNSPEC_BR_UNIFIED): New.
(UNSPECV_BARSYNC, UNSPECV_DIM_POS, UNSPECV_FORK, UNSPECV_FORKED,
UNSPECV_JOINING, UNSPECV_JOIN): New.
(BITS, BITD): New mode iterators.
(br_true_uni, br_false_uni): New.
(*oacc_ntid_insn, oacc_ntid, *oacc_tid_insn, oacc_tid): Delete.
(oacc_dim_size, oacc_dim_pos): New.
(nvptx_fork, nvptx_forked, nvptx_joining, nvptx_join): New.
(oacc_fork, oacc_join): New.
(nvptx_shuffle<mode>, unpack<mode>si2, packsi<mode>2): New.
(worker_load<mode>, worker_store<mode>): New.
(nvptx_barsync): New.
* config/nvptx/nvptx.c: Include gimple.h & dumpfile.h.
(SHUFFLE_UP, SHUFFLE_DOWN, SHUFFLE_BFLY, SHUFFLE_IDX): Define.
(worker_bcast_hwm, worker_bcast_align, worker_bcast_name,
worker_bcast_sym): New.
(nvptx_option_override): Initialize worker broadcast buffer.
(nvptx_emit_forking, nvptx_emit_joining): New.
(nvptx_init_axis_predicate): New.
(nvptx_declare_function_name): Init axis predicates.
(nvptx_expand_call): Add fork/join markers around routine call.
(nvptx_expand_oacc_fork, nvptx_expand_oacc_join): New.
(nvptx_gen_unpack, nvptx_gen_pack, nvptx_gen_shuffle): New.
(nvptx_gen_vcast): New.
(struct wcast_data_t): New.
(enum propagate_mask): New.
(nvptx_gen_wcast): New.
(nvptx_print_operand): Add 'S' case.
(struct parallel): New.
(parallel::parallel, parallel::~parallel): New.
(bb_insn_map_t, insn_bb_t, insn_bb_vec_t): New typedefs.
(nvptx_split_blocks, nvptx_discover_pre, nvptx_dump_pars,
nvptx_find_par, nvptx_discover_pars): New.
(nvptx_propagate): New.
(vprop_gen, nvptx_vpropagate): New.
(wprop_gen, nvptx_wpropagate): New.
(nvptx_wsync): New.
(nvptx_single, nvptx_skip_par): New.
(nvptx_process_pars, nvptx_neuter_pars): New.
(ntptx_reorg): Split blocks, generate parallel structure, apply
neutering.
(nvptx_cannot_copy_insn_p): New.
(nvptx_file_end): Emit worker broadcast decl.
(nvptx_goacc_fork_join): New.
(TARGET_CANNOT_COPY_INSN_P): Override.
(TARGET_GOACC_FORK_JOIN): Override.
2015-10-28 Richard Biener <rguenther@suse.de> 2015-10-28 Richard Biener <rguenther@suse.de>
* fold-const.c (negate_expr_p): Adjust the division case to * fold-const.c (negate_expr_p): Adjust the division case to
...@@ -32,6 +32,8 @@ extern void nvptx_register_pragmas (void); ...@@ -32,6 +32,8 @@ extern void nvptx_register_pragmas (void);
extern const char *nvptx_section_for_decl (const_tree); extern const char *nvptx_section_for_decl (const_tree);
#ifdef RTX_CODE #ifdef RTX_CODE
extern void nvptx_expand_oacc_fork (unsigned);
extern void nvptx_expand_oacc_join (unsigned);
extern void nvptx_expand_call (rtx, rtx); extern void nvptx_expand_call (rtx, rtx);
extern rtx nvptx_expand_compare (rtx); extern rtx nvptx_expand_compare (rtx);
extern const char *nvptx_ptx_type_from_mode (machine_mode, bool); extern const char *nvptx_ptx_type_from_mode (machine_mode, bool);
......
...@@ -51,14 +51,21 @@ ...@@ -51,14 +51,21 @@
#include "langhooks.h" #include "langhooks.h"
#include "dbxout.h" #include "dbxout.h"
#include "cfgrtl.h" #include "cfgrtl.h"
#include "gimple.h"
#include "stor-layout.h" #include "stor-layout.h"
#include "builtins.h" #include "builtins.h"
#include "omp-low.h" #include "omp-low.h"
#include "gomp-constants.h" #include "gomp-constants.h"
#include "dumpfile.h"
/* This file should be included last. */ /* This file should be included last. */
#include "target-def.h" #include "target-def.h"
#define SHUFFLE_UP 0
#define SHUFFLE_DOWN 1
#define SHUFFLE_BFLY 2
#define SHUFFLE_IDX 3
/* Record the function decls we've written, and the libfuncs and function /* Record the function decls we've written, and the libfuncs and function
decls corresponding to them. */ decls corresponding to them. */
static std::stringstream func_decls; static std::stringstream func_decls;
...@@ -81,6 +88,16 @@ struct tree_hasher : ggc_cache_ptr_hash<tree_node> ...@@ -81,6 +88,16 @@ struct tree_hasher : ggc_cache_ptr_hash<tree_node>
static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab; static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab; static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
/* Size of buffer needed to broadcast across workers. This is used
for both worker-neutering and worker broadcasting. It is shared
by all functions emitted. The buffer is placed in shared memory.
It'd be nice if PTX supported common blocks, because then this
could be shared across TUs (taking the largest size). */
static unsigned worker_bcast_size;
static unsigned worker_bcast_align;
#define worker_bcast_name "__worker_bcast"
static GTY(()) rtx worker_bcast_sym;
/* Allocate a new, cleared machine_function structure. */ /* Allocate a new, cleared machine_function structure. */
static struct machine_function * static struct machine_function *
...@@ -108,6 +125,9 @@ nvptx_option_override (void) ...@@ -108,6 +125,9 @@ nvptx_option_override (void)
needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
declared_libfuncs_htab declared_libfuncs_htab
= hash_table<declared_libfunc_hasher>::create_ggc (17); = hash_table<declared_libfunc_hasher>::create_ggc (17);
worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, worker_bcast_name);
worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
} }
/* Return the mode to be used when declaring a ptx object for OBJ. /* Return the mode to be used when declaring a ptx object for OBJ.
...@@ -194,6 +214,47 @@ nvptx_split_reg_p (machine_mode mode) ...@@ -194,6 +214,47 @@ nvptx_split_reg_p (machine_mode mode)
return false; return false;
} }
/* Emit forking instructions for MASK. */
static void
nvptx_emit_forking (unsigned mask, bool is_call)
{
mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
| GOMP_DIM_MASK (GOMP_DIM_VECTOR));
if (mask)
{
rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
/* Emit fork at all levels. This helps form SESE regions, as
it creates a block with a single successor before entering a
partitooned region. That is a good candidate for the end of
an SESE region. */
if (!is_call)
emit_insn (gen_nvptx_fork (op));
emit_insn (gen_nvptx_forked (op));
}
}
/* Emit joining instructions for MASK. */
static void
nvptx_emit_joining (unsigned mask, bool is_call)
{
mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
| GOMP_DIM_MASK (GOMP_DIM_VECTOR));
if (mask)
{
rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
/* Emit joining for all non-call pars to ensure there's a single
predecessor for the block the join insn ends up in. This is
needed for skipping entire loops. */
if (!is_call)
emit_insn (gen_nvptx_joining (op));
emit_insn (gen_nvptx_join (op));
}
}
#define PASS_IN_REG_P(MODE, TYPE) \ #define PASS_IN_REG_P(MODE, TYPE) \
((GET_MODE_CLASS (MODE) == MODE_INT \ ((GET_MODE_CLASS (MODE) == MODE_INT \
|| GET_MODE_CLASS (MODE) == MODE_FLOAT \ || GET_MODE_CLASS (MODE) == MODE_FLOAT \
...@@ -500,6 +561,19 @@ nvptx_record_needed_fndecl (tree decl) ...@@ -500,6 +561,19 @@ nvptx_record_needed_fndecl (tree decl)
*slot = decl; *slot = decl;
} }
/* Emit code to initialize the REGNO predicate register to indicate
whether we are not lane zero on the NAME axis. */
static void
nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
{
fprintf (file, "\t{\n");
fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
fprintf (file, "\t}\n");
}
/* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
function, including local var decls and copies from the arguments to function, including local var decls and copies from the arguments to
local regs. */ local regs. */
...@@ -623,6 +697,14 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) ...@@ -623,6 +697,14 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
if (stdarg_p (fntype)) if (stdarg_p (fntype))
fprintf (file, "\tld.param.u%d %%argp, [%%in_argp];\n", fprintf (file, "\tld.param.u%d %%argp, [%%in_argp];\n",
GET_MODE_BITSIZE (Pmode)); GET_MODE_BITSIZE (Pmode));
/* Emit axis predicates. */
if (cfun->machine->axis_predicate[0])
nvptx_init_axis_predicate (file,
REGNO (cfun->machine->axis_predicate[0]), "y");
if (cfun->machine->axis_predicate[1])
nvptx_init_axis_predicate (file,
REGNO (cfun->machine->axis_predicate[1]), "x");
} }
/* Output a return instruction. Also copy the return value to its outgoing /* Output a return instruction. Also copy the return value to its outgoing
...@@ -779,6 +861,7 @@ nvptx_expand_call (rtx retval, rtx address) ...@@ -779,6 +861,7 @@ nvptx_expand_call (rtx retval, rtx address)
bool external_decl = false; bool external_decl = false;
rtx varargs = NULL_RTX; rtx varargs = NULL_RTX;
tree decl_type = NULL_TREE; tree decl_type = NULL_TREE;
unsigned parallel = 0;
for (t = cfun->machine->call_args; t; t = XEXP (t, 1)) for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
nargs++; nargs++;
...@@ -799,6 +882,22 @@ nvptx_expand_call (rtx retval, rtx address) ...@@ -799,6 +882,22 @@ nvptx_expand_call (rtx retval, rtx address)
cfun->machine->has_call_with_sc = true; cfun->machine->has_call_with_sc = true;
if (DECL_EXTERNAL (decl)) if (DECL_EXTERNAL (decl))
external_decl = true; external_decl = true;
tree attr = get_oacc_fn_attrib (decl);
if (attr)
{
tree dims = TREE_VALUE (attr);
parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
{
if (TREE_PURPOSE (dims)
&& !integer_zerop (TREE_PURPOSE (dims)))
break;
/* Not on this axis. */
parallel ^= GOMP_DIM_MASK (ix);
dims = TREE_CHAIN (dims);
}
}
} }
} }
...@@ -860,7 +959,11 @@ nvptx_expand_call (rtx retval, rtx address) ...@@ -860,7 +959,11 @@ nvptx_expand_call (rtx retval, rtx address)
write_func_decl_from_insn (func_decls, retval, pat, callee); write_func_decl_from_insn (func_decls, retval, pat, callee);
} }
} }
nvptx_emit_forking (parallel, true);
emit_call_insn (pat); emit_call_insn (pat);
nvptx_emit_joining (parallel, true);
if (tmp_retval != retval) if (tmp_retval != retval)
emit_move_insn (retval, tmp_retval); emit_move_insn (retval, tmp_retval);
} }
...@@ -1069,6 +1172,214 @@ nvptx_expand_compare (rtx compare) ...@@ -1069,6 +1172,214 @@ nvptx_expand_compare (rtx compare)
return gen_rtx_NE (BImode, pred, const0_rtx); return gen_rtx_NE (BImode, pred, const0_rtx);
} }
/* Expand the oacc fork & join primitive into ptx-required unspecs. */
void
nvptx_expand_oacc_fork (unsigned mode)
{
nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
}
void
nvptx_expand_oacc_join (unsigned mode)
{
nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
}
/* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
objects. */
static rtx
nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
{
rtx res;
switch (GET_MODE (src))
{
case DImode:
res = gen_unpackdisi2 (dst0, dst1, src);
break;
case DFmode:
res = gen_unpackdfsi2 (dst0, dst1, src);
break;
default: gcc_unreachable ();
}
return res;
}
/* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
object. */
static rtx
nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
{
rtx res;
switch (GET_MODE (dst))
{
case DImode:
res = gen_packsidi2 (dst, src0, src1);
break;
case DFmode:
res = gen_packsidf2 (dst, src0, src1);
break;
default: gcc_unreachable ();
}
return res;
}
/* Generate an instruction or sequence to broadcast register REG
across the vectors of a single warp. */
static rtx
nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, unsigned kind)
{
rtx res;
switch (GET_MODE (dst))
{
case SImode:
res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
break;
case SFmode:
res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
break;
case DImode:
case DFmode:
{
rtx tmp0 = gen_reg_rtx (SImode);
rtx tmp1 = gen_reg_rtx (SImode);
start_sequence ();
emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
res = get_insns ();
end_sequence ();
}
break;
case BImode:
{
rtx tmp = gen_reg_rtx (SImode);
start_sequence ();
emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
res = get_insns ();
end_sequence ();
}
break;
default:
gcc_unreachable ();
}
return res;
}
/* Generate an instruction or sequence to broadcast register REG
across the vectors of a single warp. */
static rtx
nvptx_gen_vcast (rtx reg)
{
return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
}
/* Structure used when generating a worker-level spill or fill. */
struct wcast_data_t
{
rtx base; /* Register holding base addr of buffer. */
rtx ptr; /* Iteration var, if needed. */
unsigned offset; /* Offset into worker buffer. */
};
/* Direction of the spill/fill and looping setup/teardown indicator. */
enum propagate_mask
{
PM_read = 1 << 0,
PM_write = 1 << 1,
PM_loop_begin = 1 << 2,
PM_loop_end = 1 << 3,
PM_read_write = PM_read | PM_write
};
/* Generate instruction(s) to spill or fill register REG to/from the
worker broadcast array. PM indicates what is to be done, REP
how many loop iterations will be executed (0 for not a loop). */
static rtx
nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
{
rtx res;
machine_mode mode = GET_MODE (reg);
switch (mode)
{
case BImode:
{
rtx tmp = gen_reg_rtx (SImode);
start_sequence ();
if (pm & PM_read)
emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
if (pm & PM_write)
emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
res = get_insns ();
end_sequence ();
}
break;
default:
{
rtx addr = data->ptr;
if (!addr)
{
unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
if (align > worker_bcast_align)
worker_bcast_align = align;
data->offset = (data->offset + align - 1) & ~(align - 1);
addr = data->base;
if (data->offset)
addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
}
addr = gen_rtx_MEM (mode, addr);
addr = gen_rtx_UNSPEC (mode, gen_rtvec (1, addr), UNSPEC_SHARED_DATA);
if (pm == PM_read)
res = gen_rtx_SET (addr, reg);
else if (pm == PM_write)
res = gen_rtx_SET (reg, addr);
else
gcc_unreachable ();
if (data->ptr)
{
/* We're using a ptr, increment it. */
start_sequence ();
emit_insn (res);
emit_insn (gen_adddi3 (data->ptr, data->ptr,
GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
res = get_insns ();
end_sequence ();
}
else
rep = 1;
data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
}
break;
}
return res;
}
/* When loading an operand ORIG_OP, verify whether an address space /* When loading an operand ORIG_OP, verify whether an address space
conversion to generic is required, and if so, perform it. Also conversion to generic is required, and if so, perform it. Also
check for SYMBOL_REFs for function decls and call check for SYMBOL_REFs for function decls and call
...@@ -1660,6 +1971,7 @@ nvptx_print_operand_address (FILE *file, rtx addr) ...@@ -1660,6 +1971,7 @@ nvptx_print_operand_address (FILE *file, rtx addr)
c -- print an opcode suffix for a comparison operator, including a type code c -- print an opcode suffix for a comparison operator, including a type code
d -- print a CONST_INT as a vector dimension (x, y, or z) d -- print a CONST_INT as a vector dimension (x, y, or z)
f -- print a full reg even for something that must always be split f -- print a full reg even for something that must always be split
S -- print a shuffle kind specified by CONST_INT
t -- print a type opcode suffix, promoting QImode to 32 bits t -- print a type opcode suffix, promoting QImode to 32 bits
T -- print a type size in bits T -- print a type size in bits
u -- print a type opcode suffix without promotions. */ u -- print a type opcode suffix without promotions. */
...@@ -1723,6 +2035,15 @@ nvptx_print_operand (FILE *file, rtx x, int code) ...@@ -1723,6 +2035,15 @@ nvptx_print_operand (FILE *file, rtx x, int code)
fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, false)); fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, false));
break; break;
case 'S':
{
unsigned kind = UINTVAL (x);
static const char *const kinds[] =
{"up", "down", "bfly", "idx"};
fprintf (file, ".%s", kinds[kind]);
}
break;
case 'T': case 'T':
fprintf (file, "%d", GET_MODE_BITSIZE (GET_MODE (x))); fprintf (file, "%d", GET_MODE_BITSIZE (GET_MODE (x)));
break; break;
...@@ -1973,10 +2294,747 @@ nvptx_reorg_subreg (void) ...@@ -1973,10 +2294,747 @@ nvptx_reorg_subreg (void)
} }
} }
/* Loop structure of the function. The entire function is described as
a NULL loop. We should be able to extend this to represent
superblocks. */
struct parallel
{
/* Parent parallel. */
parallel *parent;
/* Next sibling parallel. */
parallel *next;
/* First child parallel. */
parallel *inner;
/* Partitioning mask of the parallel. */
unsigned mask;
/* Partitioning used within inner parallels. */
unsigned inner_mask;
/* Location of parallel forked and join. The forked is the first
block in the parallel and the join is the first block after of
the partition. */
basic_block forked_block;
basic_block join_block;
rtx_insn *forked_insn;
rtx_insn *join_insn;
rtx_insn *fork_insn;
rtx_insn *joining_insn;
/* Basic blocks in this parallel, but not in child parallels. The
FORKED and JOINING blocks are in the partition. The FORK and JOIN
blocks are not. */
auto_vec<basic_block> blocks;
public:
parallel (parallel *parent, unsigned mode);
~parallel ();
};
/* Constructor links the new parallel into it's parent's chain of
children. */
parallel::parallel (parallel *parent_, unsigned mask_)
:parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
{
forked_block = join_block = 0;
forked_insn = join_insn = 0;
fork_insn = joining_insn = 0;
if (parent)
{
next = parent->inner;
parent->inner = this;
}
}
parallel::~parallel ()
{
delete inner;
delete next;
}
/* Map of basic blocks to insns */
typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
/* A tuple of an insn of interest and the BB in which it resides. */
typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
typedef auto_vec<insn_bb_t> insn_bb_vec_t;
/* Split basic blocks such that each forked and join unspecs are at
the start of their basic blocks. Thus afterwards each block will
have a single partitioning mode. We also do the same for return
insns, as they are executed by every thread. Return the
partitioning mode of the function as a whole. Populate MAP with
head and tail blocks. We also clear the BB visited flag, which is
used when finding partitions. */
static void
nvptx_split_blocks (bb_insn_map_t *map)
{
insn_bb_vec_t worklist;
basic_block block;
rtx_insn *insn;
/* Locate all the reorg instructions of interest. */
FOR_ALL_BB_FN (block, cfun)
{
bool seen_insn = false;
/* Clear visited flag, for use by parallel locator */
block->flags &= ~BB_VISITED;
FOR_BB_INSNS (block, insn)
{
if (!INSN_P (insn))
continue;
switch (recog_memoized (insn))
{
default:
seen_insn = true;
continue;
case CODE_FOR_nvptx_forked:
case CODE_FOR_nvptx_join:
break;
case CODE_FOR_return:
/* We also need to split just before return insns, as
that insn needs executing by all threads, but the
block it is in probably does not. */
break;
}
if (seen_insn)
/* We've found an instruction that must be at the start of
a block, but isn't. Add it to the worklist. */
worklist.safe_push (insn_bb_t (insn, block));
else
/* It was already the first instruction. Just add it to
the map. */
map->get_or_insert (block) = insn;
seen_insn = true;
}
}
/* Split blocks on the worklist. */
unsigned ix;
insn_bb_t *elt;
basic_block remap = 0;
for (ix = 0; worklist.iterate (ix, &elt); ix++)
{
if (remap != elt->second)
{
block = elt->second;
remap = block;
}
/* Split block before insn. The insn is in the new block */
edge e = split_block (block, PREV_INSN (elt->first));
block = e->dest;
map->get_or_insert (block) = elt->first;
}
}
/* BLOCK is a basic block containing a head or tail instruction.
Locate the associated prehead or pretail instruction, which must be
in the single predecessor block. */
static rtx_insn *
nvptx_discover_pre (basic_block block, int expected)
{
gcc_assert (block->preds->length () == 1);
basic_block pre_block = (*block->preds)[0]->src;
rtx_insn *pre_insn;
for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
pre_insn = PREV_INSN (pre_insn))
gcc_assert (pre_insn != BB_HEAD (pre_block));
gcc_assert (recog_memoized (pre_insn) == expected);
return pre_insn;
}
/* Dump this parallel and all its inner parallels. */
static void
nvptx_dump_pars (parallel *par, unsigned depth)
{
fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
depth, par->mask,
par->forked_block ? par->forked_block->index : -1,
par->join_block ? par->join_block->index : -1);
fprintf (dump_file, " blocks:");
basic_block block;
for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
fprintf (dump_file, " %d", block->index);
fprintf (dump_file, "\n");
if (par->inner)
nvptx_dump_pars (par->inner, depth + 1);
if (par->next)
nvptx_dump_pars (par->next, depth);
}
/* If BLOCK contains a fork/join marker, process it to create or
terminate a loop structure. Add this block to the current loop,
and then walk successor blocks. */
static parallel *
nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
{
if (block->flags & BB_VISITED)
return par;
block->flags |= BB_VISITED;
if (rtx_insn **endp = map->get (block))
{
rtx_insn *end = *endp;
/* This is a block head or tail, or return instruction. */
switch (recog_memoized (end))
{
case CODE_FOR_return:
/* Return instructions are in their own block, and we
don't need to do anything more. */
return par;
case CODE_FOR_nvptx_forked:
/* Loop head, create a new inner loop and add it into
our parent's child list. */
{
unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
gcc_assert (mask);
par = new parallel (par, mask);
par->forked_block = block;
par->forked_insn = end;
if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
par->fork_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
}
break;
case CODE_FOR_nvptx_join:
/* A loop tail. Finish the current loop and return to
parent. */
{
unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
gcc_assert (par->mask == mask);
par->join_block = block;
par->join_insn = end;
if (!(mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
&& (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
par->joining_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
par = par->parent;
}
break;
default:
gcc_unreachable ();
}
}
if (par)
/* Add this block onto the current loop's list of blocks. */
par->blocks.safe_push (block);
else
/* This must be the entry block. Create a NULL parallel. */
par = new parallel (0, 0);
/* Walk successor blocks. */
edge e;
edge_iterator ei;
FOR_EACH_EDGE (e, ei, block->succs)
nvptx_find_par (map, par, e->dest);
return par;
}
/* DFS walk the CFG looking for fork & join markers. Construct
loop structures as we go. MAP is a mapping of basic blocks
to head & tail markers, discovered when splitting blocks. This
speeds up the discovery. We rely on the BB visited flag having
been cleared when splitting blocks. */
static parallel *
nvptx_discover_pars (bb_insn_map_t *map)
{
basic_block block;
/* Mark exit blocks as visited. */
block = EXIT_BLOCK_PTR_FOR_FN (cfun);
block->flags |= BB_VISITED;
/* And entry block as not. */
block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
block->flags &= ~BB_VISITED;
parallel *par = nvptx_find_par (map, 0, block);
if (dump_file)
{
fprintf (dump_file, "\nLoops\n");
nvptx_dump_pars (par, 0);
fprintf (dump_file, "\n");
}
return par;
}
/* Propagate live state at the start of a partitioned region. BLOCK
provides the live register information, and might not contain
INSN. Propagation is inserted just after INSN. RW indicates whether
we are reading and/or writing state. This
separation is needed for worker-level proppagation where we
essentially do a spill & fill. FN is the underlying worker
function to generate the propagation instructions for single
register. DATA is user data.
We propagate the live register set and the entire frame. We could
do better by (a) propagating just the live set that is used within
the partitioned regions and (b) only propagating stack entries that
are used. The latter might be quite hard to determine. */
typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
static void
nvptx_propagate (basic_block block, rtx_insn *insn, propagate_mask rw,
propagator_fn fn, void *data)
{
bitmap live = DF_LIVE_IN (block);
bitmap_iterator iterator;
unsigned ix;
/* Copy the frame array. */
HOST_WIDE_INT fs = get_frame_size ();
if (fs)
{
rtx tmp = gen_reg_rtx (DImode);
rtx idx = NULL_RTX;
rtx ptr = gen_reg_rtx (Pmode);
rtx pred = NULL_RTX;
rtx_code_label *label = NULL;
gcc_assert (!(fs & (GET_MODE_SIZE (DImode) - 1)));
fs /= GET_MODE_SIZE (DImode);
/* Detect single iteration loop. */
if (fs == 1)
fs = 0;
start_sequence ();
emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
if (fs)
{
idx = gen_reg_rtx (SImode);
pred = gen_reg_rtx (BImode);
label = gen_label_rtx ();
emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
/* Allow worker function to initialize anything needed. */
rtx init = fn (tmp, PM_loop_begin, fs, data);
if (init)
emit_insn (init);
emit_label (label);
LABEL_NUSES (label)++;
emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
}
if (rw & PM_read)
emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
emit_insn (fn (tmp, rw, fs, data));
if (rw & PM_write)
emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
if (fs)
{
emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
emit_insn (gen_br_true_uni (pred, label));
rtx fini = fn (tmp, PM_loop_end, fs, data);
if (fini)
emit_insn (fini);
emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
}
emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
rtx cpy = get_insns ();
end_sequence ();
insn = emit_insn_after (cpy, insn);
}
/* Copy live registers. */
EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
{
rtx reg = regno_reg_rtx[ix];
if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
{
rtx bcast = fn (reg, rw, 0, data);
insn = emit_insn_after (bcast, insn);
}
}
}
/* Worker for nvptx_vpropagate. */
static rtx
vprop_gen (rtx reg, propagate_mask pm,
unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
{
if (!(pm & PM_read_write))
return 0;
return nvptx_gen_vcast (reg);
}
/* Propagate state that is live at start of BLOCK across the vectors
of a single warp. Propagation is inserted just after INSN. */
static void
nvptx_vpropagate (basic_block block, rtx_insn *insn)
{
nvptx_propagate (block, insn, PM_read_write, vprop_gen, 0);
}
/* Worker for nvptx_wpropagate. */
static rtx
wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
{
wcast_data_t *data = (wcast_data_t *)data_;
if (pm & PM_loop_begin)
{
/* Starting a loop, initialize pointer. */
unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
if (align > worker_bcast_align)
worker_bcast_align = align;
data->offset = (data->offset + align - 1) & ~(align - 1);
data->ptr = gen_reg_rtx (Pmode);
return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
}
else if (pm & PM_loop_end)
{
rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
data->ptr = NULL_RTX;
return clobber;
}
else
return nvptx_gen_wcast (reg, pm, rep, data);
}
/* Spill or fill live state that is live at start of BLOCK. PRE_P
indicates if this is just before partitioned mode (do spill), or
just after it starts (do fill). Sequence is inserted just after
INSN. */
static void
nvptx_wpropagate (bool pre_p, basic_block block, rtx_insn *insn)
{
wcast_data_t data;
data.base = gen_reg_rtx (Pmode);
data.offset = 0;
data.ptr = NULL_RTX;
nvptx_propagate (block, insn, pre_p ? PM_read : PM_write, wprop_gen, &data);
if (data.offset)
{
/* Stuff was emitted, initialize the base pointer now. */
rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
emit_insn_after (init, insn);
if (worker_bcast_size < data.offset)
worker_bcast_size = data.offset;
}
}
/* Emit a worker-level synchronization barrier. We use different
markers for before and after synchronizations. */
static rtx
nvptx_wsync (bool after)
{
return gen_nvptx_barsync (GEN_INT (after));
}
/* Single neutering according to MASK. FROM is the incoming block and
TO is the outgoing block. These may be the same block. Insert at
start of FROM:
if (tid.<axis>) goto end.
and insert before ending branch of TO (if there is such an insn):
end:
<possibly-broadcast-cond>
<branch>
We currently only use differnt FROM and TO when skipping an entire
loop. We could do more if we detected superblocks. */
static void
nvptx_single (unsigned mask, basic_block from, basic_block to)
{
rtx_insn *head = BB_HEAD (from);
rtx_insn *tail = BB_END (to);
unsigned skip_mask = mask;
/* Find first insn of from block */
while (head != BB_END (from) && !INSN_P (head))
head = NEXT_INSN (head);
/* Find last insn of to block */
rtx_insn *limit = from == to ? head : BB_HEAD (to);
while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
tail = PREV_INSN (tail);
/* Detect if tail is a branch. */
rtx tail_branch = NULL_RTX;
rtx cond_branch = NULL_RTX;
if (tail && INSN_P (tail))
{
tail_branch = PATTERN (tail);
if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
tail_branch = NULL_RTX;
else
{
cond_branch = SET_SRC (tail_branch);
if (GET_CODE (cond_branch) != IF_THEN_ELSE)
cond_branch = NULL_RTX;
}
}
if (tail == head)
{
/* If this is empty, do nothing. */
if (!head || !INSN_P (head))
return;
/* If this is a dummy insn, do nothing. */
switch (recog_memoized (head))
{
default:
break;
case CODE_FOR_nvptx_fork:
case CODE_FOR_nvptx_forked:
case CODE_FOR_nvptx_joining:
case CODE_FOR_nvptx_join:
return;
}
if (cond_branch)
{
/* If we're only doing vector single, there's no need to
emit skip code because we'll not insert anything. */
if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
skip_mask = 0;
}
else if (tail_branch)
/* Block with only unconditional branch. Nothing to do. */
return;
}
/* Insert the vector test inside the worker test. */
unsigned mode;
rtx_insn *before = tail;
for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
if (GOMP_DIM_MASK (mode) & skip_mask)
{
rtx_code_label *label = gen_label_rtx ();
rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
if (!pred)
{
pred = gen_reg_rtx (BImode);
cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
}
rtx br;
if (mode == GOMP_DIM_VECTOR)
br = gen_br_true (pred, label);
else
br = gen_br_true_uni (pred, label);
emit_insn_before (br, head);
LABEL_NUSES (label)++;
if (tail_branch)
before = emit_label_before (label, before);
else
emit_label_after (label, tail);
}
/* Now deal with propagating the branch condition. */
if (cond_branch)
{
rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
{
/* Vector mode only, do a shuffle. */
emit_insn_before (nvptx_gen_vcast (pvar), tail);
}
else
{
/* Includes worker mode, do spill & fill. By construction
we should never have worker mode only. */
wcast_data_t data;
data.base = worker_bcast_sym;
data.ptr = 0;
if (worker_bcast_size < GET_MODE_SIZE (SImode))
worker_bcast_size = GET_MODE_SIZE (SImode);
data.offset = 0;
emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
before);
/* Barrier so other workers can see the write. */
emit_insn_before (nvptx_wsync (false), tail);
data.offset = 0;
emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
/* This barrier is needed to avoid worker zero clobbering
the broadcast buffer before all the other workers have
had a chance to read this instance of it. */
emit_insn_before (nvptx_wsync (true), tail);
}
extract_insn (tail);
rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
UNSPEC_BR_UNIFIED);
validate_change (tail, recog_data.operand_loc[0], unsp, false);
}
}
/* PAR is a parallel that is being skipped in its entirety according to
MASK. Treat this as skipping a superblock starting at forked
and ending at joining. */
static void
nvptx_skip_par (unsigned mask, parallel *par)
{
basic_block tail = par->join_block;
gcc_assert (tail->preds->length () == 1);
basic_block pre_tail = (*tail->preds)[0]->src;
gcc_assert (pre_tail->succs->length () == 1);
nvptx_single (mask, par->forked_block, pre_tail);
}
/* Process the parallel PAR and all its contained
parallels. We do everything but the neutering. Return mask of
partitioned modes used within this parallel. */
static unsigned
nvptx_process_pars (parallel *par)
{
unsigned inner_mask = par->mask;
/* Do the inner parallels first. */
if (par->inner)
{
par->inner_mask = nvptx_process_pars (par->inner);
inner_mask |= par->inner_mask;
}
if (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX))
/* No propagation needed for a call. */;
else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
{
nvptx_wpropagate (false, par->forked_block, par->forked_insn);
nvptx_wpropagate (true, par->forked_block, par->fork_insn);
/* Insert begin and end synchronizations. */
emit_insn_after (nvptx_wsync (false), par->forked_insn);
emit_insn_before (nvptx_wsync (true), par->joining_insn);
}
else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
nvptx_vpropagate (par->forked_block, par->forked_insn);
/* Now do siblings. */
if (par->next)
inner_mask |= nvptx_process_pars (par->next);
return inner_mask;
}
/* Neuter the parallel described by PAR. We recurse in depth-first
order. MODES are the partitioning of the execution and OUTER is
the partitioning of the parallels we are contained in. */
static void
nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
{
unsigned me = (par->mask
& (GOMP_DIM_MASK (GOMP_DIM_WORKER)
| GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
unsigned skip_mask = 0, neuter_mask = 0;
if (par->inner)
nvptx_neuter_pars (par->inner, modes, outer | me);
for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
{
if ((outer | me) & GOMP_DIM_MASK (mode))
{} /* Mode is partitioned: no neutering. */
else if (!(modes & GOMP_DIM_MASK (mode)))
{} /* Mode is not used: nothing to do. */
else if (par->inner_mask & GOMP_DIM_MASK (mode)
|| !par->forked_insn)
/* Partitioned in inner parallels, or we're not a partitioned
at all: neuter individual blocks. */
neuter_mask |= GOMP_DIM_MASK (mode);
else if (!par->parent || !par->parent->forked_insn
|| par->parent->inner_mask & GOMP_DIM_MASK (mode))
/* Parent isn't a parallel or contains this paralleling: skip
parallel at this level. */
skip_mask |= GOMP_DIM_MASK (mode);
else
{} /* Parent will skip this parallel itself. */
}
if (neuter_mask)
{
int ix;
int len = par->blocks.length ();
for (ix = 0; ix != len; ix++)
{
basic_block block = par->blocks[ix];
nvptx_single (neuter_mask, block, block);
}
}
if (skip_mask)
nvptx_skip_par (skip_mask, par);
if (par->next)
nvptx_neuter_pars (par->next, modes, outer);
}
/* PTX-specific reorganization /* PTX-specific reorganization
- Scan and release reduction buffers
- Split blocks at fork and join instructions
- Compute live registers - Compute live registers
- Mark now-unused registers, so function begin doesn't declare - Mark now-unused registers, so function begin doesn't declare
unused registers. unused registers.
- Insert state propagation when entering partitioned mode
- Insert neutering instructions when in single mode
- Replace subregs with suitable sequences. - Replace subregs with suitable sequences.
*/ */
...@@ -1989,19 +3047,60 @@ nvptx_reorg (void) ...@@ -1989,19 +3047,60 @@ nvptx_reorg (void)
thread_prologue_and_epilogue_insns (); thread_prologue_and_epilogue_insns ();
/* Split blocks and record interesting unspecs. */
bb_insn_map_t bb_insn_map;
nvptx_split_blocks (&bb_insn_map);
/* Compute live regs */ /* Compute live regs */
df_clear_flags (DF_LR_RUN_DCE); df_clear_flags (DF_LR_RUN_DCE);
df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS); df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
df_live_add_problem ();
df_live_set_all_dirty ();
df_analyze (); df_analyze ();
regstat_init_n_sets_and_refs (); regstat_init_n_sets_and_refs ();
int max_regs = max_reg_num (); if (dump_file)
df_dump (dump_file);
/* Mark unused regs as unused. */ /* Mark unused regs as unused. */
int max_regs = max_reg_num ();
for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++) for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0) if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
regno_reg_rtx[i] = const0_rtx; regno_reg_rtx[i] = const0_rtx;
/* Determine launch dimensions of the function. If it is not an
offloaded function (i.e. this is a regular compiler), the
function has no neutering. */
tree attr = get_oacc_fn_attrib (current_function_decl);
if (attr)
{
/* If we determined this mask before RTL expansion, we could
elide emission of some levels of forks and joins. */
unsigned mask = 0;
tree dims = TREE_VALUE (attr);
unsigned ix;
for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
{
int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
tree allowed = TREE_PURPOSE (dims);
if (size != 1 && !(allowed && integer_zerop (allowed)))
mask |= GOMP_DIM_MASK (ix);
}
/* If there is worker neutering, there must be vector
neutering. Otherwise the hardware will fail. */
gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
|| (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
/* Discover & process partitioned regions. */
parallel *pars = nvptx_discover_pars (&bb_insn_map);
nvptx_process_pars (pars);
nvptx_neuter_pars (pars, mask, 0);
delete pars;
}
/* Replace subregs. */ /* Replace subregs. */
nvptx_reorg_subreg (); nvptx_reorg_subreg ();
...@@ -2053,6 +3152,26 @@ nvptx_vector_alignment (const_tree type) ...@@ -2053,6 +3152,26 @@ nvptx_vector_alignment (const_tree type)
return MIN (align, BIGGEST_ALIGNMENT); return MIN (align, BIGGEST_ALIGNMENT);
} }
/* Indicate that INSN cannot be duplicated. */
static bool
nvptx_cannot_copy_insn_p (rtx_insn *insn)
{
switch (recog_memoized (insn))
{
case CODE_FOR_nvptx_shufflesi:
case CODE_FOR_nvptx_shufflesf:
case CODE_FOR_nvptx_barsync:
case CODE_FOR_nvptx_fork:
case CODE_FOR_nvptx_forked:
case CODE_FOR_nvptx_joining:
case CODE_FOR_nvptx_join:
return true;
default:
return false;
}
}
/* Record a symbol for mkoffload to enter into the mapping table. */ /* Record a symbol for mkoffload to enter into the mapping table. */
static void static void
...@@ -2129,6 +3248,19 @@ nvptx_file_end (void) ...@@ -2129,6 +3248,19 @@ nvptx_file_end (void)
FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter) FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
nvptx_record_fndecl (decl, true); nvptx_record_fndecl (decl, true);
fputs (func_decls.str().c_str(), asm_out_file); fputs (func_decls.str().c_str(), asm_out_file);
if (worker_bcast_size)
{
/* Define the broadcast buffer. */
worker_bcast_size = (worker_bcast_size + worker_bcast_align - 1)
& ~(worker_bcast_align - 1);
fprintf (asm_out_file, "// BEGIN VAR DEF: %s\n", worker_bcast_name);
fprintf (asm_out_file, ".shared .align %d .u8 %s[%d];\n",
worker_bcast_align,
worker_bcast_name, worker_bcast_size);
}
} }
/* Validate compute dimensions of an OpenACC offload or routine, fill /* Validate compute dimensions of an OpenACC offload or routine, fill
...@@ -2141,12 +3273,32 @@ nvptx_goacc_validate_dims (tree ARG_UNUSED (decl), int *ARG_UNUSED (dims), ...@@ -2141,12 +3273,32 @@ nvptx_goacc_validate_dims (tree ARG_UNUSED (decl), int *ARG_UNUSED (dims),
{ {
bool changed = false; bool changed = false;
/* TODO: Leave dimensions unaltered. Partitioned execution needs /* TODO: Leave dimensions unaltered. Reductions need
porting before filtering dimensions makes sense. */ porting before filtering dimensions makes sense. */
return changed; return changed;
} }
/* Determine whether fork & joins are needed. */
static bool
nvptx_goacc_fork_join (gcall *call, const int dims[],
bool ARG_UNUSED (is_fork))
{
tree arg = gimple_call_arg (call, 2);
unsigned axis = TREE_INT_CST_LOW (arg);
/* We only care about worker and vector partitioning. */
if (axis < GOMP_DIM_WORKER)
return false;
/* If the size is 1, there's no partitioning. */
if (dims[axis] == 1)
return false;
return true;
}
#undef TARGET_OPTION_OVERRIDE #undef TARGET_OPTION_OVERRIDE
#define TARGET_OPTION_OVERRIDE nvptx_option_override #define TARGET_OPTION_OVERRIDE nvptx_option_override
...@@ -2233,9 +3385,15 @@ nvptx_goacc_validate_dims (tree ARG_UNUSED (decl), int *ARG_UNUSED (dims), ...@@ -2233,9 +3385,15 @@ nvptx_goacc_validate_dims (tree ARG_UNUSED (decl), int *ARG_UNUSED (dims),
#undef TARGET_VECTOR_ALIGNMENT #undef TARGET_VECTOR_ALIGNMENT
#define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
#undef TARGET_CANNOT_COPY_INSN_P
#define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
#undef TARGET_GOACC_VALIDATE_DIMS #undef TARGET_GOACC_VALIDATE_DIMS
#define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
#undef TARGET_GOACC_FORK_JOIN
#define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
struct gcc_target targetm = TARGET_INITIALIZER; struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-nvptx.h" #include "gt-nvptx.h"
...@@ -230,6 +230,7 @@ struct GTY(()) machine_function ...@@ -230,6 +230,7 @@ struct GTY(()) machine_function
HOST_WIDE_INT outgoing_stdarg_size; HOST_WIDE_INT outgoing_stdarg_size;
int ret_reg_mode; /* machine_mode not defined yet. */ int ret_reg_mode; /* machine_mode not defined yet. */
int punning_buffer_size; int punning_buffer_size;
rtx axis_predicate[2];
}; };
#endif #endif
......
...@@ -49,14 +49,27 @@ ...@@ -49,14 +49,27 @@
UNSPEC_ALLOCA UNSPEC_ALLOCA
UNSPEC_NTID UNSPEC_DIM_SIZE
UNSPEC_TID
UNSPEC_SHARED_DATA
UNSPEC_BIT_CONV
UNSPEC_SHUFFLE
UNSPEC_BR_UNIFIED
]) ])
(define_c_enum "unspecv" [ (define_c_enum "unspecv" [
UNSPECV_LOCK UNSPECV_LOCK
UNSPECV_CAS UNSPECV_CAS
UNSPECV_XCHG UNSPECV_XCHG
UNSPECV_BARSYNC
UNSPECV_DIM_POS
UNSPECV_FORK
UNSPECV_FORKED
UNSPECV_JOINING
UNSPECV_JOIN
]) ])
(define_attr "subregs_ok" "false,true" (define_attr "subregs_ok" "false,true"
...@@ -246,6 +259,8 @@ ...@@ -246,6 +259,8 @@
(define_mode_iterator QHSIM [QI HI SI]) (define_mode_iterator QHSIM [QI HI SI])
(define_mode_iterator SDFM [SF DF]) (define_mode_iterator SDFM [SF DF])
(define_mode_iterator SDCM [SC DC]) (define_mode_iterator SDCM [SC DC])
(define_mode_iterator BITS [SI SF])
(define_mode_iterator BITD [DI DF])
;; This mode iterator allows :P to be used for patterns that operate on ;; This mode iterator allows :P to be used for patterns that operate on
;; pointer-sized quantities. Exactly one of the two alternatives will match. ;; pointer-sized quantities. Exactly one of the two alternatives will match.
...@@ -817,6 +832,23 @@ ...@@ -817,6 +832,23 @@
"" ""
"%J0\\tbra\\t%l1;") "%J0\\tbra\\t%l1;")
;; unified conditional branch
(define_insn "br_true_uni"
[(set (pc) (if_then_else
(ne (unspec:BI [(match_operand:BI 0 "nvptx_register_operand" "R")]
UNSPEC_BR_UNIFIED) (const_int 0))
(label_ref (match_operand 1 "" "")) (pc)))]
""
"%j0\\tbra.uni\\t%l1;")
(define_insn "br_false_uni"
[(set (pc) (if_then_else
(eq (unspec:BI [(match_operand:BI 0 "nvptx_register_operand" "R")]
UNSPEC_BR_UNIFIED) (const_int 0))
(label_ref (match_operand 1 "" "")) (pc)))]
""
"%J0\\tbra.uni\\t%l1;")
(define_expand "cbranch<mode>4" (define_expand "cbranch<mode>4"
[(set (pc) [(set (pc)
(if_then_else (match_operator 0 "nvptx_comparison_operator" (if_then_else (match_operator 0 "nvptx_comparison_operator"
...@@ -1308,36 +1340,134 @@ ...@@ -1308,36 +1340,134 @@
DONE; DONE;
}) })
(define_insn "*oacc_ntid_insn" (define_insn "oacc_dim_size"
[(set (match_operand:SI 0 "nvptx_register_operand" "=R") [(set (match_operand:SI 0 "nvptx_register_operand" "")
(unspec:SI [(match_operand:SI 1 "const_int_operand" "n")] UNSPEC_NTID))] (unspec:SI [(match_operand:SI 1 "const_int_operand" "")]
UNSPEC_DIM_SIZE))]
"" ""
"%.\\tmov.u32 %0, %%ntid%d1;") {
static const char *const asms[] =
{ /* Must match oacc_loop_levels ordering. */
"%.\\tmov.u32\\t%0, %%nctaid.x;", /* gang */
"%.\\tmov.u32\\t%0, %%ntid.y;", /* worker */
"%.\\tmov.u32\\t%0, %%ntid.x;", /* vector */
};
return asms[INTVAL (operands[1])];
})
(define_expand "oacc_ntid" (define_insn "oacc_dim_pos"
[(set (match_operand:SI 0 "nvptx_register_operand" "") [(set (match_operand:SI 0 "nvptx_register_operand" "")
(unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_NTID))] (unspec_volatile:SI [(match_operand:SI 1 "const_int_operand" "")]
UNSPECV_DIM_POS))]
"" ""
{ {
if (INTVAL (operands[1]) < 0 || INTVAL (operands[1]) > 2) static const char *const asms[] =
FAIL; { /* Must match oacc_loop_levels ordering. */
"%.\\tmov.u32\\t%0, %%ctaid.x;", /* gang */
"%.\\tmov.u32\\t%0, %%tid.y;", /* worker */
"%.\\tmov.u32\\t%0, %%tid.x;", /* vector */
};
return asms[INTVAL (operands[1])];
}) })
(define_insn "*oacc_tid_insn" (define_insn "nvptx_fork"
[(set (match_operand:SI 0 "nvptx_register_operand" "=R") [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
(unspec:SI [(match_operand:SI 1 "const_int_operand" "n")] UNSPEC_TID))] UNSPECV_FORK)]
"" ""
"%.\\tmov.u32 %0, %%tid%d1;") "// fork %0;"
)
(define_expand "oacc_tid" (define_insn "nvptx_forked"
[(set (match_operand:SI 0 "nvptx_register_operand" "") [(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
(unspec:SI [(match_operand:SI 1 "const_int_operand" "")] UNSPEC_TID))] UNSPECV_FORKED)]
""
"// forked %0;"
)
(define_insn "nvptx_joining"
[(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
UNSPECV_JOINING)]
""
"// joining %0;"
)
(define_insn "nvptx_join"
[(unspec_volatile:SI [(match_operand:SI 0 "const_int_operand" "")]
UNSPECV_JOIN)]
""
"// join %0;"
)
(define_expand "oacc_fork"
[(set (match_operand:SI 0 "nvptx_nonmemory_operand" "")
(match_operand:SI 1 "nvptx_general_operand" ""))
(unspec_volatile:SI [(match_operand:SI 2 "const_int_operand" "")]
UNSPECV_FORKED)]
"" ""
{ {
if (INTVAL (operands[1]) < 0 || INTVAL (operands[1]) > 2) if (operands[0] != const0_rtx)
FAIL; emit_move_insn (operands[0], operands[1]);
nvptx_expand_oacc_fork (INTVAL (operands[2]));
DONE;
})
(define_expand "oacc_join"
[(set (match_operand:SI 0 "nvptx_nonmemory_operand" "")
(match_operand:SI 1 "nvptx_general_operand" ""))
(unspec_volatile:SI [(match_operand:SI 2 "const_int_operand" "")]
UNSPECV_JOIN)]
""
{
if (operands[0] != const0_rtx)
emit_move_insn (operands[0], operands[1]);
nvptx_expand_oacc_join (INTVAL (operands[2]));
DONE;
}) })
;; only 32-bit shuffles exist.
(define_insn "nvptx_shuffle<mode>"
[(set (match_operand:BITS 0 "nvptx_register_operand" "=R")
(unspec:BITS
[(match_operand:BITS 1 "nvptx_register_operand" "R")
(match_operand:SI 2 "nvptx_nonmemory_operand" "Ri")
(match_operand:SI 3 "const_int_operand" "n")]
UNSPEC_SHUFFLE))]
""
"%.\\tshfl%S3.b32\\t%0, %1, %2, 31;")
;; extract parts of a 64 bit object into 2 32-bit ints
(define_insn "unpack<mode>si2"
[(set (match_operand:SI 0 "nvptx_register_operand" "=R")
(unspec:SI [(match_operand:BITD 2 "nvptx_register_operand" "R")
(const_int 0)] UNSPEC_BIT_CONV))
(set (match_operand:SI 1 "nvptx_register_operand" "=R")
(unspec:SI [(match_dup 2) (const_int 1)] UNSPEC_BIT_CONV))]
""
"%.\\tmov.b64\\t{%0,%1}, %2;")
;; pack 2 32-bit ints into a 64 bit object
(define_insn "packsi<mode>2"
[(set (match_operand:BITD 0 "nvptx_register_operand" "=R")
(unspec:BITD [(match_operand:SI 1 "nvptx_register_operand" "R")
(match_operand:SI 2 "nvptx_register_operand" "R")]
UNSPEC_BIT_CONV))]
""
"%.\\tmov.b64\\t%0, {%1,%2};")
(define_insn "worker_load<mode>"
[(set (match_operand:SDISDFM 0 "nvptx_register_operand" "=R")
(unspec:SDISDFM [(match_operand:SDISDFM 1 "memory_operand" "m")]
UNSPEC_SHARED_DATA))]
""
"%.\\tld.shared%u0\\t%0, %1;")
(define_insn "worker_store<mode>"
[(set (unspec:SDISDFM [(match_operand:SDISDFM 1 "memory_operand" "=m")]
UNSPEC_SHARED_DATA)
(match_operand:SDISDFM 0 "nvptx_register_operand" "R"))]
""
"%.\\tst.shared%u1\\t%1, %0;")
;; Atomic insns. ;; Atomic insns.
(define_expand "atomic_compare_and_swap<mode>" (define_expand "atomic_compare_and_swap<mode>"
...@@ -1423,3 +1553,9 @@ ...@@ -1423,3 +1553,9 @@
(match_dup 1))] (match_dup 1))]
"0" "0"
"%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;") "%.\\tatom%A1.b%T0.<logic>\\t%0, %1, %2;")
(define_insn "nvptx_barsync"
[(unspec_volatile [(match_operand:SI 0 "const_int_operand" "")]
UNSPECV_BARSYNC)]
""
"\\tbar.sync\\t%0;")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment