Commit 080dc243 by Pekka Jääskeläinen

[BRIGFE] phsa-specific optimizations

Add flag -fassume-phsa that is on by default. If -fno-assume-phsa
is given, these optimizations are disabled.

With this flag, gccbrig can generate GENERIC that assumes we are
targeting a phsa-runtime based implementation, which allows us
to expose the work-item context accesses to retrieve WI IDs etc.
which helps optimizers.

First optimization that takes advantage of this is to get rid of
the setworkitemid calls whenever we have non-inlined calls that
use IDs internally.

Other optimizations added in this commit:

- expand absoluteid to similar level of simplicity as workitemid.
At the moment absoluteid is the best indexing ID to end up with
WG vectorization.
- propagate ID variables closer to their uses. This is mainly
to avoid known useless casts, which confuse at least scalar
evolution analysis.
- use signed long long for storing IDs. Unsigned integers have
defined wraparound semantics, which confuse at least scalar
evolution analysis, leading to unvectorizable WI loops.
- also refactor some BRIG function generation helpers to brig_function.
- no point in having the wi-loop as a for-loop. It's really
a do...while and SCEV can analyze it just fine still.
- add consts to ptrs etc. in BRIG builtin defs.
Improves optimization opportunities.
- add qualifiers to generated function parameters.
Const and restrict on the hidden local/private pointers,
the arg buffer and the context pointer help some optimizations.

From-SVN: r259957
parent 1e25c5a9
2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
* brig-builtins.def: Add consts to ptrs etc. in BRIG builtin defs.
To improve optimization opportunities.
* builtin-types.def: The new needed builtin types for the above.
2018-05-04 Richard Biener <rguenther@suse.de> 2018-05-04 Richard Biener <rguenther@suse.de>
* bb-reorder.c (sanitize_hot_paths): Release hot_bbs_to_check. * bb-reorder.c (sanitize_hot_paths): Release hot_bbs_to_check.
......
...@@ -45,25 +45,25 @@ DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_GRIDSIZE, BRIG_OPCODE_GRIDSIZE, ...@@ -45,25 +45,25 @@ DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_GRIDSIZE, BRIG_OPCODE_GRIDSIZE,
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U32, DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U32,
BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U32, BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U32,
"__hsail_workitemflatabsid_u32", BT_FN_UINT_PTR, "__hsail_workitemflatabsid_u32", BT_FN_UINT_CONST_PTR,
ATTR_NOTHROW_LEAF_LIST) ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U64, DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U64,
BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U64, BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U64,
"__hsail_workitemflatabsid_u64", BT_FN_ULONG_PTR, "__hsail_workitemflatabsid_u64", BT_FN_ULONG_CONST_PTR,
ATTR_NOTHROW_LEAF_LIST) ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATID, BRIG_OPCODE_WORKITEMFLATID, DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATID, BRIG_OPCODE_WORKITEMFLATID,
BRIG_TYPE_U32, "__hsail_workitemflatid", BT_FN_UINT_PTR, BRIG_TYPE_U32, "__hsail_workitemflatid", BT_FN_UINT_CONST_PTR,
ATTR_NOTHROW_LEAF_LIST) ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMID, BRIG_OPCODE_WORKITEMID, DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMID, BRIG_OPCODE_WORKITEMID,
BRIG_TYPE_U32, "__hsail_workitemid", BT_FN_UINT_UINT_PTR, BRIG_TYPE_U32, "__hsail_workitemid",
ATTR_NOTHROW_LEAF_LIST) BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPID, BRIG_OPCODE_WORKGROUPID, DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPID, BRIG_OPCODE_WORKGROUPID,
BRIG_TYPE_U32, "__hsail_workgroupid", BT_FN_UINT_UINT_PTR, BRIG_TYPE_U32, "__hsail_workgroupid",
ATTR_PURE_NOTHROW_LEAF_LIST) BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKITEMFLATID, DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKITEMFLATID,
BRIG_OPCODE_CURRENTWORKITEMFLATID, BRIG_OPCODE_CURRENTWORKITEMFLATID,
...@@ -90,11 +90,12 @@ DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_PACKETCOMPLETIONSIG_SIG32, ...@@ -90,11 +90,12 @@ DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_PACKETCOMPLETIONSIG_SIG32,
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE, DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE,
BRIG_OPCODE_CURRENTWORKGROUPSIZE, BRIG_TYPE_U32, BRIG_OPCODE_CURRENTWORKGROUPSIZE, BRIG_TYPE_U32,
"__hsail_currentworkgroupsize", BT_FN_UINT_UINT_PTR, "__hsail_currentworkgroupsize", BT_FN_UINT_UINT_CONST_PTR,
ATTR_PURE_NOTHROW_LEAF_LIST) ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPSIZE, BRIG_OPCODE_WORKGROUPSIZE, DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPSIZE, BRIG_OPCODE_WORKGROUPSIZE,
BRIG_TYPE_U32, "__hsail_workgroupsize", BT_FN_UINT_UINT_PTR, BRIG_TYPE_U32, "__hsail_workgroupsize",
BT_FN_UINT_UINT_CONST_PTR,
ATTR_PURE_NOTHROW_LEAF_LIST) ATTR_PURE_NOTHROW_LEAF_LIST)
DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_DIM, BRIG_OPCODE_DIM, DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_DIM, BRIG_OPCODE_DIM,
...@@ -565,7 +566,7 @@ DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_SETWORKITEMID, "__hsail_setworkitemid", ...@@ -565,7 +566,7 @@ DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_SETWORKITEMID, "__hsail_setworkitemid",
DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_WG_FUNC, DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_WG_FUNC,
"__hsail_launch_wg_function", "__hsail_launch_wg_function",
BT_FN_VOID_PTR_PTR_PTR, ATTR_NOTHROW_LEAF_LIST) BT_FN_VOID_PTR_PTR_UINT32, ATTR_NOTHROW_LEAF_LIST)
DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_KERNEL, DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_KERNEL,
"__hsail_launch_kernel", "__hsail_launch_kernel",
......
2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com> 2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
Add flag -fassume-phsa that is on by default. If -fno-assume-phsa
is given, these optimizations are disabled. With this flag, gccbrig
can generate GENERIC that assumes we are targeting a phsa-runtime
based implementation, which allows us to expose the work-item context
accesses to retrieve WI IDs etc. which helps optimizers.
First optimization that takes advantage of this is to get rid of
the setworkitemid calls whenever we have non-inlined calls that
use IDs internally. Other optimizations added in this commit:
- expand absoluteid to similar level of simplicity as workitemid.
At the moment absoluteid is the best indexing ID to end up with
WG vectorization.
- propagate ID variables closer to their uses. This is mainly
to avoid known useless casts, which confuse at least scalar
evolution analysis.
- use signed long long for storing IDs. Unsigned integers have
defined wraparound semantics, which confuse at least scalar
evolution analysis, leading to unvectorizable WI loops.
- also refactor some BRIG function generation helpers to brig_function.
- no point in having the wi-loop as a for-loop. It's really
a do...while and SCEV can analyze it just fine still.
- add consts to ptrs etc. in BRIG builtin defs.
Improves optimization opportunities.
- add qualifiers to generated function parameters.
Const and restrict on the hidden local/private pointers,
the arg buffer and the context pointer help some optimizations.
* brig/brigfrontend/brig-basic-inst-handler.cc: See above.
* brig/brigfrontend/brig-branch-inst-handler.cc: See above.
* brig/brigfrontend/brig-cmp-inst-handler.cc: See above.
* brig/brigfrontend/brig-code-entry-handler.cc: See above.
* brig/brigfrontend/brig-code-entry-handler.h: See above.
* brig/brigfrontend/brig-control-handler.cc: See above.
* brig/brigfrontend/brig-cvt-inst-handler.cc: See above.
* brig/brigfrontend/brig-function-handler.cc: See above.
* brig/brigfrontend/brig-function.cc: See above.
* brig/brigfrontend/brig-function.h: See above.
* brig/brigfrontend/brig-label-handler.cc: See above.
* brig/brigfrontend/brig-lane-inst-handler.cc: See above.
* brig/brigfrontend/brig-mem-inst-handler.cc: See above.
* brig/brigfrontend/phsa.h: See above.
* brig/lang.opt: See above.
2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
* brig/brigfrontend/brig-function-handler.cc: Skip multiple forward * brig/brigfrontend/brig-function-handler.cc: Skip multiple forward
declarations of the same function. declarations of the same function.
......
...@@ -105,7 +105,8 @@ brig_basic_inst_handler::build_shuffle (tree arith_type, ...@@ -105,7 +105,8 @@ brig_basic_inst_handler::build_shuffle (tree arith_type,
/* Unpack the tightly packed mask elements to BIT_FIELD_REFs /* Unpack the tightly packed mask elements to BIT_FIELD_REFs
from which to construct the mask vector as understood by from which to construct the mask vector as understood by
VEC_PERM_EXPR. */ VEC_PERM_EXPR. */
tree mask_operand = add_temp_var ("shuffle_mask", operands[2]); tree mask_operand
= m_parent.m_cf->add_temp_var ("shuffle_mask", operands[2]);
tree mask_element_type tree mask_element_type
= build_nonstandard_integer_type (input_mask_element_size, true); = build_nonstandard_integer_type (input_mask_element_size, true);
...@@ -219,10 +220,11 @@ brig_basic_inst_handler::build_pack (tree_stl_vec &operands) ...@@ -219,10 +220,11 @@ brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
tree wide_type = build_nonstandard_integer_type (vecsize, 1); tree wide_type = build_nonstandard_integer_type (vecsize, 1);
tree src_vect = build_resize_convert_view (wide_type, operands[0]); tree src_vect = build_resize_convert_view (wide_type, operands[0]);
src_vect = add_temp_var ("src_vect", src_vect); src_vect = m_parent.m_cf->add_temp_var ("src_vect", src_vect);
tree scalar = operands[1]; tree scalar = operands[1];
scalar = add_temp_var ("scalar", convert_to_integer (wide_type, scalar)); scalar = m_parent.m_cf->add_temp_var ("scalar",
convert_to_integer (wide_type, scalar));
tree pos = operands[2]; tree pos = operands[2];
...@@ -230,21 +232,22 @@ brig_basic_inst_handler::build_pack (tree_stl_vec &operands) ...@@ -230,21 +232,22 @@ brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
Zero them for well-defined semantics. */ Zero them for well-defined semantics. */
tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2], tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
build_int_cstu (TREE_TYPE (pos), ecount - 1)); build_int_cstu (TREE_TYPE (pos), ecount - 1));
pos = add_temp_var ("pos", convert (wide_type, t)); pos = m_parent.m_cf->add_temp_var ("pos", convert (wide_type, t));
tree element_type = TREE_TYPE (TREE_TYPE (operands[0])); tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT; size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
tree ewidth = build_int_cstu (wide_type, element_width); tree ewidth = build_int_cstu (wide_type, element_width);
tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos); tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
bitoffset = add_temp_var ("offset", bitoffset); bitoffset = m_parent.m_cf->add_temp_var ("offset", bitoffset);
uint64_t mask_int uint64_t mask_int
= element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1; = element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;
tree mask = build_int_cstu (wide_type, mask_int); tree mask = build_int_cstu (wide_type, mask_int);
mask = add_temp_var ("mask", convert_to_integer (wide_type, mask)); mask = m_parent.m_cf->add_temp_var ("mask",
convert_to_integer (wide_type, mask));
tree clearing_mask tree clearing_mask
= build1 (BIT_NOT_EXPR, wide_type, = build1 (BIT_NOT_EXPR, wide_type,
...@@ -311,7 +314,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode, ...@@ -311,7 +314,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
tree arith_type, tree arith_type,
tree_stl_vec &operands) tree_stl_vec &operands)
{ {
tree_code opcode = get_tree_code_for_hsa_opcode (brig_opcode, brig_type); tree_code opcode
= brig_function::get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK; BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;
...@@ -388,8 +392,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode, ...@@ -388,8 +392,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
on which cannot be used in general to remain HSAIL compliant. on which cannot be used in general to remain HSAIL compliant.
Perhaps a builtin call would be better option here. */ Perhaps a builtin call would be better option here. */
return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type), return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
expand_or_call_builtin (BRIG_OPCODE_SQRT, brig_type, m_parent.m_cf->expand_or_call_builtin
arith_type, operands)); (BRIG_OPCODE_SQRT, brig_type, arith_type, operands));
} }
else if (brig_opcode == BRIG_OPCODE_NRCP) else if (brig_opcode == BRIG_OPCODE_NRCP)
{ {
...@@ -410,8 +414,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode, ...@@ -410,8 +414,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
gcc_unreachable (); gcc_unreachable ();
} }
else if (opcode == CALL_EXPR) else if (opcode == CALL_EXPR)
return expand_or_call_builtin (brig_opcode, brig_type, arith_type, return m_parent.m_cf->expand_or_call_builtin (brig_opcode, brig_type,
operands); arith_type, operands);
else if (output_count == 1) else if (output_count == 1)
{ {
if (input_count == 1) if (input_count == 1)
...@@ -520,7 +524,8 @@ brig_basic_inst_handler::operator () (const BrigBase *base) ...@@ -520,7 +524,8 @@ brig_basic_inst_handler::operator () (const BrigBase *base)
in_operands[0] = build_lower_element_broadcast (in_operands[0]); in_operands[0] = build_lower_element_broadcast (in_operands[0]);
tree_code opcode tree_code opcode
= get_tree_code_for_hsa_opcode (brig_inst->opcode, brig_inst_type); = brig_function::get_tree_code_for_hsa_opcode (brig_inst->opcode,
brig_inst_type);
if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT) if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
{ {
...@@ -566,11 +571,11 @@ brig_basic_inst_handler::operator () (const BrigBase *base) ...@@ -566,11 +571,11 @@ brig_basic_inst_handler::operator () (const BrigBase *base)
*/ */
tree_stl_vec operand0_elements; tree_stl_vec operand0_elements;
if (input_count > 0) if (input_count > 0)
unpack (in_operands[0], operand0_elements); m_parent.m_cf->unpack (in_operands[0], operand0_elements);
tree_stl_vec operand1_elements; tree_stl_vec operand1_elements;
if (input_count > 1) if (input_count > 1)
unpack (in_operands[1], operand1_elements); m_parent.m_cf->unpack (in_operands[1], operand1_elements);
tree_stl_vec result_elements; tree_stl_vec result_elements;
...@@ -617,7 +622,7 @@ brig_basic_inst_handler::operator () (const BrigBase *base) ...@@ -617,7 +622,7 @@ brig_basic_inst_handler::operator () (const BrigBase *base)
result_elements.push_back (convert (scalar_type, scalar_expr)); result_elements.push_back (convert (scalar_type, scalar_expr));
} }
instr_expr = pack (result_elements); instr_expr = m_parent.m_cf->pack (result_elements);
} }
else else
{ {
...@@ -728,140 +733,3 @@ brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand) ...@@ -728,140 +733,3 @@ brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand)
vec_operand, mask); vec_operand, mask);
} }
/* Returns the tree code that should be used to implement the given
HSA instruction opcode (BRIG_OPCODE) for the given type of instruction
(BRIG_TYPE). In case the opcode cannot be mapped to a TREE node directly,
returns TREE_LIST (if it can be emulated with a simple chain of tree
nodes) or CALL_EXPR if the opcode should be implemented using a builtin
call. */
tree_code
brig_basic_inst_handler::get_tree_code_for_hsa_opcode
(BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
{
BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
switch (brig_opcode)
{
case BRIG_OPCODE_NOP:
return NOP_EXPR;
case BRIG_OPCODE_ADD:
return PLUS_EXPR;
case BRIG_OPCODE_CMOV:
if (brig_inner_type == brig_type)
return COND_EXPR;
else
return VEC_COND_EXPR;
case BRIG_OPCODE_SUB:
return MINUS_EXPR;
case BRIG_OPCODE_MUL:
case BRIG_OPCODE_MUL24:
return MULT_EXPR;
case BRIG_OPCODE_MULHI:
case BRIG_OPCODE_MUL24HI:
return MULT_HIGHPART_EXPR;
case BRIG_OPCODE_DIV:
if (gccbrig_is_float_type (brig_inner_type))
return RDIV_EXPR;
else
return TRUNC_DIV_EXPR;
case BRIG_OPCODE_NEG:
return NEGATE_EXPR;
case BRIG_OPCODE_MIN:
if (gccbrig_is_float_type (brig_inner_type))
return CALL_EXPR;
else
return MIN_EXPR;
case BRIG_OPCODE_MAX:
if (gccbrig_is_float_type (brig_inner_type))
return CALL_EXPR;
else
return MAX_EXPR;
case BRIG_OPCODE_FMA:
return FMA_EXPR;
case BRIG_OPCODE_ABS:
return ABS_EXPR;
case BRIG_OPCODE_SHL:
return LSHIFT_EXPR;
case BRIG_OPCODE_SHR:
return RSHIFT_EXPR;
case BRIG_OPCODE_OR:
return BIT_IOR_EXPR;
case BRIG_OPCODE_XOR:
return BIT_XOR_EXPR;
case BRIG_OPCODE_AND:
return BIT_AND_EXPR;
case BRIG_OPCODE_NOT:
return BIT_NOT_EXPR;
case BRIG_OPCODE_RET:
return RETURN_EXPR;
case BRIG_OPCODE_MOV:
case BRIG_OPCODE_LDF:
return MODIFY_EXPR;
case BRIG_OPCODE_LD:
case BRIG_OPCODE_ST:
return MEM_REF;
case BRIG_OPCODE_BR:
return GOTO_EXPR;
case BRIG_OPCODE_REM:
if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32)
return TRUNC_MOD_EXPR;
else
return CALL_EXPR;
case BRIG_OPCODE_NRCP:
case BRIG_OPCODE_NRSQRT:
/* Implement as 1/f (x). gcc should pattern detect that and
use a native instruction, if available, for it. */
return TREE_LIST;
case BRIG_OPCODE_FLOOR:
case BRIG_OPCODE_CEIL:
case BRIG_OPCODE_SQRT:
case BRIG_OPCODE_NSQRT:
case BRIG_OPCODE_RINT:
case BRIG_OPCODE_TRUNC:
case BRIG_OPCODE_POPCOUNT:
case BRIG_OPCODE_COPYSIGN:
case BRIG_OPCODE_NCOS:
case BRIG_OPCODE_NSIN:
case BRIG_OPCODE_NLOG2:
case BRIG_OPCODE_NEXP2:
case BRIG_OPCODE_NFMA:
/* Class has type B1 regardless of the float type, thus
the below builtin map search cannot find it. */
case BRIG_OPCODE_CLASS:
case BRIG_OPCODE_WORKITEMABSID:
return CALL_EXPR;
default:
/* Some BRIG opcodes can use the same builtins for unsigned and
signed types. Force these cases to unsigned types.
*/
if (brig_opcode == BRIG_OPCODE_BORROW
|| brig_opcode == BRIG_OPCODE_CARRY
|| brig_opcode == BRIG_OPCODE_LASTBIT
|| brig_opcode == BRIG_OPCODE_BITINSERT)
{
if (brig_type == BRIG_TYPE_S32)
brig_type = BRIG_TYPE_U32;
else if (brig_type == BRIG_TYPE_S64)
brig_type = BRIG_TYPE_U64;
}
builtin_map::const_iterator i
= s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
if (i != s_custom_builtins.end ())
return CALL_EXPR;
else if (s_custom_builtins.find
(std::make_pair (brig_opcode, brig_inner_type))
!= s_custom_builtins.end ())
return CALL_EXPR;
if (brig_inner_type == BRIG_TYPE_F16
&& s_custom_builtins.find
(std::make_pair (brig_opcode, BRIG_TYPE_F32))
!= s_custom_builtins.end ())
return CALL_EXPR;
break;
}
return TREE_LIST; /* Emulate using a chain of nodes. */
}
...@@ -119,10 +119,11 @@ brig_branch_inst_handler::operator () (const BrigBase *base) ...@@ -119,10 +119,11 @@ brig_branch_inst_handler::operator () (const BrigBase *base)
memory. */ memory. */
tree group_local_offset tree group_local_offset
= add_temp_var ("group_local_offset", = m_parent.m_cf->add_temp_var ("group_local_offset",
build_int_cst build_int_cst
(uint32_type_node, (uint32_type_node,
m_parent.m_cf->m_local_group_variables.size())); m_parent.m_cf->
m_local_group_variables.size()));
/* TODO: ensure the callee's frame is aligned! */ /* TODO: ensure the callee's frame is aligned! */
...@@ -152,6 +153,7 @@ brig_branch_inst_handler::operator () (const BrigBase *base) ...@@ -152,6 +153,7 @@ brig_branch_inst_handler::operator () (const BrigBase *base)
m_parent.m_cf->m_called_functions.push_back (func_ref); m_parent.m_cf->m_called_functions.push_back (func_ref);
if (DECL_EXTERNAL (func_ref)) if (DECL_EXTERNAL (func_ref))
m_parent.add_decl_call (call); m_parent.add_decl_call (call);
m_parent.m_cf->start_new_bb ();
return base->byteCount; return base->byteCount;
} }
...@@ -216,18 +218,21 @@ brig_branch_inst_handler::operator () (const BrigBase *base) ...@@ -216,18 +218,21 @@ brig_branch_inst_handler::operator () (const BrigBase *base)
ensure the barrier won't be duplicated or moved out of loops etc. ensure the barrier won't be duplicated or moved out of loops etc.
Like the 'noduplicate' of LLVM. Same goes for fbarriers. */ Like the 'noduplicate' of LLVM. Same goes for fbarriers. */
m_parent.m_cf->append_statement m_parent.m_cf->append_statement
(expand_or_call_builtin (brig_inst->opcode, BRIG_TYPE_NONE, NULL_TREE, (m_parent.m_cf->expand_or_call_builtin (brig_inst->opcode,
call_operands)); BRIG_TYPE_NONE, NULL_TREE,
call_operands));
} }
else if (brig_inst->opcode >= BRIG_OPCODE_ARRIVEFBAR else if (brig_inst->opcode >= BRIG_OPCODE_ARRIVEFBAR
&& brig_inst->opcode <= BRIG_OPCODE_WAITFBAR) && brig_inst->opcode <= BRIG_OPCODE_WAITFBAR)
{ {
m_parent.m_cf->m_has_barriers = true; m_parent.m_cf->m_has_barriers = true;
m_parent.m_cf->append_statement m_parent.m_cf->append_statement
(expand_or_call_builtin (brig_inst->opcode, BRIG_TYPE_NONE, (m_parent.m_cf->expand_or_call_builtin (brig_inst->opcode,
uint32_type_node, operands)); BRIG_TYPE_NONE,
uint32_type_node, operands));
} }
else else
gcc_unreachable (); gcc_unreachable ();
m_parent.m_cf->start_new_bb ();
return base->byteCount; return base->byteCount;
} }
...@@ -180,17 +180,17 @@ brig_cmp_inst_handler::operator () (const BrigBase *base) ...@@ -180,17 +180,17 @@ brig_cmp_inst_handler::operator () (const BrigBase *base)
results, we must now truncate the result vector to S16s so it results, we must now truncate the result vector to S16s so it
fits to the destination register. We can build the target vector fits to the destination register. We can build the target vector
type from the f16 storage type (unsigned ints). */ type from the f16 storage type (unsigned ints). */
expr = add_temp_var ("wide_cmp_result", expr); expr = m_parent.m_cf->add_temp_var ("wide_cmp_result", expr);
tree_stl_vec wide_elements; tree_stl_vec wide_elements;
tree_stl_vec shrunk_elements; tree_stl_vec shrunk_elements;
unpack (expr, wide_elements); m_parent.m_cf->unpack (expr, wide_elements);
for (size_t i = 0; i < wide_elements.size (); ++i) for (size_t i = 0; i < wide_elements.size (); ++i)
{ {
tree wide = wide_elements.at (i); tree wide = wide_elements.at (i);
shrunk_elements.push_back shrunk_elements.push_back
(convert_to_integer (short_integer_type_node, wide)); (convert_to_integer (short_integer_type_node, wide));
} }
expr = pack (shrunk_elements); expr = m_parent.m_cf->pack (shrunk_elements);
} }
build_output_assignment (*inst_base, operands[0], expr); build_output_assignment (*inst_base, operands[0], expr);
......
...@@ -35,8 +35,6 @@ class tree_element_unary_visitor; ...@@ -35,8 +35,6 @@ class tree_element_unary_visitor;
class brig_code_entry_handler : public brig_entry_handler class brig_code_entry_handler : public brig_entry_handler
{ {
public: public:
typedef std::map<std::pair<BrigOpcode16_t, BrigType16_t>, tree> builtin_map;
brig_code_entry_handler (brig_to_generic &parent); brig_code_entry_handler (brig_to_generic &parent);
/* Handles the brig_code data at the given pointer and adds it to the /* Handles the brig_code data at the given pointer and adds it to the
...@@ -51,8 +49,6 @@ protected: ...@@ -51,8 +49,6 @@ protected:
tree get_tree_expr_type_for_hsa_type (BrigType16_t brig_type) const; tree get_tree_expr_type_for_hsa_type (BrigType16_t brig_type) const;
tree get_tree_cst_for_hsa_operand (const BrigOperandConstantBytes *brigConst, tree get_tree_cst_for_hsa_operand (const BrigOperandConstantBytes *brigConst,
tree type) const; tree type) const;
tree get_builtin_for_hsa_opcode (tree type, BrigOpcode16_t brig_opcode,
BrigType16_t brig_type) const;
tree get_comparison_result_type (tree source_type); tree get_comparison_result_type (tree source_type);
tree build_code_ref (const BrigBase &ref); tree build_code_ref (const BrigBase &ref);
...@@ -73,16 +69,6 @@ protected: ...@@ -73,16 +69,6 @@ protected:
bool needs_workitem_context_data (BrigOpcode16_t brig_opcode) const; bool needs_workitem_context_data (BrigOpcode16_t brig_opcode) const;
void unpack (tree value, tree_stl_vec &elements);
tree pack (tree_stl_vec &elements);
bool can_expand_builtin (BrigOpcode16_t brig_opcode) const;
tree expand_builtin (BrigOpcode16_t brig_opcode, tree_stl_vec &operands);
tree expand_or_call_builtin (BrigOpcode16_t brig_opcode,
BrigType16_t brig_type, tree arith_type,
tree_stl_vec &operands);
tree add_temp_var (std::string name, tree expr); tree add_temp_var (std::string name, tree expr);
tree build_f2h_conversion (tree source); tree build_f2h_conversion (tree source);
...@@ -100,10 +86,6 @@ protected: ...@@ -100,10 +86,6 @@ protected:
tree extend_int (tree input, tree dest_type, tree src_type); tree extend_int (tree input, tree dest_type, tree src_type);
/* HSAIL-specific builtin functions not yet integrated to gcc. */
static builtin_map s_custom_builtins;
private: private:
tree_stl_vec build_or_analyze_operands (const BrigInstBase &brig_inst, tree_stl_vec build_or_analyze_operands (const BrigInstBase &brig_inst,
...@@ -299,9 +281,6 @@ private: ...@@ -299,9 +281,6 @@ private:
tree build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode, tree arith_type, tree build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode, tree arith_type,
tree_stl_vec &operands); tree_stl_vec &operands);
tree_code get_tree_code_for_hsa_opcode (BrigOpcode16_t brig_opcode,
BrigType16_t brig_type) const;
}; };
class brig_cvt_inst_handler : public brig_inst_mod_handler class brig_cvt_inst_handler : public brig_inst_mod_handler
......
...@@ -53,45 +53,45 @@ brig_directive_control_handler::operator () (const BrigBase *base) ...@@ -53,45 +53,45 @@ brig_directive_control_handler::operator () (const BrigBase *base)
case BRIG_CONTROL_MAXDYNAMICGROUPSIZE: case BRIG_CONTROL_MAXDYNAMICGROUPSIZE:
{ {
m_parent.m_cf->m_descriptor.max_dynamic_group_size m_parent.m_cf->m_descriptor.max_dynamic_group_size
= int_constant_value (operands.at (0)); = brig_function::int_constant_value (operands.at (0));
break; break;
} }
case BRIG_CONTROL_MAXFLATGRIDSIZE: case BRIG_CONTROL_MAXFLATGRIDSIZE:
{ {
m_parent.m_cf->m_descriptor.max_flat_grid_size m_parent.m_cf->m_descriptor.max_flat_grid_size
= int_constant_value (operands.at (0)); = brig_function::int_constant_value (operands.at (0));
break; break;
} }
case BRIG_CONTROL_MAXFLATWORKGROUPSIZE: case BRIG_CONTROL_MAXFLATWORKGROUPSIZE:
{ {
m_parent.m_cf->m_descriptor.max_flat_workgroup_size m_parent.m_cf->m_descriptor.max_flat_workgroup_size
= int_constant_value (operands.at (0)); = brig_function::int_constant_value (operands.at (0));
break; break;
} }
case BRIG_CONTROL_REQUIREDDIM: case BRIG_CONTROL_REQUIREDDIM:
{ {
m_parent.m_cf->m_descriptor.required_dim m_parent.m_cf->m_descriptor.required_dim
= int_constant_value (operands.at (0)); = brig_function::int_constant_value (operands.at (0));
break; break;
} }
case BRIG_CONTROL_REQUIREDGRIDSIZE: case BRIG_CONTROL_REQUIREDGRIDSIZE:
{ {
m_parent.m_cf->m_descriptor.required_grid_size[0] m_parent.m_cf->m_descriptor.required_grid_size[0]
= int_constant_value (operands.at (0)); = brig_function::int_constant_value (operands.at (0));
m_parent.m_cf->m_descriptor.required_grid_size[1] m_parent.m_cf->m_descriptor.required_grid_size[1]
= int_constant_value (operands.at (1)); = brig_function::int_constant_value (operands.at (1));
m_parent.m_cf->m_descriptor.required_grid_size[2] m_parent.m_cf->m_descriptor.required_grid_size[2]
= int_constant_value (operands.at (2)); = brig_function::int_constant_value (operands.at (2));
break; break;
} }
case BRIG_CONTROL_REQUIREDWORKGROUPSIZE: case BRIG_CONTROL_REQUIREDWORKGROUPSIZE:
{ {
m_parent.m_cf->m_descriptor.required_workgroup_size[0] m_parent.m_cf->m_descriptor.required_workgroup_size[0]
= int_constant_value (operands.at (0)); = brig_function::int_constant_value (operands.at (0));
m_parent.m_cf->m_descriptor.required_workgroup_size[1] m_parent.m_cf->m_descriptor.required_workgroup_size[1]
= int_constant_value (operands.at (1)); = brig_function::int_constant_value (operands.at (1));
m_parent.m_cf->m_descriptor.required_workgroup_size[2] m_parent.m_cf->m_descriptor.required_workgroup_size[2]
= int_constant_value (operands.at (2)); = brig_function::int_constant_value (operands.at (2));
break; break;
} }
case BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS: case BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS:
......
...@@ -83,6 +83,12 @@ brig_cvt_inst_handler::generate (const BrigBase *base) ...@@ -83,6 +83,12 @@ brig_cvt_inst_handler::generate (const BrigBase *base)
tree &input = operands.at (1); tree &input = operands.at (1);
tree &output = operands.at (0); tree &output = operands.at (0);
if (m_parent.m_cf->is_id_val (input))
{
input = m_parent.m_cf->id_val (input);
src_type = TREE_TYPE (input);
}
size_t conv_src_size = int_size_in_bytes (src_type); size_t conv_src_size = int_size_in_bytes (src_type);
size_t conv_dst_size = int_size_in_bytes (dest_type); size_t conv_dst_size = int_size_in_bytes (dest_type);
size_t src_reg_size = int_size_in_bytes (TREE_TYPE (input)); size_t src_reg_size = int_size_in_bytes (TREE_TYPE (input));
......
...@@ -93,6 +93,25 @@ brig_directive_function_handler::operator () (const BrigBase *base) ...@@ -93,6 +93,25 @@ brig_directive_function_handler::operator () (const BrigBase *base)
represent HSAIL registers. */ represent HSAIL registers. */
tree bind_expr = build3 (BIND_EXPR, void_type_node, NULL, stmt_list, NULL); tree bind_expr = build3 (BIND_EXPR, void_type_node, NULL, stmt_list, NULL);
tree restrict_char_ptr
= build_qualified_type (build_pointer_type (char_type_node),
TYPE_QUAL_RESTRICT);
tree restrict_void_ptr
= build_qualified_type (build_pointer_type (void_type_node),
TYPE_QUAL_RESTRICT);
tree restrict_const_char_ptr
= build_qualified_type (build_pointer_type
(build_qualified_type (char_type_node,
TYPE_QUAL_CONST)),
TYPE_QUAL_RESTRICT);
tree restrict_const_void_ptr
= build_qualified_type (build_pointer_type
(build_qualified_type (void_type_node,
TYPE_QUAL_CONST)),
TYPE_QUAL_RESTRICT);
if (is_kernel) if (is_kernel)
{ {
tree name_identifier tree name_identifier
...@@ -107,12 +126,11 @@ brig_directive_function_handler::operator () (const BrigBase *base) ...@@ -107,12 +126,11 @@ brig_directive_function_handler::operator () (const BrigBase *base)
3) a void* parameter that contains the first flat address of the group 3) a void* parameter that contains the first flat address of the group
region allocated to the current work-group. */ region allocated to the current work-group. */
tree char_ptr_type_node = build_pointer_type (char_type_node);
fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier, fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier,
build_function_type_list (void_type_node, build_function_type_list (void_type_node,
char_ptr_type_node, restrict_const_char_ptr,
ptr_type_node, restrict_void_ptr,
ptr_type_node, NULL_TREE)); restrict_char_ptr, NULL_TREE));
SET_DECL_ASSEMBLER_NAME (fndecl, name_identifier); SET_DECL_ASSEMBLER_NAME (fndecl, name_identifier);
...@@ -125,9 +143,10 @@ brig_directive_function_handler::operator () (const BrigBase *base) ...@@ -125,9 +143,10 @@ brig_directive_function_handler::operator () (const BrigBase *base)
= gccbrig_get_target_addr_space_id (BRIG_SEGMENT_KERNARG); = gccbrig_get_target_addr_space_id (BRIG_SEGMENT_KERNARG);
tree arg_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL, tree arg_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL,
get_identifier ("__args"), char_ptr_type_node); get_identifier ("__args"),
restrict_const_char_ptr);
DECL_ARGUMENTS (fndecl) = arg_arg; DECL_ARGUMENTS (fndecl) = arg_arg;
DECL_ARG_TYPE (arg_arg) = char_ptr_type_node; DECL_ARG_TYPE (arg_arg) = restrict_const_char_ptr;
DECL_CONTEXT (arg_arg) = fndecl; DECL_CONTEXT (arg_arg) = fndecl;
DECL_ARTIFICIAL (arg_arg) = 1; DECL_ARTIFICIAL (arg_arg) = 1;
TREE_READONLY (arg_arg) = 1; TREE_READONLY (arg_arg) = 1;
...@@ -189,7 +208,7 @@ brig_directive_function_handler::operator () (const BrigBase *base) ...@@ -189,7 +208,7 @@ brig_directive_function_handler::operator () (const BrigBase *base)
if (arg_decls == NULL_TREE) if (arg_decls == NULL_TREE)
arg_decls = arg_var; arg_decls = arg_var;
else else
chainon (arg_decls, arg_var); arg_decls = chainon (arg_decls, arg_var);
m_parent.m_cf->add_arg_variable (brigVar, arg_var); m_parent.m_cf->add_arg_variable (brigVar, arg_var);
...@@ -230,18 +249,13 @@ brig_directive_function_handler::operator () (const BrigBase *base) ...@@ -230,18 +249,13 @@ brig_directive_function_handler::operator () (const BrigBase *base)
vec_safe_push (args, TREE_TYPE (arg_var)); vec_safe_push (args, TREE_TYPE (arg_var));
m_parent.m_cf->add_arg_variable (brigVar, arg_var); m_parent.m_cf->add_arg_variable (brigVar, arg_var);
arg_decls = chainon (arg_decls, arg_var);
if (arg_decls == NULL_TREE)
arg_decls = arg_var;
else
chainon (arg_decls, arg_var);
} }
} }
vec_safe_push (args, restrict_void_ptr);
vec_safe_push (args, ptr_type_node); vec_safe_push (args, restrict_char_ptr);
vec_safe_push (args, ptr_type_node); vec_safe_push (args, uint32_type_node);
vec_safe_push (args, ptr_type_node); vec_safe_push (args, restrict_char_ptr);
vec_safe_push (args, ptr_type_node);
fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier, fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier,
build_function_type_vec (ret_type, args)); build_function_type_vec (ret_type, args));
...@@ -254,26 +268,30 @@ brig_directive_function_handler::operator () (const BrigBase *base) ...@@ -254,26 +268,30 @@ brig_directive_function_handler::operator () (const BrigBase *base)
/* All functions need the hidden __context argument passed on /* All functions need the hidden __context argument passed on
because they might call WI-specific functions which need because they might call WI-specific functions which need
the context info. */ the context info. Only kernels can write it, if they need
to update the local ids in the work-item loop. */
tree context_arg_type
= true ? restrict_void_ptr : restrict_const_void_ptr;
tree context_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL, tree context_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL,
get_identifier ("__context"), ptr_type_node); get_identifier ("__context"),
if (DECL_ARGUMENTS (fndecl) == NULL_TREE) context_arg_type);
DECL_ARGUMENTS (fndecl) = context_arg; DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), context_arg);
else
chainon (DECL_ARGUMENTS (fndecl), context_arg);
DECL_CONTEXT (context_arg) = fndecl; DECL_CONTEXT (context_arg) = fndecl;
DECL_ARG_TYPE (context_arg) = ptr_type_node; DECL_ARG_TYPE (context_arg) = context_arg_type;
DECL_ARTIFICIAL (context_arg) = 1; DECL_ARTIFICIAL (context_arg) = 1;
TREE_READONLY (context_arg) = 1; TREE_READONLY (context_arg) = 1;
TREE_USED (context_arg) = 1; TREE_USED (context_arg) = 1;
m_parent.m_cf->m_context_arg = context_arg;
/* They can also access group memory, so we need to pass the /* They can also access group memory, so we need to pass the
group pointer along too. */ group pointer along too. */
tree group_base_arg tree group_base_arg
= build_decl (UNKNOWN_LOCATION, PARM_DECL, = build_decl (UNKNOWN_LOCATION, PARM_DECL,
get_identifier ("__group_base_addr"), ptr_type_node); get_identifier ("__group_base_addr"),
chainon (DECL_ARGUMENTS (fndecl), group_base_arg); restrict_char_ptr);
DECL_ARG_TYPE (group_base_arg) = ptr_type_node; DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), group_base_arg);
DECL_ARG_TYPE (group_base_arg) = restrict_char_ptr;
DECL_CONTEXT (group_base_arg) = fndecl; DECL_CONTEXT (group_base_arg) = fndecl;
DECL_ARTIFICIAL (group_base_arg) = 1; DECL_ARTIFICIAL (group_base_arg) = 1;
TREE_READONLY (group_base_arg) = 1; TREE_READONLY (group_base_arg) = 1;
...@@ -288,7 +306,7 @@ brig_directive_function_handler::operator () (const BrigBase *base) ...@@ -288,7 +306,7 @@ brig_directive_function_handler::operator () (const BrigBase *base)
tree group_local_offset_arg tree group_local_offset_arg
= build_decl (UNKNOWN_LOCATION, PARM_DECL, = build_decl (UNKNOWN_LOCATION, PARM_DECL,
get_identifier ("__group_local_offset"), uint32_type_node); get_identifier ("__group_local_offset"), uint32_type_node);
chainon (DECL_ARGUMENTS (fndecl), group_local_offset_arg); DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), group_local_offset_arg);
DECL_ARG_TYPE (group_local_offset_arg) = uint32_type_node; DECL_ARG_TYPE (group_local_offset_arg) = uint32_type_node;
DECL_CONTEXT (group_local_offset_arg) = fndecl; DECL_CONTEXT (group_local_offset_arg) = fndecl;
DECL_ARTIFICIAL (group_local_offset_arg) = 1; DECL_ARTIFICIAL (group_local_offset_arg) = 1;
...@@ -299,24 +317,25 @@ brig_directive_function_handler::operator () (const BrigBase *base) ...@@ -299,24 +317,25 @@ brig_directive_function_handler::operator () (const BrigBase *base)
/* Same for private. */ /* Same for private. */
tree private_base_arg tree private_base_arg
= build_decl (UNKNOWN_LOCATION, PARM_DECL, = build_decl (UNKNOWN_LOCATION, PARM_DECL,
get_identifier ("__private_base_addr"), ptr_type_node); get_identifier ("__private_base_addr"), restrict_char_ptr);
chainon (DECL_ARGUMENTS (fndecl), private_base_arg); DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), private_base_arg);
DECL_ARG_TYPE (private_base_arg) = ptr_type_node; DECL_ARG_TYPE (private_base_arg) = restrict_char_ptr;
DECL_CONTEXT (private_base_arg) = fndecl; DECL_CONTEXT (private_base_arg) = fndecl;
DECL_ARTIFICIAL (private_base_arg) = 1; DECL_ARTIFICIAL (private_base_arg) = 1;
TREE_READONLY (private_base_arg) = 1; TREE_READONLY (private_base_arg) = 1;
TREE_USED (private_base_arg) = 1; TREE_USED (private_base_arg) = 1;
m_parent.m_cf->m_private_base_arg = private_base_arg;
DECL_SAVED_TREE (fndecl) = bind_expr; DECL_SAVED_TREE (fndecl) = bind_expr;
set_externally_visible (fndecl);
if (base->kind == BRIG_KIND_DIRECTIVE_FUNCTION) if (base->kind == BRIG_KIND_DIRECTIVE_FUNCTION)
{ {
TREE_STATIC (fndecl) = 0; TREE_STATIC (fndecl) = 0;
TREE_PUBLIC (fndecl) = 1; TREE_PUBLIC (fndecl) = 1;
DECL_EXTERNAL (fndecl) = 0; DECL_EXTERNAL (fndecl) = 0;
DECL_DECLARED_INLINE_P (fndecl) = 1; DECL_DECLARED_INLINE_P (fndecl) = 1;
set_inline (fndecl);
set_externally_visible (fndecl);
} }
else if (base->kind == BRIG_KIND_DIRECTIVE_KERNEL) else if (base->kind == BRIG_KIND_DIRECTIVE_KERNEL)
{ {
...@@ -330,6 +349,7 @@ brig_directive_function_handler::operator () (const BrigBase *base) ...@@ -330,6 +349,7 @@ brig_directive_function_handler::operator () (const BrigBase *base)
TREE_STATIC (fndecl) = 0; TREE_STATIC (fndecl) = 0;
TREE_PUBLIC (fndecl) = 1; TREE_PUBLIC (fndecl) = 1;
DECL_EXTERNAL (fndecl) = 1; DECL_EXTERNAL (fndecl) = 1;
set_inline (fndecl);
} }
else if (base->kind == BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION) else if (base->kind == BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION)
{ {
...@@ -371,11 +391,8 @@ brig_directive_function_handler::operator () (const BrigBase *base) ...@@ -371,11 +391,8 @@ brig_directive_function_handler::operator () (const BrigBase *base)
} }
m_parent.start_function (fndecl); m_parent.start_function (fndecl);
m_parent.m_cf->m_func_decl = fndecl; m_parent.m_cf->m_func_decl = fndecl;
m_parent.m_cf->m_current_bind_expr = bind_expr; m_parent.m_cf->m_current_bind_expr = bind_expr;
m_parent.m_cf->m_context_arg = context_arg;
m_parent.m_cf->m_private_base_arg = private_base_arg;
if (ret_value != NULL_TREE && TREE_TYPE (ret_value) != void_type_node) if (ret_value != NULL_TREE && TREE_TYPE (ret_value) != void_type_node)
{ {
......
...@@ -105,6 +105,30 @@ public: ...@@ -105,6 +105,30 @@ public:
void analyze_calls (); void analyze_calls ();
tree expand_builtin (BrigOpcode16_t brig_opcode, tree_stl_vec &operands);
tree expand_or_call_builtin (BrigOpcode16_t brig_opcode,
BrigType16_t brig_type, tree arith_type,
tree_stl_vec &operands);
bool can_expand_builtin (BrigOpcode16_t brig_opcode) const;
tree get_builtin_for_hsa_opcode (tree type, BrigOpcode16_t brig_opcode,
BrigType16_t brig_type) const;
void unpack (tree value, tree_stl_vec &elements);
tree pack (tree_stl_vec &elements);
tree add_temp_var (std::string name, tree expr);
static bool needs_workitem_context_data (BrigOpcode16_t brig_opcode);
static HOST_WIDE_INT int_constant_value (tree node);
static tree_code get_tree_code_for_hsa_opcode (BrigOpcode16_t brig_opcode,
BrigType16_t brig_type);
void start_new_bb ();
void add_reg_var_update (tree reg_var, tree val);
bool is_id_val (tree reg_var);
tree id_val (tree reg_var);
const BrigDirectiveExecutable *m_brig_def; const BrigDirectiveExecutable *m_brig_def;
bool m_is_kernel; bool m_is_kernel;
...@@ -183,6 +207,11 @@ public: ...@@ -183,6 +207,11 @@ public:
tree m_wg_id_vars[3]; tree m_wg_id_vars[3];
tree m_wg_size_vars[3]; tree m_wg_size_vars[3];
tree m_grid_size_vars[3]; tree m_grid_size_vars[3];
/* Explicitly computed WG base for the absolute IDs which is used
as the initial value when looping that dimension. We update
the abs id with ++ to make it easy for the vectorizer. */
tree m_abs_id_base_vars[3];
tree m_abs_id_vars[3];
/* Set to true in case the kernel contains at least one dispatch packet /* Set to true in case the kernel contains at least one dispatch packet
(work-item ID-related) builtin call that could not be expanded to (work-item ID-related) builtin call that could not be expanded to
...@@ -219,6 +248,20 @@ private: ...@@ -219,6 +248,20 @@ private:
/* Bookkeeping for the different HSA registers and their tree declarations /* Bookkeeping for the different HSA registers and their tree declarations
for the currently generated function. */ for the currently generated function. */
reg_decl_index_entry *m_regs[BRIG_2_TREE_HSAIL_TOTAL_REG_COUNT]; reg_decl_index_entry *m_regs[BRIG_2_TREE_HSAIL_TOTAL_REG_COUNT];
/* Map for keeping book reads of ID variables, which can be propagated
to uses in address expressions to produce cleaner indexing functions
with unnecessary casts stripped off, etc. */
typedef std::map<tree, tree> id_val_map;
/* Keeps track of ID values alive in registers in the currently
processed BB. */
id_val_map m_id_val_defs;
/* HSAIL-specific builtin functions not yet integrated to gcc. */
typedef std::map<std::pair<BrigOpcode16_t, BrigType16_t>, tree> builtin_map;
static builtin_map s_custom_builtins;
}; };
#endif #endif
...@@ -31,7 +31,10 @@ brig_directive_label_handler::operator () (const BrigBase *base) ...@@ -31,7 +31,10 @@ brig_directive_label_handler::operator () (const BrigBase *base)
std::string label_str ((const char *) (label_name->bytes), std::string label_str ((const char *) (label_name->bytes),
label_name->byteCount); label_name->byteCount);
m_parent.m_cf->start_new_bb ();
tree stmt = build_stmt (LABEL_EXPR, m_parent.m_cf->label (label_str)); tree stmt = build_stmt (LABEL_EXPR, m_parent.m_cf->label (label_str));
m_parent.m_cf->append_statement (stmt); m_parent.m_cf->append_statement (stmt);
return base->byteCount; return base->byteCount;
} }
...@@ -59,7 +59,7 @@ brig_lane_inst_handler::operator () (const BrigBase *base) ...@@ -59,7 +59,7 @@ brig_lane_inst_handler::operator () (const BrigBase *base)
elements.push_back (zero_cst); elements.push_back (zero_cst);
elements.push_back (zero_cst); elements.push_back (zero_cst);
expr = pack (elements); expr = m_parent.m_cf->pack (elements);
} }
else if (inst.base.opcode == BRIG_OPCODE_ACTIVELANEPERMUTE) else if (inst.base.opcode == BRIG_OPCODE_ACTIVELANEPERMUTE)
{ {
......
...@@ -63,7 +63,7 @@ brig_mem_inst_handler::build_mem_access (const BrigInstBase *brig_inst, ...@@ -63,7 +63,7 @@ brig_mem_inst_handler::build_mem_access (const BrigInstBase *brig_inst,
{ {
/* Add a temporary variable so there won't be multiple /* Add a temporary variable so there won't be multiple
reads in case of vector unpack. */ reads in case of vector unpack. */
mem_ref = add_temp_var ("mem_read", mem_ref); mem_ref = m_parent.m_cf->add_temp_var ("mem_read", mem_ref);
return build_output_assignment (*brig_inst, data, mem_ref); return build_output_assignment (*brig_inst, data, mem_ref);
} }
else else
...@@ -95,8 +95,9 @@ brig_mem_inst_handler::operator () (const BrigBase *base) ...@@ -95,8 +95,9 @@ brig_mem_inst_handler::operator () (const BrigBase *base)
inputs.push_back (operands[1]); inputs.push_back (operands[1]);
inputs.push_back (align_opr); inputs.push_back (align_opr);
tree builtin_call tree builtin_call
= expand_or_call_builtin (BRIG_OPCODE_ALLOCA, BRIG_TYPE_U32, = m_parent.m_cf->expand_or_call_builtin (BRIG_OPCODE_ALLOCA,
uint32_type_node, inputs); BRIG_TYPE_U32,
uint32_type_node, inputs);
build_output_assignment (*brig_inst, operands[0], builtin_call); build_output_assignment (*brig_inst, operands[0], builtin_call);
m_parent.m_cf->m_has_allocas = true; m_parent.m_cf->m_has_allocas = true;
return base->byteCount; return base->byteCount;
......
...@@ -58,13 +58,22 @@ typedef struct __attribute__((__packed__)) ...@@ -58,13 +58,22 @@ typedef struct __attribute__((__packed__))
/* The prefix to use in the ELF section containing descriptor for /* The prefix to use in the ELF section containing descriptor for
a function. */ a function. */
#define PHSA_DESC_SECTION_PREFIX "phsa.desc." #define PHSA_DESC_SECTION_PREFIX "phsa.desc."
#define PHSA_HOST_DEF_PTR_PREFIX "__phsa.host_def." #define PHSA_HOST_DEF_PTR_PREFIX "__phsa.host_def."
/* The frontend error messages are parsed by the host runtime. Known /* The frontend error messages are parsed by the host runtime. Known
prefix strings are used to separate the different runtime error prefix strings are used to separate the different runtime error
codes. */ codes. */
#define PHSA_ERROR_PREFIX_INCOMPATIBLE_MODULE "Incompatible module: " #define PHSA_ERROR_PREFIX_INCOMPATIBLE_MODULE "Incompatible module: "
#define PHSA_ERROR_PREFIX_CORRUPTED_MODULE "Corrupted module: " #define PHSA_ERROR_PREFIX_CORRUPTED_MODULE "Corrupted module: "
/* Offsets of attributes in the PHSA context structs.
Used by -fphsa-wi-context-opt. */
#define PHSA_CONTEXT_OFFS_WI_IDS 0
#define PHSA_CONTEXT_OFFS_WG_IDS (PHSA_CONTEXT_OFFS_WI_IDS + 3 * 4)
#define PHSA_CONTEXT_WG_SIZES (PHSA_CONTEXT_OFFS_WG_IDS + 3 * 4)
#define PHSA_CONTEXT_CURRENT_WG_SIZES (PHSA_CONTEXT_WG_SIZES + 3 * 4)
#endif #endif
...@@ -31,6 +31,11 @@ BRIG Separate Alias(d) ...@@ -31,6 +31,11 @@ BRIG Separate Alias(d)
-dump= -dump=
BRIG Joined Alias(d) BRIG Joined Alias(d)
fassume-phsa
BRIG Report Var(flag_assume_phsa) Init(1) Optimization
Assume we are finalizing for phsa and its libhsail-rt. Enables additional
phsa-specific optimizations (default).
L L
BRIG Joined Separate BRIG Joined Separate
; Not documented ; Not documented
......
...@@ -283,7 +283,9 @@ DEF_FUNCTION_TYPE_1 (BT_FN_UINT_INT, BT_UINT, BT_INT) ...@@ -283,7 +283,9 @@ DEF_FUNCTION_TYPE_1 (BT_FN_UINT_INT, BT_UINT, BT_INT)
DEF_FUNCTION_TYPE_1 (BT_FN_UINT_ULONG, BT_UINT, BT_ULONG) DEF_FUNCTION_TYPE_1 (BT_FN_UINT_ULONG, BT_UINT, BT_ULONG)
DEF_FUNCTION_TYPE_1 (BT_FN_UINT_LONG, BT_UINT, BT_LONG) DEF_FUNCTION_TYPE_1 (BT_FN_UINT_LONG, BT_UINT, BT_LONG)
DEF_FUNCTION_TYPE_1 (BT_FN_UINT_PTR, BT_UINT, BT_PTR) DEF_FUNCTION_TYPE_1 (BT_FN_UINT_PTR, BT_UINT, BT_PTR)
DEF_FUNCTION_TYPE_1 (BT_FN_UINT_CONST_PTR, BT_UINT, BT_CONST_PTR)
DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_PTR, BT_ULONG, BT_PTR) DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_PTR, BT_ULONG, BT_PTR)
DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_CONST_PTR, BT_ULONG, BT_CONST_PTR)
DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_ULONG, BT_ULONG, BT_ULONG) DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_ULONG, BT_ULONG, BT_ULONG)
DEF_FUNCTION_TYPE_1 (BT_FN_ULONGLONG_ULONGLONG, BT_ULONGLONG, BT_ULONGLONG) DEF_FUNCTION_TYPE_1 (BT_FN_ULONGLONG_ULONGLONG, BT_ULONGLONG, BT_ULONGLONG)
DEF_FUNCTION_TYPE_1 (BT_FN_INT8_FLOAT, BT_INT8, BT_FLOAT) DEF_FUNCTION_TYPE_1 (BT_FN_INT8_FLOAT, BT_INT8, BT_FLOAT)
...@@ -480,6 +482,7 @@ DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_SIZE_CONST_VPTR, BT_BOOL, BT_SIZE, ...@@ -480,6 +482,7 @@ DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_SIZE_CONST_VPTR, BT_BOOL, BT_SIZE,
DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_INT_BOOL, BT_BOOL, BT_INT, BT_BOOL) DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_INT_BOOL, BT_BOOL, BT_INT, BT_BOOL)
DEF_FUNCTION_TYPE_2 (BT_FN_VOID_UINT_UINT, BT_VOID, BT_UINT, BT_UINT) DEF_FUNCTION_TYPE_2 (BT_FN_VOID_UINT_UINT, BT_VOID, BT_UINT, BT_UINT)
DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_PTR, BT_UINT, BT_UINT, BT_PTR) DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_PTR, BT_UINT, BT_UINT, BT_PTR)
DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_CONST_PTR, BT_UINT, BT_UINT, BT_CONST_PTR)
DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_SIZE, BT_PTR, BT_CONST_PTR, BT_SIZE) DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_SIZE, BT_PTR, BT_CONST_PTR, BT_SIZE)
DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_CONST_PTR, BT_PTR, BT_CONST_PTR, BT_CONST_PTR) DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_CONST_PTR, BT_PTR, BT_CONST_PTR, BT_CONST_PTR)
DEF_FUNCTION_TYPE_2 (BT_FN_VOID_PTRPTR_CONST_PTR, BT_VOID, BT_PTR_PTR, BT_CONST_PTR) DEF_FUNCTION_TYPE_2 (BT_FN_VOID_PTRPTR_CONST_PTR, BT_VOID, BT_PTR_PTR, BT_CONST_PTR)
...@@ -569,6 +572,7 @@ DEF_FUNCTION_TYPE_3 (BT_FN_VOID_DOUBLE_DOUBLEPTR_DOUBLEPTR, ...@@ -569,6 +572,7 @@ DEF_FUNCTION_TYPE_3 (BT_FN_VOID_DOUBLE_DOUBLEPTR_DOUBLEPTR,
DEF_FUNCTION_TYPE_3 (BT_FN_VOID_LONGDOUBLE_LONGDOUBLEPTR_LONGDOUBLEPTR, DEF_FUNCTION_TYPE_3 (BT_FN_VOID_LONGDOUBLE_LONGDOUBLEPTR_LONGDOUBLEPTR,
BT_VOID, BT_LONGDOUBLE, BT_LONGDOUBLE_PTR, BT_LONGDOUBLE_PTR) BT_VOID, BT_LONGDOUBLE, BT_LONGDOUBLE_PTR, BT_LONGDOUBLE_PTR)
DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_PTR_PTR, BT_VOID, BT_PTR, BT_PTR, BT_PTR) DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_PTR_PTR, BT_VOID, BT_PTR, BT_PTR, BT_PTR)
DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_PTR_UINT32, BT_VOID, BT_PTR, BT_PTR, BT_UINT32)
DEF_FUNCTION_TYPE_3 (BT_FN_INT_CONST_STRING_PTR_CONST_STRING_PTR_CONST_STRING, DEF_FUNCTION_TYPE_3 (BT_FN_INT_CONST_STRING_PTR_CONST_STRING_PTR_CONST_STRING,
BT_INT, BT_CONST_STRING, BT_PTR_CONST_STRING, BT_PTR_CONST_STRING) BT_INT, BT_CONST_STRING, BT_PTR_CONST_STRING, BT_PTR_CONST_STRING)
DEF_FUNCTION_TYPE_3 (BT_FN_INT_INT_CONST_STRING_VALIST_ARG, DEF_FUNCTION_TYPE_3 (BT_FN_INT_INT_CONST_STRING_VALIST_ARG,
......
2018-05-04 Carl Love <cel@us.ibm.com> 2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
* testsuite/brig.dg/test/gimple/smoke_test.hsail: Fix the test
to match the currently produced gimple.
2018-05-04 Carl Love <cel@us.ibm.com>
* gcc.target/powerpc/vsx-vector-6.h (foo): Add test for vec_max, * gcc.target/powerpc/vsx-vector-6.h (foo): Add test for vec_max,
vec_trunc. vec_trunc.
* gcc.target/powerpc/vsx-vector-6-le.c (dg-final): Update xvcmpeqdp, * gcc.target/powerpc/vsx-vector-6-le.c (dg-final): Update xvcmpeqdp,
......
...@@ -41,15 +41,15 @@ prog kernel &KernelWithBarrier(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr) ...@@ -41,15 +41,15 @@ prog kernel &KernelWithBarrier(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr)
}; };
/* The kernel function itself should have a fingerprint as follows */ /* The kernel function itself should have a fingerprint as follows */
/* _Kernel (unsigned char * __args, void * __context, void * __group_base_addr, void * __private_base_addr) */ /* _Kernel (const unsigned char * restrict __args, void * restrict __context, unsigned char * restrict __group_base_addr, unsigned int __group_local_offset, unsigned char * restrict __private_base_addr) */
/* { dg-final { scan-tree-dump "_Kernel \\\(unsigned char \\\* __args, void \\\* __context, void \\\* __group_base_addr, unsigned int __group_local_offset, void \\\* __private_base_addr\\\)" "gimple"} } */ /* { dg-final { scan-tree-dump "_Kernel \\\(const unsigned char \\\* restrict __args, void \\\* restrict __context, unsigned char \\\* restrict __group_base_addr, unsigned int __group_local_offset, unsigned char \\\* restrict __private_base_addr\\\)" "gimple"} } */
/* ld_kernarg: mem_read.0 = MEM[(unsigned long *)__args]; */ /* ld_kernarg: mem_read.0 = MEM[(unsigned long *)__args]; */
/* { dg-final { scan-tree-dump "mem_read.\[0-9\] = MEM\\\[\\\(unsigned long \\\*\\\)__args\\\];" "gimple"} } */ /* { dg-final { scan-tree-dump "mem_read.\[0-9\] = MEM\\\[\\\(unsigned long \\\*\\\)__args\\\];" "gimple"} } */
/* The latter ld_global_u32 should be visible as a pointer dereference (after pointer arithmetics on a temporary var): */ /* The latter ld_global_u32 should be visible as a pointer dereference (after pointer arithmetics on a temporary var): */
/* mem_read.2 = *D.1691; */ /* mem_read.2 = *D.1691; */
/* { dg-final { scan-tree-dump "mem_read.\[0-9\] = \\\*\[_0-9\]+;" "gimple"} } */ /* { dg-final { scan-tree-dump "mem_read.\[0-9\]+ = \\\*\[_0-9\]+;" "gimple"} } */
/* add_u32s should generate +operators */ /* add_u32s should generate +operators */
/* { dg-final { scan-tree-dump "s2 = s0 \\\+ s1;" "gimple"} } */ /* { dg-final { scan-tree-dump "s2 = s0 \\\+ s1;" "gimple"} } */
...@@ -71,8 +71,8 @@ prog kernel &KernelWithBarrier(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr) ...@@ -71,8 +71,8 @@ prog kernel &KernelWithBarrier(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr)
/* { dg-final { scan-tree-dump "if \\\(__local_z < __cur_wg_size_z\\\) goto __wi_loop_z; else goto" "gimple"} } */ /* { dg-final { scan-tree-dump "if \\\(__local_z < __cur_wg_size_z\\\) goto __wi_loop_z; else goto" "gimple"} } */
/* The launcher should call __hsail_launch_wg_function in this case: */ /* The launcher should call __hsail_launch_wg_function in this case: */
/* Kernel (void * __context, void * __group_base_addr) */ /* Kernel (void * restrict __context, unsigned char * restrict __group_base_addr) */
/* { dg-final { scan-tree-dump "Kernel \\\(void \\\* __context, void \\\* __group_base_addr\\\)" "gimple"} } */ /* { dg-final { scan-tree-dump "Kernel \\\(void \\\* restrict __context, unsigned char \\\* restrict __group_base_addr\\\)" "gimple"} } */
/* { dg-final { scan-tree-dump "__hsail_launch_wg_function \\\(_Kernel, __context, __group_base_addr, group_local_offset.*\\\);" "gimple"} }*/ /* { dg-final { scan-tree-dump "__hsail_launch_wg_function \\\(_Kernel, __context, __group_base_addr, group_local_offset.*\\\);" "gimple"} }*/
/* The kernel should have the magic metadata section injected to the ELF. */ /* The kernel should have the magic metadata section injected to the ELF. */
......
2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com> 2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
* include/internal/phsa-rt.h: Whitespace cleanup.
* include/internal/workitems.h: Store work item ID data to easily
accessible locations.
* rt/workitems.c: Same.
2018-05-04 Pekka Jääskeläinen <pekka.jaaskelainen@parmance.com>
* rt/workitems.c: Fix an alloca stack underflow. * rt/workitems.c: Fix an alloca stack underflow.
2018-04-18 David Malcolm <dmalcolm@redhat.com> 2018-04-18 David Malcolm <dmalcolm@redhat.com>
......
...@@ -54,7 +54,6 @@ typedef void (*gccbrigKernelFunc) (unsigned char *, void *, void *, uint32_t, ...@@ -54,7 +54,6 @@ typedef void (*gccbrigKernelFunc) (unsigned char *, void *, void *, uint32_t,
*/ */
typedef struct typedef struct
{ {
/* Data set by the HSA Runtime's kernel launcher. */ /* Data set by the HSA Runtime's kernel launcher. */
hsa_kernel_dispatch_packet_t *dp; hsa_kernel_dispatch_packet_t *dp;
......
...@@ -45,11 +45,6 @@ ...@@ -45,11 +45,6 @@
typedef struct typedef struct
{ {
/* The group id of the currently executed WG. */
size_t x;
size_t y;
size_t z;
/* This is 1 in case there are more work groups to execute. /* This is 1 in case there are more work groups to execute.
If 0, the work-item threads should finish themselves. */ If 0, the work-item threads should finish themselves. */
int more_wgs; int more_wgs;
...@@ -89,6 +84,16 @@ typedef struct ...@@ -89,6 +84,16 @@ typedef struct
stack frame. Initialized to point outside the private segment. */ stack frame. Initialized to point outside the private segment. */
uint32_t alloca_frame_p; uint32_t alloca_frame_p;
/* The group id of the currently executed WG. This is for fiber based
execution. The group ids are duplicated also to the per WI context
struct for simplified single pointer access in the GCCBRIG produced
code.
*/
uint32_t x;
uint32_t y;
uint32_t z;
} PHSAWorkGroup; } PHSAWorkGroup;
/* Data identifying a single work-item, passed to the work-item thread in case /* Data identifying a single work-item, passed to the work-item thread in case
...@@ -96,17 +101,42 @@ typedef struct ...@@ -96,17 +101,42 @@ typedef struct
typedef struct typedef struct
{ {
/* NOTE: These members STARTing here should not be moved as they are
accessed directly by code emitted by BRIG FE. */
/* The local id of the current WI. */
uint32_t x;
uint32_t y;
uint32_t z;
/* The group id of the currently executed WG. */
uint32_t group_x;
uint32_t group_y;
uint32_t group_z;
/* The local size of a complete WG. */
uint32_t wg_size_x;
uint32_t wg_size_y;
uint32_t wg_size_z;
/* The local size of the current WG. */
uint32_t cur_wg_size_x;
uint32_t cur_wg_size_y;
uint32_t cur_wg_size_z;
/* NOTE: Fixed members END here. */
PHSAKernelLaunchData *launch_data; PHSAKernelLaunchData *launch_data;
/* Identifies and keeps book of the currently executed WG of the WI swarm. */ /* Identifies and keeps book of the currently executed WG of the WI swarm. */
volatile PHSAWorkGroup *wg; volatile PHSAWorkGroup *wg;
/* The local id of the current WI. */
size_t x;
size_t y;
size_t z;
#ifdef HAVE_FIBERS #ifdef HAVE_FIBERS
fiber_t fiber; fiber_t fiber;
#endif #endif
} PHSAWorkItem; } __attribute__((packed)) PHSAWorkItem;
#endif #endif
...@@ -107,11 +107,20 @@ phsa_work_item_thread (int arg0, int arg1) ...@@ -107,11 +107,20 @@ phsa_work_item_thread (int arg0, int arg1)
the current_work_group_* is set to point to the WG executed next. */ the current_work_group_* is set to point to the WG executed next. */
if (!wi->wg->more_wgs) if (!wi->wg->more_wgs)
break; break;
wi->group_x = wg->x;
wi->group_y = wg->y;
wi->group_z = wg->z;
wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
#ifdef DEBUG_PHSA_RT #ifdef DEBUG_PHSA_RT
printf ( printf (
"Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n", "Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
wi->x, wi->y, wi->z, wg->x, wg->y, wg->z, l_data->wg_max_x, wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z,
l_data->wg_max_y, l_data->wg_max_z); l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z);
#endif #endif
if (wi->x < __hsail_currentworkgroupsize (0, wi) if (wi->x < __hsail_currentworkgroupsize (0, wi)
...@@ -180,6 +189,13 @@ phsa_work_item_thread (int arg0, int arg1) ...@@ -180,6 +189,13 @@ phsa_work_item_thread (int arg0, int arg1)
else else
wg->x++; wg->x++;
#endif #endif
wi->group_x = wg->x;
wi->group_y = wg->y;
wi->group_z = wg->z;
wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
/* Reinitialize the work-group barrier according to the new WG's /* Reinitialize the work-group barrier according to the new WG's
size, which might not be the same as the previous ones, due size, which might not be the same as the previous ones, due
...@@ -233,6 +249,7 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr, ...@@ -233,6 +249,7 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
PHSAWorkItem *wi_threads = NULL; PHSAWorkItem *wi_threads = NULL;
PHSAWorkGroup wg; PHSAWorkGroup wg;
size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z; size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
uint32_t group_x, group_y, group_z;
fiber_barrier_t wg_start_barrier; fiber_barrier_t wg_start_barrier;
fiber_barrier_t wg_completion_barrier; fiber_barrier_t wg_completion_barrier;
fiber_barrier_t wg_sync_barrier; fiber_barrier_t wg_sync_barrier;
...@@ -257,13 +274,13 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr, ...@@ -257,13 +274,13 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
wg.initial_group_offset = group_local_offset; wg.initial_group_offset = group_local_offset;
#ifdef EXECUTE_WGS_BACKWARDS #ifdef EXECUTE_WGS_BACKWARDS
wg.x = context->wg_max_x - 1; group_x = context->wg_max_x - 1;
wg.y = context->wg_max_y - 1; group_y = context->wg_max_y - 1;
wg.z = context->wg_max_z - 1; group_z = context->wg_max_z - 1;
#else #else
wg.x = context->wg_min_x; group_x = context->wg_min_x;
wg.y = context->wg_min_y; group_y = context->wg_min_y;
wg.z = context->wg_min_z; group_z = context->wg_min_z;
#endif #endif
fiber_barrier_init (&wg_sync_barrier, wg_size); fiber_barrier_init (&wg_sync_barrier, wg_size);
...@@ -290,6 +307,19 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr, ...@@ -290,6 +307,19 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
PHSAWorkItem *wi = &wi_threads[flat_wi_id]; PHSAWorkItem *wi = &wi_threads[flat_wi_id];
wi->launch_data = context; wi->launch_data = context;
wi->wg = &wg; wi->wg = &wg;
wg.x = wi->group_x = group_x;
wg.y = wi->group_y = group_y;
wg.z = wi->group_z = group_z;
wi->wg_size_x = context->dp->workgroup_size_x;
wi->wg_size_y = context->dp->workgroup_size_y;
wi->wg_size_z = context->dp->workgroup_size_z;
wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
wi->x = x; wi->x = x;
wi->y = y; wi->y = y;
wi->z = z; wi->z = z;
...@@ -467,9 +497,17 @@ phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr, ...@@ -467,9 +497,17 @@ phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr,
for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y) for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x) for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
{ {
wi.wg->x = wg_x; wi.group_x = wg_x;
wi.wg->y = wg_y; wi.group_y = wg_y;
wi.wg->z = wg_z; wi.group_z = wg_z;
wi.wg_size_x = context->dp->workgroup_size_x;
wi.wg_size_y = context->dp->workgroup_size_y;
wi.wg_size_z = context->dp->workgroup_size_z;
wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi);
wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi);
wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi);
context->kernel (context->kernarg_addr, &wi, group_base_ptr, context->kernel (context->kernarg_addr, &wi, group_base_ptr,
group_local_offset, private_base_ptr); group_local_offset, private_base_ptr);
...@@ -564,15 +602,15 @@ __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context) ...@@ -564,15 +602,15 @@ __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
default: default:
case 0: case 0:
/* Overflow semantics in the case of WG dim > grid dim. */ /* Overflow semantics in the case of WG dim > grid dim. */
id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x) id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
% dp->grid_size_x; % dp->grid_size_x;
break; break;
case 1: case 1:
id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y) id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
% dp->grid_size_y; % dp->grid_size_y;
break; break;
case 2: case 2:
id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z) id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
% dp->grid_size_z; % dp->grid_size_z;
break; break;
} }
...@@ -590,15 +628,15 @@ __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context) ...@@ -590,15 +628,15 @@ __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
default: default:
case 0: case 0:
/* Overflow semantics in the case of WG dim > grid dim. */ /* Overflow semantics in the case of WG dim > grid dim. */
id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x) id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
% dp->grid_size_x; % dp->grid_size_x;
break; break;
case 1: case 1:
id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y) id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
% dp->grid_size_y; % dp->grid_size_y;
break; break;
case 2: case 2:
id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z) id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
% dp->grid_size_z; % dp->grid_size_z;
break; break;
} }
...@@ -738,19 +776,19 @@ __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi) ...@@ -738,19 +776,19 @@ __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
{ {
default: default:
case 0: case 0:
if ((uint64_t) wi->wg->x < dp->grid_size_x / dp->workgroup_size_x) if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x)
wg_size = dp->workgroup_size_x; /* Full WG. */ wg_size = dp->workgroup_size_x; /* Full WG. */
else else
wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */ wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG. */
break; break;
case 1: case 1:
if ((uint64_t) wi->wg->y < dp->grid_size_y / dp->workgroup_size_y) if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y)
wg_size = dp->workgroup_size_y; /* Full WG. */ wg_size = dp->workgroup_size_y; /* Full WG. */
else else
wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */ wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG. */
break; break;
case 2: case 2:
if ((uint64_t) wi->wg->z < dp->grid_size_z / dp->workgroup_size_z) if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z)
wg_size = dp->workgroup_size_z; /* Full WG. */ wg_size = dp->workgroup_size_z; /* Full WG. */
else else
wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */ wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG. */
...@@ -798,11 +836,11 @@ __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi) ...@@ -798,11 +836,11 @@ __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
{ {
default: default:
case 0: case 0:
return wi->wg->x; return wi->group_x;
case 1: case 1:
return wi->wg->y; return wi->group_y;
case 2: case 2:
return wi->wg->z; return wi->group_z;
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment