Commit bce29d65 by Alejandro Martinez Committed by Alejandro Martinez

[Vectorizer] Support masking fold left reductions

This patch adds support in the vectorizer for masking fold left reductions.
This avoids the need to insert a conditional assignement with some identity
value.

From-SVN: r272407
parent 9553f0d2
2019-06-18 Alejandro Martinez <alejandro.martinezvicente@arm.com>
* config/aarch64/aarch64-sve.md (mask_fold_left_plus_<mode>): Renamed
from "*fold_left_plus_<mode>", updated operands order.
* doc/md.texi (mask_fold_left_plus_@var{m}): Documented new optab.
* internal-fn.c (mask_fold_left_direct): New define.
(expand_mask_fold_left_optab_fn): Likewise.
(direct_mask_fold_left_optab_supported_p): Likewise.
* internal-fn.def (MASK_FOLD_LEFT_PLUS): New internal function.
* optabs.def (mask_fold_left_plus_optab): New optab.
* tree-vect-loop.c (mask_fold_left_plus_optab): New function to get a
masked internal_fn for a reduction ifn.
(vectorize_fold_left_reduction): Add support for masking reductions.
2019-06-18 Kewen Lin <linkw@gcc.gnu.org> 2019-06-18 Kewen Lin <linkw@gcc.gnu.org>
PR middle-end/80791 PR middle-end/80791
......
...@@ -2180,14 +2180,14 @@ ...@@ -2180,14 +2180,14 @@
) )
;; In-order FP reductions predicated with PTRUE. ;; In-order FP reductions predicated with PTRUE.
(define_insn "*fold_left_plus_<mode>" (define_insn "mask_fold_left_plus_<mode>"
[(set (match_operand:<VEL> 0 "register_operand" "=w") [(set (match_operand:<VEL> 0 "register_operand" "=w")
(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl") (unspec:<VEL> [(match_operand:<VPRED> 3 "register_operand" "Upl")
(match_operand:<VEL> 2 "register_operand" "0") (match_operand:<VEL> 1 "register_operand" "0")
(match_operand:SVE_F 3 "register_operand" "w")] (match_operand:SVE_F 2 "register_operand" "w")]
UNSPEC_FADDA))] UNSPEC_FADDA))]
"TARGET_SVE" "TARGET_SVE"
"fadda\t%<Vetype>0, %1, %<Vetype>0, %3.<Vetype>" "fadda\t%<Vetype>0, %3, %<Vetype>0, %2.<Vetype>"
) )
;; Predicated form of the above in-order reduction. ;; Predicated form of the above in-order reduction.
......
...@@ -5417,6 +5417,11 @@ mode @var{m} and the scalars have the mode appropriate for one ...@@ -5417,6 +5417,11 @@ mode @var{m} and the scalars have the mode appropriate for one
element of @var{m}. The operation is strictly in-order: there is element of @var{m}. The operation is strictly in-order: there is
no reassociation. no reassociation.
@cindex @code{mask_fold_left_plus_@var{m}} instruction pattern
@item @code{mask_fold_left_plus_@var{m}}
Like @samp{fold_left_plus_@var{m}}, but takes an additional mask operand
(operand 3) that specifies which elements of the source vector should be added.
@cindex @code{sdot_prod@var{m}} instruction pattern @cindex @code{sdot_prod@var{m}} instruction pattern
@item @samp{sdot_prod@var{m}} @item @samp{sdot_prod@var{m}}
@cindex @code{udot_prod@var{m}} instruction pattern @cindex @code{udot_prod@var{m}} instruction pattern
......
...@@ -117,6 +117,7 @@ init_internal_fns () ...@@ -117,6 +117,7 @@ init_internal_fns ()
#define while_direct { 0, 2, false } #define while_direct { 0, 2, false }
#define fold_extract_direct { 2, 2, false } #define fold_extract_direct { 2, 2, false }
#define fold_left_direct { 1, 1, false } #define fold_left_direct { 1, 1, false }
#define mask_fold_left_direct { 1, 1, false }
const direct_internal_fn_info direct_internal_fn_array[IFN_LAST + 1] = { const direct_internal_fn_info direct_internal_fn_array[IFN_LAST + 1] = {
#define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) not_direct, #define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) not_direct,
...@@ -3000,6 +3001,9 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab) ...@@ -3000,6 +3001,9 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
#define expand_fold_left_optab_fn(FN, STMT, OPTAB) \ #define expand_fold_left_optab_fn(FN, STMT, OPTAB) \
expand_direct_optab_fn (FN, STMT, OPTAB, 2) expand_direct_optab_fn (FN, STMT, OPTAB, 2)
#define expand_mask_fold_left_optab_fn(FN, STMT, OPTAB) \
expand_direct_optab_fn (FN, STMT, OPTAB, 3)
/* RETURN_TYPE and ARGS are a return type and argument list that are /* RETURN_TYPE and ARGS are a return type and argument list that are
in principle compatible with FN (which satisfies direct_internal_fn_p). in principle compatible with FN (which satisfies direct_internal_fn_p).
Return the types that should be used to determine whether the Return the types that should be used to determine whether the
...@@ -3088,6 +3092,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, ...@@ -3088,6 +3092,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
#define direct_while_optab_supported_p convert_optab_supported_p #define direct_while_optab_supported_p convert_optab_supported_p
#define direct_fold_extract_optab_supported_p direct_optab_supported_p #define direct_fold_extract_optab_supported_p direct_optab_supported_p
#define direct_fold_left_optab_supported_p direct_optab_supported_p #define direct_fold_left_optab_supported_p direct_optab_supported_p
#define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
/* Return the optab used by internal function FN. */ /* Return the optab used by internal function FN. */
......
...@@ -199,6 +199,9 @@ DEF_INTERNAL_OPTAB_FN (FOLD_EXTRACT_LAST, ECF_CONST | ECF_NOTHROW, ...@@ -199,6 +199,9 @@ DEF_INTERNAL_OPTAB_FN (FOLD_EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
DEF_INTERNAL_OPTAB_FN (FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW, DEF_INTERNAL_OPTAB_FN (FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW,
fold_left_plus, fold_left) fold_left_plus, fold_left)
DEF_INTERNAL_OPTAB_FN (MASK_FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW,
mask_fold_left_plus, mask_fold_left)
/* Unary math functions. */ /* Unary math functions. */
DEF_INTERNAL_FLT_FN (ACOS, ECF_CONST, acos, unary) DEF_INTERNAL_FLT_FN (ACOS, ECF_CONST, acos, unary)
DEF_INTERNAL_FLT_FN (ACOSH, ECF_CONST, acosh, unary) DEF_INTERNAL_FLT_FN (ACOSH, ECF_CONST, acosh, unary)
......
...@@ -323,6 +323,7 @@ OPTAB_D (reduc_and_scal_optab, "reduc_and_scal_$a") ...@@ -323,6 +323,7 @@ OPTAB_D (reduc_and_scal_optab, "reduc_and_scal_$a")
OPTAB_D (reduc_ior_scal_optab, "reduc_ior_scal_$a") OPTAB_D (reduc_ior_scal_optab, "reduc_ior_scal_$a")
OPTAB_D (reduc_xor_scal_optab, "reduc_xor_scal_$a") OPTAB_D (reduc_xor_scal_optab, "reduc_xor_scal_$a")
OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a") OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a")
OPTAB_D (mask_fold_left_plus_optab, "mask_fold_left_plus_$a")
OPTAB_D (extract_last_optab, "extract_last_$a") OPTAB_D (extract_last_optab, "extract_last_$a")
OPTAB_D (fold_extract_last_optab, "fold_extract_last_$a") OPTAB_D (fold_extract_last_optab, "fold_extract_last_$a")
......
2019-06-18 Alejandro Martinez <alejandro.martinezvicente@arm.com>
* gcc.target/aarch64/sve/fadda_1.c: New test.
2019-06-17 Jakub Jelinek <jakub@redhat.com> 2019-06-17 Jakub Jelinek <jakub@redhat.com>
* gcc.dg/vect/vect-simd-8.c: New test. * gcc.dg/vect/vect-simd-8.c: New test.
......
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
#define DO_OPS(TYPE) \
TYPE fold_##TYPE (TYPE *src, int count) \
{ \
TYPE res = 0; \
for (int i = 0; i < count; ++i) \
res += src[i]; \
return res; \
}
DO_OPS (_Float16)
DO_OPS (float)
DO_OPS (double)
/* { dg-final { scan-assembler-times {\tfadda\th[0-9]+, p[0-7], h[0-9]+, z[0-9]+\.h\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d\n} 1 } } */
/* { dg-final { scan-assembler-not "sel" } } */
...@@ -5916,6 +5916,30 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest, ...@@ -5916,6 +5916,30 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
return lhs; return lhs;
} }
/* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
type of the vector input. */
static internal_fn
get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
{
internal_fn mask_reduc_fn;
switch (reduc_fn)
{
case IFN_FOLD_LEFT_PLUS:
mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
break;
default:
return IFN_LAST;
}
if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
OPTIMIZE_FOR_SPEED))
return mask_reduc_fn;
return IFN_LAST;
}
/* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
statement that sets the live-out value. REDUC_DEF_STMT is the phi statement that sets the live-out value. REDUC_DEF_STMT is the phi
statement. CODE is the operation performed by STMT_INFO and OPS are statement. CODE is the operation performed by STMT_INFO and OPS are
...@@ -5938,6 +5962,7 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info, ...@@ -5938,6 +5962,7 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
stmt_vec_info new_stmt_info = NULL; stmt_vec_info new_stmt_info = NULL;
internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
int ncopies; int ncopies;
if (slp_node) if (slp_node)
...@@ -6014,16 +6039,21 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info, ...@@ -6014,16 +6039,21 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
def0 = negated; def0 = negated;
} }
if (mask) if (mask && mask_reduc_fn == IFN_LAST)
def0 = merge_with_identity (gsi, mask, vectype_out, def0, def0 = merge_with_identity (gsi, mask, vectype_out, def0,
vector_identity); vector_identity);
/* On the first iteration the input is simply the scalar phi /* On the first iteration the input is simply the scalar phi
result, and for subsequent iterations it is the output of result, and for subsequent iterations it is the output of
the preceding operation. */ the preceding operation. */
if (reduc_fn != IFN_LAST) if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
{ {
new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0); if (mask && mask_reduc_fn != IFN_LAST)
new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
def0, mask);
else
new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
def0);
/* For chained SLP reductions the output of the previous reduction /* For chained SLP reductions the output of the previous reduction
operation serves as the input of the next. For the final statement operation serves as the input of the next. For the final statement
the output cannot be a temporary - we reuse the original the output cannot be a temporary - we reuse the original
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment