Commit bce29d65 by Alejandro Martinez Committed by Alejandro Martinez

[Vectorizer] Support masking fold left reductions

This patch adds support in the vectorizer for masking fold left reductions.
This avoids the need to insert a conditional assignement with some identity
value.

From-SVN: r272407
parent 9553f0d2
2019-06-18 Alejandro Martinez <alejandro.martinezvicente@arm.com>
* config/aarch64/aarch64-sve.md (mask_fold_left_plus_<mode>): Renamed
from "*fold_left_plus_<mode>", updated operands order.
* doc/md.texi (mask_fold_left_plus_@var{m}): Documented new optab.
* internal-fn.c (mask_fold_left_direct): New define.
(expand_mask_fold_left_optab_fn): Likewise.
(direct_mask_fold_left_optab_supported_p): Likewise.
* internal-fn.def (MASK_FOLD_LEFT_PLUS): New internal function.
* optabs.def (mask_fold_left_plus_optab): New optab.
* tree-vect-loop.c (mask_fold_left_plus_optab): New function to get a
masked internal_fn for a reduction ifn.
(vectorize_fold_left_reduction): Add support for masking reductions.
2019-06-18 Kewen Lin <linkw@gcc.gnu.org>
PR middle-end/80791
......
......@@ -2180,14 +2180,14 @@
)
;; In-order FP reductions predicated with PTRUE.
(define_insn "*fold_left_plus_<mode>"
(define_insn "mask_fold_left_plus_<mode>"
[(set (match_operand:<VEL> 0 "register_operand" "=w")
(unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
(match_operand:<VEL> 2 "register_operand" "0")
(match_operand:SVE_F 3 "register_operand" "w")]
(unspec:<VEL> [(match_operand:<VPRED> 3 "register_operand" "Upl")
(match_operand:<VEL> 1 "register_operand" "0")
(match_operand:SVE_F 2 "register_operand" "w")]
UNSPEC_FADDA))]
"TARGET_SVE"
"fadda\t%<Vetype>0, %1, %<Vetype>0, %3.<Vetype>"
"fadda\t%<Vetype>0, %3, %<Vetype>0, %2.<Vetype>"
)
;; Predicated form of the above in-order reduction.
......
......@@ -5417,6 +5417,11 @@ mode @var{m} and the scalars have the mode appropriate for one
element of @var{m}. The operation is strictly in-order: there is
no reassociation.
@cindex @code{mask_fold_left_plus_@var{m}} instruction pattern
@item @code{mask_fold_left_plus_@var{m}}
Like @samp{fold_left_plus_@var{m}}, but takes an additional mask operand
(operand 3) that specifies which elements of the source vector should be added.
@cindex @code{sdot_prod@var{m}} instruction pattern
@item @samp{sdot_prod@var{m}}
@cindex @code{udot_prod@var{m}} instruction pattern
......
......@@ -117,6 +117,7 @@ init_internal_fns ()
#define while_direct { 0, 2, false }
#define fold_extract_direct { 2, 2, false }
#define fold_left_direct { 1, 1, false }
#define mask_fold_left_direct { 1, 1, false }
const direct_internal_fn_info direct_internal_fn_array[IFN_LAST + 1] = {
#define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) not_direct,
......@@ -3000,6 +3001,9 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
#define expand_fold_left_optab_fn(FN, STMT, OPTAB) \
expand_direct_optab_fn (FN, STMT, OPTAB, 2)
#define expand_mask_fold_left_optab_fn(FN, STMT, OPTAB) \
expand_direct_optab_fn (FN, STMT, OPTAB, 3)
/* RETURN_TYPE and ARGS are a return type and argument list that are
in principle compatible with FN (which satisfies direct_internal_fn_p).
Return the types that should be used to determine whether the
......@@ -3088,6 +3092,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
#define direct_while_optab_supported_p convert_optab_supported_p
#define direct_fold_extract_optab_supported_p direct_optab_supported_p
#define direct_fold_left_optab_supported_p direct_optab_supported_p
#define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
/* Return the optab used by internal function FN. */
......
......@@ -199,6 +199,9 @@ DEF_INTERNAL_OPTAB_FN (FOLD_EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
DEF_INTERNAL_OPTAB_FN (FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW,
fold_left_plus, fold_left)
DEF_INTERNAL_OPTAB_FN (MASK_FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW,
mask_fold_left_plus, mask_fold_left)
/* Unary math functions. */
DEF_INTERNAL_FLT_FN (ACOS, ECF_CONST, acos, unary)
DEF_INTERNAL_FLT_FN (ACOSH, ECF_CONST, acosh, unary)
......
......@@ -323,6 +323,7 @@ OPTAB_D (reduc_and_scal_optab, "reduc_and_scal_$a")
OPTAB_D (reduc_ior_scal_optab, "reduc_ior_scal_$a")
OPTAB_D (reduc_xor_scal_optab, "reduc_xor_scal_$a")
OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a")
OPTAB_D (mask_fold_left_plus_optab, "mask_fold_left_plus_$a")
OPTAB_D (extract_last_optab, "extract_last_$a")
OPTAB_D (fold_extract_last_optab, "fold_extract_last_$a")
......
2019-06-18 Alejandro Martinez <alejandro.martinezvicente@arm.com>
* gcc.target/aarch64/sve/fadda_1.c: New test.
2019-06-17 Jakub Jelinek <jakub@redhat.com>
* gcc.dg/vect/vect-simd-8.c: New test.
......
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
#define DO_OPS(TYPE) \
TYPE fold_##TYPE (TYPE *src, int count) \
{ \
TYPE res = 0; \
for (int i = 0; i < count; ++i) \
res += src[i]; \
return res; \
}
DO_OPS (_Float16)
DO_OPS (float)
DO_OPS (double)
/* { dg-final { scan-assembler-times {\tfadda\th[0-9]+, p[0-7], h[0-9]+, z[0-9]+\.h\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d\n} 1 } } */
/* { dg-final { scan-assembler-not "sel" } } */
......@@ -5916,6 +5916,30 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
return lhs;
}
/* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
type of the vector input. */
static internal_fn
get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
{
internal_fn mask_reduc_fn;
switch (reduc_fn)
{
case IFN_FOLD_LEFT_PLUS:
mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
break;
default:
return IFN_LAST;
}
if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
OPTIMIZE_FOR_SPEED))
return mask_reduc_fn;
return IFN_LAST;
}
/* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
statement that sets the live-out value. REDUC_DEF_STMT is the phi
statement. CODE is the operation performed by STMT_INFO and OPS are
......@@ -5938,6 +5962,7 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
stmt_vec_info new_stmt_info = NULL;
internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
int ncopies;
if (slp_node)
......@@ -6014,16 +6039,21 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
def0 = negated;
}
if (mask)
if (mask && mask_reduc_fn == IFN_LAST)
def0 = merge_with_identity (gsi, mask, vectype_out, def0,
vector_identity);
/* On the first iteration the input is simply the scalar phi
result, and for subsequent iterations it is the output of
the preceding operation. */
if (reduc_fn != IFN_LAST)
if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
{
new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
if (mask && mask_reduc_fn != IFN_LAST)
new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
def0, mask);
else
new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
def0);
/* For chained SLP reductions the output of the previous reduction
operation serves as the input of the next. For the final statement
the output cannot be a temporary - we reuse the original
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment