Commit 9feeafd7 by Alejandro Martinez Committed by Alejandro Martinez

[Aarch64][SVE] Dot product support

This patch does two things. For the general vectoriser, it adds support to
perform fully masked reductions over expressions that don't support masking.
This is achieved by using VEC_COND_EXPR where possible.  At the moment this is
implemented for DOT_PROD_EXPR only, but the framework is there to extend it to
other expressions.

Related to that, this patch adds support to vectorize dot product using SVE.  It
also uses the new functionality to ensure that the resulting loop is masked.

Given this input code:

uint32_t
dotprod (uint8_t *restrict x, uint8_t *restrict y, int n)
{
  uint32_t sum = 0;

  for (int i = 0; i < n; i++)
    {
      sum += x[i] * y[i];
    }

  return sum;
}

The resulting SVE code is:

0000000000000000 <dotprod>:
   0:	7100005f 	cmp	w2, #0x0
   4:	5400024d 	b.le	4c <dotprod+0x4c>
   8:	d2800003 	mov	x3, #0x0                   	// #0
   c:	93407c42 	sxtw	x2, w2
  10:	2538c001 	mov	z1.b, #0
  14:	25221fe0 	whilelo	p0.b, xzr, x2
  18:	2538c003 	mov	z3.b, #0
  1c:	d503201f 	nop
  20:	a4034002 	ld1b	{z2.b}, p0/z, [x0, x3]
  24:	a4034020 	ld1b	{z0.b}, p0/z, [x1, x3]
  28:	0430e3e3 	incb	x3
  2c:	0523c000 	sel	z0.b, p0, z0.b, z3.b
  30:	25221c60 	whilelo	p0.b, x3, x2
  34:	44820401 	udot	z1.s, z0.b, z2.b
  38:	54ffff41 	b.ne	20 <dotprod+0x20>  // b.any
  3c:	2598e3e0 	ptrue	p0.s
  40:	04812021 	uaddv	d1, p0, z1.s
  44:	1e260020 	fmov	w0, s1
  48:	d65f03c0 	ret
  4c:	1e2703e1 	fmov	s1, wzr
  50:	1e260020 	fmov	w0, s1
  54:	d65f03c0 	ret

Notice how udot is used inside a fully masked loop.

I tested this patch in an aarch64 machine bootstrapping the compiler and
running the checks.
gcc/Changelog:

2019-05-02  Alejandro Martinez  <alejandro.martinezvicente@arm.com>

	* config/aarch64/aarch64-sve.md (<sur>dot_prod<vsi2qi>): Taken from SVE
	ACLE branch.
	* config/aarch64/iterators.md: Copied Vetype_fourth, VSI2QI and vsi2qi from
	SVE ACLE branch.
	* tree-vect-loop.c (use_mask_by_cond_expr_p): New function to check if a
	VEC_COND_EXPR be inserted to emulate a conditional internal function.
	(build_vect_cond_expr): Emit the VEC_COND_EXPR.
	(vectorizable_reduction): Use the functions above to vectorize in a
	fully masked loop codes that don't have a conditional internal
	function.

gcc/testsuite/Changelog:
 
2019-05-02  Alejandro Martinez  <alejandro.martinezvicente@arm.com>

	* gcc.target/aarch64/sve/dot_1.c: New test for dot product.

From-SVN: r270790
parent cc2a672a
2019-05-02 Alejandro Martinez <alejandro.martinezvicente@arm.com>
* config/aarch64/aarch64-sve.md (<sur>dot_prod<vsi2qi>): Taken from SVE
ACLE branch.
* config/aarch64/iterators.md: Copied Vetype_fourth, VSI2QI and vsi2qi from
SVE ACLE branch.
* tree-vect-loop.c (use_mask_by_cond_expr_p): New function to check if a
VEC_COND_EXPR be inserted to emulate a conditional internal function.
(build_vect_cond_expr): Emit the VEC_COND_EXPR.
(vectorizable_reduction): Use the functions above to vectorize in a
fully masked loop codes that don't have a conditional internal
function.
2019-05-02 Martin Liska <mliska@suse.cz> 2019-05-02 Martin Liska <mliska@suse.cz>
* cgraphclones.c: Call valid_attribute_p with 1 for * cgraphclones.c: Call valid_attribute_p with 1 for
......
...@@ -3132,3 +3132,19 @@ ...@@ -3132,3 +3132,19 @@
DONE; DONE;
} }
) )
;; Unpredicated DOT product.
(define_insn "<sur>dot_prod<vsi2qi>"
[(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w")
(plus:SVE_SDI
(unspec:SVE_SDI
[(match_operand:<VSI2QI> 1 "register_operand" "w, w")
(match_operand:<VSI2QI> 2 "register_operand" "w, w")]
DOTPROD)
(match_operand:SVE_SDI 3 "register_operand" "0, w")))]
"TARGET_SVE"
"@
<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>
movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>"
[(set_attr "movprfx" "*,yes")]
)
...@@ -663,6 +663,9 @@ ...@@ -663,6 +663,9 @@
(QI "b") (HI "h") (QI "b") (HI "h")
(SI "s") (DI "d")]) (SI "s") (DI "d")])
;; Like Vetype, but map to types that are a quarter of the element size.
(define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")])
;; Equivalent of "size" for a vector element. ;; Equivalent of "size" for a vector element.
(define_mode_attr Vesize [(VNx16QI "b") (define_mode_attr Vesize [(VNx16QI "b")
(VNx8HI "h") (VNx8HF "h") (VNx8HI "h") (VNx8HF "h")
...@@ -1029,8 +1032,10 @@ ...@@ -1029,8 +1032,10 @@
(V2SF "p") (V4SF "v") (V2SF "p") (V4SF "v")
(V4HF "v") (V8HF "v")]) (V4HF "v") (V8HF "v")])
(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")]) (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")
(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")]) (VNx4SI "vnx16qi") (VNx2DI "vnx8hi")])
(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")
(VNx4SI "VNx16QI") (VNx2DI "VNx8HI")])
;; Register suffix for DOTPROD input types from the return type. ;; Register suffix for DOTPROD input types from the return type.
......
2019-05-02 Alejandro Martinez <alejandro.martinezvicente@arm.com>
* gcc.target/aarch64/sve/dot_1.c: New test for dot product.
2019-05-02 Martin Liska <mliska@suse.cz> 2019-05-02 Martin Liska <mliska@suse.cz>
* gcc.target/i386/funcspec-4.c: Update scanned pattern. * gcc.target/i386/funcspec-4.c: Update scanned pattern.
......
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
#define DEF_DOT(TYPE1, TYPE2) \
TYPE1 __attribute__ ((noinline, noclone)) \
dot_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n) \
{ \
TYPE1 sum = 0; \
for (int i = 0; i < n; i++) \
{ \
sum += x[i] * y[i]; \
} \
return sum; \
}
DEF_DOT(uint32_t, uint8_t)
DEF_DOT(int32_t, int8_t)
DEF_DOT(int64_t, int16_t)
/* The uint16_t->uint64_t dot product requires a casting to satisfy the C
language rules. */
uint64_t __attribute__ ((noinline, noclone))
dot_uint64_t_uint16_t (uint16_t *restrict x, uint16_t *restrict y, int n)
{
uint64_t sum = 0;
for (int i = 0; i < n; i++)
{
sum += (unsigned int)x[i] * y[i];
}
return sum;
}
/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
/* { dg-final { scan-assembler-times {\twhilelo\t} 8 } } */
...@@ -5958,6 +5958,55 @@ is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop) ...@@ -5958,6 +5958,55 @@ is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
<= TYPE_PRECISION (lhs_type)); <= TYPE_PRECISION (lhs_type));
} }
/* Check if masking can be supported by inserting a conditional expression.
CODE is the code for the operation. COND_FN is the conditional internal
function, if it exists. VECTYPE_IN is the type of the vector input. */
static bool
use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
tree vectype_in)
{
if (cond_fn != IFN_LAST
&& direct_internal_fn_supported_p (cond_fn, vectype_in,
OPTIMIZE_FOR_SPEED))
return false;
switch (code)
{
case DOT_PROD_EXPR:
return true;
default:
return false;
}
}
/* Insert a conditional expression to enable masked vectorization. CODE is the
code for the operation. VOP is the array of operands. MASK is the loop
mask. GSI is a statement iterator used to place the new conditional
expression. */
static void
build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
gimple_stmt_iterator *gsi)
{
switch (code)
{
case DOT_PROD_EXPR:
{
tree vectype = TREE_TYPE (vop[1]);
tree zero = build_zero_cst (vectype);
tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
mask, vop[1], zero);
gsi_insert_before (gsi, select, GSI_SAME_STMT);
vop[1] = masked_op1;
break;
}
default:
gcc_unreachable ();
}
}
/* Function vectorizable_reduction. /* Function vectorizable_reduction.
Check if STMT_INFO performs a reduction operation that can be vectorized. Check if STMT_INFO performs a reduction operation that can be vectorized.
...@@ -6931,6 +6980,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, ...@@ -6931,6 +6980,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
internal_fn cond_fn = get_conditional_internal_fn (code); internal_fn cond_fn = get_conditional_internal_fn (code);
vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
if (!vec_stmt) /* transformation not required. */ if (!vec_stmt) /* transformation not required. */
{ {
...@@ -6938,6 +6988,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, ...@@ -6938,6 +6988,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
{ {
if (reduction_type != FOLD_LEFT_REDUCTION if (reduction_type != FOLD_LEFT_REDUCTION
&& !mask_by_cond_expr
&& (cond_fn == IFN_LAST && (cond_fn == IFN_LAST
|| !direct_internal_fn_supported_p (cond_fn, vectype_in, || !direct_internal_fn_supported_p (cond_fn, vectype_in,
OPTIMIZE_FOR_SPEED))) OPTIMIZE_FOR_SPEED)))
...@@ -7101,7 +7152,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, ...@@ -7101,7 +7152,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
{ {
tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
if (masked_loop_p) if (masked_loop_p && !mask_by_cond_expr)
{ {
/* Make sure that the reduction accumulator is vop[0]. */ /* Make sure that the reduction accumulator is vop[0]. */
if (reduc_index == 1) if (reduc_index == 1)
...@@ -7125,6 +7176,14 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, ...@@ -7125,6 +7176,14 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
if (op_type == ternary_op) if (op_type == ternary_op)
vop[2] = vec_oprnds2[i]; vop[2] = vec_oprnds2[i];
if (masked_loop_p && mask_by_cond_expr)
{
tree mask = vect_get_loop_mask (gsi, masks,
vec_num * ncopies,
vectype_in, i * ncopies + j);
build_vect_cond_expr (code, vop, mask, gsi);
}
gassign *new_stmt = gimple_build_assign (vec_dest, code, gassign *new_stmt = gimple_build_assign (vec_dest, code,
vop[0], vop[1], vop[2]); vop[0], vop[1], vop[2]);
new_temp = make_ssa_name (vec_dest, new_stmt); new_temp = make_ssa_name (vec_dest, new_stmt);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment