Commit 58cc9876 by Yuliang Wang Committed by Richard Sandiford

Vectorise multiply high with scaling operations (PR 89386)

2019-09-12  Yuliang Wang  <yuliang.wang@arm.com>

gcc/
	PR tree-optimization/89386
	* config/aarch64/aarch64-sve2.md (<su>mull<bt><Vwide>)
	(<r>shrnb<mode>, <r>shrnt<mode>): New SVE2 patterns.
	(<su>mulh<r>s<mode>3): New pattern for MULHRS.
	* config/aarch64/iterators.md (UNSPEC_SMULLB, UNSPEC_SMULLT)
	(UNSPEC_UMULLB, UNSPEC_UMULLT, UNSPEC_SHRNB, UNSPEC_SHRNT)
	(UNSPEC_RSHRNB, UNSPEC_RSHRNT, UNSPEC_SMULHS, UNSPEC_SMULHRS)
	UNSPEC_UMULHS, UNSPEC_UMULHRS): New unspecs.
	(MULLBT, SHRNB, SHRNT, MULHRS): New int iterators.
	(su, r): Handle the unspecs above.
	(bt): New int attribute.
	* internal-fn.def (IFN_MULHS, IFN_MULHRS): New internal functions.
	* internal-fn.c (first_commutative_argument): Commutativity info for
	above.
	* optabs.def (smulhs_optab, smulhrs_optab, umulhs_optab)
	(umulhrs_optab): New optabs.
	* doc/md.texi (smulhs$var{m3}, umulhs$var{m3})
	(smulhrs$var{m3}, umulhrs$var{m3}): Documentation for the above.
	* tree-vect-patterns.c (vect_recog_mulhs_pattern): New pattern
	function.
	(vect_vect_recog_func_ptrs): Add it.
	* testsuite/gcc.target/aarch64/sve2/mulhrs_1.c: New test.
	* testsuite/gcc.dg/vect/vect-mulhrs-1.c: As above.
	* testsuite/gcc.dg/vect/vect-mulhrs-2.c: As above.
	* testsuite/gcc.dg/vect/vect-mulhrs-3.c: As above.
	* testsuite/gcc.dg/vect/vect-mulhrs-4.c: As above.
	* doc/sourcebuild.texi (vect_mulhrs_hi): Document new target selector.
	* testsuite/lib/target-supports.exp
	(check_effective_target_vect_mulhrs_hi): Return true for AArch64
	with SVE2.

From-SVN: r275682
parent 8c58d9d8
2019-09-12 Yuliang Wang <yuliang.wang@arm.com>
PR tree-optimization/89386
* config/aarch64/aarch64-sve2.md (<su>mull<bt><Vwide>)
(<r>shrnb<mode>, <r>shrnt<mode>): New SVE2 patterns.
(<su>mulh<r>s<mode>3): New pattern for MULHRS.
* config/aarch64/iterators.md (UNSPEC_SMULLB, UNSPEC_SMULLT)
(UNSPEC_UMULLB, UNSPEC_UMULLT, UNSPEC_SHRNB, UNSPEC_SHRNT)
(UNSPEC_RSHRNB, UNSPEC_RSHRNT, UNSPEC_SMULHS, UNSPEC_SMULHRS)
UNSPEC_UMULHS, UNSPEC_UMULHRS): New unspecs.
(MULLBT, SHRNB, SHRNT, MULHRS): New int iterators.
(su, r): Handle the unspecs above.
(bt): New int attribute.
* internal-fn.def (IFN_MULHS, IFN_MULHRS): New internal functions.
* internal-fn.c (first_commutative_argument): Commutativity info for
above.
* optabs.def (smulhs_optab, smulhrs_optab, umulhs_optab)
(umulhrs_optab): New optabs.
* doc/md.texi (smulhs$var{m3}, umulhs$var{m3})
(smulhrs$var{m3}, umulhrs$var{m3}): Documentation for the above.
* tree-vect-patterns.c (vect_recog_mulhs_pattern): New pattern
function.
(vect_vect_recog_func_ptrs): Add it.
* testsuite/gcc.target/aarch64/sve2/mulhrs_1.c: New test.
* testsuite/gcc.dg/vect/vect-mulhrs-1.c: As above.
* testsuite/gcc.dg/vect/vect-mulhrs-2.c: As above.
* testsuite/gcc.dg/vect/vect-mulhrs-3.c: As above.
* testsuite/gcc.dg/vect/vect-mulhrs-4.c: As above.
* doc/sourcebuild.texi (vect_mulhrs_hi): Document new target selector.
* testsuite/lib/target-supports.exp
(check_effective_target_vect_mulhrs_hi): Return true for AArch64
with SVE2.
2019-09-11 Michael Meissner <meissner@linux.ibm.com> 2019-09-11 Michael Meissner <meissner@linux.ibm.com>
* config/rs6000/predicates.md (non_add_cint_operand): Simplify the * config/rs6000/predicates.md (non_add_cint_operand): Simplify the
......
...@@ -63,3 +63,63 @@ ...@@ -63,3 +63,63 @@
movprfx\t%0, %2\;<sur>h<addsub>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>" movprfx\t%0, %2\;<sur>h<addsub>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
[(set_attr "movprfx" "*,yes")] [(set_attr "movprfx" "*,yes")]
) )
;; Multiply long top / bottom.
(define_insn "<su>mull<bt><Vwide>"
[(set (match_operand:<VWIDE> 0 "register_operand" "=w")
(unspec:<VWIDE> [(match_operand:SVE_BHSI 1 "register_operand" "w")
(match_operand:SVE_BHSI 2 "register_operand" "w")]
MULLBT))]
"TARGET_SVE2"
"<su>mull<bt>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
)
;; (Rounding) Right shift narrow bottom.
(define_insn "<r>shrnb<mode>"
[(set (match_operand:SVE_BHSI 0 "register_operand" "=w")
(unspec:SVE_BHSI
[(match_operand:<VWIDE> 1 "register_operand" "w")
(match_operand 2 "aarch64_simd_shift_imm_offset_<Vel>" "")]
SHRNB))]
"TARGET_SVE2"
"<r>shrnb\t%0.<Vetype>, %1.<Vewtype>, #%2"
)
;; (Rounding) Right shift narrow top.
(define_insn "<r>shrnt<mode>"
[(set (match_operand:SVE_BHSI 0 "register_operand" "=w")
(unspec:SVE_BHSI
[(match_operand:SVE_BHSI 1 "register_operand" "0")
(match_operand:<VWIDE> 2 "register_operand" "w")
(match_operand 3 "aarch64_simd_shift_imm_offset_<Vel>" "i")]
SHRNT))]
"TARGET_SVE2"
"<r>shrnt\t%0.<Vetype>, %2.<Vewtype>, #%3"
)
;; Unpredicated integer multiply-high-with-(round-and-)scale.
(define_expand "<su>mulh<r>s<mode>3"
[(set (match_operand:SVE_BHSI 0 "register_operand")
(unspec:SVE_BHSI
[(match_dup 3)
(unspec:SVE_BHSI [(match_operand:SVE_BHSI 1 "register_operand")
(match_operand:SVE_BHSI 2 "register_operand")]
MULHRS)]
UNSPEC_PRED_X))]
"TARGET_SVE2"
{
operands[3] = aarch64_ptrue_reg (<VPRED>mode);
rtx prod_b = gen_reg_rtx (<VWIDE>mode);
rtx prod_t = gen_reg_rtx (<VWIDE>mode);
emit_insn (gen_<su>mullb<Vwide> (prod_b, operands[1], operands[2]));
emit_insn (gen_<su>mullt<Vwide> (prod_t, operands[1], operands[2]));
rtx shift = GEN_INT (GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1);
emit_insn (gen_<r>shrnb<mode> (operands[0], prod_b, shift));
emit_insn (gen_<r>shrnt<mode> (operands[0], operands[0], prod_t, shift));
DONE;
}
)
...@@ -378,6 +378,10 @@ ...@@ -378,6 +378,10 @@
UNSPEC_RSUBHN2 ; Used in aarch64-simd.md. UNSPEC_RSUBHN2 ; Used in aarch64-simd.md.
UNSPEC_SQDMULH ; Used in aarch64-simd.md. UNSPEC_SQDMULH ; Used in aarch64-simd.md.
UNSPEC_SQRDMULH ; Used in aarch64-simd.md. UNSPEC_SQRDMULH ; Used in aarch64-simd.md.
UNSPEC_SMULLB ; Used in aarch64-sve2.md.
UNSPEC_SMULLT ; Used in aarch64-sve2.md.
UNSPEC_UMULLB ; Used in aarch64-sve2.md.
UNSPEC_UMULLT ; Used in aarch64-sve2.md.
UNSPEC_PMUL ; Used in aarch64-simd.md. UNSPEC_PMUL ; Used in aarch64-simd.md.
UNSPEC_FMULX ; Used in aarch64-simd.md. UNSPEC_FMULX ; Used in aarch64-simd.md.
UNSPEC_USQADD ; Used in aarch64-simd.md. UNSPEC_USQADD ; Used in aarch64-simd.md.
...@@ -400,6 +404,10 @@ ...@@ -400,6 +404,10 @@
UNSPEC_UQSHRN ; Used in aarch64-simd.md. UNSPEC_UQSHRN ; Used in aarch64-simd.md.
UNSPEC_SQRSHRN ; Used in aarch64-simd.md. UNSPEC_SQRSHRN ; Used in aarch64-simd.md.
UNSPEC_UQRSHRN ; Used in aarch64-simd.md. UNSPEC_UQRSHRN ; Used in aarch64-simd.md.
UNSPEC_SHRNB ; Used in aarch64-sve2.md.
UNSPEC_SHRNT ; Used in aarch64-sve2.md.
UNSPEC_RSHRNB ; Used in aarch64-sve2.md.
UNSPEC_RSHRNT ; Used in aarch64-sve2.md.
UNSPEC_SSHL ; Used in aarch64-simd.md. UNSPEC_SSHL ; Used in aarch64-simd.md.
UNSPEC_USHL ; Used in aarch64-simd.md. UNSPEC_USHL ; Used in aarch64-simd.md.
UNSPEC_SRSHL ; Used in aarch64-simd.md. UNSPEC_SRSHL ; Used in aarch64-simd.md.
...@@ -523,6 +531,10 @@ ...@@ -523,6 +531,10 @@
UNSPEC_FCMLA90 ; Used in aarch64-simd.md. UNSPEC_FCMLA90 ; Used in aarch64-simd.md.
UNSPEC_FCMLA180 ; Used in aarch64-simd.md. UNSPEC_FCMLA180 ; Used in aarch64-simd.md.
UNSPEC_FCMLA270 ; Used in aarch64-simd.md. UNSPEC_FCMLA270 ; Used in aarch64-simd.md.
UNSPEC_SMULHS ; Used in aarch64-sve2.md.
UNSPEC_SMULHRS ; Used in aarch64-sve2.md.
UNSPEC_UMULHS ; Used in aarch64-sve2.md.
UNSPEC_UMULHRS ; Used in aarch64-sve2.md.
]) ])
;; ------------------------------------------------------------------ ;; ------------------------------------------------------------------
...@@ -1588,6 +1600,13 @@ ...@@ -1588,6 +1600,13 @@
(define_int_iterator RHADD [UNSPEC_SRHADD UNSPEC_URHADD]) (define_int_iterator RHADD [UNSPEC_SRHADD UNSPEC_URHADD])
(define_int_iterator MULLBT [UNSPEC_SMULLB UNSPEC_UMULLB
UNSPEC_SMULLT UNSPEC_UMULLT])
(define_int_iterator SHRNB [UNSPEC_SHRNB UNSPEC_RSHRNB])
(define_int_iterator SHRNT [UNSPEC_SHRNT UNSPEC_RSHRNT])
(define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT]) (define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT])
(define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN (define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN
...@@ -1607,6 +1626,9 @@ ...@@ -1607,6 +1626,9 @@
(define_int_iterator VQDMULH [UNSPEC_SQDMULH UNSPEC_SQRDMULH]) (define_int_iterator VQDMULH [UNSPEC_SQDMULH UNSPEC_SQRDMULH])
(define_int_iterator MULHRS [UNSPEC_SMULHS UNSPEC_UMULHS
UNSPEC_SMULHRS UNSPEC_UMULHRS])
(define_int_iterator USSUQADD [UNSPEC_SUQADD UNSPEC_USQADD]) (define_int_iterator USSUQADD [UNSPEC_SUQADD UNSPEC_USQADD])
(define_int_iterator SUQMOVN [UNSPEC_SQXTN UNSPEC_UQXTN]) (define_int_iterator SUQMOVN [UNSPEC_SQXTN UNSPEC_UQXTN])
...@@ -1872,7 +1894,11 @@ ...@@ -1872,7 +1894,11 @@
(UNSPEC_COND_FCVTZS "s") (UNSPEC_COND_FCVTZS "s")
(UNSPEC_COND_FCVTZU "u") (UNSPEC_COND_FCVTZU "u")
(UNSPEC_COND_SCVTF "s") (UNSPEC_COND_SCVTF "s")
(UNSPEC_COND_UCVTF "u")]) (UNSPEC_COND_UCVTF "u")
(UNSPEC_SMULLB "s") (UNSPEC_UMULLB "u")
(UNSPEC_SMULLT "s") (UNSPEC_UMULLT "u")
(UNSPEC_SMULHS "s") (UNSPEC_UMULHS "u")
(UNSPEC_SMULHRS "s") (UNSPEC_UMULHRS "u")])
(define_int_attr sur [(UNSPEC_SHADD "s") (UNSPEC_UHADD "u") (define_int_attr sur [(UNSPEC_SHADD "s") (UNSPEC_UHADD "u")
(UNSPEC_SRHADD "sr") (UNSPEC_URHADD "ur") (UNSPEC_SRHADD "sr") (UNSPEC_URHADD "ur")
...@@ -1910,6 +1936,10 @@ ...@@ -1910,6 +1936,10 @@
(UNSPEC_SQRSHRN "r") (UNSPEC_UQRSHRN "r") (UNSPEC_SQRSHRN "r") (UNSPEC_UQRSHRN "r")
(UNSPEC_SQSHL "") (UNSPEC_UQSHL "") (UNSPEC_SQSHL "") (UNSPEC_UQSHL "")
(UNSPEC_SQRSHL "r")(UNSPEC_UQRSHL "r") (UNSPEC_SQRSHL "r")(UNSPEC_UQRSHL "r")
(UNSPEC_SHRNB "") (UNSPEC_SHRNT "")
(UNSPEC_RSHRNB "r") (UNSPEC_RSHRNT "r")
(UNSPEC_SMULHS "") (UNSPEC_UMULHS "")
(UNSPEC_SMULHRS "r") (UNSPEC_UMULHRS "r")
]) ])
(define_int_attr lr [(UNSPEC_SSLI "l") (UNSPEC_USLI "l") (define_int_attr lr [(UNSPEC_SSLI "l") (UNSPEC_USLI "l")
...@@ -1922,6 +1952,9 @@ ...@@ -1922,6 +1952,9 @@
(UNSPEC_SHADD "") (UNSPEC_UHADD "u") (UNSPEC_SHADD "") (UNSPEC_UHADD "u")
(UNSPEC_SRHADD "") (UNSPEC_URHADD "u")]) (UNSPEC_SRHADD "") (UNSPEC_URHADD "u")])
(define_int_attr bt [(UNSPEC_SMULLB "b") (UNSPEC_UMULLB "b")
(UNSPEC_SMULLT "t") (UNSPEC_UMULLT "t")])
(define_int_attr addsub [(UNSPEC_SHADD "add") (define_int_attr addsub [(UNSPEC_SHADD "add")
(UNSPEC_UHADD "add") (UNSPEC_UHADD "add")
(UNSPEC_SRHADD "add") (UNSPEC_SRHADD "add")
......
...@@ -5387,6 +5387,33 @@ operand 1. Add operand 1 to operand 2 and place the widened result in ...@@ -5387,6 +5387,33 @@ operand 1. Add operand 1 to operand 2 and place the widened result in
operand 0. (This is used express accumulation of elements into an accumulator operand 0. (This is used express accumulation of elements into an accumulator
of a wider mode.) of a wider mode.)
@cindex @code{smulhs@var{m3}} instruction pattern
@item @samp{smulhs@var{m3}}
@cindex @code{umulhs@var{m3}} instruction pattern
@itemx @samp{umulhs@var{m3}}
Signed/unsigned multiply high with scale. This is equivalent to the C code:
@smallexample
narrow op0, op1, op2;
@dots{}
op0 = (narrow) (((wide) op1 * (wide) op2) >> (N / 2 - 1));
@end smallexample
where the sign of @samp{narrow} determines whether this is a signed
or unsigned operation, and @var{N} is the size of @samp{wide} in bits.
@cindex @code{smulhrs@var{m3}} instruction pattern
@item @samp{smulhrs@var{m3}}
@cindex @code{umulhrs@var{m3}} instruction pattern
@itemx @samp{umulhrs@var{m3}}
Signed/unsigned multiply high with round and scale. This is
equivalent to the C code:
@smallexample
narrow op0, op1, op2;
@dots{}
op0 = (narrow) (((((wide) op1 * (wide) op2) >> (N / 2 - 2)) + 1) >> 1);
@end smallexample
where the sign of @samp{narrow} determines whether this is a signed
or unsigned operation, and @var{N} is the size of @samp{wide} in bits.
@cindex @code{vec_shl_insert_@var{m}} instruction pattern @cindex @code{vec_shl_insert_@var{m}} instruction pattern
@item @samp{vec_shl_insert_@var{m}} @item @samp{vec_shl_insert_@var{m}}
Shift the elements in vector input operand 1 left one element (i.e.@: Shift the elements in vector input operand 1 left one element (i.e.@:
......
...@@ -1442,6 +1442,10 @@ vector alignment. ...@@ -1442,6 +1442,10 @@ vector alignment.
Target supports both signed and unsigned averaging operations on vectors Target supports both signed and unsigned averaging operations on vectors
of bytes. of bytes.
@item vect_mulhrs_hi
Target supports both signed and unsigned multiply-high-with-round-and-scale
operations on vectors of half-words.
@item vect_condition @item vect_condition
Target supports vector conditional operations. Target supports vector conditional operations.
......
...@@ -3210,6 +3210,8 @@ first_commutative_argument (internal_fn fn) ...@@ -3210,6 +3210,8 @@ first_commutative_argument (internal_fn fn)
case IFN_FNMS: case IFN_FNMS:
case IFN_AVG_FLOOR: case IFN_AVG_FLOOR:
case IFN_AVG_CEIL: case IFN_AVG_CEIL:
case IFN_MULHS:
case IFN_MULHRS:
case IFN_FMIN: case IFN_FMIN:
case IFN_FMAX: case IFN_FMAX:
return 0; return 0;
......
...@@ -149,6 +149,11 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_FLOOR, ECF_CONST | ECF_NOTHROW, first, ...@@ -149,6 +149,11 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_FLOOR, ECF_CONST | ECF_NOTHROW, first,
DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_CEIL, ECF_CONST | ECF_NOTHROW, first, DEF_INTERNAL_SIGNED_OPTAB_FN (AVG_CEIL, ECF_CONST | ECF_NOTHROW, first,
savg_ceil, uavg_ceil, binary) savg_ceil, uavg_ceil, binary)
DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
smulhs, umulhs, binary)
DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
smulhrs, umulhrs, binary)
DEF_INTERNAL_OPTAB_FN (COND_ADD, ECF_CONST, cond_add, cond_binary) DEF_INTERNAL_OPTAB_FN (COND_ADD, ECF_CONST, cond_add, cond_binary)
DEF_INTERNAL_OPTAB_FN (COND_SUB, ECF_CONST, cond_sub, cond_binary) DEF_INTERNAL_OPTAB_FN (COND_SUB, ECF_CONST, cond_sub, cond_binary)
DEF_INTERNAL_OPTAB_FN (COND_MUL, ECF_CONST, cond_smul, cond_binary) DEF_INTERNAL_OPTAB_FN (COND_MUL, ECF_CONST, cond_smul, cond_binary)
......
...@@ -343,6 +343,10 @@ OPTAB_D (udot_prod_optab, "udot_prod$I$a") ...@@ -343,6 +343,10 @@ OPTAB_D (udot_prod_optab, "udot_prod$I$a")
OPTAB_D (usum_widen_optab, "widen_usum$I$a3") OPTAB_D (usum_widen_optab, "widen_usum$I$a3")
OPTAB_D (usad_optab, "usad$I$a") OPTAB_D (usad_optab, "usad$I$a")
OPTAB_D (ssad_optab, "ssad$I$a") OPTAB_D (ssad_optab, "ssad$I$a")
OPTAB_D (smulhs_optab, "smulhs$a3")
OPTAB_D (smulhrs_optab, "smulhrs$a3")
OPTAB_D (umulhs_optab, "umulhs$a3")
OPTAB_D (umulhrs_optab, "umulhrs$a3")
OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a") OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a") OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")
OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a") OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a")
......
/* { dg-require-effective-target vect_int } */
#include "tree-vect.h"
#ifndef SIGNEDNESS
#define SIGNEDNESS signed
#endif
#ifndef BIAS
#define BIAS 0
#endif
#define HRS(x) ((((x) >> (15 - BIAS)) + BIAS) >> BIAS)
void __attribute__ ((noipa))
f (SIGNEDNESS short *restrict a, SIGNEDNESS short *restrict b,
SIGNEDNESS short *restrict c, __INTPTR_TYPE__ n)
{
for (__INTPTR_TYPE__ i = 0; i < n; ++i)
a[i] = HRS((SIGNEDNESS int) b[i] * (SIGNEDNESS int) c[i]);
}
#define N 50
#define BASE1 ((SIGNEDNESS int) -1 < 0 ? -126 : 4)
#define BASE2 ((SIGNEDNESS int) -1 < 0 ? -101 : 26)
#define CONST1 0x01AB
#define CONST2 0x01CD
int
main (void)
{
check_vect ();
SIGNEDNESS short a[N], b[N], c[N];
for (int i = 0; i < N; ++i)
{
b[i] = BASE1 + i * CONST1;
c[i] = BASE2 + i * CONST2;
asm volatile ("" ::: "memory");
}
f (a, b, c, N);
for (int i = 0; i < N; ++i)
if (a[i] != HRS(BASE1 * BASE2 + i * i * (CONST1 * CONST2)
+ i * (BASE1 * CONST2 + BASE2 * CONST1)))
__builtin_abort ();
return 0;
}
/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
/* { dg-final { scan-tree-dump {\.MULHS} "vect" { target vect_mulhrs_hi } } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
/* { dg-require-effective-target vect_int } */
#define SIGNEDNESS unsigned
#include "vect-mulhrs-1.c"
/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
/* { dg-final { scan-tree-dump {\.MULHS} "vect" { target vect_mulhrs_hi } } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
/* { dg-require-effective-target vect_int } */
#define BIAS 1
#include "vect-mulhrs-1.c"
/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
/* { dg-final { scan-tree-dump {\.MULHRS} "vect" { target vect_mulhrs_hi } } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
/* { dg-require-effective-target vect_int } */
#define SIGNEDNESS unsigned
#define BIAS 1
#include "vect-mulhrs-1.c"
/* { dg-final { scan-tree-dump "vect_recog_mulhs_pattern: detected" "vect" } } */
/* { dg-final { scan-tree-dump {\.MULHRS} "vect" { target vect_mulhrs_hi } } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target vect_mulhrs_hi } } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details --save-temps" } */
#include <stdint.h>
#define MULTHI(TYPE, BIGGER, RND) \
TYPE __attribute__ ((noinline, noclone)) \
mulhs_##TYPE##_##RND (TYPE *restrict x, \
TYPE *restrict y, TYPE *restrict z, int n) \
{ \
for (int i = 0; i < n; i++) \
{ \
z[i] = ((((BIGGER)x[i] * (BIGGER)y[i]) >> \
(sizeof(BIGGER)*8/2-2)) + RND) >> 1; \
} \
}
MULTHI (int8_t, int16_t, 0)
MULTHI (int16_t, int32_t, 0)
MULTHI (int32_t, int64_t, 0)
MULTHI (uint8_t, uint16_t, 0)
MULTHI (uint16_t, uint32_t, 0)
MULTHI (uint32_t, uint64_t, 0)
MULTHI (int8_t, int16_t, 1)
MULTHI (int16_t, int32_t, 1)
MULTHI (int32_t, int64_t, 1)
MULTHI (uint8_t, uint16_t, 1)
MULTHI (uint16_t, uint32_t, 1)
MULTHI (uint32_t, uint64_t, 1)
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 12 "vect" } } */
/* { dg-final { scan-assembler-times {\tsmullb\tz[0-9]+\.h, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
/* { dg-final { scan-assembler-times {\tsmullt\tz[0-9]+\.h, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
/* { dg-final { scan-assembler-times {\tsmullb\tz[0-9]+\.s, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
/* { dg-final { scan-assembler-times {\tsmullt\tz[0-9]+\.s, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
/* { dg-final { scan-assembler-times {\tsmullb\tz[0-9]+\.d, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
/* { dg-final { scan-assembler-times {\tsmullt\tz[0-9]+\.d, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
/* { dg-final { scan-assembler-times {\tshrnb\tz[0-9]+\.b, z[0-9]+\.h, #7\n} 2 } } */
/* { dg-final { scan-assembler-times {\tshrnt\tz[0-9]+\.b, z[0-9]+\.h, #7\n} 2 } } */
/* { dg-final { scan-assembler-times {\tshrnb\tz[0-9]+\.h, z[0-9]+\.s, #15\n} 2 } } */
/* { dg-final { scan-assembler-times {\tshrnt\tz[0-9]+\.h, z[0-9]+\.s, #15\n} 2 } } */
/* { dg-final { scan-assembler-times {\tshrnb\tz[0-9]+\.s, z[0-9]+\.d, #31\n} 2 } } */
/* { dg-final { scan-assembler-times {\tshrnt\tz[0-9]+\.s, z[0-9]+\.d, #31\n} 2 } } */
/* { dg-final { scan-assembler-times {\tumullb\tz[0-9]+\.h, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
/* { dg-final { scan-assembler-times {\tumullt\tz[0-9]+\.h, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
/* { dg-final { scan-assembler-times {\tumullb\tz[0-9]+\.s, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
/* { dg-final { scan-assembler-times {\tumullt\tz[0-9]+\.s, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
/* { dg-final { scan-assembler-times {\tumullb\tz[0-9]+\.d, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
/* { dg-final { scan-assembler-times {\tumullt\tz[0-9]+\.d, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
/* { dg-final { scan-assembler-times {\trshrnb\tz[0-9]+\.b, z[0-9]+\.h, #7\n} 2 } } */
/* { dg-final { scan-assembler-times {\trshrnt\tz[0-9]+\.b, z[0-9]+\.h, #7\n} 2 } } */
/* { dg-final { scan-assembler-times {\trshrnb\tz[0-9]+\.h, z[0-9]+\.s, #15\n} 2 } } */
/* { dg-final { scan-assembler-times {\trshrnt\tz[0-9]+\.h, z[0-9]+\.s, #15\n} 2 } } */
/* { dg-final { scan-assembler-times {\trshrnb\tz[0-9]+\.s, z[0-9]+\.d, #31\n} 2 } } */
/* { dg-final { scan-assembler-times {\trshrnt\tz[0-9]+\.s, z[0-9]+\.d, #31\n} 2 } } */
...@@ -6175,6 +6175,15 @@ proc check_effective_target_vect_avg_qi {} { ...@@ -6175,6 +6175,15 @@ proc check_effective_target_vect_avg_qi {} {
&& ![check_effective_target_aarch64_sve1_only] }] && ![check_effective_target_aarch64_sve1_only] }]
} }
# Return 1 if the target plus current options supports both signed
# and unsigned multiply-high-with-round-and-scale operations
# on vectors of half-words.
proc check_effective_target_vect_mulhrs_hi {} {
return [expr { [istarget aarch64*-*-*]
&& [check_effective_target_aarch64_sve2] }]
}
# Return 1 if the target plus current options supports a vector # Return 1 if the target plus current options supports a vector
# demotion (packing) of shorts (to chars) and ints (to shorts) # demotion (packing) of shorts (to chars) and ints (to shorts)
# using modulo arithmetic, 0 otherwise. # using modulo arithmetic, 0 otherwise.
......
...@@ -1723,6 +1723,175 @@ vect_recog_over_widening_pattern (stmt_vec_info last_stmt_info, tree *type_out) ...@@ -1723,6 +1723,175 @@ vect_recog_over_widening_pattern (stmt_vec_info last_stmt_info, tree *type_out)
return pattern_stmt; return pattern_stmt;
} }
/* Recognize the following patterns:
ATYPE a; // narrower than TYPE
BTYPE b; // narrower than TYPE
1) Multiply high with scaling
TYPE res = ((TYPE) a * (TYPE) b) >> c;
2) ... or also with rounding
TYPE res = (((TYPE) a * (TYPE) b) >> d + 1) >> 1;
where only the bottom half of res is used. */
static gimple *
vect_recog_mulhs_pattern (stmt_vec_info last_stmt_info, tree *type_out)
{
/* Check for a right shift. */
gassign *last_stmt = dyn_cast <gassign *> (last_stmt_info->stmt);
if (!last_stmt
|| gimple_assign_rhs_code (last_stmt) != RSHIFT_EXPR)
return NULL;
vec_info *vinfo = last_stmt_info->vinfo;
/* Check that the shift result is wider than the users of the
result need (i.e. that narrowing would be a natural choice). */
tree lhs_type = TREE_TYPE (gimple_assign_lhs (last_stmt));
unsigned int target_precision
= vect_element_precision (last_stmt_info->min_output_precision);
if (!INTEGRAL_TYPE_P (lhs_type)
|| target_precision >= TYPE_PRECISION (lhs_type))
return NULL;
/* Look through any change in sign on the outer shift input. */
vect_unpromoted_value unprom_rshift_input;
tree rshift_input = vect_look_through_possible_promotion
(vinfo, gimple_assign_rhs1 (last_stmt), &unprom_rshift_input);
if (!rshift_input
|| TYPE_PRECISION (TREE_TYPE (rshift_input))
!= TYPE_PRECISION (lhs_type))
return NULL;
/* Get the definition of the shift input. */
stmt_vec_info rshift_input_stmt_info
= vect_get_internal_def (vinfo, rshift_input);
if (!rshift_input_stmt_info)
return NULL;
gassign *rshift_input_stmt
= dyn_cast <gassign *> (rshift_input_stmt_info->stmt);
if (!rshift_input_stmt)
return NULL;
stmt_vec_info mulh_stmt_info;
tree scale_term;
internal_fn ifn;
unsigned int expect_offset;
/* Check for the presence of the rounding term. */
if (gimple_assign_rhs_code (rshift_input_stmt) == PLUS_EXPR)
{
/* Check that the outer shift was by 1. */
if (!integer_onep (gimple_assign_rhs2 (last_stmt)))
return NULL;
/* Check that the second operand of the PLUS_EXPR is 1. */
if (!integer_onep (gimple_assign_rhs2 (rshift_input_stmt)))
return NULL;
/* Look through any change in sign on the addition input. */
vect_unpromoted_value unprom_plus_input;
tree plus_input = vect_look_through_possible_promotion
(vinfo, gimple_assign_rhs1 (rshift_input_stmt), &unprom_plus_input);
if (!plus_input
|| TYPE_PRECISION (TREE_TYPE (plus_input))
!= TYPE_PRECISION (TREE_TYPE (rshift_input)))
return NULL;
/* Get the definition of the multiply-high-scale part. */
stmt_vec_info plus_input_stmt_info
= vect_get_internal_def (vinfo, plus_input);
if (!plus_input_stmt_info)
return NULL;
gassign *plus_input_stmt
= dyn_cast <gassign *> (plus_input_stmt_info->stmt);
if (!plus_input_stmt
|| gimple_assign_rhs_code (plus_input_stmt) != RSHIFT_EXPR)
return NULL;
/* Look through any change in sign on the scaling input. */
vect_unpromoted_value unprom_scale_input;
tree scale_input = vect_look_through_possible_promotion
(vinfo, gimple_assign_rhs1 (plus_input_stmt), &unprom_scale_input);
if (!scale_input
|| TYPE_PRECISION (TREE_TYPE (scale_input))
!= TYPE_PRECISION (TREE_TYPE (plus_input)))
return NULL;
/* Get the definition of the multiply-high part. */
mulh_stmt_info = vect_get_internal_def (vinfo, scale_input);
if (!mulh_stmt_info)
return NULL;
/* Get the scaling term. */
scale_term = gimple_assign_rhs2 (plus_input_stmt);
expect_offset = target_precision + 2;
ifn = IFN_MULHRS;
}
else
{
mulh_stmt_info = rshift_input_stmt_info;
scale_term = gimple_assign_rhs2 (last_stmt);
expect_offset = target_precision + 1;
ifn = IFN_MULHS;
}
/* Check that the scaling factor is correct. */
if (TREE_CODE (scale_term) != INTEGER_CST
|| wi::to_widest (scale_term) + expect_offset
!= TYPE_PRECISION (lhs_type))
return NULL;
/* Check whether the scaling input term can be seen as two widened
inputs multiplied together. */
vect_unpromoted_value unprom_mult[2];
tree new_type;
unsigned int nops
= vect_widened_op_tree (mulh_stmt_info, MULT_EXPR, WIDEN_MULT_EXPR,
false, 2, unprom_mult, &new_type);
if (nops != 2)
return NULL;
vect_pattern_detected ("vect_recog_mulhs_pattern", last_stmt);
/* Adjust output precision. */
if (TYPE_PRECISION (new_type) < target_precision)
new_type = build_nonstandard_integer_type
(target_precision, TYPE_UNSIGNED (new_type));
/* Check for target support. */
tree new_vectype = get_vectype_for_scalar_type (new_type);
if (!new_vectype
|| !direct_internal_fn_supported_p
(ifn, new_vectype, OPTIMIZE_FOR_SPEED))
return NULL;
/* The IR requires a valid vector type for the cast result, even though
it's likely to be discarded. */
*type_out = get_vectype_for_scalar_type (lhs_type);
if (!*type_out)
return NULL;
/* Generate the IFN_MULHRS call. */
tree new_var = vect_recog_temp_ssa_var (new_type, NULL);
tree new_ops[2];
vect_convert_inputs (last_stmt_info, 2, new_ops, new_type,
unprom_mult, new_vectype);
gcall *mulhrs_stmt
= gimple_build_call_internal (ifn, 2, new_ops[0], new_ops[1]);
gimple_call_set_lhs (mulhrs_stmt, new_var);
gimple_set_location (mulhrs_stmt, gimple_location (last_stmt));
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"created pattern stmt: %G", mulhrs_stmt);
return vect_convert_output (last_stmt_info, lhs_type,
mulhrs_stmt, new_vectype);
}
/* Recognize the patterns: /* Recognize the patterns:
ATYPE a; // narrower than TYPE ATYPE a; // narrower than TYPE
...@@ -4713,6 +4882,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = { ...@@ -4713,6 +4882,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
/* Must come after over_widening, which narrows the shift as much as /* Must come after over_widening, which narrows the shift as much as
possible beforehand. */ possible beforehand. */
{ vect_recog_average_pattern, "average" }, { vect_recog_average_pattern, "average" },
{ vect_recog_mulhs_pattern, "mult_high" },
{ vect_recog_cast_forwprop_pattern, "cast_forwprop" }, { vect_recog_cast_forwprop_pattern, "cast_forwprop" },
{ vect_recog_widen_mult_pattern, "widen_mult" }, { vect_recog_widen_mult_pattern, "widen_mult" },
{ vect_recog_dot_prod_pattern, "dot_prod" }, { vect_recog_dot_prod_pattern, "dot_prod" },
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment