Commit 779aea46 by James Greenhalgh Committed by James Greenhalgh

[AArch64] Implement vmul<q>_lane<q>_<fsu><16,32,64> intrinsics in C

gcc/
	* config/aarch64/aarch64-simd.md (aarch64_mul3_elt<mode>): New.
	(aarch64_mul3_elt_<vswap_width_name><mode>): Likewise.
	(aarch64_mul3_elt_to_128df): Likewise.
	(aarch64_mul3_elt_to_64v2df): Likewise.
	* config/aarch64/iterators.md (VEL): Also handle DFmode.
	(VMUL): New.
	(VMUL_CHANGE_NLANES) Likewise.
	(h_con): Likewise.
	(f): Likewise.
	* config/aarch64/arm_neon.h
	(vmul<q>_lane<q>_<suf><16,32,64>): Convert to C implementation.

gcc/testsuite/
	* gcc.target/aarch64/mul_intrinsic_1.c: New.
	* gcc.target/aarch64/fmul_intrinsic_1.c: Likewise.

From-SVN: r202624
parent a407a750
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com> 2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* config/aarch64/aarch64-simd.md (aarch64_mul3_elt<mode>): New.
(aarch64_mul3_elt_<vswap_width_name><mode>): Likewise.
(aarch64_mul3_elt_to_128df): Likewise.
(aarch64_mul3_elt_to_64v2df): Likewise.
* config/aarch64/iterators.md (VEL): Also handle DFmode.
(VMUL): New.
(VMUL_CHANGE_NLANES) Likewise.
(h_con): Likewise.
(f): Likewise.
* config/aarch64/arm_neon.h
(vmul<q>_lane<q>_<suf><16,32,64>): Convert to C implementation.
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* config/aarch64/arm_neon.h * config/aarch64/arm_neon.h
(vcvtx_high_f32_f64): Fix parameters. (vcvtx_high_f32_f64): Fix parameters.
......
...@@ -582,6 +582,59 @@ ...@@ -582,6 +582,59 @@
(set_attr "simd_mode" "<MODE>")] (set_attr "simd_mode" "<MODE>")]
) )
(define_insn "*aarch64_mul3_elt<mode>"
[(set (match_operand:VMUL 0 "register_operand" "=w")
(mult:VMUL
(vec_duplicate:VMUL
(vec_select:<VEL>
(match_operand:VMUL 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VMUL 3 "register_operand" "w")))]
"TARGET_SIMD"
"<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]"
[(set_attr "simd_type" "simd_<f>mul_elt")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_mul3_elt_<vswap_width_name><mode>"
[(set (match_operand:VMUL_CHANGE_NLANES 0 "register_operand" "=w")
(mult:VMUL_CHANGE_NLANES
(vec_duplicate:VMUL_CHANGE_NLANES
(vec_select:<VEL>
(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VMUL_CHANGE_NLANES 3 "register_operand" "w")))]
"TARGET_SIMD"
"<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]"
[(set_attr "simd_type" "simd_<f>mul_elt")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_mul3_elt_to_128df"
[(set (match_operand:V2DF 0 "register_operand" "=w")
(mult:V2DF
(vec_duplicate:V2DF
(match_operand:DF 2 "register_operand" "w"))
(match_operand:V2DF 1 "register_operand" "w")))]
"TARGET_SIMD"
"fmul\\t%0.2d, %1.2d, %2.d[0]"
[(set_attr "simd_type" "simd_fmul_elt")
(set_attr "simd_mode" "V2DF")]
)
(define_insn "*aarch64_mul3_elt_to_64v2df"
[(set (match_operand:DF 0 "register_operand" "=w")
(mult:DF
(vec_select:DF
(match_operand:V2DF 1 "register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand")]))
(match_operand:DF 3 "register_operand" "w")))]
"TARGET_SIMD"
"fmul\\t%0.2d, %3.2d, %1.d[%2]"
[(set_attr "simd_type" "simd_fmul_elt")
(set_attr "simd_mode" "V2DF")]
)
(define_insn "neg<mode>2" (define_insn "neg<mode>2"
[(set (match_operand:VDQ 0 "register_operand" "=w") [(set (match_operand:VDQ 0 "register_operand" "=w")
(neg:VDQ (match_operand:VDQ 1 "register_operand" "w")))] (neg:VDQ (match_operand:VDQ 1 "register_operand" "w")))]
......
...@@ -169,6 +169,12 @@ ...@@ -169,6 +169,12 @@
;; Double scalar modes ;; Double scalar modes
(define_mode_iterator DX [DI DF]) (define_mode_iterator DX [DI DF])
;; Modes available for <f>mul lane operations.
(define_mode_iterator VMUL [V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
;; Modes available for <f>mul lane operations changing lane count.
(define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF])
;; ------------------------------------------------------------------ ;; ------------------------------------------------------------------
;; Unspec enumerations for Advance SIMD. These could well go into ;; Unspec enumerations for Advance SIMD. These could well go into
;; aarch64.md but for their use in int_iterators here. ;; aarch64.md but for their use in int_iterators here.
...@@ -358,7 +364,7 @@ ...@@ -358,7 +364,7 @@
(V2SI "SI") (V4SI "SI") (V2SI "SI") (V4SI "SI")
(DI "DI") (V2DI "DI") (DI "DI") (V2DI "DI")
(V2SF "SF") (V4SF "SF") (V2SF "SF") (V4SF "SF")
(V2DF "DF") (V2DF "DF") (DF "DF")
(SI "SI") (HI "HI") (SI "SI") (HI "HI")
(QI "QI")]) (QI "QI")])
...@@ -541,6 +547,22 @@ ...@@ -541,6 +547,22 @@
(V2SF "to_128") (V4SF "to_64") (V2SF "to_128") (V4SF "to_64")
(DF "to_128") (V2DF "to_64")]) (DF "to_128") (V2DF "to_64")])
;; For certain vector-by-element multiplication instructions we must
;; constrain the HI cases to use only V0-V15. This is covered by
;; the 'x' constraint. All other modes may use the 'w' constraint.
(define_mode_attr h_con [(V2SI "w") (V4SI "w")
(V4HI "x") (V8HI "x")
(V2SF "w") (V4SF "w")
(V2DF "w") (DF "w")])
;; Defined to 'f' for types whose element type is a float type.
(define_mode_attr f [(V8QI "") (V16QI "")
(V4HI "") (V8HI "")
(V2SI "") (V4SI "")
(DI "") (V2DI "")
(V2SF "f") (V4SF "f")
(V2DF "f") (DF "f")])
;; ------------------------------------------------------------------- ;; -------------------------------------------------------------------
;; Code Iterators ;; Code Iterators
;; ------------------------------------------------------------------- ;; -------------------------------------------------------------------
......
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* gcc.target/aarch64/mul_intrinsic_1.c: New.
* gcc.target/aarch64/fmul_intrinsic_1.c: Likewise.
2013-09-16 Richard Biener <rguenther@suse.de> 2013-09-16 Richard Biener <rguenther@suse.de>
* gcc.dg/tree-ssa/ldist-22.c: New testcase. * gcc.dg/tree-ssa/ldist-22.c: New testcase.
......
/* { dg-do run } */
/* { dg-options "-O3 --save-temps" } */
#include <arm_neon.h>
#define DELTA 0.0001
extern void abort (void);
extern double fabs (double);
#define TEST_VMUL(q1, q2, size, in1_lanes, in2_lanes) \
static void \
test_vmul##q1##_lane##q2##_f##size (float##size##_t * res, \
const float##size##_t *in1, \
const float##size##_t *in2) \
{ \
float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res); \
float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1); \
float##size##x##in2_lanes##_t c; \
if (in2_lanes > 1) \
{ \
c = vld1##q2##_f##size (in2); \
a = vmul##q1##_lane##q2##_f##size (b, c, 1); \
} \
else \
{ \
c = vld1##q2##_f##size (in2 + 1); \
a = vmul##q1##_lane##q2##_f##size (b, c, 0); \
} \
vst1##q1##_f##size (res, a); \
}
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
TEST_VMUL ( , , width, n_half_lanes, n_half_lanes) \
TEST_VMUL (q, , width, n_lanes, n_half_lanes) \
TEST_VMUL ( , q, width, n_half_lanes, n_lanes) \
TEST_VMUL (q, q, width, n_lanes, n_lanes)
BUILD_VARS (32, 4, 2)
BUILD_VARS (64, 2, 1)
#define POOL2 {0.0, 1.0}
#define POOL4 {0.0, 1.0, 2.0, 3.0}
#define EMPTY2 {0.0, 0.0}
#define EMPTY4 {0.0, 0.0, 0.0, 0.0}
#define BUILD_TEST(size, lanes) \
static void \
test_f##size (void) \
{ \
int i; \
float##size##_t pool[lanes] = POOL##lanes; \
float##size##_t res[lanes] = EMPTY##lanes; \
float##size##_t res2[lanes] = EMPTY##lanes; \
float##size##_t res3[lanes] = EMPTY##lanes; \
float##size##_t res4[lanes] = EMPTY##lanes; \
\
/* Avoid constant folding the multiplication. */ \
asm volatile ("" : : : "memory"); \
test_vmul_lane_f##size (res, pool, pool); \
/* Avoid fusing multiplication and subtraction. */ \
asm volatile ("" : :"Q" (res) : "memory"); \
for (i = 0; i < lanes / 2; i++) \
if (fabs (res[i] - pool[i]) > DELTA) \
abort (); \
\
test_vmulq_lane_f##size (res2, pool, pool); \
/* Avoid fusing multiplication and subtraction. */ \
asm volatile ("" : :"Q" (res2) : "memory"); \
for (i = 0; i < lanes; i++) \
if (fabs (res2[i] - pool[i]) > DELTA) \
abort (); \
\
test_vmul_laneq_f##size (res3, pool, pool); \
/* Avoid fusing multiplication and subtraction. */ \
asm volatile ("" : :"Q" (res3) : "memory"); \
for (i = 0; i < lanes / 2; i++) \
if (fabs (res3[i] - pool[i]) > DELTA) \
abort (); \
\
test_vmulq_laneq_f##size (res4, pool, pool); \
/* Avoid fusing multiplication and subtraction. */ \
asm volatile ("" : :"Q" (res4) : "memory"); \
for (i = 0; i < lanes; i++) \
if (fabs (res4[i] - pool[i]) > DELTA) \
abort (); \
}
BUILD_TEST (32, 4)
BUILD_TEST (64, 2)
int
main (int argc, char **argv)
{
test_f32 ();
test_f64 ();
return 0;
}
/* vmul_laneq_f32.
vmul_lane_f32. */
/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 2 } } */
/* vmulq_lane_f32.
vmulq_laneq_f32. */
/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 2 } } */
/* vmul_lane_f64. */
/* { dg-final { scan-assembler-times "fmul\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 1 } } */
/* vmul_laneq_f64.
vmulq_lane_f64.
vmulq_laneq_f64. */
/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 3 } } */
/* { dg-final { cleanup-saved-temps } } */
/* { dg-do run } */
/* { dg-options "-O3 --save-temps" } */
#include <arm_neon.h>
extern void abort (void);
#define MAPs(size, xx) int##size##xx##_t
#define MAPu(size, xx) uint##size##xx##_t
#define TEST_VMUL(q, su, size, in1_lanes, in2_lanes) \
static void \
test_vmulq_lane##q##_##su##size (MAP##su (size, ) * res, \
const MAP##su(size, ) *in1, \
const MAP##su(size, ) *in2) \
{ \
MAP##su (size, x##in1_lanes) a = vld1q_##su##size (in1); \
MAP##su (size, x##in2_lanes) b = vld1##q##_##su##size (in2); \
a = vmulq_lane##q##_##su##size (a, b, 1); \
vst1q_##su##size (res, a); \
}
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
TEST_VMUL (, s, width, n_lanes, n_half_lanes) \
TEST_VMUL (q, s, width, n_lanes, n_lanes) \
TEST_VMUL (, u, width, n_lanes, n_half_lanes) \
TEST_VMUL (q, u, width, n_lanes, n_lanes) \
BUILD_VARS (32, 4, 2)
BUILD_VARS (16, 8, 4)
#define POOL4 {0, 1, 2, 3}
#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7}
#define EMPTY4 {0, 0, 0, 0}
#define EMPTY8 {0, 0, 0, 0, 0, 0, 0, 0}
#define BUILD_TEST(su, size, lanes) \
static void \
test_##su##size (void) \
{ \
int i; \
MAP##su (size,) pool[lanes] = POOL##lanes; \
MAP##su (size,) res[lanes] = EMPTY##lanes; \
MAP##su (size,) res2[lanes] = EMPTY##lanes; \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vmulq_lane_##su##size (res, pool, pool); \
for (i = 0; i < lanes; i++) \
if (res[i] != pool[i]) \
abort (); \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vmulq_laneq_##su##size (res2, pool, pool); \
for (i = 0; i < lanes; i++) \
if (res2[i] != pool[i]) \
abort (); \
}
#undef BUILD_VARS
#define BUILD_VARS(size, lanes) \
BUILD_TEST (s, size, lanes) \
BUILD_TEST (u, size, lanes)
BUILD_VARS (32, 4)
BUILD_VARS (16, 8)
int
main (int argc, char **argv)
{
test_s32 ();
test_u32 ();
test_s16 ();
test_u16 ();
return 0;
}
/* { dg-final { scan-assembler-times "mul\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 4 } } */
/* { dg-final { scan-assembler-times "mul\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.h\\\[\[0-9\]+\\\]" 4 } } */
/* { dg-final { cleanup-saved-temps } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment