Commit 779aea46 by James Greenhalgh Committed by James Greenhalgh

[AArch64] Implement vmul<q>_lane<q>_<fsu><16,32,64> intrinsics in C

gcc/
	* config/aarch64/aarch64-simd.md (aarch64_mul3_elt<mode>): New.
	(aarch64_mul3_elt_<vswap_width_name><mode>): Likewise.
	(aarch64_mul3_elt_to_128df): Likewise.
	(aarch64_mul3_elt_to_64v2df): Likewise.
	* config/aarch64/iterators.md (VEL): Also handle DFmode.
	(VMUL): New.
	(VMUL_CHANGE_NLANES) Likewise.
	(h_con): Likewise.
	(f): Likewise.
	* config/aarch64/arm_neon.h
	(vmul<q>_lane<q>_<suf><16,32,64>): Convert to C implementation.

gcc/testsuite/
	* gcc.target/aarch64/mul_intrinsic_1.c: New.
	* gcc.target/aarch64/fmul_intrinsic_1.c: Likewise.

From-SVN: r202624
parent a407a750
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com> 2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* config/aarch64/aarch64-simd.md (aarch64_mul3_elt<mode>): New.
(aarch64_mul3_elt_<vswap_width_name><mode>): Likewise.
(aarch64_mul3_elt_to_128df): Likewise.
(aarch64_mul3_elt_to_64v2df): Likewise.
* config/aarch64/iterators.md (VEL): Also handle DFmode.
(VMUL): New.
(VMUL_CHANGE_NLANES) Likewise.
(h_con): Likewise.
(f): Likewise.
* config/aarch64/arm_neon.h
(vmul<q>_lane<q>_<suf><16,32,64>): Convert to C implementation.
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* config/aarch64/arm_neon.h * config/aarch64/arm_neon.h
(vcvtx_high_f32_f64): Fix parameters. (vcvtx_high_f32_f64): Fix parameters.
......
...@@ -582,6 +582,59 @@ ...@@ -582,6 +582,59 @@
(set_attr "simd_mode" "<MODE>")] (set_attr "simd_mode" "<MODE>")]
) )
(define_insn "*aarch64_mul3_elt<mode>"
[(set (match_operand:VMUL 0 "register_operand" "=w")
(mult:VMUL
(vec_duplicate:VMUL
(vec_select:<VEL>
(match_operand:VMUL 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VMUL 3 "register_operand" "w")))]
"TARGET_SIMD"
"<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]"
[(set_attr "simd_type" "simd_<f>mul_elt")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_mul3_elt_<vswap_width_name><mode>"
[(set (match_operand:VMUL_CHANGE_NLANES 0 "register_operand" "=w")
(mult:VMUL_CHANGE_NLANES
(vec_duplicate:VMUL_CHANGE_NLANES
(vec_select:<VEL>
(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VMUL_CHANGE_NLANES 3 "register_operand" "w")))]
"TARGET_SIMD"
"<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]"
[(set_attr "simd_type" "simd_<f>mul_elt")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_mul3_elt_to_128df"
[(set (match_operand:V2DF 0 "register_operand" "=w")
(mult:V2DF
(vec_duplicate:V2DF
(match_operand:DF 2 "register_operand" "w"))
(match_operand:V2DF 1 "register_operand" "w")))]
"TARGET_SIMD"
"fmul\\t%0.2d, %1.2d, %2.d[0]"
[(set_attr "simd_type" "simd_fmul_elt")
(set_attr "simd_mode" "V2DF")]
)
(define_insn "*aarch64_mul3_elt_to_64v2df"
[(set (match_operand:DF 0 "register_operand" "=w")
(mult:DF
(vec_select:DF
(match_operand:V2DF 1 "register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand")]))
(match_operand:DF 3 "register_operand" "w")))]
"TARGET_SIMD"
"fmul\\t%0.2d, %3.2d, %1.d[%2]"
[(set_attr "simd_type" "simd_fmul_elt")
(set_attr "simd_mode" "V2DF")]
)
(define_insn "neg<mode>2" (define_insn "neg<mode>2"
[(set (match_operand:VDQ 0 "register_operand" "=w") [(set (match_operand:VDQ 0 "register_operand" "=w")
(neg:VDQ (match_operand:VDQ 1 "register_operand" "w")))] (neg:VDQ (match_operand:VDQ 1 "register_operand" "w")))]
......
...@@ -9501,136 +9501,6 @@ vmovq_n_u64 (uint64_t a) ...@@ -9501,136 +9501,6 @@ vmovq_n_u64 (uint64_t a)
return result; return result;
} }
#define vmul_lane_f32(a, b, c) \
__extension__ \
({ \
float32x2_t b_ = (b); \
float32x2_t a_ = (a); \
float32x2_t result; \
__asm__ ("fmul %0.2s,%1.2s,%2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmul_lane_s16(a, b, c) \
__extension__ \
({ \
int16x4_t b_ = (b); \
int16x4_t a_ = (a); \
int16x4_t result; \
__asm__ ("mul %0.4h,%1.4h,%2.h[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmul_lane_s32(a, b, c) \
__extension__ \
({ \
int32x2_t b_ = (b); \
int32x2_t a_ = (a); \
int32x2_t result; \
__asm__ ("mul %0.2s,%1.2s,%2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmul_lane_u16(a, b, c) \
__extension__ \
({ \
uint16x4_t b_ = (b); \
uint16x4_t a_ = (a); \
uint16x4_t result; \
__asm__ ("mul %0.4h,%1.4h,%2.h[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmul_lane_u32(a, b, c) \
__extension__ \
({ \
uint32x2_t b_ = (b); \
uint32x2_t a_ = (a); \
uint32x2_t result; \
__asm__ ("mul %0.2s, %1.2s, %2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmul_laneq_f32(a, b, c) \
__extension__ \
({ \
float32x4_t b_ = (b); \
float32x2_t a_ = (a); \
float32x2_t result; \
__asm__ ("fmul %0.2s, %1.2s, %2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmul_laneq_s16(a, b, c) \
__extension__ \
({ \
int16x8_t b_ = (b); \
int16x4_t a_ = (a); \
int16x4_t result; \
__asm__ ("mul %0.4h, %1.4h, %2.h[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmul_laneq_s32(a, b, c) \
__extension__ \
({ \
int32x4_t b_ = (b); \
int32x2_t a_ = (a); \
int32x2_t result; \
__asm__ ("mul %0.2s, %1.2s, %2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmul_laneq_u16(a, b, c) \
__extension__ \
({ \
uint16x8_t b_ = (b); \
uint16x4_t a_ = (a); \
uint16x4_t result; \
__asm__ ("mul %0.4h, %1.4h, %2.h[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmul_laneq_u32(a, b, c) \
__extension__ \
({ \
uint32x4_t b_ = (b); \
uint32x2_t a_ = (a); \
uint32x2_t result; \
__asm__ ("mul %0.2s, %1.2s, %2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmul_n_f32 (float32x2_t a, float32_t b) vmul_n_f32 (float32x2_t a, float32_t b)
{ {
...@@ -10149,162 +10019,6 @@ vmull_u32 (uint32x2_t a, uint32x2_t b) ...@@ -10149,162 +10019,6 @@ vmull_u32 (uint32x2_t a, uint32x2_t b)
return result; return result;
} }
#define vmulq_lane_f32(a, b, c) \
__extension__ \
({ \
float32x2_t b_ = (b); \
float32x4_t a_ = (a); \
float32x4_t result; \
__asm__ ("fmul %0.4s, %1.4s, %2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmulq_lane_f64(a, b, c) \
__extension__ \
({ \
float64x1_t b_ = (b); \
float64x2_t a_ = (a); \
float64x2_t result; \
__asm__ ("fmul %0.2d,%1.2d,%2.d[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmulq_lane_s16(a, b, c) \
__extension__ \
({ \
int16x4_t b_ = (b); \
int16x8_t a_ = (a); \
int16x8_t result; \
__asm__ ("mul %0.8h,%1.8h,%2.h[%3]" \
: "=w"(result) \
: "w"(a_), "x"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmulq_lane_s32(a, b, c) \
__extension__ \
({ \
int32x2_t b_ = (b); \
int32x4_t a_ = (a); \
int32x4_t result; \
__asm__ ("mul %0.4s,%1.4s,%2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmulq_lane_u16(a, b, c) \
__extension__ \
({ \
uint16x4_t b_ = (b); \
uint16x8_t a_ = (a); \
uint16x8_t result; \
__asm__ ("mul %0.8h,%1.8h,%2.h[%3]" \
: "=w"(result) \
: "w"(a_), "x"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmulq_lane_u32(a, b, c) \
__extension__ \
({ \
uint32x2_t b_ = (b); \
uint32x4_t a_ = (a); \
uint32x4_t result; \
__asm__ ("mul %0.4s, %1.4s, %2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmulq_laneq_f32(a, b, c) \
__extension__ \
({ \
float32x4_t b_ = (b); \
float32x4_t a_ = (a); \
float32x4_t result; \
__asm__ ("fmul %0.4s, %1.4s, %2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmulq_laneq_f64(a, b, c) \
__extension__ \
({ \
float64x2_t b_ = (b); \
float64x2_t a_ = (a); \
float64x2_t result; \
__asm__ ("fmul %0.2d,%1.2d,%2.d[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmulq_laneq_s16(a, b, c) \
__extension__ \
({ \
int16x8_t b_ = (b); \
int16x8_t a_ = (a); \
int16x8_t result; \
__asm__ ("mul %0.8h, %1.8h, %2.h[%3]" \
: "=w"(result) \
: "w"(a_), "x"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmulq_laneq_s32(a, b, c) \
__extension__ \
({ \
int32x4_t b_ = (b); \
int32x4_t a_ = (a); \
int32x4_t result; \
__asm__ ("mul %0.4s, %1.4s, %2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmulq_laneq_u16(a, b, c) \
__extension__ \
({ \
uint16x8_t b_ = (b); \
uint16x8_t a_ = (a); \
uint16x8_t result; \
__asm__ ("mul %0.8h, %1.8h, %2.h[%3]" \
: "=w"(result) \
: "w"(a_), "x"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
#define vmulq_laneq_u32(a, b, c) \
__extension__ \
({ \
uint32x4_t b_ = (b); \
uint32x4_t a_ = (a); \
uint32x4_t result; \
__asm__ ("mul %0.4s, %1.4s, %2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmulq_n_f32 (float32x4_t a, float32_t b) vmulq_n_f32 (float32x4_t a, float32_t b)
{ {
...@@ -21435,6 +21149,158 @@ vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) ...@@ -21435,6 +21149,158 @@ vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
return a - b * c; return a - b * c;
} }
/* vmul_lane */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __lane)
{
return __a * __aarch64_vget_lane_f32 (__b, __lane);
}
__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
vmul_lane_f64 (float64x1_t __a, float64x1_t __b, const int __lane)
{
return __a * __b;
}
__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __lane)
{
return __a * __aarch64_vget_lane_s16 (__b, __lane);
}
__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __lane)
{
return __a * __aarch64_vget_lane_s32 (__b, __lane);
}
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __lane)
{
return __a * __aarch64_vget_lane_u16 (__b, __lane);
}
__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __lane)
{
return __a * __aarch64_vget_lane_u32 (__b, __lane);
}
/* vmul_laneq */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmul_laneq_f32 (float32x2_t __a, float32x4_t __b, const int __lane)
{
return __a * __aarch64_vgetq_lane_f32 (__b, __lane);
}
__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
vmul_laneq_f64 (float64x1_t __a, float64x2_t __b, const int __lane)
{
return __a * __aarch64_vgetq_lane_f64 (__b, __lane);
}
__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vmul_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __lane)
{
return __a * __aarch64_vgetq_lane_s16 (__b, __lane);
}
__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
vmul_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __lane)
{
return __a * __aarch64_vgetq_lane_s32 (__b, __lane);
}
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
vmul_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __lane)
{
return __a * __aarch64_vgetq_lane_u16 (__b, __lane);
}
__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
vmul_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __lane)
{
return __a * __aarch64_vgetq_lane_u32 (__b, __lane);
}
/* vmulq_lane */
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane)
{
return __a * __aarch64_vget_lane_f32 (__b, __lane);
}
__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
vmulq_lane_f64 (float64x2_t __a, float64x1_t __b, const int __lane)
{
return __a * __b;
}
__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __lane)
{
return __a * __aarch64_vget_lane_s16 (__b, __lane);
}
__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __lane)
{
return __a * __aarch64_vget_lane_s32 (__b, __lane);
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __lane)
{
return __a * __aarch64_vget_lane_u16 (__b, __lane);
}
__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __lane)
{
return __a * __aarch64_vget_lane_u32 (__b, __lane);
}
/* vmulq_laneq */
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmulq_laneq_f32 (float32x4_t __a, float32x4_t __b, const int __lane)
{
return __a * __aarch64_vgetq_lane_f32 (__b, __lane);
}
__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
vmulq_laneq_f64 (float64x2_t __a, float64x2_t __b, const int __lane)
{
return __a * __aarch64_vgetq_lane_f64 (__b, __lane);
}
__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vmulq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __lane)
{
return __a * __aarch64_vgetq_lane_s16 (__b, __lane);
}
__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vmulq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __lane)
{
return __a * __aarch64_vgetq_lane_s32 (__b, __lane);
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vmulq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __lane)
{
return __a * __aarch64_vgetq_lane_u16 (__b, __lane);
}
__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane)
{
return __a * __aarch64_vgetq_lane_u32 (__b, __lane);
}
/* vqabs */ /* vqabs */
__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) __extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
......
...@@ -169,6 +169,12 @@ ...@@ -169,6 +169,12 @@
;; Double scalar modes ;; Double scalar modes
(define_mode_iterator DX [DI DF]) (define_mode_iterator DX [DI DF])
;; Modes available for <f>mul lane operations.
(define_mode_iterator VMUL [V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
;; Modes available for <f>mul lane operations changing lane count.
(define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF])
;; ------------------------------------------------------------------ ;; ------------------------------------------------------------------
;; Unspec enumerations for Advance SIMD. These could well go into ;; Unspec enumerations for Advance SIMD. These could well go into
;; aarch64.md but for their use in int_iterators here. ;; aarch64.md but for their use in int_iterators here.
...@@ -358,7 +364,7 @@ ...@@ -358,7 +364,7 @@
(V2SI "SI") (V4SI "SI") (V2SI "SI") (V4SI "SI")
(DI "DI") (V2DI "DI") (DI "DI") (V2DI "DI")
(V2SF "SF") (V4SF "SF") (V2SF "SF") (V4SF "SF")
(V2DF "DF") (V2DF "DF") (DF "DF")
(SI "SI") (HI "HI") (SI "SI") (HI "HI")
(QI "QI")]) (QI "QI")])
...@@ -541,6 +547,22 @@ ...@@ -541,6 +547,22 @@
(V2SF "to_128") (V4SF "to_64") (V2SF "to_128") (V4SF "to_64")
(DF "to_128") (V2DF "to_64")]) (DF "to_128") (V2DF "to_64")])
;; For certain vector-by-element multiplication instructions we must
;; constrain the HI cases to use only V0-V15. This is covered by
;; the 'x' constraint. All other modes may use the 'w' constraint.
(define_mode_attr h_con [(V2SI "w") (V4SI "w")
(V4HI "x") (V8HI "x")
(V2SF "w") (V4SF "w")
(V2DF "w") (DF "w")])
;; Defined to 'f' for types whose element type is a float type.
(define_mode_attr f [(V8QI "") (V16QI "")
(V4HI "") (V8HI "")
(V2SI "") (V4SI "")
(DI "") (V2DI "")
(V2SF "f") (V4SF "f")
(V2DF "f") (DF "f")])
;; ------------------------------------------------------------------- ;; -------------------------------------------------------------------
;; Code Iterators ;; Code Iterators
;; ------------------------------------------------------------------- ;; -------------------------------------------------------------------
......
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* gcc.target/aarch64/mul_intrinsic_1.c: New.
* gcc.target/aarch64/fmul_intrinsic_1.c: Likewise.
2013-09-16 Richard Biener <rguenther@suse.de> 2013-09-16 Richard Biener <rguenther@suse.de>
* gcc.dg/tree-ssa/ldist-22.c: New testcase. * gcc.dg/tree-ssa/ldist-22.c: New testcase.
......
/* { dg-do run } */
/* { dg-options "-O3 --save-temps" } */
#include <arm_neon.h>
#define DELTA 0.0001
extern void abort (void);
extern double fabs (double);
#define TEST_VMUL(q1, q2, size, in1_lanes, in2_lanes) \
static void \
test_vmul##q1##_lane##q2##_f##size (float##size##_t * res, \
const float##size##_t *in1, \
const float##size##_t *in2) \
{ \
float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res); \
float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1); \
float##size##x##in2_lanes##_t c; \
if (in2_lanes > 1) \
{ \
c = vld1##q2##_f##size (in2); \
a = vmul##q1##_lane##q2##_f##size (b, c, 1); \
} \
else \
{ \
c = vld1##q2##_f##size (in2 + 1); \
a = vmul##q1##_lane##q2##_f##size (b, c, 0); \
} \
vst1##q1##_f##size (res, a); \
}
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
TEST_VMUL ( , , width, n_half_lanes, n_half_lanes) \
TEST_VMUL (q, , width, n_lanes, n_half_lanes) \
TEST_VMUL ( , q, width, n_half_lanes, n_lanes) \
TEST_VMUL (q, q, width, n_lanes, n_lanes)
BUILD_VARS (32, 4, 2)
BUILD_VARS (64, 2, 1)
#define POOL2 {0.0, 1.0}
#define POOL4 {0.0, 1.0, 2.0, 3.0}
#define EMPTY2 {0.0, 0.0}
#define EMPTY4 {0.0, 0.0, 0.0, 0.0}
#define BUILD_TEST(size, lanes) \
static void \
test_f##size (void) \
{ \
int i; \
float##size##_t pool[lanes] = POOL##lanes; \
float##size##_t res[lanes] = EMPTY##lanes; \
float##size##_t res2[lanes] = EMPTY##lanes; \
float##size##_t res3[lanes] = EMPTY##lanes; \
float##size##_t res4[lanes] = EMPTY##lanes; \
\
/* Avoid constant folding the multiplication. */ \
asm volatile ("" : : : "memory"); \
test_vmul_lane_f##size (res, pool, pool); \
/* Avoid fusing multiplication and subtraction. */ \
asm volatile ("" : :"Q" (res) : "memory"); \
for (i = 0; i < lanes / 2; i++) \
if (fabs (res[i] - pool[i]) > DELTA) \
abort (); \
\
test_vmulq_lane_f##size (res2, pool, pool); \
/* Avoid fusing multiplication and subtraction. */ \
asm volatile ("" : :"Q" (res2) : "memory"); \
for (i = 0; i < lanes; i++) \
if (fabs (res2[i] - pool[i]) > DELTA) \
abort (); \
\
test_vmul_laneq_f##size (res3, pool, pool); \
/* Avoid fusing multiplication and subtraction. */ \
asm volatile ("" : :"Q" (res3) : "memory"); \
for (i = 0; i < lanes / 2; i++) \
if (fabs (res3[i] - pool[i]) > DELTA) \
abort (); \
\
test_vmulq_laneq_f##size (res4, pool, pool); \
/* Avoid fusing multiplication and subtraction. */ \
asm volatile ("" : :"Q" (res4) : "memory"); \
for (i = 0; i < lanes; i++) \
if (fabs (res4[i] - pool[i]) > DELTA) \
abort (); \
}
BUILD_TEST (32, 4)
BUILD_TEST (64, 2)
int
main (int argc, char **argv)
{
test_f32 ();
test_f64 ();
return 0;
}
/* vmul_laneq_f32.
vmul_lane_f32. */
/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 2 } } */
/* vmulq_lane_f32.
vmulq_laneq_f32. */
/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 2 } } */
/* vmul_lane_f64. */
/* { dg-final { scan-assembler-times "fmul\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 1 } } */
/* vmul_laneq_f64.
vmulq_lane_f64.
vmulq_laneq_f64. */
/* { dg-final { scan-assembler-times "fmul\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.d\\\[\[0-9\]+\\\]" 3 } } */
/* { dg-final { cleanup-saved-temps } } */
/* { dg-do run } */
/* { dg-options "-O3 --save-temps" } */
#include <arm_neon.h>
extern void abort (void);
#define MAPs(size, xx) int##size##xx##_t
#define MAPu(size, xx) uint##size##xx##_t
#define TEST_VMUL(q, su, size, in1_lanes, in2_lanes) \
static void \
test_vmulq_lane##q##_##su##size (MAP##su (size, ) * res, \
const MAP##su(size, ) *in1, \
const MAP##su(size, ) *in2) \
{ \
MAP##su (size, x##in1_lanes) a = vld1q_##su##size (in1); \
MAP##su (size, x##in2_lanes) b = vld1##q##_##su##size (in2); \
a = vmulq_lane##q##_##su##size (a, b, 1); \
vst1q_##su##size (res, a); \
}
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
TEST_VMUL (, s, width, n_lanes, n_half_lanes) \
TEST_VMUL (q, s, width, n_lanes, n_lanes) \
TEST_VMUL (, u, width, n_lanes, n_half_lanes) \
TEST_VMUL (q, u, width, n_lanes, n_lanes) \
BUILD_VARS (32, 4, 2)
BUILD_VARS (16, 8, 4)
#define POOL4 {0, 1, 2, 3}
#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7}
#define EMPTY4 {0, 0, 0, 0}
#define EMPTY8 {0, 0, 0, 0, 0, 0, 0, 0}
#define BUILD_TEST(su, size, lanes) \
static void \
test_##su##size (void) \
{ \
int i; \
MAP##su (size,) pool[lanes] = POOL##lanes; \
MAP##su (size,) res[lanes] = EMPTY##lanes; \
MAP##su (size,) res2[lanes] = EMPTY##lanes; \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vmulq_lane_##su##size (res, pool, pool); \
for (i = 0; i < lanes; i++) \
if (res[i] != pool[i]) \
abort (); \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vmulq_laneq_##su##size (res2, pool, pool); \
for (i = 0; i < lanes; i++) \
if (res2[i] != pool[i]) \
abort (); \
}
#undef BUILD_VARS
#define BUILD_VARS(size, lanes) \
BUILD_TEST (s, size, lanes) \
BUILD_TEST (u, size, lanes)
BUILD_VARS (32, 4)
BUILD_VARS (16, 8)
int
main (int argc, char **argv)
{
test_s32 ();
test_u32 ();
test_s16 ();
test_u16 ();
return 0;
}
/* { dg-final { scan-assembler-times "mul\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.s\\\[\[0-9\]+\\\]" 4 } } */
/* { dg-final { scan-assembler-times "mul\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.h\\\[\[0-9\]+\\\]" 4 } } */
/* { dg-final { cleanup-saved-temps } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment