Commit 828e70c1 by James Greenhalgh Committed by James Greenhalgh

[AArch64] Improve arm_neon.h vml<as>_lane handling.

gcc/
	* config/aarch64/aarch64-simd-builtins.def (fma): New.
	* config/aarch64/aarch64-simd.md
	(aarch64_mla_elt<mode>): New.
	(aarch64_mla_elt_<vswap_width_name><mode>): Likewise.
	(aarch64_mls_elt<mode>): Likewise.
	(aarch64_mls_elt_<vswap_width_name><mode>): Likewise.
	(aarch64_fma4_elt<mode>): Likewise.
	(aarch64_fma4_elt_<vswap_width_name><mode>): Likewise.
	(aarch64_fma4_elt_to_128v2df): Likewise.
	(aarch64_fma4_elt_to_64df): Likewise.
	(fnma<mode>4): Likewise.
	(aarch64_fnma4_elt<mode>): Likewise.
	(aarch64_fnma4_elt_<vswap_width_name><mode>): Likewise.
	(aarch64_fnma4_elt_to_128v2df): Likewise.
	(aarch64_fnma4_elt_to_64df): Likewise.
	* config/aarch64/iterators.md (VDQSF): New.
	* config/aarch64/arm_neon.h
	(vfm<as><sdq>_lane<q>_f<32, 64>): Convert to C implementation.
	(vml<sa><q>_lane<q>_<fsu><16, 32, 64>): Likewise.

gcc/testsuite/
	* gcc.target/aarch64/fmla-intrinsic.c: New.
	* gcc.target/aarch64/mla-intrinsic.c: Likewise.
	* gcc.target/aarch64/fmls-intrinsic.c: Likewise.
	* gcc.target/aarch64/mls-intrinsic.c: Likewise.

From-SVN: r202625
parent 779aea46
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* config/aarch64/aarch64-simd-builtins.def (fma): New.
* config/aarch64/aarch64-simd.md
(aarch64_mla_elt<mode>): New.
(aarch64_mla_elt_<vswap_width_name><mode>): Likewise.
(aarch64_mls_elt<mode>): Likewise.
(aarch64_mls_elt_<vswap_width_name><mode>): Likewise.
(aarch64_fma4_elt<mode>): Likewise.
(aarch64_fma4_elt_<vswap_width_name><mode>): Likewise.
(aarch64_fma4_elt_to_128v2df): Likewise.
(aarch64_fma4_elt_to_64df): Likewise.
(fnma<mode>4): Likewise.
(aarch64_fnma4_elt<mode>): Likewise.
(aarch64_fnma4_elt_<vswap_width_name><mode>): Likewise.
(aarch64_fnma4_elt_to_128v2df): Likewise.
(aarch64_fnma4_elt_to_64df): Likewise.
* config/aarch64/iterators.md (VDQSF): New.
* config/aarch64/arm_neon.h
(vfm<as><sdq>_lane<q>_f<32, 64>): Convert to C implementation.
(vml<sa><q>_lane<q>_<fsu><16, 32, 64>): Likewise.
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* config/aarch64/aarch64-simd.md (aarch64_mul3_elt<mode>): New.
(aarch64_mul3_elt_<vswap_width_name><mode>): Likewise.
(aarch64_mul3_elt_to_128df): Likewise.
......
......@@ -359,3 +359,6 @@
/* Implemented by aarch64_st1<VALL:mode>. */
BUILTIN_VALL (STORE1, st1, 0)
/* Implemented by fma<mode>4. */
BUILTIN_VDQF (TERNOP, fma, 4)
......@@ -1070,6 +1070,38 @@
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_mla_elt<mode>"
[(set (match_operand:VDQHS 0 "register_operand" "=w")
(plus:VDQHS
(mult:VDQHS
(vec_duplicate:VDQHS
(vec_select:<VEL>
(match_operand:VDQHS 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQHS 3 "register_operand" "w"))
(match_operand:VDQHS 4 "register_operand" "0")))]
"TARGET_SIMD"
"mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_mla")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_mla_elt_<vswap_width_name><mode>"
[(set (match_operand:VDQHS 0 "register_operand" "=w")
(plus:VDQHS
(mult:VDQHS
(vec_duplicate:VDQHS
(vec_select:<VEL>
(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQHS 3 "register_operand" "w"))
(match_operand:VDQHS 4 "register_operand" "0")))]
"TARGET_SIMD"
"mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_mla")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "aarch64_mls<mode>"
[(set (match_operand:VQ_S 0 "register_operand" "=w")
(minus:VQ_S (match_operand:VQ_S 1 "register_operand" "0")
......@@ -1081,6 +1113,38 @@
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_mls_elt<mode>"
[(set (match_operand:VDQHS 0 "register_operand" "=w")
(minus:VDQHS
(match_operand:VDQHS 4 "register_operand" "0")
(mult:VDQHS
(vec_duplicate:VDQHS
(vec_select:<VEL>
(match_operand:VDQHS 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQHS 3 "register_operand" "w"))))]
"TARGET_SIMD"
"mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_mla")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_mls_elt_<vswap_width_name><mode>"
[(set (match_operand:VDQHS 0 "register_operand" "=w")
(minus:VDQHS
(match_operand:VDQHS 4 "register_operand" "0")
(mult:VDQHS
(vec_duplicate:VDQHS
(vec_select:<VEL>
(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQHS 3 "register_operand" "w"))))]
"TARGET_SIMD"
"mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_mla")
(set_attr "simd_mode" "<MODE>")]
)
;; Max/Min operations.
(define_insn "<su><maxmin><mode>3"
[(set (match_operand:VQ_S 0 "register_operand" "=w")
......@@ -1483,6 +1547,137 @@
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_fma4_elt<mode>"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(fma:VDQF
(vec_duplicate:VDQF
(vec_select:<VEL>
(match_operand:VDQF 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQF 3 "register_operand" "w")
(match_operand:VDQF 4 "register_operand" "0")))]
"TARGET_SIMD"
"fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_fma4_elt_<vswap_width_name><mode>"
[(set (match_operand:VDQSF 0 "register_operand" "=w")
(fma:VDQSF
(vec_duplicate:VDQSF
(vec_select:<VEL>
(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQSF 3 "register_operand" "w")
(match_operand:VDQSF 4 "register_operand" "0")))]
"TARGET_SIMD"
"fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_fma4_elt_to_128df"
[(set (match_operand:V2DF 0 "register_operand" "=w")
(fma:V2DF
(vec_duplicate:V2DF
(match_operand:DF 1 "register_operand" "w"))
(match_operand:V2DF 2 "register_operand" "w")
(match_operand:V2DF 3 "register_operand" "0")))]
"TARGET_SIMD"
"fmla\\t%0.2d, %2.2d, %1.2d[0]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "V2DF")]
)
(define_insn "*aarch64_fma4_elt_to_64v2df"
[(set (match_operand:DF 0 "register_operand" "=w")
(fma:DF
(vec_select:DF
(match_operand:V2DF 1 "register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand")]))
(match_operand:DF 3 "register_operand" "w")
(match_operand:DF 4 "register_operand" "0")))]
"TARGET_SIMD"
"fmla\\t%0.2d, %3.2d, %1.2d[%2]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "V2DF")]
)
(define_insn "fnma<mode>4"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(fma:VDQF
(match_operand:VDQF 1 "register_operand" "w")
(neg:VDQF
(match_operand:VDQF 2 "register_operand" "w"))
(match_operand:VDQF 3 "register_operand" "0")))]
"TARGET_SIMD"
"fmls\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
[(set_attr "simd_type" "simd_fmla")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_fnma4_elt<mode>"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(fma:VDQF
(neg:VDQF
(match_operand:VDQF 3 "register_operand" "w"))
(vec_duplicate:VDQF
(vec_select:<VEL>
(match_operand:VDQF 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQF 4 "register_operand" "0")))]
"TARGET_SIMD"
"fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_fnma4_elt_<vswap_width_name><mode>"
[(set (match_operand:VDQSF 0 "register_operand" "=w")
(fma:VDQSF
(neg:VDQSF
(match_operand:VDQSF 3 "register_operand" "w"))
(vec_duplicate:VDQSF
(vec_select:<VEL>
(match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
(parallel [(match_operand:SI 2 "immediate_operand")])))
(match_operand:VDQSF 4 "register_operand" "0")))]
"TARGET_SIMD"
"fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "<MODE>")]
)
(define_insn "*aarch64_fnma4_elt_to_128df"
[(set (match_operand:V2DF 0 "register_operand" "=w")
(fma:V2DF
(neg:V2DF
(match_operand:V2DF 2 "register_operand" "w"))
(vec_duplicate:V2DF
(match_operand:DF 1 "register_operand" "w"))
(match_operand:V2DF 3 "register_operand" "0")))]
"TARGET_SIMD"
"fmls\\t%0.2d, %2.2d, %1.2d[0]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "V2DF")]
)
(define_insn "*aarch64_fnma4_elt_to_64v2df"
[(set (match_operand:DF 0 "register_operand" "=w")
(fma:DF
(vec_select:DF
(match_operand:V2DF 1 "register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand")]))
(neg:DF
(match_operand:DF 3 "register_operand" "w"))
(match_operand:DF 4 "register_operand" "0")))]
"TARGET_SIMD"
"fmls\\t%0.2d, %3.2d, %1.2d[%2]"
[(set_attr "simd_type" "simd_fmla_elt")
(set_attr "simd_mode" "V2DF")]
)
;; Vector versions of the floating-point frint patterns.
;; Expands to btrunc, ceil, floor, nearbyint, rint, round.
(define_insn "<frint_pattern><mode>2"
......
......@@ -6100,33 +6100,6 @@ vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
return result;
}
#define vfma_lane_f32(a, b, c, d) \
__extension__ \
({ \
float32x2_t c_ = (c); \
float32x2_t b_ = (b); \
float32x2_t a_ = (a); \
float32x2_t result; \
__asm__ ("fmla %0.2s,%2.2s,%3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vfmad_lane_f64(a, b, c) \
__extension__ \
({ \
float64x2_t b_ = (b); \
float64_t a_ = (a); \
float64_t result; \
__asm__ ("fmla %d0,%d1,%2.d[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
{
......@@ -6149,47 +6122,6 @@ vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
return result;
}
#define vfmaq_lane_f32(a, b, c, d) \
__extension__ \
({ \
float32x4_t c_ = (c); \
float32x4_t b_ = (b); \
float32x4_t a_ = (a); \
float32x4_t result; \
__asm__ ("fmla %0.4s,%2.4s,%3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vfmaq_lane_f64(a, b, c, d) \
__extension__ \
({ \
float64x2_t c_ = (c); \
float64x2_t b_ = (b); \
float64x2_t a_ = (a); \
float64x2_t result; \
__asm__ ("fmla %0.2d,%2.2d,%3.d[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vfmas_lane_f32(a, b, c) \
__extension__ \
({ \
float32x4_t b_ = (b); \
float32_t a_ = (a); \
float32_t result; \
__asm__ ("fmla %s0,%s1,%2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vfma_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
{
......@@ -6234,19 +6166,6 @@ vfms_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
return result;
}
#define vfmsd_lane_f64(a, b, c) \
__extension__ \
({ \
float64x2_t b_ = (b); \
float64_t a_ = (a); \
float64_t result; \
__asm__ ("fmls %d0,%d1,%2.d[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vfmsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
{
......@@ -6269,19 +6188,6 @@ vfmsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
return result;
}
#define vfmss_lane_f32(a, b, c) \
__extension__ \
({ \
float32x4_t b_ = (b); \
float32_t a_ = (a); \
float32_t result; \
__asm__ ("fmls %s0,%s1,%2.s[%3]" \
: "=w"(result) \
: "w"(a_), "w"(b_), "i"(c) \
: /* No clobbers */); \
result; \
})
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vget_high_f32 (float32x4_t a)
{
......@@ -7122,133 +7028,6 @@ vld1q_dup_u64 (const uint64_t * a)
result; \
})
#define vmla_lane_f32(a, b, c, d) \
__extension__ \
({ \
float32x2_t c_ = (c); \
float32x2_t b_ = (b); \
float32x2_t a_ = (a); \
float32x2_t result; \
float32x2_t t1; \
__asm__ ("fmul %1.2s, %3.2s, %4.s[%5]; fadd %0.2s, %0.2s, %1.2s" \
: "=w"(result), "=w"(t1) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmla_lane_s16(a, b, c, d) \
__extension__ \
({ \
int16x4_t c_ = (c); \
int16x4_t b_ = (b); \
int16x4_t a_ = (a); \
int16x4_t result; \
__asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "x"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmla_lane_s32(a, b, c, d) \
__extension__ \
({ \
int32x2_t c_ = (c); \
int32x2_t b_ = (b); \
int32x2_t a_ = (a); \
int32x2_t result; \
__asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmla_lane_u16(a, b, c, d) \
__extension__ \
({ \
uint16x4_t c_ = (c); \
uint16x4_t b_ = (b); \
uint16x4_t a_ = (a); \
uint16x4_t result; \
__asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "x"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmla_lane_u32(a, b, c, d) \
__extension__ \
({ \
uint32x2_t c_ = (c); \
uint32x2_t b_ = (b); \
uint32x2_t a_ = (a); \
uint32x2_t result; \
__asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmla_laneq_s16(a, b, c, d) \
__extension__ \
({ \
int16x8_t c_ = (c); \
int16x4_t b_ = (b); \
int16x4_t a_ = (a); \
int16x4_t result; \
__asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "x"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmla_laneq_s32(a, b, c, d) \
__extension__ \
({ \
int32x4_t c_ = (c); \
int32x2_t b_ = (b); \
int32x2_t a_ = (a); \
int32x2_t result; \
__asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmla_laneq_u16(a, b, c, d) \
__extension__ \
({ \
uint16x8_t c_ = (c); \
uint16x4_t b_ = (b); \
uint16x4_t a_ = (a); \
uint16x4_t result; \
__asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "x"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmla_laneq_u32(a, b, c, d) \
__extension__ \
({ \
uint32x4_t c_ = (c); \
uint32x2_t b_ = (b); \
uint32x2_t a_ = (a); \
uint32x2_t result; \
__asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
{
......@@ -7815,133 +7594,6 @@ vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
return result;
}
#define vmlaq_lane_f32(a, b, c, d) \
__extension__ \
({ \
float32x4_t c_ = (c); \
float32x4_t b_ = (b); \
float32x4_t a_ = (a); \
float32x4_t result; \
float32x4_t t1; \
__asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fadd %0.4s, %0.4s, %1.4s" \
: "=w"(result), "=w"(t1) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmlaq_lane_s16(a, b, c, d) \
__extension__ \
({ \
int16x8_t c_ = (c); \
int16x8_t b_ = (b); \
int16x8_t a_ = (a); \
int16x8_t result; \
__asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "x"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmlaq_lane_s32(a, b, c, d) \
__extension__ \
({ \
int32x4_t c_ = (c); \
int32x4_t b_ = (b); \
int32x4_t a_ = (a); \
int32x4_t result; \
__asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmlaq_lane_u16(a, b, c, d) \
__extension__ \
({ \
uint16x8_t c_ = (c); \
uint16x8_t b_ = (b); \
uint16x8_t a_ = (a); \
uint16x8_t result; \
__asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "x"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmlaq_lane_u32(a, b, c, d) \
__extension__ \
({ \
uint32x4_t c_ = (c); \
uint32x4_t b_ = (b); \
uint32x4_t a_ = (a); \
uint32x4_t result; \
__asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmlaq_laneq_s16(a, b, c, d) \
__extension__ \
({ \
int16x8_t c_ = (c); \
int16x8_t b_ = (b); \
int16x8_t a_ = (a); \
int16x8_t result; \
__asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "x"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmlaq_laneq_s32(a, b, c, d) \
__extension__ \
({ \
int32x4_t c_ = (c); \
int32x4_t b_ = (b); \
int32x4_t a_ = (a); \
int32x4_t result; \
__asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmlaq_laneq_u16(a, b, c, d) \
__extension__ \
({ \
uint16x8_t c_ = (c); \
uint16x8_t b_ = (b); \
uint16x8_t a_ = (a); \
uint16x8_t result; \
__asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "x"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmlaq_laneq_u32(a, b, c, d) \
__extension__ \
({ \
uint32x4_t c_ = (c); \
uint32x4_t b_ = (b); \
uint32x4_t a_ = (a); \
uint32x4_t result; \
__asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
{
......@@ -8046,106 +7698,35 @@ vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
{
uint8x16_t result;
__asm__ ("mla %0.16b, %2.16b, %3.16b"
: "=w"(result)
: "0"(a), "w"(b), "w"(c)
: /* No clobbers */);
return result;
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
{
uint16x8_t result;
__asm__ ("mla %0.8h, %2.8h, %3.8h"
: "=w"(result)
: "0"(a), "w"(b), "w"(c)
: /* No clobbers */);
return result;
}
__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
{
uint32x4_t result;
__asm__ ("mla %0.4s, %2.4s, %3.4s"
: "=w"(result)
: "0"(a), "w"(b), "w"(c)
: /* No clobbers */);
return result;
}
#define vmls_lane_f32(a, b, c, d) \
__extension__ \
({ \
float32x2_t c_ = (c); \
float32x2_t b_ = (b); \
float32x2_t a_ = (a); \
float32x2_t result; \
float32x2_t t1; \
__asm__ ("fmul %1.2s, %3.2s, %4.s[%5]; fsub %0.2s, %0.2s, %1.2s" \
: "=w"(result), "=w"(t1) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmls_lane_s16(a, b, c, d) \
__extension__ \
({ \
int16x4_t c_ = (c); \
int16x4_t b_ = (b); \
int16x4_t a_ = (a); \
int16x4_t result; \
__asm__ ("mls %0.4h,%2.4h,%3.h[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "x"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmls_lane_s32(a, b, c, d) \
__extension__ \
({ \
int32x2_t c_ = (c); \
int32x2_t b_ = (b); \
int32x2_t a_ = (a); \
int32x2_t result; \
__asm__ ("mls %0.2s,%2.2s,%3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmls_lane_u16(a, b, c, d) \
__extension__ \
({ \
uint16x4_t c_ = (c); \
uint16x4_t b_ = (b); \
uint16x4_t a_ = (a); \
uint16x4_t result; \
__asm__ ("mls %0.4h,%2.4h,%3.h[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "x"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
uint8x16_t result;
__asm__ ("mla %0.16b, %2.16b, %3.16b"
: "=w"(result)
: "0"(a), "w"(b), "w"(c)
: /* No clobbers */);
return result;
}
#define vmls_lane_u32(a, b, c, d) \
__extension__ \
({ \
uint32x2_t c_ = (c); \
uint32x2_t b_ = (b); \
uint32x2_t a_ = (a); \
uint32x2_t result; \
__asm__ ("mls %0.2s,%2.2s,%3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
{
uint16x8_t result;
__asm__ ("mla %0.8h, %2.8h, %3.8h"
: "=w"(result)
: "0"(a), "w"(b), "w"(c)
: /* No clobbers */);
return result;
}
__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
{
uint32x4_t result;
__asm__ ("mla %0.4s, %2.4s, %3.4s"
: "=w"(result)
: "0"(a), "w"(b), "w"(c)
: /* No clobbers */);
return result;
}
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
......@@ -8713,148 +8294,6 @@ vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
return result;
}
#define vmlsq_lane_f32(a, b, c, d) \
__extension__ \
({ \
float32x4_t c_ = (c); \
float32x4_t b_ = (b); \
float32x4_t a_ = (a); \
float32x4_t result; \
float32x4_t t1; \
__asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s" \
: "=w"(result), "=w"(t1) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmlsq_lane_s16(a, b, c, d) \
__extension__ \
({ \
int16x8_t c_ = (c); \
int16x8_t b_ = (b); \
int16x8_t a_ = (a); \
int16x8_t result; \
__asm__ ("mls %0.8h,%2.8h,%3.h[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "x"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmlsq_lane_s32(a, b, c, d) \
__extension__ \
({ \
int32x4_t c_ = (c); \
int32x4_t b_ = (b); \
int32x4_t a_ = (a); \
int32x4_t result; \
__asm__ ("mls %0.4s,%2.4s,%3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmlsq_lane_u16(a, b, c, d) \
__extension__ \
({ \
uint16x8_t c_ = (c); \
uint16x8_t b_ = (b); \
uint16x8_t a_ = (a); \
uint16x8_t result; \
__asm__ ("mls %0.8h,%2.8h,%3.h[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "x"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmlsq_lane_u32(a, b, c, d) \
__extension__ \
({ \
uint32x4_t c_ = (c); \
uint32x4_t b_ = (b); \
uint32x4_t a_ = (a); \
uint32x4_t result; \
__asm__ ("mls %0.4s,%2.4s,%3.s[%4]" \
: "=w"(result) \
: "0"(a_), "w"(b_), "w"(c_), "i"(d) \
: /* No clobbers */); \
result; \
})
#define vmlsq_laneq_f32(__a, __b, __c, __d) \
__extension__ \
({ \
float32x4_t __c_ = (__c); \
float32x4_t __b_ = (__b); \
float32x4_t __a_ = (__a); \
float32x4_t __result; \
float32x4_t __t1; \
__asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s" \
: "=w"(__result), "=w"(__t1) \
: "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \
: /* No clobbers */); \
__result; \
})
#define vmlsq_laneq_s16(__a, __b, __c, __d) \
__extension__ \
({ \
int16x8_t __c_ = (__c); \
int16x8_t __b_ = (__b); \
int16x8_t __a_ = (__a); \
int16x8_t __result; \
__asm__ ("mls %0.8h, %2.8h, %3.h[%4]" \
: "=w"(__result) \
: "0"(__a_), "w"(__b_), "x"(__c_), "i"(__d) \
: /* No clobbers */); \
__result; \
})
#define vmlsq_laneq_s32(__a, __b, __c, __d) \
__extension__ \
({ \
int32x4_t __c_ = (__c); \
int32x4_t __b_ = (__b); \
int32x4_t __a_ = (__a); \
int32x4_t __result; \
__asm__ ("mls %0.4s, %2.4s, %3.s[%4]" \
: "=w"(__result) \
: "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \
: /* No clobbers */); \
__result; \
})
#define vmlsq_laneq_u16(__a, __b, __c, __d) \
__extension__ \
({ \
uint16x8_t __c_ = (__c); \
uint16x8_t __b_ = (__b); \
uint16x8_t __a_ = (__a); \
uint16x8_t __result; \
__asm__ ("mls %0.8h, %2.8h, %3.h[%4]" \
: "=w"(__result) \
: "0"(__a_), "w"(__b_), "x"(__c_), "i"(__d) \
: /* No clobbers */); \
__result; \
})
#define vmlsq_laneq_u32(__a, __b, __c, __d) \
__extension__ \
({ \
uint32x4_t __c_ = (__c); \
uint32x4_t __b_ = (__b); \
uint32x4_t __a_ = (__a); \
uint32x4_t __result; \
__asm__ ("mls %0.4s, %2.4s, %3.s[%4]" \
: "=w"(__result) \
: "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \
: /* No clobbers */); \
__result; \
})
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
{
......@@ -19488,130 +18927,334 @@ vduph_lane_p16 (poly16x4_t __a, const int __b)
return __aarch64_vget_lane_p16 (__a, __b);
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vduph_lane_s16 (int16x4_t __a, const int __b)
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vduph_lane_s16 (int16x4_t __a, const int __b)
{
return __aarch64_vget_lane_s16 (__a, __b);
}
__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
vduph_lane_u16 (uint16x4_t __a, const int __b)
{
return __aarch64_vget_lane_u16 (__a, __b);
}
/* vdups_lane */
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vdups_lane_f32 (float32x2_t __a, const int __b)
{
return __aarch64_vget_lane_f32 (__a, __b);
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vdups_lane_s32 (int32x2_t __a, const int __b)
{
return __aarch64_vget_lane_s32 (__a, __b);
}
__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
vdups_lane_u32 (uint32x2_t __a, const int __b)
{
return __aarch64_vget_lane_u32 (__a, __b);
}
/* vdupd_lane */
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vdupd_lane_f64 (float64x1_t __a, const int __attribute__ ((unused)) __b)
{
return __a;
}
__extension__ static __inline int64_t __attribute__ ((__always_inline__))
vdupd_lane_s64 (int64x1_t __a, const int __attribute__ ((unused)) __b)
{
return __a;
}
__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
vdupd_lane_u64 (uint64x1_t __a, const int __attribute__ ((unused)) __b)
{
return __a;
}
/* vdupb_laneq */
__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
vdupb_laneq_p8 (poly8x16_t __a, const int __b)
{
return __aarch64_vgetq_lane_p8 (__a, __b);
}
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vdupb_laneq_s8 (int8x16_t __a, const int __attribute__ ((unused)) __b)
{
return __aarch64_vgetq_lane_s8 (__a, __b);
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
vdupb_laneq_u8 (uint8x16_t __a, const int __b)
{
return __aarch64_vgetq_lane_u8 (__a, __b);
}
/* vduph_laneq */
__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
vduph_laneq_p16 (poly16x8_t __a, const int __b)
{
return __aarch64_vgetq_lane_p16 (__a, __b);
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vduph_laneq_s16 (int16x8_t __a, const int __b)
{
return __aarch64_vgetq_lane_s16 (__a, __b);
}
__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
vduph_laneq_u16 (uint16x8_t __a, const int __b)
{
return __aarch64_vgetq_lane_u16 (__a, __b);
}
/* vdups_laneq */
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vdups_laneq_f32 (float32x4_t __a, const int __b)
{
return __aarch64_vgetq_lane_f32 (__a, __b);
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vdups_laneq_s32 (int32x4_t __a, const int __b)
{
return __aarch64_vgetq_lane_s32 (__a, __b);
}
__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
vdups_laneq_u32 (uint32x4_t __a, const int __b)
{
return __aarch64_vgetq_lane_u32 (__a, __b);
}
/* vdupd_laneq */
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vdupd_laneq_f64 (float64x2_t __a, const int __b)
{
return __aarch64_vgetq_lane_f64 (__a, __b);
}
__extension__ static __inline int64_t __attribute__ ((__always_inline__))
vdupd_laneq_s64 (int64x2_t __a, const int __b)
{
return __aarch64_vgetq_lane_s64 (__a, __b);
}
__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
vdupd_laneq_u64 (uint64x2_t __a, const int __b)
{
return __aarch64_vgetq_lane_u64 (__a, __b);
}
/* vfma_lane */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vfma_lane_f32 (float32x2_t __a, float32x2_t __b,
float32x2_t __c, const int __lane)
{
return __builtin_aarch64_fmav2sf (__b,
__aarch64_vdup_lane_f32 (__c, __lane),
__a);
}
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vfma_lane_f64 (float64_t __a, float64_t __b,
float64_t __c, const int __lane)
{
return __builtin_fma (__b, __c, __a);
}
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vfmad_lane_f64 (float64_t __a, float64_t __b,
float64_t __c, const int __lane)
{
return __builtin_fma (__b, __c, __a);
}
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vfmas_lane_f32 (float32_t __a, float32_t __b,
float32x2_t __c, const int __lane)
{
return __builtin_fmaf (__b, __aarch64_vget_lane_f32 (__c, __lane), __a);
}
/* vfma_laneq */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vfma_laneq_f32 (float32x2_t __a, float32x2_t __b,
float32x4_t __c, const int __lane)
{
return __builtin_aarch64_fmav2sf (__b,
__aarch64_vdup_laneq_f32 (__c, __lane),
__a);
}
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vfma_laneq_f64 (float64_t __a, float64_t __b,
float64x2_t __c, const int __lane)
{
return __aarch64_vget_lane_s16 (__a, __b);
return __builtin_fma (__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a);
}
__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
vduph_lane_u16 (uint16x4_t __a, const int __b)
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vfmad_laneq_f64 (float64_t __a, float64_t __b,
float64x2_t __c, const int __lane)
{
return __aarch64_vget_lane_u16 (__a, __b);
return __builtin_fma (__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a);
}
/* vdups_lane */
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vdups_lane_f32 (float32x2_t __a, const int __b)
vfmas_laneq_f32 (float32_t __a, float32_t __b,
float32x4_t __c, const int __lane)
{
return __aarch64_vget_lane_f32 (__a, __b);
return __builtin_fmaf (__b, __aarch64_vgetq_lane_f32 (__c, __lane), __a);
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vdups_lane_s32 (int32x2_t __a, const int __b)
/* vfmaq_lane */
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vfmaq_lane_f32 (float32x4_t __a, float32x4_t __b,
float32x2_t __c, const int __lane)
{
return __aarch64_vget_lane_s32 (__a, __b);
return __builtin_aarch64_fmav4sf (__b,
__aarch64_vdupq_lane_f32 (__c, __lane),
__a);
}
__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
vdups_lane_u32 (uint32x2_t __a, const int __b)
__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
vfmaq_lane_f64 (float64x2_t __a, float64x2_t __b,
float64_t __c, const int __lane)
{
return __aarch64_vget_lane_u32 (__a, __b);
return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a);
}
/* vdupd_lane */
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vdupd_lane_f64 (float64x1_t __a, const int __attribute__ ((unused)) __b)
/* vfmaq_laneq */
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vfmaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
float32x4_t __c, const int __lane)
{
return __a;
return __builtin_aarch64_fmav4sf (__b,
__aarch64_vdupq_laneq_f32 (__c, __lane),
__a);
}
__extension__ static __inline int64_t __attribute__ ((__always_inline__))
vdupd_lane_s64 (int64x1_t __a, const int __attribute__ ((unused)) __b)
__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
vfmaq_laneq_f64 (float64x2_t __a, float64x2_t __b,
float64x2_t __c, const int __lane)
{
return __a;
return __builtin_aarch64_fmav2df (__b,
__aarch64_vdupq_laneq_f64 (__c, __lane),
__a);
}
__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
vdupd_lane_u64 (uint64x1_t __a, const int __attribute__ ((unused)) __b)
/* vfms_lane */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vfms_lane_f32 (float32x2_t __a, float32x2_t __b,
float32x2_t __c, const int __lane)
{
return __a;
return __builtin_aarch64_fmav2sf (-__b,
__aarch64_vdup_lane_f32 (__c, __lane),
__a);
}
/* vdupb_laneq */
__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
vdupb_laneq_p8 (poly8x16_t __a, const int __b)
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vfms_lane_f64 (float64_t __a, float64_t __b,
float64_t __c, const int __lane)
{
return __aarch64_vgetq_lane_p8 (__a, __b);
return __builtin_fma (-__b, __c, __a);
}
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vdupb_laneq_s8 (int8x16_t __a, const int __attribute__ ((unused)) __b)
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vfmsd_lane_f64 (float64_t __a, float64_t __b,
float64_t __c, const int __lane)
{
return __aarch64_vgetq_lane_s8 (__a, __b);
return __builtin_fma (-__b, __c, __a);
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
vdupb_laneq_u8 (uint8x16_t __a, const int __b)
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vfmss_lane_f32 (float32_t __a, float32_t __b,
float32x2_t __c, const int __lane)
{
return __aarch64_vgetq_lane_u8 (__a, __b);
return __builtin_fmaf (-__b, __aarch64_vget_lane_f32 (__c, __lane), __a);
}
/* vduph_laneq */
__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
vduph_laneq_p16 (poly16x8_t __a, const int __b)
/* vfms_laneq */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vfms_laneq_f32 (float32x2_t __a, float32x2_t __b,
float32x4_t __c, const int __lane)
{
return __aarch64_vgetq_lane_p16 (__a, __b);
return __builtin_aarch64_fmav2sf (-__b,
__aarch64_vdup_laneq_f32 (__c, __lane),
__a);
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vduph_laneq_s16 (int16x8_t __a, const int __b)
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vfms_laneq_f64 (float64_t __a, float64_t __b,
float64x2_t __c, const int __lane)
{
return __aarch64_vgetq_lane_s16 (__a, __b);
return __builtin_fma (-__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a);
}
__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
vduph_laneq_u16 (uint16x8_t __a, const int __b)
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vfmsd_laneq_f64 (float64_t __a, float64_t __b,
float64x2_t __c, const int __lane)
{
return __aarch64_vgetq_lane_u16 (__a, __b);
return __builtin_fma (-__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a);
}
/* vdups_laneq */
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vdups_laneq_f32 (float32x4_t __a, const int __b)
vfmss_laneq_f32 (float32_t __a, float32_t __b,
float32x4_t __c, const int __lane)
{
return __aarch64_vgetq_lane_f32 (__a, __b);
return __builtin_fmaf (-__b, __aarch64_vgetq_lane_f32 (__c, __lane), __a);
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vdups_laneq_s32 (int32x4_t __a, const int __b)
{
return __aarch64_vgetq_lane_s32 (__a, __b);
}
/* vfmsq_lane */
__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
vdups_laneq_u32 (uint32x4_t __a, const int __b)
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vfmsq_lane_f32 (float32x4_t __a, float32x4_t __b,
float32x2_t __c, const int __lane)
{
return __aarch64_vgetq_lane_u32 (__a, __b);
return __builtin_aarch64_fmav4sf (-__b,
__aarch64_vdupq_lane_f32 (__c, __lane),
__a);
}
/* vdupd_laneq */
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vdupd_laneq_f64 (float64x2_t __a, const int __b)
__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
vfmsq_lane_f64 (float64x2_t __a, float64x2_t __b,
float64_t __c, const int __lane)
{
return __aarch64_vgetq_lane_f64 (__a, __b);
return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
}
__extension__ static __inline int64_t __attribute__ ((__always_inline__))
vdupd_laneq_s64 (int64x2_t __a, const int __b)
/* vfmsq_laneq */
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vfmsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
float32x4_t __c, const int __lane)
{
return __aarch64_vgetq_lane_s64 (__a, __b);
return __builtin_aarch64_fmav4sf (-__b,
__aarch64_vdupq_laneq_f32 (__c, __lane),
__a);
}
__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
vdupd_laneq_u64 (uint64x2_t __a, const int __b)
__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b,
float64x2_t __c, const int __lane)
{
return __aarch64_vgetq_lane_u64 (__a, __b);
return __builtin_aarch64_fmav2df (-__b,
__aarch64_vdupq_laneq_f64 (__c, __lane),
__a);
}
/* vld1 */
......@@ -21131,6 +20774,156 @@ vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
return a + b * c;
}
/* vmla_lane */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmla_lane_f32 (float32x2_t __a, float32x2_t __b,
float32x2_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vget_lane_f32 (__c, __lane)));
}
__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vmla_lane_s16 (int16x4_t __a, int16x4_t __b,
int16x4_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vget_lane_s16 (__c, __lane)));
}
__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
vmla_lane_s32 (int32x2_t __a, int32x2_t __b,
int32x2_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vget_lane_s32 (__c, __lane)));
}
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b,
uint16x4_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vget_lane_u16 (__c, __lane)));
}
__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b,
uint32x2_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vget_lane_u32 (__c, __lane)));
}
/* vmla_laneq */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmla_laneq_f32 (float32x2_t __a, float32x2_t __b,
float32x4_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vgetq_lane_f32 (__c, __lane)));
}
__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vmla_laneq_s16 (int16x4_t __a, int16x4_t __b,
int16x8_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vgetq_lane_s16 (__c, __lane)));
}
__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
vmla_laneq_s32 (int32x2_t __a, int32x2_t __b,
int32x4_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vgetq_lane_s32 (__c, __lane)));
}
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
vmla_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
uint16x8_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vgetq_lane_u16 (__c, __lane)));
}
__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
vmla_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
uint32x4_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vgetq_lane_u32 (__c, __lane)));
}
/* vmlaq_lane */
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b,
float32x2_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vget_lane_f32 (__c, __lane)));
}
__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b,
int16x4_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vget_lane_s16 (__c, __lane)));
}
__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b,
int32x2_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vget_lane_s32 (__c, __lane)));
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
uint16x4_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vget_lane_u16 (__c, __lane)));
}
__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
uint32x2_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vget_lane_u32 (__c, __lane)));
}
/* vmlaq_laneq */
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
float32x4_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vgetq_lane_f32 (__c, __lane)));
}
__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vmlaq_laneq_s16 (int16x8_t __a, int16x8_t __b,
int16x8_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vgetq_lane_s16 (__c, __lane)));
}
__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vmlaq_laneq_s32 (int32x4_t __a, int32x4_t __b,
int32x4_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vgetq_lane_s32 (__c, __lane)));
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vmlaq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
uint16x8_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vgetq_lane_u16 (__c, __lane)));
}
__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
uint32x4_t __c, const int __lane)
{
return (__a + (__b * __aarch64_vgetq_lane_u32 (__c, __lane)));
}
/* vmls */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
{
......@@ -21149,6 +20942,153 @@ vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
return a - b * c;
}
/* vmls_lane */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmls_lane_f32 (float32x2_t __a, float32x2_t __b,
float32x2_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vget_lane_f32 (__c, __lane)));
}
__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vmls_lane_s16 (int16x4_t __a, int16x4_t __b,
int16x4_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vget_lane_s16 (__c, __lane)));
}
__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
vmls_lane_s32 (int32x2_t __a, int32x2_t __b,
int32x2_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vget_lane_s32 (__c, __lane)));
}
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b,
uint16x4_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vget_lane_u16 (__c, __lane)));
}
__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b,
uint32x2_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vget_lane_u32 (__c, __lane)));
}
/* vmls_laneq */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmls_laneq_f32 (float32x2_t __a, float32x2_t __b,
float32x4_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vgetq_lane_f32 (__c, __lane)));
}
__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vmls_laneq_s16 (int16x4_t __a, int16x4_t __b,
int16x8_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vgetq_lane_s16 (__c, __lane)));
}
__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
vmls_laneq_s32 (int32x2_t __a, int32x2_t __b,
int32x4_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vgetq_lane_s32 (__c, __lane)));
}
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
vmls_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
uint16x8_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vgetq_lane_u16 (__c, __lane)));
}
__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
vmls_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
uint32x4_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vgetq_lane_u32 (__c, __lane)));
}
/* vmlsq_lane */
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b,
float32x2_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vget_lane_f32 (__c, __lane)));
}
__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b,
int16x4_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vget_lane_s16 (__c, __lane)));
}
__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b,
int32x2_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vget_lane_s32 (__c, __lane)));
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
uint16x4_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vget_lane_u16 (__c, __lane)));
}
__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
uint32x2_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vget_lane_u32 (__c, __lane)));
}
/* vmlsq_laneq */
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
float32x4_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vgetq_lane_f32 (__c, __lane)));
}
__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vmlsq_laneq_s16 (int16x8_t __a, int16x8_t __b,
int16x8_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vgetq_lane_s16 (__c, __lane)));
}
__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vmlsq_laneq_s32 (int32x4_t __a, int32x4_t __b,
int32x4_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vgetq_lane_s32 (__c, __lane)));
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vmlsq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
uint16x8_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vgetq_lane_u16 (__c, __lane)));
}
__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
uint32x4_t __c, const int __lane)
{
return (__a - (__b * __aarch64_vgetq_lane_u32 (__c, __lane)));
}
/* vmul_lane */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
......
......@@ -89,6 +89,9 @@
;; Vector Float modes.
(define_mode_iterator VDQF [V2SF V4SF V2DF])
;; Vector single Float modes.
(define_mode_iterator VDQSF [V2SF V4SF])
;; Modes suitable to use as the return type of a vcond expression.
(define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])
......
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* gcc.target/aarch64/fmla-intrinsic.c: New.
* gcc.target/aarch64/mla-intrinsic.c: Likewise.
* gcc.target/aarch64/fmls-intrinsic.c: Likewise.
* gcc.target/aarch64/mls-intrinsic.c: Likewise.
2013-09-16 James Greenhalgh <james.greenhalgh@arm.com>
* gcc.target/aarch64/mul_intrinsic_1.c: New.
* gcc.target/aarch64/fmul_intrinsic_1.c: Likewise.
......
/* { dg-do run } */
/* { dg-options "-O3 --save-temps" } */
#include <arm_neon.h>
#define DELTA 0.0001
extern double fabs (double);
extern void abort (void);
#define TEST_VMLA(q1, q2, size, in1_lanes, in2_lanes) \
static void \
test_vfma##q1##_lane##q2##_f##size (float##size##_t * res, \
const float##size##_t *in1, \
const float##size##_t *in2) \
{ \
float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res); \
float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1); \
float##size##x##in2_lanes##_t c; \
if (in2_lanes > 1) \
{ \
c = vld1##q2##_f##size (in2); \
a = vfma##q1##_lane##q2##_f##size (a, b, c, 1); \
} \
else \
{ \
c = vld1##q2##_f##size (in2 + 1); \
a = vfma##q1##_lane##q2##_f##size (a, b, c, 0); \
} \
vst1##q1##_f##size (res, a); \
}
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
TEST_VMLA ( , , width, n_half_lanes, n_half_lanes) \
TEST_VMLA (q, , width, n_lanes, n_half_lanes) \
TEST_VMLA ( , q, width, n_half_lanes, n_lanes) \
TEST_VMLA (q, q, width, n_lanes, n_lanes) \
BUILD_VARS (32, 4, 2)
BUILD_VARS (64, 2, 1)
#define POOL2 {0.0, 1.0}
#define POOL4 {0.0, 1.0, 2.0, 3.0}
#define EMPTY2 {0.0, 0.0}
#define EMPTY4 {0.0, 0.0, 0.0, 0.0}
#define BUILD_TEST(size, lanes) \
static void \
test_f##size (void) \
{ \
int i; \
float##size##_t pool[lanes] = POOL##lanes; \
float##size##_t res[lanes] = EMPTY##lanes; \
float##size##_t res2[lanes] = EMPTY##lanes; \
float##size##_t res3[lanes] = EMPTY##lanes; \
float##size##_t res4[lanes] = EMPTY##lanes; \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vfma_lane_f##size (res, pool, pool); \
for (i = 0; i < lanes / 2; i++) \
if (fabs (res[i] - pool[i]) > DELTA) \
abort (); \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vfmaq_lane_f##size (res2, pool, pool); \
for (i = 0; i < lanes; i++) \
if (fabs (res2[i] - pool[i]) > DELTA) \
abort (); \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vfma_laneq_f##size (res3, pool, pool); \
for (i = 0; i < lanes / 2; i++) \
if (fabs (res3[i] - pool[i]) > DELTA) \
abort (); \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vfmaq_laneq_f##size (res4, pool, pool); \
for (i = 0; i < lanes; i++) \
if (fabs (res4[i] - pool[i]) > DELTA) \
abort (); \
}
BUILD_TEST (32, 4)
BUILD_TEST (64, 2)
int
main (int argc, char **argv)
{
test_f32 ();
test_f64 ();
return 0;
}
/* vfma_laneq_f32.
vfma_lane_f32. */
/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s\\\[\[0-9\]+\\\]" 2 } } */
/* vfmaq_lane_f32.
vfmaq_laneq_f32. */
/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 2 } } */
/* vfma_lane_f64. */
/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 1 } } */
/* vfmaq_lane_f64.
vfma_laneq_f64.
vfmaq_laneq_f64. */
/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
/* { dg-final { cleanup-saved-temps } } */
/* { dg-do run } */
/* { dg-options "-O3 --save-temps" } */
#include <arm_neon.h>
#define DELTA 0.0001
extern double fabs (double);
extern void abort (void);
#define TEST_VMLS(q1, q2, size, in1_lanes, in2_lanes) \
static void \
test_vfms##q1##_lane##q2##_f##size (float##size##_t * res, \
const float##size##_t *in1, \
const float##size##_t *in2) \
{ \
float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res); \
float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1); \
float##size##x##in2_lanes##_t c; \
if (in2_lanes > 1) \
{ \
c = vld1##q2##_f##size (in2); \
a = vfms##q1##_lane##q2##_f##size (a, b, c, 1); \
} \
else \
{ \
c = vld1##q2##_f##size (in2 + 1); \
a = vfms##q1##_lane##q2##_f##size (a, b, c, 0); \
} \
vst1##q1##_f##size (res, a); \
}
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
TEST_VMLS ( , , width, n_half_lanes, n_half_lanes) \
TEST_VMLS (q, , width, n_lanes, n_half_lanes) \
TEST_VMLS ( , q, width, n_half_lanes, n_lanes) \
TEST_VMLS (q, q, width, n_lanes, n_lanes) \
BUILD_VARS (32, 4, 2)
BUILD_VARS (64, 2, 1)
#define POOL2 {0.0, 1.0}
#define POOL4 {0.0, 1.0, 2.0, 3.0}
#define EMPTY2 {0.0, 0.0}
#define EMPTY4 {0.0, 0.0, 0.0, 0.0}
#define BUILD_TEST(size, lanes) \
static void \
test_f##size (void) \
{ \
int i; \
float##size##_t pool[lanes] = POOL##lanes; \
float##size##_t res[lanes] = EMPTY##lanes; \
float##size##_t res2[lanes] = EMPTY##lanes; \
float##size##_t res3[lanes] = EMPTY##lanes; \
float##size##_t res4[lanes] = EMPTY##lanes; \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vfms_lane_f##size (res, pool, pool); \
asm volatile ("" : :"Q" (res) : "memory"); \
for (i = 0; i < lanes / 2; i++) \
if (fabs (res[i] + pool[i]) > DELTA) \
abort (); \
\
/* Forecfully avoid optimization. */ \
test_vfmsq_lane_f##size (res2, pool, pool); \
asm volatile ("" : :"Q" (res2) : "memory"); \
for (i = 0; i < lanes; i++) \
if (fabs (res2[i] + pool[i]) > DELTA) \
abort (); \
\
/* Forecfully avoid optimization. */ \
test_vfms_laneq_f##size (res3, pool, pool); \
asm volatile ("" : :"Q" (res3) : "memory"); \
for (i = 0; i < lanes / 2; i++) \
if (fabs (res3[i] + pool[i]) > DELTA) \
abort (); \
\
/* Forecfully avoid optimization. */ \
test_vfmsq_laneq_f##size (res4, pool, pool); \
asm volatile ("" : :"Q" (res4) : "memory"); \
for (i = 0; i < lanes; i++) \
if (fabs (res4[i] + pool[i]) > DELTA) \
abort (); \
}
BUILD_TEST (32, 4)
BUILD_TEST (64, 2)
int
main (int argc, char **argv)
{
test_f32 ();
test_f64 ();
return 0;
}
/* vfms_laneq_f32.
vfms_lane_f32. */
/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s\\\[\[0-9\]+\\\]" 2 } } */
/* vfmsq_lane_f32.
vfmsq_laneq_f32. */
/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 2 } } */
/* vfms_lane_f64. */
/* { dg-final { scan-assembler-times "fmsub\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 1 } } */
/* vfmsq_lane_f64.
vfms_laneq_f64.
vfmsq_laneq_f64. */
/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
/* { dg-final { cleanup-saved-temps } } */
/* { dg-do run } */
/* { dg-options "-O3 --save-temps" } */
#include <arm_neon.h>
extern void abort (void);
#define MAPs(size, xx) int##size##xx##_t
#define MAPu(size, xx) uint##size##xx##_t
#define TEST_VMLA(q, su, size, in1_lanes, in2_lanes) \
static void \
test_vmlaq_lane##q##_##su##size (MAP##su (size, ) * res, \
const MAP##su(size, ) *in1, \
const MAP##su(size, ) *in2) \
{ \
MAP##su (size, x##in1_lanes) a = vld1q_##su##size (res); \
MAP##su (size, x##in1_lanes) b = vld1q_##su##size (in1); \
MAP##su (size, x##in2_lanes) c = vld1##q##_##su##size (in2); \
a = vmlaq_lane##q##_##su##size (a, b, c, 1); \
vst1q_##su##size (res, a); \
}
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
TEST_VMLA (, s, width, n_lanes, n_half_lanes) \
TEST_VMLA (q, s, width, n_lanes, n_lanes) \
TEST_VMLA (, u, width, n_lanes, n_half_lanes) \
TEST_VMLA (q, u, width, n_lanes, n_lanes) \
BUILD_VARS (32, 4, 2)
BUILD_VARS (16, 8, 4)
#define POOL4 {0, 1, 2, 3}
#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7}
#define EMPTY4 {0, 0, 0, 0}
#define EMPTY8 {0, 0, 0, 0, 0, 0, 0, 0}
#define BUILD_TEST(su, size, lanes) \
static void \
test_##su##size (void) \
{ \
int i; \
MAP##su (size,) pool[lanes] = POOL##lanes; \
MAP##su (size,) res[lanes] = EMPTY##lanes; \
MAP##su (size,) res2[lanes] = EMPTY##lanes; \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vmlaq_lane_##su##size (res, pool, pool); \
for (i = 0; i < lanes; i++) \
if (res[i] != pool[i]) \
abort (); \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vmlaq_laneq_##su##size (res2, pool, pool); \
for (i = 0; i < lanes; i++) \
if (res2[i] != pool[i]) \
abort (); \
}
#undef BUILD_VARS
#define BUILD_VARS(size, lanes) \
BUILD_TEST (s, size, lanes) \
BUILD_TEST (u, size, lanes)
BUILD_VARS (32, 4)
BUILD_VARS (16, 8)
int
main (int argc, char **argv)
{
test_s32 ();
test_u32 ();
test_s16 ();
test_u16 ();
return 0;
}
/* { dg-final { scan-assembler-times "mla\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 4 } } */
/* { dg-final { scan-assembler-times "mla\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.8h\\\[\[0-9\]+\\\]" 4 } } */
/* { dg-final { cleanup-saved-temps } } */
/* { dg-do run } */
/* { dg-options "-O3 --save-temps" } */
#include <arm_neon.h>
extern void abort (void);
#define MAPs(size, xx) int##size##xx##_t
#define MAPu(size, xx) uint##size##xx##_t
#define TEST_VMLS(q, su, size, in1_lanes, in2_lanes) \
static void \
test_vmlsq_lane##q##_##su##size (MAP##su (size, ) * res, \
const MAP##su(size, ) *in1, \
const MAP##su(size, ) *in2) \
{ \
MAP##su (size, x##in1_lanes) a = vld1q_##su##size (res); \
MAP##su (size, x##in1_lanes) b = vld1q_##su##size (in1); \
MAP##su (size, x##in2_lanes) c = vld1##q##_##su##size (in2); \
a = vmlsq_lane##q##_##su##size (a, b, c, 1); \
vst1q_##su##size (res, a); \
}
#define BUILD_VARS(width, n_lanes, n_half_lanes) \
TEST_VMLS (, s, width, n_lanes, n_half_lanes) \
TEST_VMLS (q, s, width, n_lanes, n_lanes) \
TEST_VMLS (, u, width, n_lanes, n_half_lanes) \
TEST_VMLS (q, u, width, n_lanes, n_lanes) \
BUILD_VARS (32, 4, 2)
BUILD_VARS (16, 8, 4)
#define MAP_OPs +
#define MAP_OPu -
#define POOL4 {0, 1, 2, 3}
#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7}
#define EMPTY4s {0, 0, 0, 0}
#define EMPTY8s {0, 0, 0, 0, 0, 0, 0, 0}
#define EMPTY4u {0, 2, 4, 6}
#define EMPTY8u {0, 2, 4, 6, 8, 10, 12, 14}
#define BUILD_TEST(su, size, lanes) \
static void \
test_##su##size (void) \
{ \
int i; \
MAP##su (size,) pool[lanes] = POOL##lanes; \
MAP##su (size,) res[lanes] = EMPTY##lanes##su; \
MAP##su (size,) res2[lanes] = EMPTY##lanes##su; \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vmlsq_lane_##su##size (res, pool, pool); \
for (i = 0; i < lanes; i++) \
if (res[i] MAP_OP##su pool[i] != 0) \
abort (); \
\
/* Forecfully avoid optimization. */ \
asm volatile ("" : : : "memory"); \
test_vmlsq_laneq_##su##size (res2, pool, pool); \
for (i = 0; i < lanes; i++) \
if (res2[i] MAP_OP##su pool[i] != 0) \
abort (); \
}
#undef BUILD_VARS
#define BUILD_VARS(size, lanes) \
BUILD_TEST (s, size, lanes) \
BUILD_TEST (u, size, lanes)
BUILD_VARS (32, 4)
BUILD_VARS (16, 8)
int
main (int argc, char **argv)
{
test_s32 ();
test_u32 ();
test_s16 ();
test_u16 ();
return 0;
}
/* { dg-final { scan-assembler-times "mls\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 4 } } */
/* { dg-final { scan-assembler-times "mls\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.8h\\\[\[0-9\]+\\\]" 4 } } */
/* { dg-final { cleanup-saved-temps } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment