Commit ab2e8f01 by Jiong Wang Committed by Jiong Wang

[AArch64][5/10] ARMv8.2-A FP16 lane vector intrinsics

gcc/
	* config/aarch64/aarch64-simd.md (*aarch64_mulx_elt_to_64v2df): Rename to
	"*aarch64_mulx_elt_from_dup<mode>".
	(*aarch64_mul3_elt<mode>): Update schedule type.
	(*aarch64_mul3_elt_from_dup<mode>): Likewise.
	(*aarch64_fma4_elt_from_dup<mode>): Likewise.
	(*aarch64_fnma4_elt_from_dup<mode>): Likewise.
	* config/aarch64/iterators.md (VMUL): Supprt half precision float modes.
	(f, fp): Support HF modes.
	* config/aarch64/arm_neon.h (vfma_lane_f16, vfmaq_lane_f16,
	vfma_laneq_f16, vfmaq_laneq_f16, vfma_n_f16, vfmaq_n_f16, vfms_lane_f16,
        vfmsq_lane_f16, vfms_laneq_f16, vfmsq_laneq_f16, vfms_n_f16,
	vfmsq_n_f16, vmul_lane_f16, vmulq_lane_f16, vmul_laneq_f16,
	vmulq_laneq_f16, vmul_n_f16, vmulq_n_f16, vmulx_lane_f16,
	vmulxq_lane_f16, vmulx_laneq_f16, vmulxq_laneq_f16): New.

From-SVN: r238719
parent 89ed6d5f
2016-07-25 Jiong Wang <jiong.wang@arm.com> 2016-07-25 Jiong Wang <jiong.wang@arm.com>
* config/aarch64/aarch64-simd.md (*aarch64_mulx_elt_to_64v2df): Rename to
"*aarch64_mulx_elt_from_dup<mode>".
(*aarch64_mul3_elt<mode>): Update schedule type.
(*aarch64_mul3_elt_from_dup<mode>): Likewise.
(*aarch64_fma4_elt_from_dup<mode>): Likewise.
(*aarch64_fnma4_elt_from_dup<mode>): Likewise.
* config/aarch64/iterators.md (VMUL): Supprt half precision float modes.
(f, fp): Support HF modes.
* config/aarch64/arm_neon.h (vfma_lane_f16, vfmaq_lane_f16,
vfma_laneq_f16, vfmaq_laneq_f16, vfma_n_f16, vfmaq_n_f16, vfms_lane_f16,
vfmsq_lane_f16, vfms_laneq_f16, vfmsq_laneq_f16, vfms_n_f16,
vfmsq_n_f16, vmul_lane_f16, vmulq_lane_f16, vmul_laneq_f16,
vmulq_laneq_f16, vmul_n_f16, vmulq_n_f16, vmulx_lane_f16,
vmulxq_lane_f16, vmulx_laneq_f16, vmulxq_laneq_f16): New.
2016-07-25 Jiong Wang <jiong.wang@arm.com>
* config/aarch64/aarch64-simd-builtins.def: Register new builtins. * config/aarch64/aarch64-simd-builtins.def: Register new builtins.
* config/aarch64/aarch64-simd.md (fma<mode>4, fnma<mode>4): Extend to HF * config/aarch64/aarch64-simd.md (fma<mode>4, fnma<mode>4): Extend to HF
modes. modes.
......
...@@ -351,7 +351,7 @@ ...@@ -351,7 +351,7 @@
operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]"; return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]";
} }
[(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")] [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
) )
(define_insn "*aarch64_mul3_elt_<vswap_width_name><mode>" (define_insn "*aarch64_mul3_elt_<vswap_width_name><mode>"
...@@ -379,7 +379,7 @@ ...@@ -379,7 +379,7 @@
(match_operand:VMUL 2 "register_operand" "w")))] (match_operand:VMUL 2 "register_operand" "w")))]
"TARGET_SIMD" "TARGET_SIMD"
"<f>mul\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"; "<f>mul\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]";
[(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")] [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
) )
(define_insn "aarch64_rsqrte<mode>" (define_insn "aarch64_rsqrte<mode>"
...@@ -1634,7 +1634,7 @@ ...@@ -1634,7 +1634,7 @@
(match_operand:VMUL 3 "register_operand" "0")))] (match_operand:VMUL 3 "register_operand" "0")))]
"TARGET_SIMD" "TARGET_SIMD"
"fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
[(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] [(set_attr "type" "neon<fp>_mla_<stype>_scalar<q>")]
) )
(define_insn "*aarch64_fma4_elt_to_64v2df" (define_insn "*aarch64_fma4_elt_to_64v2df"
...@@ -1712,7 +1712,7 @@ ...@@ -1712,7 +1712,7 @@
(match_operand:VMUL 3 "register_operand" "0")))] (match_operand:VMUL 3 "register_operand" "0")))]
"TARGET_SIMD" "TARGET_SIMD"
"fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
[(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] [(set_attr "type" "neon<fp>_mla_<stype>_scalar<q>")]
) )
(define_insn "*aarch64_fnma4_elt_to_64v2df" (define_insn "*aarch64_fnma4_elt_to_64v2df"
...@@ -3101,20 +3101,18 @@ ...@@ -3101,20 +3101,18 @@
[(set_attr "type" "neon_fp_mul_<Vetype><q>")] [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
) )
;; vmulxq_lane_f64 ;; vmulxq_lane
(define_insn "*aarch64_mulx_elt_to_64v2df" (define_insn "*aarch64_mulx_elt_from_dup<mode>"
[(set (match_operand:V2DF 0 "register_operand" "=w") [(set (match_operand:VHSDF 0 "register_operand" "=w")
(unspec:V2DF (unspec:VHSDF
[(match_operand:V2DF 1 "register_operand" "w") [(match_operand:VHSDF 1 "register_operand" "w")
(vec_duplicate:V2DF (vec_duplicate:VHSDF
(match_operand:DF 2 "register_operand" "w"))] (match_operand:<VEL> 2 "register_operand" "w"))]
UNSPEC_FMULX))] UNSPEC_FMULX))]
"TARGET_SIMD" "TARGET_SIMD"
{ "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]";
return "fmulx\t%0.2d, %1.2d, %2.d[0]"; [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
}
[(set_attr "type" "neon_fp_mul_d_scalar_q")]
) )
;; vmulxs_lane_f32, vmulxs_laneq_f32 ;; vmulxs_lane_f32, vmulxs_laneq_f32
......
...@@ -26773,6 +26773,160 @@ vfmsq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c) ...@@ -26773,6 +26773,160 @@ vfmsq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
return __builtin_aarch64_fnmav8hf (__b, __c, __a); return __builtin_aarch64_fnmav8hf (__b, __c, __a);
} }
/* ARMv8.2-A FP16 lane vector intrinsics. */
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vfma_lane_f16 (float16x4_t __a, float16x4_t __b,
float16x4_t __c, const int __lane)
{
return vfma_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vfmaq_lane_f16 (float16x8_t __a, float16x8_t __b,
float16x4_t __c, const int __lane)
{
return vfmaq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vfma_laneq_f16 (float16x4_t __a, float16x4_t __b,
float16x8_t __c, const int __lane)
{
return vfma_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vfmaq_laneq_f16 (float16x8_t __a, float16x8_t __b,
float16x8_t __c, const int __lane)
{
return vfmaq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vfma_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c)
{
return vfma_f16 (__a, __b, vdup_n_f16 (__c));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vfmaq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c)
{
return vfmaq_f16 (__a, __b, vdupq_n_f16 (__c));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vfms_lane_f16 (float16x4_t __a, float16x4_t __b,
float16x4_t __c, const int __lane)
{
return vfms_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vfmsq_lane_f16 (float16x8_t __a, float16x8_t __b,
float16x4_t __c, const int __lane)
{
return vfmsq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vfms_laneq_f16 (float16x4_t __a, float16x4_t __b,
float16x8_t __c, const int __lane)
{
return vfms_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vfmsq_laneq_f16 (float16x8_t __a, float16x8_t __b,
float16x8_t __c, const int __lane)
{
return vfmsq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vfms_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c)
{
return vfms_f16 (__a, __b, vdup_n_f16 (__c));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vfmsq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c)
{
return vfmsq_f16 (__a, __b, vdupq_n_f16 (__c));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vmul_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane)
{
return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vmulq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane)
{
return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vmul_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane)
{
return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vmulq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane)
{
return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vmul_n_f16 (float16x4_t __a, float16_t __b)
{
return vmul_lane_f16 (__a, vdup_n_f16 (__b), 0);
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vmulq_n_f16 (float16x8_t __a, float16_t __b)
{
return vmulq_laneq_f16 (__a, vdupq_n_f16 (__b), 0);
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vmulx_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane)
{
return vmulx_f16 (__a, __aarch64_vdup_lane_f16 (__b, __lane));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vmulxq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane)
{
return vmulxq_f16 (__a, __aarch64_vdupq_lane_f16 (__b, __lane));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vmulx_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane)
{
return vmulx_f16 (__a, __aarch64_vdup_laneq_f16 (__b, __lane));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vmulxq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane)
{
return vmulxq_f16 (__a, __aarch64_vdupq_laneq_f16 (__b, __lane));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vmulx_n_f16 (float16x4_t __a, float16_t __b)
{
return vmulx_f16 (__a, vdup_n_f16 (__b));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vmulxq_n_f16 (float16x8_t __a, float16_t __b)
{
return vmulxq_f16 (__a, vdupq_n_f16 (__b));
}
#pragma GCC pop_options #pragma GCC pop_options
#undef __aarch64_vget_lane_any #undef __aarch64_vget_lane_any
......
...@@ -218,7 +218,10 @@ ...@@ -218,7 +218,10 @@
(define_mode_iterator DX [DI DF]) (define_mode_iterator DX [DI DF])
;; Modes available for <f>mul lane operations. ;; Modes available for <f>mul lane operations.
(define_mode_iterator VMUL [V4HI V8HI V2SI V4SI V2SF V4SF V2DF]) (define_mode_iterator VMUL [V4HI V8HI V2SI V4SI
(V4HF "TARGET_SIMD_F16INST")
(V8HF "TARGET_SIMD_F16INST")
V2SF V4SF V2DF])
;; Modes available for <f>mul lane operations changing lane count. ;; Modes available for <f>mul lane operations changing lane count.
(define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF]) (define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF])
...@@ -730,6 +733,7 @@ ...@@ -730,6 +733,7 @@
(V4HI "") (V8HI "") (V4HI "") (V8HI "")
(V2SI "") (V4SI "") (V2SI "") (V4SI "")
(DI "") (V2DI "") (DI "") (V2DI "")
(V4HF "f") (V8HF "f")
(V2SF "f") (V4SF "f") (V2SF "f") (V4SF "f")
(V2DF "f") (DF "f")]) (V2DF "f") (DF "f")])
...@@ -738,6 +742,7 @@ ...@@ -738,6 +742,7 @@
(V4HI "") (V8HI "") (V4HI "") (V8HI "")
(V2SI "") (V4SI "") (V2SI "") (V4SI "")
(DI "") (V2DI "") (DI "") (V2DI "")
(V4HF "_fp") (V8HF "_fp")
(V2SF "_fp") (V4SF "_fp") (V2SF "_fp") (V4SF "_fp")
(V2DF "_fp") (DF "_fp") (V2DF "_fp") (DF "_fp")
(SF "_fp")]) (SF "_fp")])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment