Commit eccf4d70 by Kyrylo Tkachov Committed by Kyrylo Tkachov

[arm][3/3] Implement fp16fml lane intrinsics

This patch implements the lane-wise fp16fml intrinsics.
There's quite a few of them so I've split them up from
the other simpler fp16fml intrinsics.

These ones expose instructions such as

vfmal.f16 Dd, Sn, Sm[<index>]  0 <= index <= 1
vfmal.f16 Qd, Dn, Dm[<index>]  0 <= index <= 3
vfmsl.f16 Dd, Sn, Sm[<index>]  0 <= index <= 1
vfmsl.f16 Qd, Dn, Dm[<index>]  0 <= index <= 3

These instructions extract a single half-precision
floating-point value from one of the source regs
and perform a vfmal/vfmsl operation as per the
normal variant with that value.

The nuance here is that some of the intrinsics want
to do things like:

float32x2_t vfmlal_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b, const int __index)


where the float16x8_t value of '__b' is held in a Q
register, so we need to be a bit smart about finding
the right D or S sub-register and translating the
lane number to a lane in that sub-register, instead
of just passing the language-level const-int down to
the assembly instruction.

That's where most of the complexity of this patch comes from
but hopefully it's orthogonal enough to make sense.

Bootstrapped and tested on arm-none-linux-gnueabihf as well as
armeb-none-eabi.

	* config/arm/arm_neon.h (vfmlal_lane_low_u32, vfmlal_lane_high_u32,
	vfmlalq_laneq_low_u32, vfmlalq_lane_low_u32, vfmlal_laneq_low_u32,
	vfmlalq_laneq_high_u32, vfmlalq_lane_high_u32, vfmlal_laneq_high_u32,
	vfmlsl_lane_low_u32, vfmlsl_lane_high_u32, vfmlslq_laneq_low_u32,
	vfmlslq_lane_low_u32, vfmlsl_laneq_low_u32, vfmlslq_laneq_high_u32,
	vfmlslq_lane_high_u32, vfmlsl_laneq_high_u32): Define.
	* config/arm/arm_neon_builtins.def (vfmal_lane_low,
	vfmal_lane_lowv4hf, vfmal_lane_lowv8hf, vfmal_lane_high,
	vfmal_lane_highv4hf, vfmal_lane_highv8hf, vfmsl_lane_low,
	vfmsl_lane_lowv4hf, vfmsl_lane_lowv8hf, vfmsl_lane_high,
	vfmsl_lane_highv4hf, vfmsl_lane_highv8hf): New sets of builtins.
	* config/arm/iterators.md (VFMLSEL2, vfmlsel2): New mode attributes.
	(V_lane_reg): Likewise.
	* config/arm/neon.md (neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>):
	New define_expand.
	(neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>): Likewise.
	(vfmal_lane_low<mode>_intrinsic,
	vfmal_lane_low<vfmlsel2><mode>_intrinsic,
	vfmal_lane_high<vfmlsel2><mode>_intrinsic,
	vfmal_lane_high<mode>_intrinsic, vfmsl_lane_low<mode>_intrinsic,
	vfmsl_lane_low<vfmlsel2><mode>_intrinsic,
	vfmsl_lane_high<vfmlsel2><mode>_intrinsic,
	vfmsl_lane_high<mode>_intrinsic): New define_insns.

	* gcc.target/arm/simd/fp16fml_lane_high.c: New test.
	* gcc.target/arm/simd/fp16fml_lane_low.c: New test.

From-SVN: r256540
parent 06e95715
2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/arm/arm_neon.h (vfmlal_lane_low_u32, vfmlal_lane_high_u32,
vfmlalq_laneq_low_u32, vfmlalq_lane_low_u32, vfmlal_laneq_low_u32,
vfmlalq_laneq_high_u32, vfmlalq_lane_high_u32, vfmlal_laneq_high_u32,
vfmlsl_lane_low_u32, vfmlsl_lane_high_u32, vfmlslq_laneq_low_u32,
vfmlslq_lane_low_u32, vfmlsl_laneq_low_u32, vfmlslq_laneq_high_u32,
vfmlslq_lane_high_u32, vfmlsl_laneq_high_u32): Define.
* config/arm/arm_neon_builtins.def (vfmal_lane_low,
vfmal_lane_lowv4hf, vfmal_lane_lowv8hf, vfmal_lane_high,
vfmal_lane_highv4hf, vfmal_lane_highv8hf, vfmsl_lane_low,
vfmsl_lane_lowv4hf, vfmsl_lane_lowv8hf, vfmsl_lane_high,
vfmsl_lane_highv4hf, vfmsl_lane_highv8hf): New sets of builtins.
* config/arm/iterators.md (VFMLSEL2, vfmlsel2): New mode attributes.
(V_lane_reg): Likewise.
* config/arm/neon.md (neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>):
New define_expand.
(neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>): Likewise.
(vfmal_lane_low<mode>_intrinsic,
vfmal_lane_low<vfmlsel2><mode>_intrinsic,
vfmal_lane_high<vfmlsel2><mode>_intrinsic,
vfmal_lane_high<mode>_intrinsic, vfmsl_lane_low<mode>_intrinsic,
vfmsl_lane_low<vfmlsel2><mode>_intrinsic,
vfmsl_lane_high<vfmlsel2><mode>_intrinsic,
vfmsl_lane_high<mode>_intrinsic): New define_insns.
2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/arm/arm-cpus.in (fp16fml): New feature. * config/arm/arm-cpus.in (fp16fml): New feature.
(ALL_SIMD): Add fp16fml. (ALL_SIMD): Add fp16fml.
(armv8.2-a): Add fp16fml as an option. (armv8.2-a): Add fp16fml as an option.
......
...@@ -18160,6 +18160,150 @@ vfmlslq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b) ...@@ -18160,6 +18160,150 @@ vfmlslq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
return __builtin_neon_vfmsl_highv4sf (__r, __a, __b); return __builtin_neon_vfmsl_highv4sf (__r, __a, __b);
} }
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlal_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmal_lane_lowv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlal_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmal_lane_highv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlalq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmal_lane_lowv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlalq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmal_lane_lowv4hfv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlal_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmal_lane_lowv8hfv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlalq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmal_lane_highv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlalq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmal_lane_highv4hfv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlal_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmal_lane_highv8hfv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlsl_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmsl_lane_lowv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlsl_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmsl_lane_highv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlslq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmsl_lane_lowv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlslq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmsl_lane_lowv4hfv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlsl_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmsl_lane_lowv8hfv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlslq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmsl_lane_highv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlslq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmsl_lane_highv4hfv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlsl_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmsl_lane_highv8hfv2sf (__r, __a, __b, __index);
}
#pragma GCC pop_options #pragma GCC pop_options
#endif #endif
......
...@@ -55,6 +55,18 @@ VAR2 (TERNOP, vfmal_low, v2sf, v4sf) ...@@ -55,6 +55,18 @@ VAR2 (TERNOP, vfmal_low, v2sf, v4sf)
VAR2 (TERNOP, vfmal_high, v2sf, v4sf) VAR2 (TERNOP, vfmal_high, v2sf, v4sf)
VAR2 (TERNOP, vfmsl_low, v2sf, v4sf) VAR2 (TERNOP, vfmsl_low, v2sf, v4sf)
VAR2 (TERNOP, vfmsl_high, v2sf, v4sf) VAR2 (TERNOP, vfmsl_high, v2sf, v4sf)
VAR2 (MAC_LANE, vfmal_lane_low, v2sf, v4sf)
VAR1 (MAC_LANE, vfmal_lane_lowv4hf, v4sf)
VAR1 (MAC_LANE, vfmal_lane_lowv8hf, v2sf)
VAR2 (MAC_LANE, vfmal_lane_high, v2sf, v4sf)
VAR1 (MAC_LANE, vfmal_lane_highv4hf, v4sf)
VAR1 (MAC_LANE, vfmal_lane_highv8hf, v2sf)
VAR2 (MAC_LANE, vfmsl_lane_low, v2sf, v4sf)
VAR1 (MAC_LANE, vfmsl_lane_lowv4hf, v4sf)
VAR1 (MAC_LANE, vfmsl_lane_lowv8hf, v2sf)
VAR2 (MAC_LANE, vfmsl_lane_high, v2sf, v4sf)
VAR1 (MAC_LANE, vfmsl_lane_highv4hf, v4sf)
VAR1 (MAC_LANE, vfmsl_lane_highv8hf, v2sf)
VAR3 (BINOP, vmullp, v8qi, v4hi, v2si) VAR3 (BINOP, vmullp, v8qi, v4hi, v2si)
VAR3 (BINOP, vmulls, v8qi, v4hi, v2si) VAR3 (BINOP, vmulls, v8qi, v4hi, v2si)
VAR3 (BINOP, vmullu, v8qi, v4hi, v2si) VAR3 (BINOP, vmullu, v8qi, v4hi, v2si)
......
...@@ -484,6 +484,12 @@ ...@@ -484,6 +484,12 @@
;; Mode mapping for VFM[A,S]L instructions for the vec_select result. ;; Mode mapping for VFM[A,S]L instructions for the vec_select result.
(define_mode_attr VFMLSEL [(V2SF "V2HF") (V4SF "V4HF")]) (define_mode_attr VFMLSEL [(V2SF "V2HF") (V4SF "V4HF")])
;; Mode mapping for VFM[A,S]L instructions for some awkward lane-wise forms.
(define_mode_attr VFMLSEL2 [(V2SF "V8HF") (V4SF "V4HF")])
;; Same as the above, but lowercase.
(define_mode_attr vfmlsel2 [(V2SF "v8hf") (V4SF "v4hf")])
;; Similar, for three elements. ;; Similar, for three elements.
(define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK") (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
(V4HI "BLK") (V8HI "BLK") (V4HI "BLK") (V8HI "BLK")
...@@ -516,6 +522,10 @@ ...@@ -516,6 +522,10 @@
;; Output template to select the low VFP register of a mult-register value. ;; Output template to select the low VFP register of a mult-register value.
(define_mode_attr V_lo [(V2SF "") (V4SF "e")]) (define_mode_attr V_lo [(V2SF "") (V4SF "e")])
;; Helper attribute for printing output templates for awkward forms of
;; vfmlal/vfmlsl intrinsics.
(define_mode_attr V_lane_reg [(V2SF "") (V4SF "P")])
;; Wider modes with the same number of elements. ;; Wider modes with the same number of elements.
(define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")]) (define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")])
......
...@@ -2382,6 +2382,314 @@ ...@@ -2382,6 +2382,314 @@
[(set_attr "type" "neon_fp_mla_s<q>")] [(set_attr "type" "neon_fp_mla_s<q>")]
) )
(define_expand "neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>"
[(set:VCVTF (match_operand:VCVTF 0 "s_register_operand")
(unspec:VCVTF
[(match_operand:VCVTF 1 "s_register_operand")
(PLUSMINUS:<VFML>
(match_operand:<VFML> 2 "s_register_operand")
(match_operand:<VFML> 3 "s_register_operand"))
(match_operand:SI 4 "const_int_operand")] VFMLHALVES))]
"TARGET_FP16FML"
{
rtx lane = GEN_INT (NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[4])));
rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>);
emit_insn (gen_vfm<vfml_op>l_lane_<vfml_half><mode>_intrinsic
(operands[0], operands[1],
operands[2], operands[3],
half, lane));
DONE;
})
(define_insn "vfmal_lane_low<mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_low" "")))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFML> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
{
operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
return "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_hi>3[%c5]";
}
else
{
operands[5] = GEN_INT (lane);
return "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3[%c5]";
}
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
(define_expand "neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>"
[(set:VCVTF (match_operand:VCVTF 0 "s_register_operand")
(unspec:VCVTF
[(match_operand:VCVTF 1 "s_register_operand")
(PLUSMINUS:<VFML>
(match_operand:<VFML> 2 "s_register_operand")
(match_operand:<VFMLSEL2> 3 "s_register_operand"))
(match_operand:SI 4 "const_int_operand")] VFMLHALVES))]
"TARGET_FP16FML"
{
rtx lane
= GEN_INT (NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[4])));
rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>);
emit_insn (gen_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>_intrinsic
(operands[0], operands[1], operands[2], operands[3],
half, lane));
DONE;
})
;; Used to implement the intrinsics:
;; float32x4_t vfmlalq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
;; float32x2_t vfmlal_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
;; Needs a bit of care to get the modes of the different sub-expressions right
;; due to 'a' and 'b' having different sizes and make sure we use the right
;; S or D subregister to select the appropriate lane from.
(define_insn "vfmal_lane_low<vfmlsel2><mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_low" "")))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
int new_lane = lane % elts_per_reg;
int regdiff = lane / elts_per_reg;
operands[5] = GEN_INT (new_lane);
/* We re-create operands[2] and operands[3] in the halved VFMLSEL modes
because we want the print_operand code to print the appropriate
S or D register prefix. */
operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
operands[2] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[2]));
return "vfmal.f16\\t%<V_reg>0, %<V_lane_reg>2, %<V_lane_reg>3[%c5]";
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
;; Used to implement the intrinsics:
;; float32x4_t vfmlalq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
;; float32x2_t vfmlal_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
;; Needs a bit of care to get the modes of the different sub-expressions right
;; due to 'a' and 'b' having different sizes and make sure we use the right
;; S or D subregister to select the appropriate lane from.
(define_insn "vfmal_lane_high<vfmlsel2><mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_high" "")))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
int new_lane = lane % elts_per_reg;
int regdiff = lane / elts_per_reg;
operands[5] = GEN_INT (new_lane);
/* We re-create operands[3] in the halved VFMLSEL mode
because we've calculated the correct half-width subreg to extract
the lane from and we want to print *that* subreg instead. */
operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_lane_reg>3[%c5]";
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
(define_insn "vfmal_lane_high<mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_high" "")))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFML> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
{
operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3[%c5]";
}
else
{
operands[5] = GEN_INT (lane);
return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_lo>3[%c5]";
}
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
(define_insn "vfmsl_lane_low<mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:<VFMLSEL>
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_low" ""))))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFML> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
{
operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
return "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_hi>3[%c5]";
}
else
{
operands[5] = GEN_INT (lane);
return "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3[%c5]";
}
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
;; Used to implement the intrinsics:
;; float32x4_t vfmlslq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
;; float32x2_t vfmlsl_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
;; Needs a bit of care to get the modes of the different sub-expressions right
;; due to 'a' and 'b' having different sizes and make sure we use the right
;; S or D subregister to select the appropriate lane from.
(define_insn "vfmsl_lane_low<vfmlsel2><mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:<VFMLSEL>
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_low" ""))))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
int new_lane = lane % elts_per_reg;
int regdiff = lane / elts_per_reg;
operands[5] = GEN_INT (new_lane);
/* We re-create operands[2] and operands[3] in the halved VFMLSEL modes
because we want the print_operand code to print the appropriate
S or D register prefix. */
operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
operands[2] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[2]));
return "vfmsl.f16\\t%<V_reg>0, %<V_lane_reg>2, %<V_lane_reg>3[%c5]";
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
;; Used to implement the intrinsics:
;; float32x4_t vfmlslq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
;; float32x2_t vfmlsl_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
;; Needs a bit of care to get the modes of the different sub-expressions right
;; due to 'a' and 'b' having different sizes and make sure we use the right
;; S or D subregister to select the appropriate lane from.
(define_insn "vfmsl_lane_high<vfmlsel2><mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:<VFMLSEL>
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_high" ""))))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
int new_lane = lane % elts_per_reg;
int regdiff = lane / elts_per_reg;
operands[5] = GEN_INT (new_lane);
/* We re-create operands[3] in the halved VFMLSEL mode
because we've calculated the correct half-width subreg to extract
the lane from and we want to print *that* subreg instead. */
operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_lane_reg>3[%c5]";
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
(define_insn "vfmsl_lane_high<mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:<VFMLSEL>
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_high" ""))))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFML> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
{
operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3[%c5]";
}
else
{
operands[5] = GEN_INT (lane);
return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_lo>3[%c5]";
}
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
; Used for intrinsics when flag_unsafe_math_optimizations is false. ; Used for intrinsics when flag_unsafe_math_optimizations is false.
(define_insn "neon_vmla<mode>_unspec" (define_insn "neon_vmla<mode>_unspec"
......
2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.target/arm/simd/fp16fml_lane_high.c: New test.
* gcc.target/arm/simd/fp16fml_lane_low.c: New test.
2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.target/arm/multilib.exp: Add combination tests for fp16fml. * gcc.target/arm/multilib.exp: Add combination tests for fp16fml.
* gcc.target/arm/simd/fp16fml_high.c: New test. * gcc.target/arm/simd/fp16fml_high.c: New test.
* gcc.target/arm/simd/fp16fml_low.c: Likewise. * gcc.target/arm/simd/fp16fml_low.c: Likewise.
......
/* { dg-do compile } */
/* { dg-require-effective-target arm_fp16fml_neon_ok } */
/* { dg-add-options arm_fp16fml_neon } */
#include "arm_neon.h"
float32x2_t
test_vfmlal_lane_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
{
return vfmlal_lane_high_u32 (r, a, b, 0);
}
float32x2_t
tets_vfmlsl_lane_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
{
return vfmlsl_lane_high_u32 (r, a, b, 0);
}
float32x2_t
test_vfmlal_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
{
return vfmlal_laneq_high_u32 (r, a, b, 6);
}
float32x2_t
test_vfmlsl_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
{
return vfmlsl_laneq_high_u32 (r, a, b, 6);
}
float32x4_t
test_vfmlalq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
{
return vfmlalq_lane_high_u32 (r, a, b, 1);
}
float32x4_t
test_vfmlslq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
{
return vfmlslq_lane_high_u32 (r, a, b, 1);
}
float32x4_t
test_vfmlalq_laneq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
{
return vfmlalq_laneq_high_u32 (r, a, b, 7);
}
float32x4_t
test_vfmlslq_laneq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
{
return vfmlslq_laneq_high_u32 (r, a, b, 7);
}
/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[02468]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[0-9]+\[1\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]\[3\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[02468]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[0-9]+\[1\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]\[3\]} 1 } } */
/* { dg-do compile } */
/* { dg-require-effective-target arm_fp16fml_neon_ok } */
/* { dg-add-options arm_fp16fml_neon } */
#include "arm_neon.h"
float32x2_t
test_vfmlal_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
{
return vfmlal_lane_low_u32 (r, a, b, 0);
}
float32x2_t
test_vfmlsl_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
{
return vfmlsl_lane_low_u32 (r, a, b, 0);
}
float32x2_t
test_vfmlal_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
{
return vfmlal_laneq_low_u32 (r, a, b, 6);
}
float32x2_t
test_vfmlsl_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
{
return vfmlsl_laneq_low_u32 (r, a, b, 6);
}
float32x4_t
test_vfmlalq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
{
return vfmlalq_lane_low_u32 (r, a, b, 1);
}
float32x4_t
test_vfmlslq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
{
return vfmlslq_lane_low_u32 (r, a, b, 1);
}
float32x4_t
test_vfmlalq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
{
return vfmlalq_laneq_low_u32 (r, a, b, 7);
}
float32x4_t
test_vfmlslq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
{
return vfmlslq_laneq_low_u32 (r, a, b, 7);
}
/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[13579]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[0-9]+\[1\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[123]?[13579]\[3\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[13579]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[0-9]+\[1\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[123]?[13579]\[3\]} 1 } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment