Commit eccf4d70 by Kyrylo Tkachov Committed by Kyrylo Tkachov

[arm][3/3] Implement fp16fml lane intrinsics

This patch implements the lane-wise fp16fml intrinsics.
There's quite a few of them so I've split them up from
the other simpler fp16fml intrinsics.

These ones expose instructions such as

vfmal.f16 Dd, Sn, Sm[<index>]  0 <= index <= 1
vfmal.f16 Qd, Dn, Dm[<index>]  0 <= index <= 3
vfmsl.f16 Dd, Sn, Sm[<index>]  0 <= index <= 1
vfmsl.f16 Qd, Dn, Dm[<index>]  0 <= index <= 3

These instructions extract a single half-precision
floating-point value from one of the source regs
and perform a vfmal/vfmsl operation as per the
normal variant with that value.

The nuance here is that some of the intrinsics want
to do things like:

float32x2_t vfmlal_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b, const int __index)


where the float16x8_t value of '__b' is held in a Q
register, so we need to be a bit smart about finding
the right D or S sub-register and translating the
lane number to a lane in that sub-register, instead
of just passing the language-level const-int down to
the assembly instruction.

That's where most of the complexity of this patch comes from
but hopefully it's orthogonal enough to make sense.

Bootstrapped and tested on arm-none-linux-gnueabihf as well as
armeb-none-eabi.

	* config/arm/arm_neon.h (vfmlal_lane_low_u32, vfmlal_lane_high_u32,
	vfmlalq_laneq_low_u32, vfmlalq_lane_low_u32, vfmlal_laneq_low_u32,
	vfmlalq_laneq_high_u32, vfmlalq_lane_high_u32, vfmlal_laneq_high_u32,
	vfmlsl_lane_low_u32, vfmlsl_lane_high_u32, vfmlslq_laneq_low_u32,
	vfmlslq_lane_low_u32, vfmlsl_laneq_low_u32, vfmlslq_laneq_high_u32,
	vfmlslq_lane_high_u32, vfmlsl_laneq_high_u32): Define.
	* config/arm/arm_neon_builtins.def (vfmal_lane_low,
	vfmal_lane_lowv4hf, vfmal_lane_lowv8hf, vfmal_lane_high,
	vfmal_lane_highv4hf, vfmal_lane_highv8hf, vfmsl_lane_low,
	vfmsl_lane_lowv4hf, vfmsl_lane_lowv8hf, vfmsl_lane_high,
	vfmsl_lane_highv4hf, vfmsl_lane_highv8hf): New sets of builtins.
	* config/arm/iterators.md (VFMLSEL2, vfmlsel2): New mode attributes.
	(V_lane_reg): Likewise.
	* config/arm/neon.md (neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>):
	New define_expand.
	(neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>): Likewise.
	(vfmal_lane_low<mode>_intrinsic,
	vfmal_lane_low<vfmlsel2><mode>_intrinsic,
	vfmal_lane_high<vfmlsel2><mode>_intrinsic,
	vfmal_lane_high<mode>_intrinsic, vfmsl_lane_low<mode>_intrinsic,
	vfmsl_lane_low<vfmlsel2><mode>_intrinsic,
	vfmsl_lane_high<vfmlsel2><mode>_intrinsic,
	vfmsl_lane_high<mode>_intrinsic): New define_insns.

	* gcc.target/arm/simd/fp16fml_lane_high.c: New test.
	* gcc.target/arm/simd/fp16fml_lane_low.c: New test.

From-SVN: r256540
parent 06e95715
2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/arm/arm_neon.h (vfmlal_lane_low_u32, vfmlal_lane_high_u32,
vfmlalq_laneq_low_u32, vfmlalq_lane_low_u32, vfmlal_laneq_low_u32,
vfmlalq_laneq_high_u32, vfmlalq_lane_high_u32, vfmlal_laneq_high_u32,
vfmlsl_lane_low_u32, vfmlsl_lane_high_u32, vfmlslq_laneq_low_u32,
vfmlslq_lane_low_u32, vfmlsl_laneq_low_u32, vfmlslq_laneq_high_u32,
vfmlslq_lane_high_u32, vfmlsl_laneq_high_u32): Define.
* config/arm/arm_neon_builtins.def (vfmal_lane_low,
vfmal_lane_lowv4hf, vfmal_lane_lowv8hf, vfmal_lane_high,
vfmal_lane_highv4hf, vfmal_lane_highv8hf, vfmsl_lane_low,
vfmsl_lane_lowv4hf, vfmsl_lane_lowv8hf, vfmsl_lane_high,
vfmsl_lane_highv4hf, vfmsl_lane_highv8hf): New sets of builtins.
* config/arm/iterators.md (VFMLSEL2, vfmlsel2): New mode attributes.
(V_lane_reg): Likewise.
* config/arm/neon.md (neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>):
New define_expand.
(neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>): Likewise.
(vfmal_lane_low<mode>_intrinsic,
vfmal_lane_low<vfmlsel2><mode>_intrinsic,
vfmal_lane_high<vfmlsel2><mode>_intrinsic,
vfmal_lane_high<mode>_intrinsic, vfmsl_lane_low<mode>_intrinsic,
vfmsl_lane_low<vfmlsel2><mode>_intrinsic,
vfmsl_lane_high<vfmlsel2><mode>_intrinsic,
vfmsl_lane_high<mode>_intrinsic): New define_insns.
2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/arm/arm-cpus.in (fp16fml): New feature.
(ALL_SIMD): Add fp16fml.
(armv8.2-a): Add fp16fml as an option.
......
......@@ -18160,6 +18160,150 @@ vfmlslq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
return __builtin_neon_vfmsl_highv4sf (__r, __a, __b);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlal_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmal_lane_lowv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlal_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmal_lane_highv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlalq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmal_lane_lowv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlalq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmal_lane_lowv4hfv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlal_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmal_lane_lowv8hfv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlalq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmal_lane_highv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlalq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmal_lane_highv4hfv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlal_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmal_lane_highv8hfv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlsl_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmsl_lane_lowv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlsl_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmsl_lane_highv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlslq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmsl_lane_lowv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlslq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmsl_lane_lowv4hfv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlsl_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmsl_lane_lowv8hfv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlslq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmsl_lane_highv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlslq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmsl_lane_highv4hfv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlsl_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmsl_lane_highv8hfv2sf (__r, __a, __b, __index);
}
#pragma GCC pop_options
#endif
......
......@@ -55,6 +55,18 @@ VAR2 (TERNOP, vfmal_low, v2sf, v4sf)
VAR2 (TERNOP, vfmal_high, v2sf, v4sf)
VAR2 (TERNOP, vfmsl_low, v2sf, v4sf)
VAR2 (TERNOP, vfmsl_high, v2sf, v4sf)
VAR2 (MAC_LANE, vfmal_lane_low, v2sf, v4sf)
VAR1 (MAC_LANE, vfmal_lane_lowv4hf, v4sf)
VAR1 (MAC_LANE, vfmal_lane_lowv8hf, v2sf)
VAR2 (MAC_LANE, vfmal_lane_high, v2sf, v4sf)
VAR1 (MAC_LANE, vfmal_lane_highv4hf, v4sf)
VAR1 (MAC_LANE, vfmal_lane_highv8hf, v2sf)
VAR2 (MAC_LANE, vfmsl_lane_low, v2sf, v4sf)
VAR1 (MAC_LANE, vfmsl_lane_lowv4hf, v4sf)
VAR1 (MAC_LANE, vfmsl_lane_lowv8hf, v2sf)
VAR2 (MAC_LANE, vfmsl_lane_high, v2sf, v4sf)
VAR1 (MAC_LANE, vfmsl_lane_highv4hf, v4sf)
VAR1 (MAC_LANE, vfmsl_lane_highv8hf, v2sf)
VAR3 (BINOP, vmullp, v8qi, v4hi, v2si)
VAR3 (BINOP, vmulls, v8qi, v4hi, v2si)
VAR3 (BINOP, vmullu, v8qi, v4hi, v2si)
......
......@@ -484,6 +484,12 @@
;; Mode mapping for VFM[A,S]L instructions for the vec_select result.
(define_mode_attr VFMLSEL [(V2SF "V2HF") (V4SF "V4HF")])
;; Mode mapping for VFM[A,S]L instructions for some awkward lane-wise forms.
(define_mode_attr VFMLSEL2 [(V2SF "V8HF") (V4SF "V4HF")])
;; Same as the above, but lowercase.
(define_mode_attr vfmlsel2 [(V2SF "v8hf") (V4SF "v4hf")])
;; Similar, for three elements.
(define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
(V4HI "BLK") (V8HI "BLK")
......@@ -516,6 +522,10 @@
;; Output template to select the low VFP register of a mult-register value.
(define_mode_attr V_lo [(V2SF "") (V4SF "e")])
;; Helper attribute for printing output templates for awkward forms of
;; vfmlal/vfmlsl intrinsics.
(define_mode_attr V_lane_reg [(V2SF "") (V4SF "P")])
;; Wider modes with the same number of elements.
(define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")])
......
......@@ -2382,6 +2382,314 @@
[(set_attr "type" "neon_fp_mla_s<q>")]
)
(define_expand "neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>"
[(set:VCVTF (match_operand:VCVTF 0 "s_register_operand")
(unspec:VCVTF
[(match_operand:VCVTF 1 "s_register_operand")
(PLUSMINUS:<VFML>
(match_operand:<VFML> 2 "s_register_operand")
(match_operand:<VFML> 3 "s_register_operand"))
(match_operand:SI 4 "const_int_operand")] VFMLHALVES))]
"TARGET_FP16FML"
{
rtx lane = GEN_INT (NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[4])));
rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>);
emit_insn (gen_vfm<vfml_op>l_lane_<vfml_half><mode>_intrinsic
(operands[0], operands[1],
operands[2], operands[3],
half, lane));
DONE;
})
(define_insn "vfmal_lane_low<mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_low" "")))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFML> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
{
operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
return "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_hi>3[%c5]";
}
else
{
operands[5] = GEN_INT (lane);
return "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3[%c5]";
}
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
(define_expand "neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>"
[(set:VCVTF (match_operand:VCVTF 0 "s_register_operand")
(unspec:VCVTF
[(match_operand:VCVTF 1 "s_register_operand")
(PLUSMINUS:<VFML>
(match_operand:<VFML> 2 "s_register_operand")
(match_operand:<VFMLSEL2> 3 "s_register_operand"))
(match_operand:SI 4 "const_int_operand")] VFMLHALVES))]
"TARGET_FP16FML"
{
rtx lane
= GEN_INT (NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[4])));
rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>);
emit_insn (gen_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>_intrinsic
(operands[0], operands[1], operands[2], operands[3],
half, lane));
DONE;
})
;; Used to implement the intrinsics:
;; float32x4_t vfmlalq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
;; float32x2_t vfmlal_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
;; Needs a bit of care to get the modes of the different sub-expressions right
;; due to 'a' and 'b' having different sizes and make sure we use the right
;; S or D subregister to select the appropriate lane from.
(define_insn "vfmal_lane_low<vfmlsel2><mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_low" "")))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
int new_lane = lane % elts_per_reg;
int regdiff = lane / elts_per_reg;
operands[5] = GEN_INT (new_lane);
/* We re-create operands[2] and operands[3] in the halved VFMLSEL modes
because we want the print_operand code to print the appropriate
S or D register prefix. */
operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
operands[2] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[2]));
return "vfmal.f16\\t%<V_reg>0, %<V_lane_reg>2, %<V_lane_reg>3[%c5]";
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
;; Used to implement the intrinsics:
;; float32x4_t vfmlalq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
;; float32x2_t vfmlal_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
;; Needs a bit of care to get the modes of the different sub-expressions right
;; due to 'a' and 'b' having different sizes and make sure we use the right
;; S or D subregister to select the appropriate lane from.
(define_insn "vfmal_lane_high<vfmlsel2><mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_high" "")))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
int new_lane = lane % elts_per_reg;
int regdiff = lane / elts_per_reg;
operands[5] = GEN_INT (new_lane);
/* We re-create operands[3] in the halved VFMLSEL mode
because we've calculated the correct half-width subreg to extract
the lane from and we want to print *that* subreg instead. */
operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_lane_reg>3[%c5]";
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
(define_insn "vfmal_lane_high<mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_high" "")))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFML> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
{
operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3[%c5]";
}
else
{
operands[5] = GEN_INT (lane);
return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_lo>3[%c5]";
}
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
(define_insn "vfmsl_lane_low<mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:<VFMLSEL>
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_low" ""))))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFML> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
{
operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
return "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_hi>3[%c5]";
}
else
{
operands[5] = GEN_INT (lane);
return "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3[%c5]";
}
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
;; Used to implement the intrinsics:
;; float32x4_t vfmlslq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
;; float32x2_t vfmlsl_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
;; Needs a bit of care to get the modes of the different sub-expressions right
;; due to 'a' and 'b' having different sizes and make sure we use the right
;; S or D subregister to select the appropriate lane from.
(define_insn "vfmsl_lane_low<vfmlsel2><mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:<VFMLSEL>
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_low" ""))))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
int new_lane = lane % elts_per_reg;
int regdiff = lane / elts_per_reg;
operands[5] = GEN_INT (new_lane);
/* We re-create operands[2] and operands[3] in the halved VFMLSEL modes
because we want the print_operand code to print the appropriate
S or D register prefix. */
operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
operands[2] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[2]));
return "vfmsl.f16\\t%<V_reg>0, %<V_lane_reg>2, %<V_lane_reg>3[%c5]";
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
;; Used to implement the intrinsics:
;; float32x4_t vfmlslq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
;; float32x2_t vfmlsl_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
;; Needs a bit of care to get the modes of the different sub-expressions right
;; due to 'a' and 'b' having different sizes and make sure we use the right
;; S or D subregister to select the appropriate lane from.
(define_insn "vfmsl_lane_high<vfmlsel2><mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:<VFMLSEL>
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_high" ""))))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
int new_lane = lane % elts_per_reg;
int regdiff = lane / elts_per_reg;
operands[5] = GEN_INT (new_lane);
/* We re-create operands[3] in the halved VFMLSEL mode
because we've calculated the correct half-width subreg to extract
the lane from and we want to print *that* subreg instead. */
operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_lane_reg>3[%c5]";
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
(define_insn "vfmsl_lane_high<mode>_intrinsic"
[(set (match_operand:VCVTF 0 "s_register_operand" "=w")
(fma:VCVTF
(float_extend:VCVTF
(neg:<VFMLSEL>
(vec_select:<VFMLSEL>
(match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
(match_operand:<VFML> 4 "vect_par_constant_high" ""))))
(float_extend:VCVTF
(vec_duplicate:<VFMLSEL>
(vec_select:HF
(match_operand:<VFML> 3 "s_register_operand" "x")
(parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
(match_operand:VCVTF 1 "s_register_operand" "0")))]
"TARGET_FP16FML"
{
int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
{
operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3[%c5]";
}
else
{
operands[5] = GEN_INT (lane);
return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_lo>3[%c5]";
}
}
[(set_attr "type" "neon_fp_mla_s<q>")]
)
; Used for intrinsics when flag_unsafe_math_optimizations is false.
(define_insn "neon_vmla<mode>_unspec"
......
2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.target/arm/simd/fp16fml_lane_high.c: New test.
* gcc.target/arm/simd/fp16fml_lane_low.c: New test.
2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.target/arm/multilib.exp: Add combination tests for fp16fml.
* gcc.target/arm/simd/fp16fml_high.c: New test.
* gcc.target/arm/simd/fp16fml_low.c: Likewise.
......
/* { dg-do compile } */
/* { dg-require-effective-target arm_fp16fml_neon_ok } */
/* { dg-add-options arm_fp16fml_neon } */
#include "arm_neon.h"
float32x2_t
test_vfmlal_lane_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
{
return vfmlal_lane_high_u32 (r, a, b, 0);
}
float32x2_t
tets_vfmlsl_lane_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
{
return vfmlsl_lane_high_u32 (r, a, b, 0);
}
float32x2_t
test_vfmlal_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
{
return vfmlal_laneq_high_u32 (r, a, b, 6);
}
float32x2_t
test_vfmlsl_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
{
return vfmlsl_laneq_high_u32 (r, a, b, 6);
}
float32x4_t
test_vfmlalq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
{
return vfmlalq_lane_high_u32 (r, a, b, 1);
}
float32x4_t
test_vfmlslq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
{
return vfmlslq_lane_high_u32 (r, a, b, 1);
}
float32x4_t
test_vfmlalq_laneq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
{
return vfmlalq_laneq_high_u32 (r, a, b, 7);
}
float32x4_t
test_vfmlslq_laneq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
{
return vfmlslq_laneq_high_u32 (r, a, b, 7);
}
/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[02468]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[0-9]+\[1\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]\[3\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[02468]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[0-9]+\[1\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]\[3\]} 1 } } */
/* { dg-do compile } */
/* { dg-require-effective-target arm_fp16fml_neon_ok } */
/* { dg-add-options arm_fp16fml_neon } */
#include "arm_neon.h"
float32x2_t
test_vfmlal_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
{
return vfmlal_lane_low_u32 (r, a, b, 0);
}
float32x2_t
test_vfmlsl_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
{
return vfmlsl_lane_low_u32 (r, a, b, 0);
}
float32x2_t
test_vfmlal_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
{
return vfmlal_laneq_low_u32 (r, a, b, 6);
}
float32x2_t
test_vfmlsl_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
{
return vfmlsl_laneq_low_u32 (r, a, b, 6);
}
float32x4_t
test_vfmlalq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
{
return vfmlalq_lane_low_u32 (r, a, b, 1);
}
float32x4_t
test_vfmlslq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
{
return vfmlslq_lane_low_u32 (r, a, b, 1);
}
float32x4_t
test_vfmlalq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
{
return vfmlalq_laneq_low_u32 (r, a, b, 7);
}
float32x4_t
test_vfmlslq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
{
return vfmlslq_laneq_low_u32 (r, a, b, 7);
}
/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[13579]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[0-9]+\[1\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[123]?[13579]\[3\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[13579]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[0-9]+\[1\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[123]?[13579]\[3\]} 1 } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment