Commit eccf4d70 by Kyrylo Tkachov Committed by Kyrylo Tkachov

[arm][3/3] Implement fp16fml lane intrinsics

This patch implements the lane-wise fp16fml intrinsics.
There's quite a few of them so I've split them up from
the other simpler fp16fml intrinsics.

These ones expose instructions such as

vfmal.f16 Dd, Sn, Sm[<index>]  0 <= index <= 1
vfmal.f16 Qd, Dn, Dm[<index>]  0 <= index <= 3
vfmsl.f16 Dd, Sn, Sm[<index>]  0 <= index <= 1
vfmsl.f16 Qd, Dn, Dm[<index>]  0 <= index <= 3

These instructions extract a single half-precision
floating-point value from one of the source regs
and perform a vfmal/vfmsl operation as per the
normal variant with that value.

The nuance here is that some of the intrinsics want
to do things like:

float32x2_t vfmlal_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b, const int __index)


where the float16x8_t value of '__b' is held in a Q
register, so we need to be a bit smart about finding
the right D or S sub-register and translating the
lane number to a lane in that sub-register, instead
of just passing the language-level const-int down to
the assembly instruction.

That's where most of the complexity of this patch comes from
but hopefully it's orthogonal enough to make sense.

Bootstrapped and tested on arm-none-linux-gnueabihf as well as
armeb-none-eabi.

	* config/arm/arm_neon.h (vfmlal_lane_low_u32, vfmlal_lane_high_u32,
	vfmlalq_laneq_low_u32, vfmlalq_lane_low_u32, vfmlal_laneq_low_u32,
	vfmlalq_laneq_high_u32, vfmlalq_lane_high_u32, vfmlal_laneq_high_u32,
	vfmlsl_lane_low_u32, vfmlsl_lane_high_u32, vfmlslq_laneq_low_u32,
	vfmlslq_lane_low_u32, vfmlsl_laneq_low_u32, vfmlslq_laneq_high_u32,
	vfmlslq_lane_high_u32, vfmlsl_laneq_high_u32): Define.
	* config/arm/arm_neon_builtins.def (vfmal_lane_low,
	vfmal_lane_lowv4hf, vfmal_lane_lowv8hf, vfmal_lane_high,
	vfmal_lane_highv4hf, vfmal_lane_highv8hf, vfmsl_lane_low,
	vfmsl_lane_lowv4hf, vfmsl_lane_lowv8hf, vfmsl_lane_high,
	vfmsl_lane_highv4hf, vfmsl_lane_highv8hf): New sets of builtins.
	* config/arm/iterators.md (VFMLSEL2, vfmlsel2): New mode attributes.
	(V_lane_reg): Likewise.
	* config/arm/neon.md (neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>):
	New define_expand.
	(neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>): Likewise.
	(vfmal_lane_low<mode>_intrinsic,
	vfmal_lane_low<vfmlsel2><mode>_intrinsic,
	vfmal_lane_high<vfmlsel2><mode>_intrinsic,
	vfmal_lane_high<mode>_intrinsic, vfmsl_lane_low<mode>_intrinsic,
	vfmsl_lane_low<vfmlsel2><mode>_intrinsic,
	vfmsl_lane_high<vfmlsel2><mode>_intrinsic,
	vfmsl_lane_high<mode>_intrinsic): New define_insns.

	* gcc.target/arm/simd/fp16fml_lane_high.c: New test.
	* gcc.target/arm/simd/fp16fml_lane_low.c: New test.

From-SVN: r256540
parent 06e95715
2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/arm/arm_neon.h (vfmlal_lane_low_u32, vfmlal_lane_high_u32,
vfmlalq_laneq_low_u32, vfmlalq_lane_low_u32, vfmlal_laneq_low_u32,
vfmlalq_laneq_high_u32, vfmlalq_lane_high_u32, vfmlal_laneq_high_u32,
vfmlsl_lane_low_u32, vfmlsl_lane_high_u32, vfmlslq_laneq_low_u32,
vfmlslq_lane_low_u32, vfmlsl_laneq_low_u32, vfmlslq_laneq_high_u32,
vfmlslq_lane_high_u32, vfmlsl_laneq_high_u32): Define.
* config/arm/arm_neon_builtins.def (vfmal_lane_low,
vfmal_lane_lowv4hf, vfmal_lane_lowv8hf, vfmal_lane_high,
vfmal_lane_highv4hf, vfmal_lane_highv8hf, vfmsl_lane_low,
vfmsl_lane_lowv4hf, vfmsl_lane_lowv8hf, vfmsl_lane_high,
vfmsl_lane_highv4hf, vfmsl_lane_highv8hf): New sets of builtins.
* config/arm/iterators.md (VFMLSEL2, vfmlsel2): New mode attributes.
(V_lane_reg): Likewise.
* config/arm/neon.md (neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>):
New define_expand.
(neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>): Likewise.
(vfmal_lane_low<mode>_intrinsic,
vfmal_lane_low<vfmlsel2><mode>_intrinsic,
vfmal_lane_high<vfmlsel2><mode>_intrinsic,
vfmal_lane_high<mode>_intrinsic, vfmsl_lane_low<mode>_intrinsic,
vfmsl_lane_low<vfmlsel2><mode>_intrinsic,
vfmsl_lane_high<vfmlsel2><mode>_intrinsic,
vfmsl_lane_high<mode>_intrinsic): New define_insns.
2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/arm/arm-cpus.in (fp16fml): New feature. * config/arm/arm-cpus.in (fp16fml): New feature.
(ALL_SIMD): Add fp16fml. (ALL_SIMD): Add fp16fml.
(armv8.2-a): Add fp16fml as an option. (armv8.2-a): Add fp16fml as an option.
......
...@@ -18160,6 +18160,150 @@ vfmlslq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b) ...@@ -18160,6 +18160,150 @@ vfmlslq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
return __builtin_neon_vfmsl_highv4sf (__r, __a, __b); return __builtin_neon_vfmsl_highv4sf (__r, __a, __b);
} }
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlal_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmal_lane_lowv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlal_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmal_lane_highv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlalq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmal_lane_lowv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlalq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmal_lane_lowv4hfv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlal_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmal_lane_lowv8hfv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlalq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmal_lane_highv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlalq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmal_lane_highv4hfv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlal_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmal_lane_highv8hfv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlsl_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmsl_lane_lowv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlsl_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmsl_lane_highv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlslq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmsl_lane_lowv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlslq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmsl_lane_lowv4hfv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlsl_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmsl_lane_lowv8hfv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlslq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmsl_lane_highv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlslq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
const int __index)
{
__builtin_arm_lane_check (4, __index);
return __builtin_neon_vfmsl_lane_highv4hfv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmlsl_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
const int __index)
{
__builtin_arm_lane_check (8, __index);
return __builtin_neon_vfmsl_lane_highv8hfv2sf (__r, __a, __b, __index);
}
#pragma GCC pop_options #pragma GCC pop_options
#endif #endif
......
...@@ -55,6 +55,18 @@ VAR2 (TERNOP, vfmal_low, v2sf, v4sf) ...@@ -55,6 +55,18 @@ VAR2 (TERNOP, vfmal_low, v2sf, v4sf)
VAR2 (TERNOP, vfmal_high, v2sf, v4sf) VAR2 (TERNOP, vfmal_high, v2sf, v4sf)
VAR2 (TERNOP, vfmsl_low, v2sf, v4sf) VAR2 (TERNOP, vfmsl_low, v2sf, v4sf)
VAR2 (TERNOP, vfmsl_high, v2sf, v4sf) VAR2 (TERNOP, vfmsl_high, v2sf, v4sf)
VAR2 (MAC_LANE, vfmal_lane_low, v2sf, v4sf)
VAR1 (MAC_LANE, vfmal_lane_lowv4hf, v4sf)
VAR1 (MAC_LANE, vfmal_lane_lowv8hf, v2sf)
VAR2 (MAC_LANE, vfmal_lane_high, v2sf, v4sf)
VAR1 (MAC_LANE, vfmal_lane_highv4hf, v4sf)
VAR1 (MAC_LANE, vfmal_lane_highv8hf, v2sf)
VAR2 (MAC_LANE, vfmsl_lane_low, v2sf, v4sf)
VAR1 (MAC_LANE, vfmsl_lane_lowv4hf, v4sf)
VAR1 (MAC_LANE, vfmsl_lane_lowv8hf, v2sf)
VAR2 (MAC_LANE, vfmsl_lane_high, v2sf, v4sf)
VAR1 (MAC_LANE, vfmsl_lane_highv4hf, v4sf)
VAR1 (MAC_LANE, vfmsl_lane_highv8hf, v2sf)
VAR3 (BINOP, vmullp, v8qi, v4hi, v2si) VAR3 (BINOP, vmullp, v8qi, v4hi, v2si)
VAR3 (BINOP, vmulls, v8qi, v4hi, v2si) VAR3 (BINOP, vmulls, v8qi, v4hi, v2si)
VAR3 (BINOP, vmullu, v8qi, v4hi, v2si) VAR3 (BINOP, vmullu, v8qi, v4hi, v2si)
......
...@@ -484,6 +484,12 @@ ...@@ -484,6 +484,12 @@
;; Mode mapping for VFM[A,S]L instructions for the vec_select result. ;; Mode mapping for VFM[A,S]L instructions for the vec_select result.
(define_mode_attr VFMLSEL [(V2SF "V2HF") (V4SF "V4HF")]) (define_mode_attr VFMLSEL [(V2SF "V2HF") (V4SF "V4HF")])
;; Mode mapping for VFM[A,S]L instructions for some awkward lane-wise forms.
(define_mode_attr VFMLSEL2 [(V2SF "V8HF") (V4SF "V4HF")])
;; Same as the above, but lowercase.
(define_mode_attr vfmlsel2 [(V2SF "v8hf") (V4SF "v4hf")])
;; Similar, for three elements. ;; Similar, for three elements.
(define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK") (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
(V4HI "BLK") (V8HI "BLK") (V4HI "BLK") (V8HI "BLK")
...@@ -516,6 +522,10 @@ ...@@ -516,6 +522,10 @@
;; Output template to select the low VFP register of a mult-register value. ;; Output template to select the low VFP register of a mult-register value.
(define_mode_attr V_lo [(V2SF "") (V4SF "e")]) (define_mode_attr V_lo [(V2SF "") (V4SF "e")])
;; Helper attribute for printing output templates for awkward forms of
;; vfmlal/vfmlsl intrinsics.
(define_mode_attr V_lane_reg [(V2SF "") (V4SF "P")])
;; Wider modes with the same number of elements. ;; Wider modes with the same number of elements.
(define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")]) (define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")])
......
2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com> 2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.target/arm/simd/fp16fml_lane_high.c: New test.
* gcc.target/arm/simd/fp16fml_lane_low.c: New test.
2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.target/arm/multilib.exp: Add combination tests for fp16fml. * gcc.target/arm/multilib.exp: Add combination tests for fp16fml.
* gcc.target/arm/simd/fp16fml_high.c: New test. * gcc.target/arm/simd/fp16fml_high.c: New test.
* gcc.target/arm/simd/fp16fml_low.c: Likewise. * gcc.target/arm/simd/fp16fml_low.c: Likewise.
......
/* { dg-do compile } */
/* { dg-require-effective-target arm_fp16fml_neon_ok } */
/* { dg-add-options arm_fp16fml_neon } */
#include "arm_neon.h"
float32x2_t
test_vfmlal_lane_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
{
return vfmlal_lane_high_u32 (r, a, b, 0);
}
float32x2_t
tets_vfmlsl_lane_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
{
return vfmlsl_lane_high_u32 (r, a, b, 0);
}
float32x2_t
test_vfmlal_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
{
return vfmlal_laneq_high_u32 (r, a, b, 6);
}
float32x2_t
test_vfmlsl_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
{
return vfmlsl_laneq_high_u32 (r, a, b, 6);
}
float32x4_t
test_vfmlalq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
{
return vfmlalq_lane_high_u32 (r, a, b, 1);
}
float32x4_t
test_vfmlslq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
{
return vfmlslq_lane_high_u32 (r, a, b, 1);
}
float32x4_t
test_vfmlalq_laneq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
{
return vfmlalq_laneq_high_u32 (r, a, b, 7);
}
float32x4_t
test_vfmlslq_laneq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
{
return vfmlslq_laneq_high_u32 (r, a, b, 7);
}
/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[02468]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[0-9]+\[1\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]\[3\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[02468]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[0-9]+\[1\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]\[3\]} 1 } } */
/* { dg-do compile } */
/* { dg-require-effective-target arm_fp16fml_neon_ok } */
/* { dg-add-options arm_fp16fml_neon } */
#include "arm_neon.h"
float32x2_t
test_vfmlal_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
{
return vfmlal_lane_low_u32 (r, a, b, 0);
}
float32x2_t
test_vfmlsl_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
{
return vfmlsl_lane_low_u32 (r, a, b, 0);
}
float32x2_t
test_vfmlal_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
{
return vfmlal_laneq_low_u32 (r, a, b, 6);
}
float32x2_t
test_vfmlsl_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
{
return vfmlsl_laneq_low_u32 (r, a, b, 6);
}
float32x4_t
test_vfmlalq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
{
return vfmlalq_lane_low_u32 (r, a, b, 1);
}
float32x4_t
test_vfmlslq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
{
return vfmlslq_lane_low_u32 (r, a, b, 1);
}
float32x4_t
test_vfmlalq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
{
return vfmlalq_laneq_low_u32 (r, a, b, 7);
}
float32x4_t
test_vfmlslq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
{
return vfmlslq_laneq_low_u32 (r, a, b, 7);
}
/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[13579]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[0-9]+\[1\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[123]?[13579]\[3\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[13579]\[0\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[0-9]+\[1\]} 1 } } */
/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[123]?[13579]\[3\]} 1 } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment