[arm][3/3] Implement fp16fml lane intrinsics

This patch implements the lane-wise fp16fml intrinsics. There's quite a few of them so I've split them up from the other simpler fp16fml intrinsics. These ones expose instructions such as vfmal.f16 Dd, Sn, Sm[<index>] 0 <= index <= 1 vfmal.f16 Qd, Dn, Dm[<index>] 0 <= index <= 3 vfmsl.f16 Dd, Sn, Sm[<index>] 0 <= index <= 1 vfmsl.f16 Qd, Dn, Dm[<index>] 0 <= index <= 3 These instructions extract a single half-precision floating-point value from one of the source regs and perform a vfmal/vfmsl operation as per the normal variant with that value. The nuance here is that some of the intrinsics want to do things like: float32x2_t vfmlal_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b, const int __index) where the float16x8_t value of '__b' is held in a Q register, so we need to be a bit smart about finding the right D or S sub-register and translating the lane number to a lane in that sub-register, instead of just passing the language-level const-int down to the assembly instruction. That's where most of the complexity of this patch comes from but hopefully it's orthogonal enough to make sense. Bootstrapped and tested on arm-none-linux-gnueabihf as well as armeb-none-eabi. * config/arm/arm_neon.h (vfmlal_lane_low_u32, vfmlal_lane_high_u32, vfmlalq_laneq_low_u32, vfmlalq_lane_low_u32, vfmlal_laneq_low_u32, vfmlalq_laneq_high_u32, vfmlalq_lane_high_u32, vfmlal_laneq_high_u32, vfmlsl_lane_low_u32, vfmlsl_lane_high_u32, vfmlslq_laneq_low_u32, vfmlslq_lane_low_u32, vfmlsl_laneq_low_u32, vfmlslq_laneq_high_u32, vfmlslq_lane_high_u32, vfmlsl_laneq_high_u32): Define. * config/arm/arm_neon_builtins.def (vfmal_lane_low, vfmal_lane_lowv4hf, vfmal_lane_lowv8hf, vfmal_lane_high, vfmal_lane_highv4hf, vfmal_lane_highv8hf, vfmsl_lane_low, vfmsl_lane_lowv4hf, vfmsl_lane_lowv8hf, vfmsl_lane_high, vfmsl_lane_highv4hf, vfmsl_lane_highv8hf): New sets of builtins. * config/arm/iterators.md (VFMLSEL2, vfmlsel2): New mode attributes. (V_lane_reg): Likewise. * config/arm/neon.md (neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>): New define_expand. (neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>): Likewise. (vfmal_lane_low<mode>_intrinsic, vfmal_lane_low<vfmlsel2><mode>_intrinsic, vfmal_lane_high<vfmlsel2><mode>_intrinsic, vfmal_lane_high<mode>_intrinsic, vfmsl_lane_low<mode>_intrinsic, vfmsl_lane_low<vfmlsel2><mode>_intrinsic, vfmsl_lane_high<vfmlsel2><mode>_intrinsic, vfmsl_lane_high<mode>_intrinsic): New define_insns. * gcc.target/arm/simd/fp16fml_lane_high.c: New test. * gcc.target/arm/simd/fp16fml_lane_low.c: New test. From-SVN: r256540

[arm][3/3] Implement fp16fml lane intrinsics
This patch implements the lane-wise fp16fml intrinsics. There's quite a few of them so I've split them up from the other simpler fp16fml intrinsics. These ones expose instructions such as vfmal.f16 Dd, Sn, Sm[<index>] 0 <= index <= 1 vfmal.f16 Qd, Dn, Dm[<index>] 0 <= index <= 3 vfmsl.f16 Dd, Sn, Sm[<index>] 0 <= index <= 1 vfmsl.f16 Qd, Dn, Dm[<index>] 0 <= index <= 3 These instructions extract a single half-precision floating-point value from one of the source regs and perform a vfmal/vfmsl operation as per the normal variant with that value. The nuance here is that some of the intrinsics want to do things like: float32x2_t vfmlal_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b, const int __index) where the float16x8_t value of '__b' is held in a Q register, so we need to be a bit smart about finding the right D or S sub-register and translating the lane number to a lane in that sub-register, instead of just passing the language-level const-int down to the assembly instruction. That's where most of the complexity of this patch comes from but hopefully it's orthogonal enough to make sense. Bootstrapped and tested on arm-none-linux-gnueabihf as well as armeb-none-eabi. * config/arm/arm_neon.h (vfmlal_lane_low_u32, vfmlal_lane_high_u32, vfmlalq_laneq_low_u32, vfmlalq_lane_low_u32, vfmlal_laneq_low_u32, vfmlalq_laneq_high_u32, vfmlalq_lane_high_u32, vfmlal_laneq_high_u32, vfmlsl_lane_low_u32, vfmlsl_lane_high_u32, vfmlslq_laneq_low_u32, vfmlslq_lane_low_u32, vfmlsl_laneq_low_u32, vfmlslq_laneq_high_u32, vfmlslq_lane_high_u32, vfmlsl_laneq_high_u32): Define. * config/arm/arm_neon_builtins.def (vfmal_lane_low, vfmal_lane_lowv4hf, vfmal_lane_lowv8hf, vfmal_lane_high, vfmal_lane_highv4hf, vfmal_lane_highv8hf, vfmsl_lane_low, vfmsl_lane_lowv4hf, vfmsl_lane_lowv8hf, vfmsl_lane_high, vfmsl_lane_highv4hf, vfmsl_lane_highv8hf): New sets of builtins. * config/arm/iterators.md (VFMLSEL2, vfmlsel2): New mode attributes. (V_lane_reg): Likewise. * config/arm/neon.md (neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>): New define_expand. (neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>): Likewise. (vfmal_lane_low<mode>_intrinsic, vfmal_lane_low<vfmlsel2><mode>_intrinsic, vfmal_lane_high<vfmlsel2><mode>_intrinsic, vfmal_lane_high<mode>_intrinsic, vfmsl_lane_low<mode>_intrinsic, vfmsl_lane_low<vfmlsel2><mode>_intrinsic, vfmsl_lane_high<vfmlsel2><mode>_intrinsic, vfmsl_lane_high<mode>_intrinsic): New define_insns. * gcc.target/arm/simd/fp16fml_lane_high.c: New test. * gcc.target/arm/simd/fp16fml_lane_low.c: New test. From-SVN: r256540
eccf4d70 · Kyrylo Tkachov · Kyrylo Tkachov · 06e95715 · eccf4d70 · eccf4d70
Commit eccf4d70 authored Jan 11, 2018 by Kyrylo Tkachov Committed by Kyrylo Tkachov Jan 11, 2018
8 changed files
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
 2018-01-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

+	* config/arm/arm_neon.h (vfmlal_lane_low_u32, vfmlal_lane_high_u32,
+	vfmlalq_laneq_low_u32, vfmlalq_lane_low_u32, vfmlal_laneq_low_u32,
+	vfmlalq_laneq_high_u32, vfmlalq_lane_high_u32, vfmlal_laneq_high_u32,
+	vfmlsl_lane_low_u32, vfmlsl_lane_high_u32, vfmlslq_laneq_low_u32,
+	vfmlslq_lane_low_u32, vfmlsl_laneq_low_u32, vfmlslq_laneq_high_u32,
+	vfmlslq_lane_high_u32, vfmlsl_laneq_high_u32): Define.
+	* config/arm/arm_neon_builtins.def (vfmal_lane_low,
+	vfmal_lane_lowv4hf, vfmal_lane_lowv8hf, vfmal_lane_high,
+	vfmal_lane_highv4hf, vfmal_lane_highv8hf, vfmsl_lane_low,
+	vfmsl_lane_lowv4hf, vfmsl_lane_lowv8hf, vfmsl_lane_high,
+	vfmsl_lane_highv4hf, vfmsl_lane_highv8hf): New sets of builtins.
+	* config/arm/iterators.md (VFMLSEL2, vfmlsel2): New mode attributes.
+	(V_lane_reg): Likewise.
+	* config/arm/neon.md (neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>):
+	New define_expand.
+	(neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>): Likewise.
+	(vfmal_lane_low<mode>_intrinsic,
+	vfmal_lane_low<vfmlsel2><mode>_intrinsic,
+	vfmal_lane_high<vfmlsel2><mode>_intrinsic,
+	vfmal_lane_high<mode>_intrinsic, vfmsl_lane_low<mode>_intrinsic,
+	vfmsl_lane_low<vfmlsel2><mode>_intrinsic,
+	vfmsl_lane_high<vfmlsel2><mode>_intrinsic,
+	vfmsl_lane_high<mode>_intrinsic): New define_insns.
+
+2018-01-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
 	* config/arm/arm-cpus.in (fp16fml): New feature.
 	(ALL_SIMD): Add fp16fml.
 	(armv8.2-a): Add fp16fml as an option.

--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18160,6 +18160,150 @@ vfmlslq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
  return __builtin_neon_vfmsl_highv4sf (__r, __a, __b);
 }

+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
+		     const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmal_lane_lowv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
+		      const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmal_lane_highv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
+		       const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmal_lane_lowv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
+		       const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmal_lane_lowv4hfv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
+		       const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmal_lane_lowv8hfv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
+			const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmal_lane_highv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
+		       const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmal_lane_highv4hfv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
+		       const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmal_lane_highv8hfv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
+		     const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmsl_lane_lowv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
+		      const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmsl_lane_highv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
+		       const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmsl_lane_lowv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
+		       const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmsl_lane_lowv4hfv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
+		       const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmsl_lane_lowv8hfv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
+			const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmsl_lane_highv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
+		       const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmsl_lane_highv4hfv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
+		       const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmsl_lane_highv8hfv2sf (__r, __a, __b, __index);
+}
+
 #pragma GCC pop_options
 #endif


--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -55,6 +55,18 @@ VAR2 (TERNOP, vfmal_low, v2sf, v4sf)
 VAR2 (TERNOP, vfmal_high, v2sf, v4sf)
 VAR2 (TERNOP, vfmsl_low, v2sf, v4sf)
 VAR2 (TERNOP, vfmsl_high, v2sf, v4sf)
+VAR2 (MAC_LANE, vfmal_lane_low, v2sf, v4sf)
+VAR1 (MAC_LANE, vfmal_lane_lowv4hf, v4sf)
+VAR1 (MAC_LANE, vfmal_lane_lowv8hf, v2sf)
+VAR2 (MAC_LANE, vfmal_lane_high, v2sf, v4sf)
+VAR1 (MAC_LANE, vfmal_lane_highv4hf, v4sf)
+VAR1 (MAC_LANE, vfmal_lane_highv8hf, v2sf)
+VAR2 (MAC_LANE, vfmsl_lane_low, v2sf, v4sf)
+VAR1 (MAC_LANE, vfmsl_lane_lowv4hf, v4sf)
+VAR1 (MAC_LANE, vfmsl_lane_lowv8hf, v2sf)
+VAR2 (MAC_LANE, vfmsl_lane_high, v2sf, v4sf)
+VAR1 (MAC_LANE, vfmsl_lane_highv4hf, v4sf)
+VAR1 (MAC_LANE, vfmsl_lane_highv8hf, v2sf)
 VAR3 (BINOP, vmullp, v8qi, v4hi, v2si)
 VAR3 (BINOP, vmulls, v8qi, v4hi, v2si)
 VAR3 (BINOP, vmullu, v8qi, v4hi, v2si)

--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -484,6 +484,12 @@
 ;; Mode mapping for VFM[A,S]L instructions for the vec_select result.
 (define_mode_attr VFMLSEL [(V2SF "V2HF") (V4SF "V4HF")])

+;; Mode mapping for VFM[A,S]L instructions for some awkward lane-wise forms.
+(define_mode_attr VFMLSEL2 [(V2SF "V8HF") (V4SF "V4HF")])
+
+;; Same as the above, but lowercase.
+(define_mode_attr vfmlsel2 [(V2SF "v8hf") (V4SF "v4hf")])
+
 ;; Similar, for three elements.
 (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
                                (V4HI "BLK") (V8HI "BLK")
@@ -516,6 +522,10 @@
 ;; Output template to select the low VFP register of a mult-register value.
 (define_mode_attr V_lo [(V2SF "") (V4SF  "e")])

+;; Helper attribute for printing output templates for awkward forms of
+;; vfmlal/vfmlsl intrinsics.
+(define_mode_attr V_lane_reg [(V2SF "") (V4SF  "P")])
+
 ;; Wider modes with the same number of elements.
 (define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")])


--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
 2018-01-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

+	* gcc.target/arm/simd/fp16fml_lane_high.c: New test.
+	* gcc.target/arm/simd/fp16fml_lane_low.c: New test.
+
+2018-01-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
 	* gcc.target/arm/multilib.exp: Add combination tests for fp16fml.
 	* gcc.target/arm/simd/fp16fml_high.c: New test.
 	* gcc.target/arm/simd/fp16fml_low.c: Likewise.

--- a/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_high.c
+++ b/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_high.c
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16fml_neon_ok } */
+/* { dg-add-options arm_fp16fml_neon }  */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_lane_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlal_lane_high_u32 (r, a, b, 0);
+}
+
+float32x2_t
+tets_vfmlsl_lane_high_u32  (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlsl_lane_high_u32 (r, a, b, 0);
+}
+
+float32x2_t
+test_vfmlal_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
+{
+  return vfmlal_laneq_high_u32 (r, a, b, 6);
+}
+
+float32x2_t
+test_vfmlsl_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
+{
+  return vfmlsl_laneq_high_u32 (r, a, b, 6);
+}
+
+float32x4_t
+test_vfmlalq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
+{
+  return vfmlalq_lane_high_u32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vfmlslq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
+{
+  return vfmlslq_lane_high_u32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vfmlalq_laneq_high_u32  (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlalq_laneq_high_u32 (r, a, b, 7);
+}
+
+float32x4_t
+test_vfmlslq_laneq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlslq_laneq_high_u32 (r, a, b, 7);
+}
+
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[02468]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[0-9]+\[1\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]\[3\]} 1 } } */
+
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[02468]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[0-9]+\[1\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]\[3\]} 1 } } */
--- a/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_low.c
+++ b/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_low.c
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16fml_neon_ok } */
+/* { dg-add-options arm_fp16fml_neon }  */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlal_lane_low_u32 (r, a, b, 0);
+}
+
+float32x2_t
+test_vfmlsl_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlsl_lane_low_u32 (r, a, b, 0);
+}
+
+float32x2_t
+test_vfmlal_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
+{
+  return vfmlal_laneq_low_u32 (r, a, b, 6);
+}
+
+float32x2_t
+test_vfmlsl_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
+{
+  return vfmlsl_laneq_low_u32 (r, a, b, 6);
+}
+
+float32x4_t
+test_vfmlalq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
+{
+  return vfmlalq_lane_low_u32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vfmlslq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
+{
+  return vfmlslq_lane_low_u32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vfmlalq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlalq_laneq_low_u32 (r, a, b, 7);
+}
+
+float32x4_t
+test_vfmlslq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlslq_laneq_low_u32 (r, a, b, 7);
+}
+
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[13579]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[0-9]+\[1\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[123]?[13579]\[3\]} 1 } } */
+
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[13579]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[0-9]+\[1\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[123]?[13579]\[3\]} 1 } } */