aarch64: Add bfloat16 vdup and vreinterpret ACLE intrinsics

This patch adds support for the bf16 duplicate and reinterpret intrinsics. ACLE documents are at https://developer.arm.com/docs/101028/latest ISA documents are at https://developer.arm.com/docs/ddi0596/latest 2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com> gcc/ * config/aarch64/iterators.md (VDQF_F16) Add V4BF and V8BF. (VALL_F16): Likewise. (VALLDI_F16): Likewise. (Vtype): Likewise. (Vetype): Likewise. (vswap_width_name): Likewise. (VSWAP_WIDTH): Likewise. (Vel): Likewise. (VEL): Likewise. (q): Likewise. * config/aarch64/arm_neon.h (vset_lane_bf16, vsetq_lane_bf16): New. (vget_lane_bf16, vgetq_lane_bf16): New. (vcreate_bf16): New. (vdup_n_bf16, vdupq_n_bf16): New. (vdup_lane_bf16, vdup_laneq_bf16): New. (vdupq_lane_bf16, vdupq_laneq_bf16): New. (vduph_lane_bf16, vduph_laneq_bf16): New. (vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New. (vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New. (vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New. (vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New. (vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New. (vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New. (vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New. (vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New. (vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New. (vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New. (vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New (vreinterpret_bf16_f16, vreinterpretq_bf16_f16): New (vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New. (vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New. (vreinterpretq_bf16_p128): New. (vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New. (vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New. (vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New. (vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New. (vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New. (vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New. (vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New. (vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New. (vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New. (vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New. (vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New. (vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New. (vreinterpret_f64_bf16,vreinterpretq_f64_bf16): New. (vreinterpret_f16_bf16,vreinterpretq_f16_bf16): New. (vreinterpretq_p128_bf16): New. gcc/testsuite/ * gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c: New test. * gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c: New test.

aarch64: Add bfloat16 vdup and vreinterpret ACLE intrinsics
This patch adds support for the bf16 duplicate and reinterpret intrinsics. ACLE documents are at https://developer.arm.com/docs/101028/latest ISA documents are at https://developer.arm.com/docs/ddi0596/latest 2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com> gcc/ * config/aarch64/iterators.md (VDQF_F16) Add V4BF and V8BF. (VALL_F16): Likewise. (VALLDI_F16): Likewise. (Vtype): Likewise. (Vetype): Likewise. (vswap_width_name): Likewise. (VSWAP_WIDTH): Likewise. (Vel): Likewise. (VEL): Likewise. (q): Likewise. * config/aarch64/arm_neon.h (vset_lane_bf16, vsetq_lane_bf16): New. (vget_lane_bf16, vgetq_lane_bf16): New. (vcreate_bf16): New. (vdup_n_bf16, vdupq_n_bf16): New. (vdup_lane_bf16, vdup_laneq_bf16): New. (vdupq_lane_bf16, vdupq_laneq_bf16): New. (vduph_lane_bf16, vduph_laneq_bf16): New. (vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New. (vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New. (vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New. (vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New. (vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New. (vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New. (vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New. (vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New. (vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New. (vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New. (vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New (vreinterpret_bf16_f16, vreinterpretq_bf16_f16): New (vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New. (vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New. (vreinterpretq_bf16_p128): New. (vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New. (vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New. (vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New. (vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New. (vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New. (vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New. (vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New. (vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New. (vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New. (vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New. (vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New. (vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New. (vreinterpret_f64_bf16,vreinterpretq_f64_bf16): New. (vreinterpret_f16_bf16,vreinterpretq_f16_bf16): New. (vreinterpretq_p128_bf16): New. gcc/testsuite/ * gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c: New test. * gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c: New test.
8ea6c1b8 · Mihail Ionescu · Richard Sandiford · 76a8c0f6 · 8ea6c1b8 · 8ea6c1b8
Commit 8ea6c1b8 authored Feb 18, 2020 by Mihail Ionescu Committed by Richard Sandiford Feb 25, 2020
6 changed files
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2020-02-25  Mihail Ionescu  <mihail.ionescu@arm.com>
+	* config/aarch64/iterators.md (VDQF_F16) Add V4BF and V8BF.
+	(VALL_F16): Likewise.
+	(VALLDI_F16): Likewise.
+	(Vtype): Likewise.
+	(Vetype): Likewise.
+	(vswap_width_name): Likewise.
+	(VSWAP_WIDTH): Likewise.
+	(Vel): Likewise.
+	(VEL): Likewise.
+	(q): Likewise.
+	* config/aarch64/arm_neon.h (vset_lane_bf16, vsetq_lane_bf16): New.
+	(vget_lane_bf16, vgetq_lane_bf16): New.
+	(vcreate_bf16): New.
+	(vdup_n_bf16, vdupq_n_bf16): New.
+	(vdup_lane_bf16, vdup_laneq_bf16): New.
+	(vdupq_lane_bf16, vdupq_laneq_bf16): New.
+	(vduph_lane_bf16, vduph_laneq_bf16): New.
+	(vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New.
+	(vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New.
+	(vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New.
+	(vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New.
+	(vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New.
+	(vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New.
+	(vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New.
+	(vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New.
+	(vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New.
+	(vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New.
+	(vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New
+	(vreinterpret_bf16_f16, vreinterpretq_bf16_f16): New
+	(vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New.
+	(vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New.
+	(vreinterpretq_bf16_p128): New.
+	(vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New.
+	(vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New.
+	(vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New.
+	(vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New.
+	(vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New.
+	(vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New.
+	(vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New.
+	(vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New.
+	(vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New.
+	(vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New.
+	(vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New.
+	(vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New.
+	(vreinterpret_f64_bf16,vreinterpretq_f64_bf16): New.
+	(vreinterpret_f16_bf16,vreinterpretq_f16_bf16): New.
+	(vreinterpretq_p128_bf16): New.
 2020-02-25  Dennis Zhang  <dennis.zhang@arm.com>
 	* config/arm/arm_neon.h (vbfdot_f32, vbfdotq_f32): New

--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -139,7 +139,8 @@
 (define_mode_iterator PTR [(SI "ptr_mode == SImode") (DI "ptr_mode == DImode")])
 ;; Advanced SIMD Float modes suitable for moving, loading and storing.
-(define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF])
+(define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF
+				V4BF V8BF])
 ;; Advanced SIMD Float modes.
 (define_mode_iterator VDQF [V2SF V4SF V2DF])
@@ -180,7 +181,7 @@
 ;; All Advanced SIMD modes suitable for moving, loading, and storing.
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
-				V4HF V8HF V2SF V4SF V2DF])
+				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
 ;; All Advanced SIMD modes suitable for moving, loading, and storing,
 ;; including special Bfloat vector types.
@@ -196,7 +197,7 @@
 ;; All Advanced SIMD modes and DI.
 (define_mode_iterator VALLDI_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
-				  V4HF V8HF V2SF V4SF V2DF DI])
+				  V4HF V8HF V4BF V8BF V2SF V4SF V2DF DI])
 ;; All Advanced SIMD modes, plus DI and DF.
 (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI
@@ -972,6 +973,7 @@
 (define_mode_attr Vtype [(V8QI "8b") (V16QI "16b")
 			 (V4HI "4h") (V8HI  "8h")
+			 (V4BF "4h") (V8BF  "8h")
                         (V2SI "2s") (V4SI  "4s")
                         (DI   "1d") (DF    "1d")
                         (V2DI "2d") (V2SF "2s")
@@ -1015,6 +1017,7 @@
 			  (VNx4SF "s") (VNx2SF "s")
 			  (VNx2DI "d")
 			  (VNx2DF "d")
+			  (BF "h") (V4BF "h") (V8BF "h")
 			  (HF "h")
 			  (SF "s") (DF "d")
 			  (QI "b") (HI "h")
@@ -1083,6 +1086,7 @@
 		       (DF   "DF") (V2DF  "DF")
 		       (SI   "SI") (HI    "HI")
 		       (QI   "QI")
+		       (V4BF "BF") (V8BF "BF")
 		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
 		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
 		       (VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF")
@@ -1102,6 +1106,7 @@
 		       (V2DF "df") (DF   "df")
 		       (SI   "si") (HI   "hi")
 		       (QI   "qi")
+		       (V4BF "bf") (V8BF "bf")
 		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
 		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
 		       (VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf")
@@ -1422,6 +1427,7 @@
 (define_mode_attr VSWAP_WIDTH [(V8QI "V16QI") (V16QI "V8QI")
 				(V4HI "V8HI") (V8HI  "V4HI")
+				(V8BF "V4BF") (V4BF  "V8BF")
 				(V2SI "V4SI") (V4SI  "V2SI")
 				(DI   "V2DI") (V2DI  "DI")
 				(V2SF "V4SF") (V4SF  "V2SF")
@@ -1434,6 +1440,7 @@
 				    (DI   "to_128") (V2DI  "to_64")
 				    (V4HF "to_128") (V8HF  "to_64")
 				    (V2SF "to_128") (V4SF  "to_64")
+				    (V4BF "to_128") (V8BF  "to_64")
 				    (DF   "to_128") (V2DF  "to_64")])
 ;; For certain vector-by-element multiplication instructions we must
@@ -1467,6 +1474,7 @@
 ;; Defined to '_q' for 128-bit types.
 (define_mode_attr q [(V8QI "") (V16QI "_q")
 		     (V4HI "") (V8HI  "_q")
+		     (V4BF "") (V8BF  "_q")
 		     (V2SI "") (V4SI  "_q")
 		     (DI   "") (V2DI  "_q")
 		     (V4HF "") (V8HF "_q")

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
+2020-02-25  Mihail Ionescu  <mihail.ionescu@arm.com>
+	* gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c: New test.
+	* gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c: New test.
 2020-02-25  Dennis Zhang  <dennis.zhang@arm.com>
 	* gcc.target/arm/simd/bf16_dot_1.c: New test.

--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-save-temps" } */
+#include <arm_neon.h>
+float32x2_t test_vcreate (float32x2_t r, uint64_t a, uint64_t b)
+{
+  bfloat16x4_t _a = vcreate_bf16(a);
+  bfloat16x4_t _b = vcreate_bf16(b);
+  return vbfdot_f32 (r, _a, _b);
+}
+/* { dg-final { scan-assembler {bfdot\tv[0-9]+.2s, v[0-9]+.4h, v[0-9]+.4h} } } */
+bfloat16x4_t test_vset_lane_bf16 (bfloat16_t a, bfloat16x4_t b)
+{
+  return vset_lane_bf16 (a, b, 3);
+}
+bfloat16x8_t test_vsetq_lane_bf16 (bfloat16_t a, bfloat16x8_t b)
+{
+  return vsetq_lane_bf16 (a, b, 7);
+}
+/* { dg-final { scan-assembler-times "ins\\t" 2 } } */
+bfloat16x4_t vdup_test (bfloat16_t a)
+{
+  return vdup_n_bf16 (a);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.4h, v\[0-9\]+.h\\\[0\\\]" } } */
+bfloat16x8_t vdupq_test (bfloat16_t a)
+{
+  return vdupq_n_bf16 (a);
+}
+bfloat16x8_t test_vdupq_lane_bf16 (bfloat16x4_t a)
+{
+  return vdupq_lane_bf16 (a, 1);
+}
+/* { dg-final { scan-assembler-times "dup\\tv\[0-9\]+\.8h, v\[0-9\]+.h\\\[0\\\]" 2 } } */
+bfloat16_t test_vget_lane_bf16 (bfloat16x4_t a)
+{
+  return vget_lane_bf16 (a, 1);
+}
+/* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[1\\\]" 2 } } */
+bfloat16x4_t test_vdup_lane_bf16 (bfloat16x4_t a)
+{
+  return vdup_lane_bf16 (a, 1);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.4h, v\[0-9\]+\.h\\\[1\\\]" } } */
+bfloat16x4_t test_vdup_laneq_bf16 (bfloat16x8_t a)
+{
+  return vdup_laneq_bf16 (a, 7);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.8h, v\[0-9\]+\.h\\\[7\\\]" } } */
+bfloat16x8_t test_vdupq_laneq_bf16 (bfloat16x8_t a)
+{
+  return vdupq_laneq_bf16 (a, 5);
+}
+/* { dg-final { scan-assembler "dup\\tv\[0-9\]+\.8h, v\[0-9\]+\.h\\\[5\\\]" } } */
+bfloat16_t test_vduph_lane_bf16 (bfloat16x4_t a)
+{
+  return vduph_lane_bf16 (a, 3);
+}
+/* { dg-final { scan-assembler "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[3\\\]" } } */
+bfloat16_t test_vgetq_lane_bf16 (bfloat16x8_t a)
+{
+  return vgetq_lane_bf16 (a, 7);
+}
+bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a)
+{
+  return vduph_laneq_bf16 (a, 7);
+}
+/* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[7\\\]" 2 } } */
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c