Commit e603cd43 by Mihail Ionescu Committed by Richard Sandiford

aarch64: Add bfloat16 vldn/vstn intrinsics

This patch adds the load/store bfloat16 intrinsics to the AArch64 back-end.
ACLE documents are at https://developer.arm.com/docs/101028/latest
ISA documents are at https://developer.arm.com/docs/ddi0596/latest

2020-02-25  Mihail Ionescu  <mihail.ionescu@arm.com>

gcc/
	* config/aarch64/aarch64-builtins.c (aarch64_scalar_builtin_types):
	Add simd_bf.
	(aarch64_init_simd_builtin_scalar_types): Register simd_bf.
	(VAR15, VAR16): New.
	* config/aarch64/iterators.md (VALLDIF): Enable for V4BF and V8BF.
	(VD): Enable for V4BF.
	(VDC): Likewise.
	(VQ): Enable for V8BF.
	(VQ2): Likewise.
	(VQ_NO2E): Likewise.
	(VDBL, Vdbl): Add V4BF.
	(V_INT_EQUIV, v_int_equiv): Add V4BF and V8BF.
	* config/aarch64/arm_neon.h (bfloat16x4x2_t): New typedef.
	(bfloat16x8x2_t): Likewise.
	(bfloat16x4x3_t): Likewise.
	(bfloat16x8x3_t): Likewise.
	(bfloat16x4x4_t): Likewise.
	(bfloat16x8x4_t): Likewise.
	(vcombine_bf16): New.
	(vld1_bf16, vld1_bf16_x2): New.
	(vld1_bf16_x3, vld1_bf16_x4): New.
	(vld1q_bf16, vld1q_bf16_x2): New.
	(vld1q_bf16_x3, vld1q_bf16_x4): New.
	(vld1_lane_bf16): New.
	(vld1q_lane_bf16): New.
	(vld1_dup_bf16): New.
	(vld1q_dup_bf16): New.
	(vld2_bf16): New.
	(vld2q_bf16): New.
	(vld2_dup_bf16): New.
	(vld2q_dup_bf16): New.
	(vld3_bf16): New.
	(vld3q_bf16): New.
	(vld3_dup_bf16): New.
	(vld3q_dup_bf16): New.
	(vld4_bf16): New.
	(vld4q_bf16): New.
	(vld4_dup_bf16): New.
	(vld4q_dup_bf16): New.
	(vst1_bf16, vst1_bf16_x2): New.
	(vst1_bf16_x3, vst1_bf16_x4): New.
	(vst1q_bf16, vst1q_bf16_x2): New.
	(vst1q_bf16_x3, vst1q_bf16_x4): New.
	(vst1_lane_bf16): New.
	(vst1q_lane_bf16): New.
	(vst2_bf16): New.
	(vst2q_bf16): New.
	(vst3_bf16): New.
	(vst3q_bf16): New.
	(vst4_bf16): New.
	(vst4q_bf16): New.

gcc/testsuite/
	* gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c: New test.
	* gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c: New test.
parent 8ea6c1b8
2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com> 2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com>
* config/aarch64/aarch64-builtins.c (aarch64_scalar_builtin_types):
Add simd_bf.
(aarch64_init_simd_builtin_scalar_types): Register simd_bf.
(VAR15, VAR16): New.
* config/aarch64/iterators.md (VALLDIF): Enable for V4BF and V8BF.
(VD): Enable for V4BF.
(VDC): Likewise.
(VQ): Enable for V8BF.
(VQ2): Likewise.
(VQ_NO2E): Likewise.
(VDBL, Vdbl): Add V4BF.
(V_INT_EQUIV, v_int_equiv): Add V4BF and V8BF.
* config/aarch64/arm_neon.h (bfloat16x4x2_t): New typedef.
(bfloat16x8x2_t): Likewise.
(bfloat16x4x3_t): Likewise.
(bfloat16x8x3_t): Likewise.
(bfloat16x4x4_t): Likewise.
(bfloat16x8x4_t): Likewise.
(vcombine_bf16): New.
(vld1_bf16, vld1_bf16_x2): New.
(vld1_bf16_x3, vld1_bf16_x4): New.
(vld1q_bf16, vld1q_bf16_x2): New.
(vld1q_bf16_x3, vld1q_bf16_x4): New.
(vld1_lane_bf16): New.
(vld1q_lane_bf16): New.
(vld1_dup_bf16): New.
(vld1q_dup_bf16): New.
(vld2_bf16): New.
(vld2q_bf16): New.
(vld2_dup_bf16): New.
(vld2q_dup_bf16): New.
(vld3_bf16): New.
(vld3q_bf16): New.
(vld3_dup_bf16): New.
(vld3q_dup_bf16): New.
(vld4_bf16): New.
(vld4q_bf16): New.
(vld4_dup_bf16): New.
(vld4q_dup_bf16): New.
(vst1_bf16, vst1_bf16_x2): New.
(vst1_bf16_x3, vst1_bf16_x4): New.
(vst1q_bf16, vst1q_bf16_x2): New.
(vst1q_bf16_x3, vst1q_bf16_x4): New.
(vst1_lane_bf16): New.
(vst1q_lane_bf16): New.
(vst2_bf16): New.
(vst2q_bf16): New.
(vst3_bf16): New.
(vst3q_bf16): New.
(vst4_bf16): New.
(vst4q_bf16): New.
2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com>
* config/aarch64/iterators.md (VDQF_F16) Add V4BF and V8BF. * config/aarch64/iterators.md (VDQF_F16) Add V4BF and V8BF.
(VALL_F16): Likewise. (VALL_F16): Likewise.
(VALLDI_F16): Likewise. (VALLDI_F16): Likewise.
......
...@@ -370,6 +370,12 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS] ...@@ -370,6 +370,12 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
#define VAR14(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \ #define VAR14(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
VAR13 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \ VAR13 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
VAR1 (T, X, MAP, N) VAR1 (T, X, MAP, N)
#define VAR15(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
VAR14 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
VAR1 (T, X, MAP, O)
#define VAR16(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
VAR15 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
VAR1 (T, X, MAP, P)
#include "aarch64-builtin-iterators.h" #include "aarch64-builtin-iterators.h"
...@@ -534,6 +540,7 @@ const char *aarch64_scalar_builtin_types[] = { ...@@ -534,6 +540,7 @@ const char *aarch64_scalar_builtin_types[] = {
"__builtin_aarch64_simd_oi", "__builtin_aarch64_simd_oi",
"__builtin_aarch64_simd_ci", "__builtin_aarch64_simd_ci",
"__builtin_aarch64_simd_xi", "__builtin_aarch64_simd_xi",
"__builtin_aarch64_simd_bf",
NULL NULL
}; };
...@@ -847,6 +854,8 @@ aarch64_init_simd_builtin_scalar_types (void) ...@@ -847,6 +854,8 @@ aarch64_init_simd_builtin_scalar_types (void)
"__builtin_aarch64_simd_poly128"); "__builtin_aarch64_simd_poly128");
(*lang_hooks.types.register_builtin_type) (intTI_type_node, (*lang_hooks.types.register_builtin_type) (intTI_type_node,
"__builtin_aarch64_simd_ti"); "__builtin_aarch64_simd_ti");
(*lang_hooks.types.register_builtin_type) (aarch64_bf16_type_node,
"__builtin_aarch64_simd_bf");
/* Unsigned integer types for various mode sizes. */ /* Unsigned integer types for various mode sizes. */
(*lang_hooks.types.register_builtin_type) (unsigned_intQI_type_node, (*lang_hooks.types.register_builtin_type) (unsigned_intQI_type_node,
"__builtin_aarch64_simd_uqi"); "__builtin_aarch64_simd_uqi");
......
...@@ -87,7 +87,7 @@ ...@@ -87,7 +87,7 @@
(define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI]) (define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI])
;; Double vector modes. ;; Double vector modes.
(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF]) (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF V4BF])
;; Double vector modes suitable for moving. Includes BFmode. ;; Double vector modes suitable for moving. Includes BFmode.
(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF]) (define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
...@@ -105,10 +105,10 @@ ...@@ -105,10 +105,10 @@
(define_mode_iterator VDQ_BHSI [V8QI V16QI V4HI V8HI V2SI V4SI]) (define_mode_iterator VDQ_BHSI [V8QI V16QI V4HI V8HI V2SI V4SI])
;; Quad vector modes. ;; Quad vector modes.
(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF]) (define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF V8BF])
;; Copy of the above. ;; Copy of the above.
(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF]) (define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
;; Quad vector modes suitable for moving. Includes BFmode. ;; Quad vector modes suitable for moving. Includes BFmode.
(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF]) (define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
...@@ -120,7 +120,7 @@ ...@@ -120,7 +120,7 @@
(define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI]) (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
;; VQ without 2 element modes. ;; VQ without 2 element modes.
(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF]) (define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
;; Quad vector with only 2 element modes. ;; Quad vector with only 2 element modes.
(define_mode_iterator VQ_2E [V2DI V2DF]) (define_mode_iterator VQ_2E [V2DI V2DF])
...@@ -200,7 +200,7 @@ ...@@ -200,7 +200,7 @@
V4HF V8HF V4BF V8BF V2SF V4SF V2DF DI]) V4HF V8HF V4BF V8BF V2SF V4SF V2DF DI])
;; All Advanced SIMD modes, plus DI and DF. ;; All Advanced SIMD modes, plus DI and DF.
(define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI (define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI V4BF V8BF
V2DI V4HF V8HF V2SF V4SF V2DF DI DF]) V2DI V4HF V8HF V2SF V4SF V2DF DI DF])
;; Advanced SIMD modes for Integer reduction across lanes. ;; Advanced SIMD modes for Integer reduction across lanes.
...@@ -226,7 +226,7 @@ ...@@ -226,7 +226,7 @@
(define_mode_iterator VQW [V16QI V8HI V4SI]) (define_mode_iterator VQW [V16QI V8HI V4SI])
;; Double vector modes for combines. ;; Double vector modes for combines.
(define_mode_iterator VDC [V8QI V4HI V4HF V2SI V2SF DI DF]) (define_mode_iterator VDC [V8QI V4HI V4BF V4HF V2SI V2SF DI DF])
;; Advanced SIMD modes except double int. ;; Advanced SIMD modes except double int.
(define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF]) (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
...@@ -1171,7 +1171,7 @@ ...@@ -1171,7 +1171,7 @@
;; Double modes of vector modes. ;; Double modes of vector modes.
(define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI") (define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
(V4HF "V8HF") (V4HF "V8HF") (V4BF "V8BF")
(V2SI "V4SI") (V2SF "V4SF") (V2SI "V4SI") (V2SF "V4SF")
(SI "V2SI") (DI "V2DI") (SI "V2SI") (DI "V2DI")
(DF "V2DF")]) (DF "V2DF")])
...@@ -1181,7 +1181,7 @@ ...@@ -1181,7 +1181,7 @@
;; Double modes of vector modes (lower case). ;; Double modes of vector modes (lower case).
(define_mode_attr Vdbl [(V8QI "v16qi") (V4HI "v8hi") (define_mode_attr Vdbl [(V8QI "v16qi") (V4HI "v8hi")
(V4HF "v8hf") (V4HF "v8hf") (V4BF "v8bf")
(V2SI "v4si") (V2SF "v4sf") (V2SI "v4si") (V2SF "v4sf")
(SI "v2si") (DI "v2di") (SI "v2si") (DI "v2di")
(DF "v2df")]) (DF "v2df")])
...@@ -1314,6 +1314,7 @@ ...@@ -1314,6 +1314,7 @@
(V2SI "V2SI") (V4SI "V4SI") (V2SI "V2SI") (V4SI "V4SI")
(DI "DI") (V2DI "V2DI") (DI "DI") (V2DI "V2DI")
(V4HF "V4HI") (V8HF "V8HI") (V4HF "V4HI") (V8HF "V8HI")
(V4BF "V4HI") (V8BF "V8HI")
(V2SF "V2SI") (V4SF "V4SI") (V2SF "V2SI") (V4SF "V4SI")
(DF "DI") (V2DF "V2DI") (DF "DI") (V2DF "V2DI")
(SF "SI") (SI "SI") (SF "SI") (SI "SI")
...@@ -1331,6 +1332,7 @@ ...@@ -1331,6 +1332,7 @@
(V2SI "v2si") (V4SI "v4si") (V2SI "v2si") (V4SI "v4si")
(DI "di") (V2DI "v2di") (DI "di") (V2DI "v2di")
(V4HF "v4hi") (V8HF "v8hi") (V4HF "v4hi") (V8HF "v8hi")
(V4BF "v4hi") (V8BF "v8hi")
(V2SF "v2si") (V4SF "v4si") (V2SF "v2si") (V4SF "v4si")
(DF "di") (V2DF "v2di") (DF "di") (V2DF "v2di")
(SF "si") (SF "si")
......
2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com> 2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com>
* gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c: New test.
2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com>
* gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c: New test. * gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c: New test. * gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c: New test.
......
/* { dg-do assemble { target { aarch64*-*-* } } } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
#include <arm_neon.h>
bfloat16x4_t
test_vld1_dup_bf16 (bfloat16_t * ptr)
{
return vld1_dup_bf16 (ptr);
}
bfloat16x8_t
test_vld1q_dup_bf16 (bfloat16_t * ptr)
{
return vld1q_dup_bf16 (ptr);
}
bfloat16x4_t
test_vld1_lane_bf16 (bfloat16_t * ptr, bfloat16x4_t src)
{
return vld1_lane_bf16 (ptr, src, 3);
}
bfloat16x8_t
test_vld1q_lane_bf16 (bfloat16_t * ptr, bfloat16x8_t src)
{
return vld1q_lane_bf16 (ptr, src, 7);
}
bfloat16x4_t
test_vld1_bf16 (bfloat16_t * ptr)
{
return vld1_bf16 (ptr);
}
bfloat16x8_t
test_vld1q_bf16 (bfloat16_t * ptr)
{
return vld1q_bf16 (ptr);
}
bfloat16x4x2_t
test_vld1_bf16_x2 (bfloat16_t * ptr)
{
return vld1_bf16_x2 (ptr);
}
bfloat16x8x2_t
test_vld1q_bf16_x2 (bfloat16_t * ptr)
{
return vld1q_bf16_x2 (ptr);
}
bfloat16x4x3_t
test_vld1_bf16_x3 (bfloat16_t * ptr)
{
return vld1_bf16_x3 (ptr);
}
bfloat16x8x3_t
test_vld1q_bf16_x3 (bfloat16_t * ptr)
{
return vld1q_bf16_x3 (ptr);
}
bfloat16x4x4_t
test_vld1_bf16_x4 (bfloat16_t * ptr)
{
return vld1_bf16_x4 (ptr);
}
bfloat16x8x4_t
test_vld1q_bf16_x4 (bfloat16_t * ptr)
{
return vld1q_bf16_x4 (ptr);
}
bfloat16x4x2_t
test_vld2_bf16 (bfloat16_t * ptr)
{
return vld2_bf16 (ptr);
}
bfloat16x8x2_t
test_vld2q_bf16 (bfloat16_t * ptr)
{
return vld2q_bf16 (ptr);
}
bfloat16x4x2_t
test_vld2_dup_bf16 (bfloat16_t * ptr)
{
return vld2_dup_bf16 (ptr);
}
bfloat16x8x2_t
test_vld2q_dup_bf16 (bfloat16_t * ptr)
{
return vld2q_dup_bf16 (ptr);
}
bfloat16x4x3_t
test_vld3_bf16 (bfloat16_t * ptr)
{
return vld3_bf16 (ptr);
}
bfloat16x8x3_t
test_vld3q_bf16 (bfloat16_t * ptr)
{
return vld3q_bf16 (ptr);
}
bfloat16x4x3_t
test_vld3_dup_bf16 (bfloat16_t * ptr)
{
return vld3_dup_bf16 (ptr);
}
bfloat16x8x3_t
test_vld3q_dup_bf16 (bfloat16_t * ptr)
{
return vld3q_dup_bf16 (ptr);
}
bfloat16x4x4_t
test_vld4_bf16 (bfloat16_t * ptr)
{
return vld4_bf16 (ptr);
}
bfloat16x8x4_t
test_vld4q_bf16 (bfloat16_t * ptr)
{
return vld4q_bf16 (ptr);
}
bfloat16x4x4_t
test_vld4_dup_bf16 (bfloat16_t * ptr)
{
return vld4_dup_bf16 (ptr);
}
bfloat16x8x4_t
test_vld4q_dup_bf16 (bfloat16_t * ptr)
{
return vld4q_dup_bf16 (ptr);
}
/* { dg-do assemble { target { aarch64*-*-* } } } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
#include <arm_neon.h>
void
test_vst1_bf16_x2 (bfloat16_t *ptr, bfloat16x4x2_t val)
{
vst1_bf16_x2 (ptr, val);
}
void
test_vst1q_bf16_x2 (bfloat16_t *ptr, bfloat16x8x2_t val)
{
vst1q_bf16_x2 (ptr, val);
}
void
test_vst1_bf16_x3 (bfloat16_t *ptr, bfloat16x4x3_t val)
{
vst1_bf16_x3 (ptr, val);
}
void
test_vst1q_bf16_x3 (bfloat16_t *ptr, bfloat16x8x3_t val)
{
vst1q_bf16_x3 (ptr, val);
}
void
test_vst1_bf16_x4 (bfloat16_t *ptr, bfloat16x4x4_t val)
{
vst1_bf16_x4 (ptr, val);
}
void
test_vst1q_bf16_x4 (bfloat16_t *ptr, bfloat16x8x4_t val)
{
vst1q_bf16_x4 (ptr, val);
}
void
test_vst1_lane_bf16 (bfloat16_t *ptr, bfloat16x4_t val)
{
vst1_lane_bf16 (ptr, val, 3);
}
void
test_vst1q_lane_bf16 (bfloat16_t *ptr, bfloat16x8_t val)
{
vst1q_lane_bf16 (ptr, val, 7);
}
void
test_vst1_bf16 (bfloat16_t *ptr, bfloat16x4_t val)
{
vst1_bf16 (ptr, val);
}
void
test_vst1q_bf16 (bfloat16_t *ptr, bfloat16x8_t val)
{
vst1q_bf16 (ptr, val);
}
void
test_vst2_bf16 (bfloat16_t *ptr, bfloat16x4x2_t val)
{
vst2_bf16 (ptr, val);
}
void
test_vst2q_bf16 (bfloat16_t *ptr, bfloat16x8x2_t val)
{
vst2q_bf16 (ptr, val);
}
void
test_vst3_bf16 (bfloat16_t *ptr, bfloat16x4x3_t val)
{
vst3_bf16 (ptr, val);
}
void
test_vst3q_bf16 (bfloat16_t *ptr, bfloat16x8x3_t val)
{
vst3q_bf16 (ptr, val);
}
void
test_vst4_bf16 (bfloat16_t *ptr, bfloat16x4x4_t val)
{
vst4_bf16 (ptr, val);
}
void
test_vst4q_bf16 (bfloat16_t *ptr, bfloat16x8x4_t val)
{
vst4q_bf16 (ptr, val);
}
int main()
{
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment