Commit e603cd43 by Mihail Ionescu Committed by Richard Sandiford

aarch64: Add bfloat16 vldn/vstn intrinsics

This patch adds the load/store bfloat16 intrinsics to the AArch64 back-end.
ACLE documents are at https://developer.arm.com/docs/101028/latest
ISA documents are at https://developer.arm.com/docs/ddi0596/latest

2020-02-25  Mihail Ionescu  <mihail.ionescu@arm.com>

gcc/
	* config/aarch64/aarch64-builtins.c (aarch64_scalar_builtin_types):
	Add simd_bf.
	(aarch64_init_simd_builtin_scalar_types): Register simd_bf.
	(VAR15, VAR16): New.
	* config/aarch64/iterators.md (VALLDIF): Enable for V4BF and V8BF.
	(VD): Enable for V4BF.
	(VDC): Likewise.
	(VQ): Enable for V8BF.
	(VQ2): Likewise.
	(VQ_NO2E): Likewise.
	(VDBL, Vdbl): Add V4BF.
	(V_INT_EQUIV, v_int_equiv): Add V4BF and V8BF.
	* config/aarch64/arm_neon.h (bfloat16x4x2_t): New typedef.
	(bfloat16x8x2_t): Likewise.
	(bfloat16x4x3_t): Likewise.
	(bfloat16x8x3_t): Likewise.
	(bfloat16x4x4_t): Likewise.
	(bfloat16x8x4_t): Likewise.
	(vcombine_bf16): New.
	(vld1_bf16, vld1_bf16_x2): New.
	(vld1_bf16_x3, vld1_bf16_x4): New.
	(vld1q_bf16, vld1q_bf16_x2): New.
	(vld1q_bf16_x3, vld1q_bf16_x4): New.
	(vld1_lane_bf16): New.
	(vld1q_lane_bf16): New.
	(vld1_dup_bf16): New.
	(vld1q_dup_bf16): New.
	(vld2_bf16): New.
	(vld2q_bf16): New.
	(vld2_dup_bf16): New.
	(vld2q_dup_bf16): New.
	(vld3_bf16): New.
	(vld3q_bf16): New.
	(vld3_dup_bf16): New.
	(vld3q_dup_bf16): New.
	(vld4_bf16): New.
	(vld4q_bf16): New.
	(vld4_dup_bf16): New.
	(vld4q_dup_bf16): New.
	(vst1_bf16, vst1_bf16_x2): New.
	(vst1_bf16_x3, vst1_bf16_x4): New.
	(vst1q_bf16, vst1q_bf16_x2): New.
	(vst1q_bf16_x3, vst1q_bf16_x4): New.
	(vst1_lane_bf16): New.
	(vst1q_lane_bf16): New.
	(vst2_bf16): New.
	(vst2q_bf16): New.
	(vst3_bf16): New.
	(vst3q_bf16): New.
	(vst4_bf16): New.
	(vst4q_bf16): New.

gcc/testsuite/
	* gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c: New test.
	* gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c: New test.
parent 8ea6c1b8
2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com>
* config/aarch64/aarch64-builtins.c (aarch64_scalar_builtin_types):
Add simd_bf.
(aarch64_init_simd_builtin_scalar_types): Register simd_bf.
(VAR15, VAR16): New.
* config/aarch64/iterators.md (VALLDIF): Enable for V4BF and V8BF.
(VD): Enable for V4BF.
(VDC): Likewise.
(VQ): Enable for V8BF.
(VQ2): Likewise.
(VQ_NO2E): Likewise.
(VDBL, Vdbl): Add V4BF.
(V_INT_EQUIV, v_int_equiv): Add V4BF and V8BF.
* config/aarch64/arm_neon.h (bfloat16x4x2_t): New typedef.
(bfloat16x8x2_t): Likewise.
(bfloat16x4x3_t): Likewise.
(bfloat16x8x3_t): Likewise.
(bfloat16x4x4_t): Likewise.
(bfloat16x8x4_t): Likewise.
(vcombine_bf16): New.
(vld1_bf16, vld1_bf16_x2): New.
(vld1_bf16_x3, vld1_bf16_x4): New.
(vld1q_bf16, vld1q_bf16_x2): New.
(vld1q_bf16_x3, vld1q_bf16_x4): New.
(vld1_lane_bf16): New.
(vld1q_lane_bf16): New.
(vld1_dup_bf16): New.
(vld1q_dup_bf16): New.
(vld2_bf16): New.
(vld2q_bf16): New.
(vld2_dup_bf16): New.
(vld2q_dup_bf16): New.
(vld3_bf16): New.
(vld3q_bf16): New.
(vld3_dup_bf16): New.
(vld3q_dup_bf16): New.
(vld4_bf16): New.
(vld4q_bf16): New.
(vld4_dup_bf16): New.
(vld4q_dup_bf16): New.
(vst1_bf16, vst1_bf16_x2): New.
(vst1_bf16_x3, vst1_bf16_x4): New.
(vst1q_bf16, vst1q_bf16_x2): New.
(vst1q_bf16_x3, vst1q_bf16_x4): New.
(vst1_lane_bf16): New.
(vst1q_lane_bf16): New.
(vst2_bf16): New.
(vst2q_bf16): New.
(vst3_bf16): New.
(vst3q_bf16): New.
(vst4_bf16): New.
(vst4q_bf16): New.
2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com>
* config/aarch64/iterators.md (VDQF_F16) Add V4BF and V8BF.
(VALL_F16): Likewise.
(VALLDI_F16): Likewise.
......
......@@ -370,6 +370,12 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
#define VAR14(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
VAR13 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \
VAR1 (T, X, MAP, N)
#define VAR15(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
VAR14 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \
VAR1 (T, X, MAP, O)
#define VAR16(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
VAR15 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \
VAR1 (T, X, MAP, P)
#include "aarch64-builtin-iterators.h"
......@@ -534,6 +540,7 @@ const char *aarch64_scalar_builtin_types[] = {
"__builtin_aarch64_simd_oi",
"__builtin_aarch64_simd_ci",
"__builtin_aarch64_simd_xi",
"__builtin_aarch64_simd_bf",
NULL
};
......@@ -847,6 +854,8 @@ aarch64_init_simd_builtin_scalar_types (void)
"__builtin_aarch64_simd_poly128");
(*lang_hooks.types.register_builtin_type) (intTI_type_node,
"__builtin_aarch64_simd_ti");
(*lang_hooks.types.register_builtin_type) (aarch64_bf16_type_node,
"__builtin_aarch64_simd_bf");
/* Unsigned integer types for various mode sizes. */
(*lang_hooks.types.register_builtin_type) (unsigned_intQI_type_node,
"__builtin_aarch64_simd_uqi");
......
......@@ -87,7 +87,7 @@
(define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI])
;; Double vector modes.
(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF V4BF])
;; Double vector modes suitable for moving. Includes BFmode.
(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
......@@ -105,10 +105,10 @@
(define_mode_iterator VDQ_BHSI [V8QI V16QI V4HI V8HI V2SI V4SI])
;; Quad vector modes.
(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF V8BF])
;; Copy of the above.
(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
;; Quad vector modes suitable for moving. Includes BFmode.
(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
......@@ -120,7 +120,7 @@
(define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
;; VQ without 2 element modes.
(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF])
(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF])
;; Quad vector with only 2 element modes.
(define_mode_iterator VQ_2E [V2DI V2DF])
......@@ -200,7 +200,7 @@
V4HF V8HF V4BF V8BF V2SF V4SF V2DF DI])
;; All Advanced SIMD modes, plus DI and DF.
(define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI
(define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI V4BF V8BF
V2DI V4HF V8HF V2SF V4SF V2DF DI DF])
;; Advanced SIMD modes for Integer reduction across lanes.
......@@ -226,7 +226,7 @@
(define_mode_iterator VQW [V16QI V8HI V4SI])
;; Double vector modes for combines.
(define_mode_iterator VDC [V8QI V4HI V4HF V2SI V2SF DI DF])
(define_mode_iterator VDC [V8QI V4HI V4BF V4HF V2SI V2SF DI DF])
;; Advanced SIMD modes except double int.
(define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
......@@ -1171,7 +1171,7 @@
;; Double modes of vector modes.
(define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
(V4HF "V8HF")
(V4HF "V8HF") (V4BF "V8BF")
(V2SI "V4SI") (V2SF "V4SF")
(SI "V2SI") (DI "V2DI")
(DF "V2DF")])
......@@ -1181,7 +1181,7 @@
;; Double modes of vector modes (lower case).
(define_mode_attr Vdbl [(V8QI "v16qi") (V4HI "v8hi")
(V4HF "v8hf")
(V4HF "v8hf") (V4BF "v8bf")
(V2SI "v4si") (V2SF "v4sf")
(SI "v2si") (DI "v2di")
(DF "v2df")])
......@@ -1314,6 +1314,7 @@
(V2SI "V2SI") (V4SI "V4SI")
(DI "DI") (V2DI "V2DI")
(V4HF "V4HI") (V8HF "V8HI")
(V4BF "V4HI") (V8BF "V8HI")
(V2SF "V2SI") (V4SF "V4SI")
(DF "DI") (V2DF "V2DI")
(SF "SI") (SI "SI")
......@@ -1331,6 +1332,7 @@
(V2SI "v2si") (V4SI "v4si")
(DI "di") (V2DI "v2di")
(V4HF "v4hi") (V8HF "v8hi")
(V4BF "v4hi") (V8BF "v8hi")
(V2SF "v2si") (V4SF "v4si")
(DF "di") (V2DF "v2di")
(SF "si")
......
2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com>
* gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c: New test.
2020-02-25 Mihail Ionescu <mihail.ionescu@arm.com>
* gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/bf16_reinterpret.c: New test.
......
/* { dg-do assemble { target { aarch64*-*-* } } } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
#include <arm_neon.h>
bfloat16x4_t
test_vld1_dup_bf16 (bfloat16_t * ptr)
{
return vld1_dup_bf16 (ptr);
}
bfloat16x8_t
test_vld1q_dup_bf16 (bfloat16_t * ptr)
{
return vld1q_dup_bf16 (ptr);
}
bfloat16x4_t
test_vld1_lane_bf16 (bfloat16_t * ptr, bfloat16x4_t src)
{
return vld1_lane_bf16 (ptr, src, 3);
}
bfloat16x8_t
test_vld1q_lane_bf16 (bfloat16_t * ptr, bfloat16x8_t src)
{
return vld1q_lane_bf16 (ptr, src, 7);
}
bfloat16x4_t
test_vld1_bf16 (bfloat16_t * ptr)
{
return vld1_bf16 (ptr);
}
bfloat16x8_t
test_vld1q_bf16 (bfloat16_t * ptr)
{
return vld1q_bf16 (ptr);
}
bfloat16x4x2_t
test_vld1_bf16_x2 (bfloat16_t * ptr)
{
return vld1_bf16_x2 (ptr);
}
bfloat16x8x2_t
test_vld1q_bf16_x2 (bfloat16_t * ptr)
{
return vld1q_bf16_x2 (ptr);
}
bfloat16x4x3_t
test_vld1_bf16_x3 (bfloat16_t * ptr)
{
return vld1_bf16_x3 (ptr);
}
bfloat16x8x3_t
test_vld1q_bf16_x3 (bfloat16_t * ptr)
{
return vld1q_bf16_x3 (ptr);
}
bfloat16x4x4_t
test_vld1_bf16_x4 (bfloat16_t * ptr)
{
return vld1_bf16_x4 (ptr);
}
bfloat16x8x4_t
test_vld1q_bf16_x4 (bfloat16_t * ptr)
{
return vld1q_bf16_x4 (ptr);
}
bfloat16x4x2_t
test_vld2_bf16 (bfloat16_t * ptr)
{
return vld2_bf16 (ptr);
}
bfloat16x8x2_t
test_vld2q_bf16 (bfloat16_t * ptr)
{
return vld2q_bf16 (ptr);
}
bfloat16x4x2_t
test_vld2_dup_bf16 (bfloat16_t * ptr)
{
return vld2_dup_bf16 (ptr);
}
bfloat16x8x2_t
test_vld2q_dup_bf16 (bfloat16_t * ptr)
{
return vld2q_dup_bf16 (ptr);
}
bfloat16x4x3_t
test_vld3_bf16 (bfloat16_t * ptr)
{
return vld3_bf16 (ptr);
}
bfloat16x8x3_t
test_vld3q_bf16 (bfloat16_t * ptr)
{
return vld3q_bf16 (ptr);
}
bfloat16x4x3_t
test_vld3_dup_bf16 (bfloat16_t * ptr)
{
return vld3_dup_bf16 (ptr);
}
bfloat16x8x3_t
test_vld3q_dup_bf16 (bfloat16_t * ptr)
{
return vld3q_dup_bf16 (ptr);
}
bfloat16x4x4_t
test_vld4_bf16 (bfloat16_t * ptr)
{
return vld4_bf16 (ptr);
}
bfloat16x8x4_t
test_vld4q_bf16 (bfloat16_t * ptr)
{
return vld4q_bf16 (ptr);
}
bfloat16x4x4_t
test_vld4_dup_bf16 (bfloat16_t * ptr)
{
return vld4_dup_bf16 (ptr);
}
bfloat16x8x4_t
test_vld4q_dup_bf16 (bfloat16_t * ptr)
{
return vld4q_dup_bf16 (ptr);
}
/* { dg-do assemble { target { aarch64*-*-* } } } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
#include <arm_neon.h>
void
test_vst1_bf16_x2 (bfloat16_t *ptr, bfloat16x4x2_t val)
{
vst1_bf16_x2 (ptr, val);
}
void
test_vst1q_bf16_x2 (bfloat16_t *ptr, bfloat16x8x2_t val)
{
vst1q_bf16_x2 (ptr, val);
}
void
test_vst1_bf16_x3 (bfloat16_t *ptr, bfloat16x4x3_t val)
{
vst1_bf16_x3 (ptr, val);
}
void
test_vst1q_bf16_x3 (bfloat16_t *ptr, bfloat16x8x3_t val)
{
vst1q_bf16_x3 (ptr, val);
}
void
test_vst1_bf16_x4 (bfloat16_t *ptr, bfloat16x4x4_t val)
{
vst1_bf16_x4 (ptr, val);
}
void
test_vst1q_bf16_x4 (bfloat16_t *ptr, bfloat16x8x4_t val)
{
vst1q_bf16_x4 (ptr, val);
}
void
test_vst1_lane_bf16 (bfloat16_t *ptr, bfloat16x4_t val)
{
vst1_lane_bf16 (ptr, val, 3);
}
void
test_vst1q_lane_bf16 (bfloat16_t *ptr, bfloat16x8_t val)
{
vst1q_lane_bf16 (ptr, val, 7);
}
void
test_vst1_bf16 (bfloat16_t *ptr, bfloat16x4_t val)
{
vst1_bf16 (ptr, val);
}
void
test_vst1q_bf16 (bfloat16_t *ptr, bfloat16x8_t val)
{
vst1q_bf16 (ptr, val);
}
void
test_vst2_bf16 (bfloat16_t *ptr, bfloat16x4x2_t val)
{
vst2_bf16 (ptr, val);
}
void
test_vst2q_bf16 (bfloat16_t *ptr, bfloat16x8x2_t val)
{
vst2q_bf16 (ptr, val);
}
void
test_vst3_bf16 (bfloat16_t *ptr, bfloat16x4x3_t val)
{
vst3_bf16 (ptr, val);
}
void
test_vst3q_bf16 (bfloat16_t *ptr, bfloat16x8x3_t val)
{
vst3q_bf16 (ptr, val);
}
void
test_vst4_bf16 (bfloat16_t *ptr, bfloat16x4x4_t val)
{
vst4_bf16 (ptr, val);
}
void
test_vst4q_bf16 (bfloat16_t *ptr, bfloat16x8x4_t val)
{
vst4q_bf16 (ptr, val);
}
int main()
{
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment