Commit 17a13507 by Mihail Ionescu

[GCC][PATCH][ARM] Add vreinterpret, vdup, vget and vset bfloat16 intrinsics

This patch adds support for the bf16 vector create, get, set,
duplicate and reinterpret intrinsics.
ACLE documents are at https://developer.arm.com/docs/101028/latest
ISA documents are at https://developer.arm.com/docs/ddi0596/latest

gcc/ChangeLog:

2020-02-27  Mihail Ionescu  <mihail.ionescu@arm.com>

	* (__ARM_NUM_LANES, __arm_lane, __arm_lane_q): Move to the
	beginning of the file.
	(vcreate_bf16, vcombine_bf16): New.
	(vdup_n_bf16, vdupq_n_bf16): New.
	(vdup_lane_bf16, vdup_laneq_bf16): New.
	(vdupq_lane_bf16, vdupq_laneq_bf16): New.
	(vduph_lane_bf16, vduph_laneq_bf16): New.
	(vset_lane_bf16, vsetq_lane_bf16): New.
	(vget_lane_bf16, vgetq_lane_bf16): New.
	(vget_high_bf16, vget_low_bf16): New.
	(vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New.
	(vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New.
	(vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New.
	(vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New.
	(vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New.
	(vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New.
	(vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New.
	(vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New.
	(vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New.
	(vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New.
	(vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New.
	(vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New.
	(vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New.
	(vreinterpretq_bf16_p128): New.
	(vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New.
	(vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New.
	(vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New.
	(vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New.
	(vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New.
	(vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New.
	(vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New.
	(vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New.
	(vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New.
	(vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New.
	(vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New.
	(vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New.
	(vreinterpretq_p128_bf16): New.
	* config/arm/arm_neon_builtins.def (VDX): Add V4BF.
	(V_elem): Likewise.
	(V_elem_l): Likewise.
	(VD_LANE): Likewise.
	(VQX) Add V8BF.
	(V_DOUBLE): Likewise.
	(VDQX): Add V4BF and V8BF.
	(V_two_elem, V_three_elem, V_four_elem): Likewise.
	(V_reg): Likewise.
	(V_HALF): Likewise.
	(V_double_vector_mode): Likewise.
	(V_cmp_result): Likewise.
	(V_uf_sclr): Likewise.
	(V_sz_elem): Likewise.
	(Is_d_reg): Likewise.
	(V_mode_nunits): Likewise.
	* config/arm/neon.md (neon_vdup_lane): Enable for BFloat.

gcc/testsuite/ChangeLog:

2020-02-27  Mihail Ionescu  <mihail.ionescu@arm.com>

	* gcc.target/arm/bf16_dup.c: New test.
	* gcc.target/arm/bf16_reinterpret.c: Likewise.
parent dc941ea9
2020-02-27 Mihail Ionescu <mihail.ionescu@arm.com>
* (__ARM_NUM_LANES, __arm_lane, __arm_lane_q): Move to the
beginning of the file.
(vcreate_bf16, vcombine_bf16): New.
(vdup_n_bf16, vdupq_n_bf16): New.
(vdup_lane_bf16, vdup_laneq_bf16): New.
(vdupq_lane_bf16, vdupq_laneq_bf16): New.
(vduph_lane_bf16, vduph_laneq_bf16): New.
(vset_lane_bf16, vsetq_lane_bf16): New.
(vget_lane_bf16, vgetq_lane_bf16): New.
(vget_high_bf16, vget_low_bf16): New.
(vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New.
(vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New.
(vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New.
(vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New.
(vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New.
(vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New.
(vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New.
(vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New.
(vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New.
(vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New.
(vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New.
(vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New.
(vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New.
(vreinterpretq_bf16_p128): New.
(vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New.
(vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New.
(vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New.
(vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New.
(vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New.
(vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New.
(vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New.
(vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New.
(vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New.
(vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New.
(vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New.
(vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New.
(vreinterpretq_p128_bf16): New.
* config/arm/arm_neon_builtins.def (VDX): Add V4BF.
(V_elem): Likewise.
(V_elem_l): Likewise.
(VD_LANE): Likewise.
(VQX) Add V8BF.
(V_DOUBLE): Likewise.
(VDQX): Add V4BF and V8BF.
(V_two_elem, V_three_elem, V_four_elem): Likewise.
(V_reg): Likewise.
(V_HALF): Likewise.
(V_double_vector_mode): Likewise.
(V_cmp_result): Likewise.
(V_uf_sclr): Likewise.
(V_sz_elem): Likewise.
(Is_d_reg): Likewise.
(V_mode_nunits): Likewise.
* config/arm/neon.md (neon_vdup_lane): Enable for BFloat16.
2020-02-27 Andrew Stubbs <ams@codesourcery.com>
* config/gcn/gcn-valu.md (VEC_SUBDWORD_MODE): New mode iterator.
......
......@@ -221,13 +221,13 @@ VAR10 (SETLANE, vset_lane,
VAR5 (UNOP, vcreate, v8qi, v4hi, v2si, v2sf, di)
VAR10 (UNOP, vdup_n,
v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
VAR2 (UNOP, vdup_n, v8hf, v4hf)
VAR4 (UNOP, vdup_n, v8hf, v4hf, v8bf, v4bf)
VAR10 (GETLANE, vdup_lane,
v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
VAR2 (GETLANE, vdup_lane, v8hf, v4hf)
VAR6 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di)
VAR6 (UNOP, vget_high, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
VAR6 (UNOP, vget_low, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
VAR4 (GETLANE, vdup_lane, v8hf, v4hf, v8bf, v4bf)
VAR7 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf)
VAR7 (UNOP, vget_high, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di)
VAR7 (UNOP, vget_low, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di)
VAR3 (UNOP, vmovn, v8hi, v4si, v2di)
VAR3 (UNOP, vqmovns, v8hi, v4si, v2di)
VAR3 (UNOP, vqmovnu, v8hi, v4si, v2di)
......
......@@ -82,14 +82,14 @@
(define_mode_iterator VD_RE [V8QI V4HI V2SI V2SF DI])
;; Double-width vector modes plus 64-bit elements.
(define_mode_iterator VDX [V8QI V4HI V4HF V2SI V2SF DI])
(define_mode_iterator VDX [V8QI V4HI V4HF V4BF V2SI V2SF DI])
;; Double-width vector modes plus 64-bit elements,
;; with V4BFmode added, suitable for moves.
(define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI])
;; Double-width vector modes, with V4HF - for vldN_lane and vstN_lane.
(define_mode_iterator VD_LANE [V8QI V4HI V4HF V2SI V2SF])
(define_mode_iterator VD_LANE [V8QI V4HI V4HF V4BF V2SI V2SF])
;; Double-width vector modes without floating-point elements.
(define_mode_iterator VDI [V8QI V4HI V2SI])
......@@ -104,7 +104,7 @@
(define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF])
;; Quad-width vector modes plus 64-bit elements.
(define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI])
(define_mode_iterator VQX [V16QI V8HI V8HF V8BF V4SI V4SF V2DI])
;; Quad-width vector modes without floating-point elements.
(define_mode_iterator VQI [V16QI V8HI V4SI])
......@@ -153,7 +153,7 @@
;; Vector modes, including 64-bit integer elements.
(define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI
V4HF V8HF V2SF V4SF DI V2DI])
V4HF V8HF V4BF V8BF V2SF V4SF DI V2DI])
;; Vector modes including 64-bit integer elements, but no floats.
(define_mode_iterator VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI])
......@@ -522,6 +522,7 @@
(define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
(V4HI "HI") (V8HI "HI")
(V4HF "HF") (V8HF "HF")
(V4BF "BF") (V8BF "BF")
(V2SI "SI") (V4SI "SI")
(V2SF "SF") (V4SF "SF")
(DI "DI") (V2DI "DI")])
......@@ -530,6 +531,7 @@
(define_mode_attr V_elem_l [(V8QI "qi") (V16QI "qi")
(V4HI "hi") (V8HI "hi")
(V4HF "hf") (V8HF "hf")
(V4BF "bf") (V8BF "bf")
(V2SI "si") (V4SI "si")
(V2SF "sf") (V4SF "sf")
(DI "di") (V2DI "di")])
......@@ -547,6 +549,7 @@
(define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI")
(V4HI "SI") (V8HI "SI")
(V4HF "SF") (V8HF "SF")
(V4BF "BF") (V8BF "BF")
(V2SI "V2SI") (V4SI "V2SI")
(V2SF "V2SF") (V4SF "V2SF")
(DI "V2DI") (V2DI "V2DI")])
......@@ -567,6 +570,7 @@
(define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
(V4HI "BLK") (V8HI "BLK")
(V4HF "BLK") (V8HF "BLK")
(V4BF "BLK") (V8BF "BLK")
(V2SI "BLK") (V4SI "BLK")
(V2SF "BLK") (V4SF "BLK")
(DI "EI") (V2DI "EI")])
......@@ -575,6 +579,7 @@
(define_mode_attr V_four_elem [(V8QI "SI") (V16QI "SI")
(V4HI "V4HI") (V8HI "V4HI")
(V4HF "V4HF") (V8HF "V4HF")
(V4BF "V4BF") (V8BF "V4BF")
(V2SI "V4SI") (V4SI "V4SI")
(V2SF "V4SF") (V4SF "V4SF")
(DI "OI") (V2DI "OI")])
......@@ -583,6 +588,7 @@
(define_mode_attr V_reg [(V8QI "P") (V16QI "q")
(V4HI "P") (V8HI "q")
(V4HF "P") (V8HF "q")
(V4BF "P") (V8BF "q")
(V2SI "P") (V4SI "q")
(V2SF "P") (V4SF "q")
(DI "P") (V2DI "q")
......@@ -613,7 +619,8 @@
(define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
(V8HF "V4HF") (V4SI "V2SI")
(V4SF "V2SF") (V2DF "DF")
(V2DI "DI") (V4HF "HF")])
(V2DI "DI") (V4HF "HF")
(V4BF "BF") (V8BF "V4BF")])
;; Same, but lower-case.
(define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
......@@ -624,7 +631,7 @@
(define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI")
(V2SI "V4SI") (V4HF "V8HF")
(V2SF "V4SF") (DF "V2DF")
(DI "V2DI")])
(DI "V2DI") (V4BF "V8BF")])
;; Same, but lower-case.
(define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi")
......@@ -643,6 +650,7 @@
(V4SI "V2SI") (V4SF "V2SF")
(V8QI "V8QI") (V4HI "V4HI")
(V2SI "V2SI") (V2SF "V2SF")
(V8BF "V4BF") (V4BF "V4BF")
(V8HF "V4HF") (V4HF "V4HF")])
;; Mode of result of comparison operations (and bit-select operand 1).
......@@ -650,6 +658,7 @@
(V4HI "V4HI") (V8HI "V8HI")
(V2SI "V2SI") (V4SI "V4SI")
(V4HF "V4HI") (V8HF "V8HI")
(V4BF "V4HI") (V8BF "V8HI")
(V2SF "V2SI") (V4SF "V4SI")
(DI "DI") (V2DI "V2DI")])
......@@ -691,6 +700,7 @@
(V4HI "u16") (V8HI "u16")
(V2SI "32") (V4SI "32")
(V4HF "u16") (V8HF "u16")
(V4BF "u16") (V8BF "u16")
(V2SF "32") (V4SF "32")])
(define_mode_attr V_sz_elem [(V8QI "8") (V16QI "8")
......@@ -698,6 +708,7 @@
(V2SI "32") (V4SI "32")
(DI "64") (V2DI "64")
(V4HF "16") (V8HF "16")
(V4BF "16") (V8BF "16")
(V2SF "32") (V4SF "32")])
(define_mode_attr V_elem_ch [(V8QI "b") (V16QI "b")
......@@ -768,10 +779,12 @@
(V2SI "true") (V4SI "false")
(V2SF "true") (V4SF "false")
(DI "true") (V2DI "false")
(V4BF "true") (V8BF "false")
(V4HF "true") (V8HF "false")])
(define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16")
(V4HF "4") (V8HF "8")
(V4BF "4") (V8BF "8")
(V4HI "4") (V8HI "8")
(V2SI "2") (V4SI "4")
(V2SF "2") (V4SF "4")
......
......@@ -3737,6 +3737,22 @@ if (BYTES_BIG_ENDIAN)
[(set_attr "type" "neon_from_gp_q")]
)
(define_insn "neon_vdup_nv4bf"
[(set (match_operand:V4BF 0 "s_register_operand" "=w")
(vec_duplicate:V4BF (match_operand:BF 1 "s_register_operand" "r")))]
"TARGET_NEON"
"vdup.16\t%P0, %1"
[(set_attr "type" "neon_from_gp")]
)
(define_insn "neon_vdup_nv8bf"
[(set (match_operand:V8BF 0 "s_register_operand" "=w")
(vec_duplicate:V8BF (match_operand:BF 1 "s_register_operand" "r")))]
"TARGET_NEON"
"vdup.16\t%q0, %1"
[(set_attr "type" "neon_from_gp_q")]
)
(define_insn "neon_vdup_n<mode>"
[(set (match_operand:V32 0 "s_register_operand" "=w,w")
(vec_duplicate:V32 (match_operand:<V_elem> 1 "s_register_operand" "r,t")))]
......@@ -3791,12 +3807,12 @@ if (BYTES_BIG_ENDIAN)
)
(define_insn "neon_vdup_lane<mode>_internal"
[(set (match_operand:VH 0 "s_register_operand" "=w")
(vec_duplicate:VH
[(set (match_operand:VHFBF 0 "s_register_operand" "=w")
(vec_duplicate:VHFBF
(vec_select:<V_elem>
(match_operand:<V_double_vector_mode> 1 "s_register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
"TARGET_NEON && TARGET_FP16"
"TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)"
{
if (BYTES_BIG_ENDIAN)
{
......@@ -3832,10 +3848,10 @@ if (BYTES_BIG_ENDIAN)
})
(define_expand "neon_vdup_lane<mode>"
[(match_operand:VH 0 "s_register_operand")
[(match_operand:VHFBF 0 "s_register_operand")
(match_operand:<V_double_vector_mode> 1 "s_register_operand")
(match_operand:SI 2 "immediate_operand")]
"TARGET_NEON && TARGET_FP16"
"TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)"
{
if (BYTES_BIG_ENDIAN)
{
......
2020-02-27 Mihail Ionescu <mihail.ionescu@arm.com>
* gcc.target/arm/bf16_dup.c: New test.
* gcc.target/arm/bf16_reinterpret.c: Likewise.
2020-02-27 Will Schmidt <will_schmidt@vnet.ibm.com>
* lib/target_supports.exp (check_effective_target_has_arch_pwr5): New.
......
/* { dg-do assemble { target { arm*-*-* } } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
/* { dg-additional-options "-save-temps -march=armv8.2-a+bf16+fp16 -mfloat-abi=softfp" } */
#include "arm_neon.h"
float32x2_t
test_vbfdot_vcreate (float32x2_t r, uint64_t a, uint64_t b)
{
bfloat16x4_t _a = vcreate_bf16(a);
bfloat16x4_t _b = vcreate_bf16(b);
return vbfdot_f32 (r, _a, _b);
}
/* { dg-final { scan-assembler {vdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+} } } */
bfloat16x8_t test_vcombine_bf16 (bfloat16x4_t a, bfloat16x4_t b)
{
return vcombine_bf16 (a, b);
}
bfloat16x4_t test_vget_high_bf16 (bfloat16x8_t a)
{
return vget_high_bf16 (a);
}
bfloat16x4_t test_vget_low_bf16 (bfloat16x8_t a)
{
return vget_low_bf16 (a);
}
bfloat16_t test_vget_lane_bf16 (bfloat16x4_t a)
{
return vget_lane_bf16 (a, 1);
}
bfloat16_t test_vgetq_lane_bf16 (bfloat16x8_t a)
{
return vgetq_lane_bf16 (a, 7);
}
bfloat16x4_t test_vset_lane_bf16 (bfloat16_t a, bfloat16x4_t b)
{
return vset_lane_bf16 (a, b, 1);
}
bfloat16x8_t test_vsetq_lane_bf16 (bfloat16_t a, bfloat16x8_t b)
{
return vsetq_lane_bf16 (a, b, 7);
}
bfloat16x4_t vdup_test (bfloat16_t a)
{
return vdup_n_bf16 (a);
}
/* { dg-final { scan-assembler {vdup\.16\td[0-9]+, r[0-9]+} } } */
bfloat16x8_t vdupq_test (bfloat16_t a)
{
return vdupq_n_bf16 (a);
}
/* { dg-final { scan-assembler {vdup\.16\tq[0-9]+, r[0-9]+} } } */
bfloat16x4_t test_vdup_lane_bf16 (bfloat16x4_t a)
{
return vdup_lane_bf16 (a, 1);
}
/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, d[0-9]+\[1\]} 1 } } */
bfloat16x8_t test_vdupq_lane_bf16 (bfloat16x4_t a)
{
return vdupq_lane_bf16 (a, 1);
}
/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, d[0-9]+\[1\]} 1 } } */
bfloat16x4_t test_vdup_laneq_bf16 (bfloat16x8_t a)
{
return vdup_laneq_bf16 (a, 3);
}
bfloat16x8_t test_vdupq_laneq_bf16 (bfloat16x8_t a)
{
return vdupq_laneq_bf16 (a, 3);
}
bfloat16_t test_vduph_lane_bf16 (bfloat16x4_t a)
{
return vduph_lane_bf16 (a, 1);
}
bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a)
{
return vduph_laneq_bf16 (a, 7);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment