Commit eb637e76 by Delia Burduv Committed by Kyrylo Tkachov

ACLE intrinsics: BFloat16 load intrinsics for AArch32

2020-03-06  Delia Burduv  <delia.burduv@arm.com>

	* config/arm/arm_neon.h (vld2_bf16): New.
	(vld2q_bf16): New.
	(vld3_bf16): New.
	(vld3q_bf16): New.
	(vld4_bf16): New.
	(vld4q_bf16): New.
	(vld2_dup_bf16): New.
	(vld2q_dup_bf16): New.
	(vld3_dup_bf16): New.
	(vld3q_dup_bf16): New.
	(vld4_dup_bf16): New.
	(vld4q_dup_bf16): New.
	* config/arm/arm_neon_builtins.def
	(vld2): Changed to VAR13 and added v4bf, v8bf
	(vld2_dup): Changed to VAR8 and added v4bf, v8bf
	(vld3): Changed to VAR13 and added v4bf, v8bf
	(vld3_dup): Changed to VAR8 and added v4bf, v8bf
	(vld4): Changed to VAR13 and added v4bf, v8bf
	(vld4_dup): Changed to VAR8 and added v4bf, v8bf
	* config/arm/iterators.md (VDXBF2): New iterator.
	*config/arm/neon.md (neon_vld2): Use new iterators.
	(neon_vld2_dup<mode): Use new iterators.
	(neon_vld3<mode>): Likewise.
	(neon_vld3qa<mode>): Likewise.
	(neon_vld3qb<mode>): Likewise.
	(neon_vld3_dup<mode>): Likewise.
	(neon_vld4<mode>): Likewise.
	(neon_vld4qa<mode>): Likewise.
	(neon_vld4qb<mode>): Likewise.
	(neon_vld4_dup<mode>): Likewise.
	(neon_vld2_dupv8bf): New.
	(neon_vld3_dupv8bf): Likewise.
	(neon_vld4_dupv8bf): Likewise.

	* gcc.target/arm/simd/bf16_vldn_1.c: New test.
parent ff229375
2020-03-06 Delia Burduv <delia.burduv@arm.com> 2020-03-06 Delia Burduv <delia.burduv@arm.com>
* config/arm/arm_neon.h (vld2_bf16): New.
(vld2q_bf16): New.
(vld3_bf16): New.
(vld3q_bf16): New.
(vld4_bf16): New.
(vld4q_bf16): New.
(vld2_dup_bf16): New.
(vld2q_dup_bf16): New.
(vld3_dup_bf16): New.
(vld3q_dup_bf16): New.
(vld4_dup_bf16): New.
(vld4q_dup_bf16): New.
* config/arm/arm_neon_builtins.def
(vld2): Changed to VAR13 and added v4bf, v8bf
(vld2_dup): Changed to VAR8 and added v4bf, v8bf
(vld3): Changed to VAR13 and added v4bf, v8bf
(vld3_dup): Changed to VAR8 and added v4bf, v8bf
(vld4): Changed to VAR13 and added v4bf, v8bf
(vld4_dup): Changed to VAR8 and added v4bf, v8bf
* config/arm/iterators.md (VDXBF2): New iterator.
*config/arm/neon.md (neon_vld2): Use new iterators.
(neon_vld2_dup<mode): Use new iterators.
(neon_vld3<mode>): Likewise.
(neon_vld3qa<mode>): Likewise.
(neon_vld3qb<mode>): Likewise.
(neon_vld3_dup<mode>): Likewise.
(neon_vld4<mode>): Likewise.
(neon_vld4qa<mode>): Likewise.
(neon_vld4qb<mode>): Likewise.
(neon_vld4_dup<mode>): Likewise.
(neon_vld2_dupv8bf): New.
(neon_vld3_dupv8bf): Likewise.
(neon_vld4_dupv8bf): Likewise.
2020-03-06 Delia Burduv <delia.burduv@arm.com>
* config/arm/arm_neon.h (bfloat16x4x2_t): New typedef. * config/arm/arm_neon.h (bfloat16x4x2_t): New typedef.
(bfloat16x8x2_t): New typedef. (bfloat16x8x2_t): New typedef.
(bfloat16x4x3_t): New typedef. (bfloat16x4x3_t): New typedef.
......
...@@ -19557,6 +19557,114 @@ vst4q_bf16 (bfloat16_t * __ptr, bfloat16x8x4_t __val) ...@@ -19557,6 +19557,114 @@ vst4q_bf16 (bfloat16_t * __ptr, bfloat16x8x4_t __val)
return __builtin_neon_vst4v8bf (__ptr, __bu.__o); return __builtin_neon_vst4v8bf (__ptr, __bu.__o);
} }
__extension__ extern __inline bfloat16x4x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld2_bf16 (bfloat16_t const * __ptr)
{
union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv;
__rv.__o = __builtin_neon_vld2v4bf ((const __builtin_neon_hi *) __ptr);
return __rv.__i;
}
__extension__ extern __inline bfloat16x8x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld2q_bf16 (const bfloat16_t * __ptr)
{
union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv;
__rv.__o = __builtin_neon_vld2v8bf ((const __builtin_neon_hi *) __ptr);
return __rv.__i;
}
__extension__ extern __inline bfloat16x4x3_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld3_bf16 (const bfloat16_t * __ptr)
{
union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv;
__rv.__o = __builtin_neon_vld3v4bf ((const __builtin_neon_hi *) __ptr);
return __rv.__i;
}
__extension__ extern __inline bfloat16x8x3_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld3q_bf16 (const bfloat16_t * __ptr)
{
union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv;
__rv.__o = __builtin_neon_vld3v8bf ((const __builtin_neon_hi *) __ptr);
return __rv.__i;
}
__extension__ extern __inline bfloat16x4x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld4_bf16 (const bfloat16_t * __ptr)
{
union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv;
__rv.__o = __builtin_neon_vld4v4bf ((const __builtin_neon_hi *) __ptr);
return __rv.__i;
}
__extension__ extern __inline bfloat16x8x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld4q_bf16 (const bfloat16_t * __ptr)
{
union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv;
__rv.__o = __builtin_neon_vld4v8bf ((const __builtin_neon_hi *) __ptr);
return __rv.__i;
}
__extension__ extern __inline bfloat16x4x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld2_dup_bf16 (const bfloat16_t * __ptr)
{
union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv;
__rv.__o = __builtin_neon_vld2_dupv4bf ((const __builtin_neon_hi *) __ptr);
return __rv.__i;
}
__extension__ extern __inline bfloat16x8x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld2q_dup_bf16 (const bfloat16_t * __ptr)
{
union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv;
__rv.__o = __builtin_neon_vld2_dupv8bf ((const __builtin_neon_hi *) __ptr);
return __rv.__i;
}
__extension__ extern __inline bfloat16x4x3_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld3_dup_bf16 (const bfloat16_t * __ptr)
{
union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv;
__rv.__o = __builtin_neon_vld3_dupv4bf ((const __builtin_neon_hi *) __ptr);
return __rv.__i;
}
__extension__ extern __inline bfloat16x8x3_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld3q_dup_bf16 (const bfloat16_t * __ptr)
{
union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv;
__rv.__o = __builtin_neon_vld3_dupv8bf ((const __builtin_neon_hi *) __ptr);
return __rv.__i;
}
__extension__ extern __inline bfloat16x4x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld4_dup_bf16 (const bfloat16_t * __ptr)
{
union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv;
__rv.__o = __builtin_neon_vld4_dupv4bf ((const __builtin_neon_hi *) __ptr);
return __rv.__i;
}
__extension__ extern __inline bfloat16x8x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld4q_dup_bf16 (const bfloat16_t * __ptr)
{
union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv;
__rv.__o = __builtin_neon_vld4_dupv8bf ((const __builtin_neon_hi *) __ptr);
return __rv.__i;
}
#pragma GCC pop_options #pragma GCC pop_options
#ifdef __cplusplus #ifdef __cplusplus
......
...@@ -320,29 +320,29 @@ VAR12 (STORE1, vst1, ...@@ -320,29 +320,29 @@ VAR12 (STORE1, vst1,
v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
VAR12 (STORE1LANE, vst1_lane, VAR12 (STORE1LANE, vst1_lane,
v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
VAR11 (LOAD1, vld2, VAR13 (LOAD1, vld2,
v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf)
VAR9 (LOAD1LANE, vld2_lane, VAR9 (LOAD1LANE, vld2_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR6 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) VAR8 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf)
VAR13 (STORE1, vst2, VAR13 (STORE1, vst2,
v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf)
VAR9 (STORE1LANE, vst2_lane, VAR9 (STORE1LANE, vst2_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR11 (LOAD1, vld3, VAR13 (LOAD1, vld3,
v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf)
VAR9 (LOAD1LANE, vld3_lane, VAR9 (LOAD1LANE, vld3_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR6 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) VAR8 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf)
VAR13 (STORE1, vst3, VAR13 (STORE1, vst3,
v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf)
VAR9 (STORE1LANE, vst3_lane, VAR9 (STORE1LANE, vst3_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR11 (LOAD1, vld4, VAR13 (LOAD1, vld4,
v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf)
VAR9 (LOAD1LANE, vld4_lane, VAR9 (LOAD1LANE, vld4_lane,
v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
VAR6 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) VAR8 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf)
VAR13 (STORE1, vst4, VAR13 (STORE1, vst4,
v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf)
VAR9 (STORE1LANE, vst4_lane, VAR9 (STORE1LANE, vst4_lane,
......
...@@ -87,6 +87,9 @@ ...@@ -87,6 +87,9 @@
;; Double-width vector modes plus 64-bit elements, including V4BF. ;; Double-width vector modes plus 64-bit elements, including V4BF.
(define_mode_iterator VDXBF [V8QI V4HI V4HF (V4BF "TARGET_BF16_SIMD") V2SI V2SF DI]) (define_mode_iterator VDXBF [V8QI V4HI V4HF (V4BF "TARGET_BF16_SIMD") V2SI V2SF DI])
;; Double-width vector modes plus 64-bit elements, V4BF and V8BF.
(define_mode_iterator VDXBF2 [V8QI V4HI V4HF V2SI V2SF DI (V4BF "TARGET_BF16_SIMD") (V8BF ("TARGET_BF16_SIMD"))])
;; Double-width vector modes plus 64-bit elements, ;; Double-width vector modes plus 64-bit elements,
;; with V4BFmode added, suitable for moves. ;; with V4BFmode added, suitable for moves.
(define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI]) (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI])
......
...@@ -5428,7 +5428,7 @@ if (BYTES_BIG_ENDIAN) ...@@ -5428,7 +5428,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vld2<mode>" (define_insn "neon_vld2<mode>"
[(set (match_operand:TI 0 "s_register_operand" "=w") [(set (match_operand:TI 0 "s_register_operand" "=w")
(unspec:TI [(match_operand:TI 1 "neon_struct_operand" "Um") (unspec:TI [(match_operand:TI 1 "neon_struct_operand" "Um")
(unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2))] UNSPEC_VLD2))]
"TARGET_NEON" "TARGET_NEON"
{ {
...@@ -5453,7 +5453,7 @@ if (BYTES_BIG_ENDIAN) ...@@ -5453,7 +5453,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vld2<mode>" (define_insn "neon_vld2<mode>"
[(set (match_operand:OI 0 "s_register_operand" "=w") [(set (match_operand:OI 0 "s_register_operand" "=w")
(unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2))] UNSPEC_VLD2))]
"TARGET_NEON" "TARGET_NEON"
"vld2.<V_sz_elem>\t%h0, %A1" "vld2.<V_sz_elem>\t%h0, %A1"
...@@ -5516,7 +5516,7 @@ if (BYTES_BIG_ENDIAN) ...@@ -5516,7 +5516,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vld2_dup<mode>" (define_insn "neon_vld2_dup<mode>"
[(set (match_operand:TI 0 "s_register_operand" "=w") [(set (match_operand:TI 0 "s_register_operand" "=w")
(unspec:TI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um") (unspec:TI [(match_operand:<V_two_elem> 1 "neon_struct_operand" "Um")
(unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2_DUP))] UNSPEC_VLD2_DUP))]
"TARGET_NEON" "TARGET_NEON"
{ {
...@@ -5531,6 +5531,27 @@ if (BYTES_BIG_ENDIAN) ...@@ -5531,6 +5531,27 @@ if (BYTES_BIG_ENDIAN)
(const_string "neon_load1_1reg<q>")))] (const_string "neon_load1_1reg<q>")))]
) )
(define_insn "neon_vld2_dupv8bf"
[(set (match_operand:OI 0 "s_register_operand" "=w")
(unspec:OI [(match_operand:V2BF 1 "neon_struct_operand" "Um")
(unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2_DUP))]
"TARGET_BF16_SIMD"
{
rtx ops[5];
int tabbase = REGNO (operands[0]);
ops[4] = operands[1];
ops[0] = gen_rtx_REG (V4BFmode, tabbase);
ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2);
ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4);
ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6);
output_asm_insn ("vld2.16\t{%P0, %P1, %P2, %P3}, %A4", ops);
return "";
}
[(set_attr "type" "neon_load2_all_lanes_q")]
)
(define_expand "vec_store_lanesti<mode>" (define_expand "vec_store_lanesti<mode>"
[(set (match_operand:TI 0 "neon_struct_operand") [(set (match_operand:TI 0 "neon_struct_operand")
(unspec:TI [(match_operand:TI 1 "s_register_operand") (unspec:TI [(match_operand:TI 1 "s_register_operand")
...@@ -5637,7 +5658,7 @@ if (BYTES_BIG_ENDIAN) ...@@ -5637,7 +5658,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vld3<mode>" (define_insn "neon_vld3<mode>"
[(set (match_operand:EI 0 "s_register_operand" "=w") [(set (match_operand:EI 0 "s_register_operand" "=w")
(unspec:EI [(match_operand:EI 1 "neon_struct_operand" "Um") (unspec:EI [(match_operand:EI 1 "neon_struct_operand" "Um")
(unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD3))] UNSPEC_VLD3))]
"TARGET_NEON" "TARGET_NEON"
{ {
...@@ -5665,7 +5686,7 @@ if (BYTES_BIG_ENDIAN) ...@@ -5665,7 +5686,7 @@ if (BYTES_BIG_ENDIAN)
(define_expand "neon_vld3<mode>" (define_expand "neon_vld3<mode>"
[(match_operand:CI 0 "s_register_operand") [(match_operand:CI 0 "s_register_operand")
(match_operand:CI 1 "neon_struct_operand") (match_operand:CI 1 "neon_struct_operand")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON" "TARGET_NEON"
{ {
rtx mem; rtx mem;
...@@ -5680,7 +5701,7 @@ if (BYTES_BIG_ENDIAN) ...@@ -5680,7 +5701,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vld3qa<mode>" (define_insn "neon_vld3qa<mode>"
[(set (match_operand:CI 0 "s_register_operand" "=w") [(set (match_operand:CI 0 "s_register_operand" "=w")
(unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD3A))] UNSPEC_VLD3A))]
"TARGET_NEON" "TARGET_NEON"
{ {
...@@ -5700,7 +5721,7 @@ if (BYTES_BIG_ENDIAN) ...@@ -5700,7 +5721,7 @@ if (BYTES_BIG_ENDIAN)
[(set (match_operand:CI 0 "s_register_operand" "=w") [(set (match_operand:CI 0 "s_register_operand" "=w")
(unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um")
(match_operand:CI 2 "s_register_operand" "0") (match_operand:CI 2 "s_register_operand" "0")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD3B))] UNSPEC_VLD3B))]
"TARGET_NEON" "TARGET_NEON"
{ {
...@@ -5777,7 +5798,7 @@ if (BYTES_BIG_ENDIAN) ...@@ -5777,7 +5798,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vld3_dup<mode>" (define_insn "neon_vld3_dup<mode>"
[(set (match_operand:EI 0 "s_register_operand" "=w") [(set (match_operand:EI 0 "s_register_operand" "=w")
(unspec:EI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um") (unspec:EI [(match_operand:<V_three_elem> 1 "neon_struct_operand" "Um")
(unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD3_DUP))] UNSPEC_VLD3_DUP))]
"TARGET_NEON" "TARGET_NEON"
{ {
...@@ -5800,6 +5821,26 @@ if (BYTES_BIG_ENDIAN) ...@@ -5800,6 +5821,26 @@ if (BYTES_BIG_ENDIAN)
(const_string "neon_load3_all_lanes<q>") (const_string "neon_load3_all_lanes<q>")
(const_string "neon_load1_1reg<q>")))]) (const_string "neon_load1_1reg<q>")))])
(define_insn "neon_vld3_dupv8bf"
[(set (match_operand:CI 0 "s_register_operand" "=w")
(unspec:CI [(match_operand:V2BF 1 "neon_struct_operand" "Um")
(unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2_DUP))]
"TARGET_BF16_SIMD"
{
rtx ops[4];
int tabbase = REGNO (operands[0]);
ops[3] = operands[1];
ops[0] = gen_rtx_REG (V4BFmode, tabbase);
ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2);
ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4);
output_asm_insn ("vld3.16\t{%P0[], %P1[], %P2[]}, %A3", ops);
return "";
}
[(set_attr "type" "neon_load3_all_lanes_q")]
)
(define_expand "vec_store_lanesei<mode>" (define_expand "vec_store_lanesei<mode>"
[(set (match_operand:EI 0 "neon_struct_operand") [(set (match_operand:EI 0 "neon_struct_operand")
(unspec:EI [(match_operand:EI 1 "s_register_operand") (unspec:EI [(match_operand:EI 1 "s_register_operand")
...@@ -5955,7 +5996,7 @@ if (BYTES_BIG_ENDIAN) ...@@ -5955,7 +5996,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vld4<mode>" (define_insn "neon_vld4<mode>"
[(set (match_operand:OI 0 "s_register_operand" "=w") [(set (match_operand:OI 0 "s_register_operand" "=w")
(unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um")
(unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD4))] UNSPEC_VLD4))]
"TARGET_NEON" "TARGET_NEON"
{ {
...@@ -5983,7 +6024,7 @@ if (BYTES_BIG_ENDIAN) ...@@ -5983,7 +6024,7 @@ if (BYTES_BIG_ENDIAN)
(define_expand "neon_vld4<mode>" (define_expand "neon_vld4<mode>"
[(match_operand:XI 0 "s_register_operand") [(match_operand:XI 0 "s_register_operand")
(match_operand:XI 1 "neon_struct_operand") (match_operand:XI 1 "neon_struct_operand")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON" "TARGET_NEON"
{ {
rtx mem; rtx mem;
...@@ -5998,7 +6039,7 @@ if (BYTES_BIG_ENDIAN) ...@@ -5998,7 +6039,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vld4qa<mode>" (define_insn "neon_vld4qa<mode>"
[(set (match_operand:XI 0 "s_register_operand" "=w") [(set (match_operand:XI 0 "s_register_operand" "=w")
(unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD4A))] UNSPEC_VLD4A))]
"TARGET_NEON" "TARGET_NEON"
{ {
...@@ -6019,7 +6060,7 @@ if (BYTES_BIG_ENDIAN) ...@@ -6019,7 +6060,7 @@ if (BYTES_BIG_ENDIAN)
[(set (match_operand:XI 0 "s_register_operand" "=w") [(set (match_operand:XI 0 "s_register_operand" "=w")
(unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um")
(match_operand:XI 2 "s_register_operand" "0") (match_operand:XI 2 "s_register_operand" "0")
(unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD4B))] UNSPEC_VLD4B))]
"TARGET_NEON" "TARGET_NEON"
{ {
...@@ -6099,7 +6140,7 @@ if (BYTES_BIG_ENDIAN) ...@@ -6099,7 +6140,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vld4_dup<mode>" (define_insn "neon_vld4_dup<mode>"
[(set (match_operand:OI 0 "s_register_operand" "=w") [(set (match_operand:OI 0 "s_register_operand" "=w")
(unspec:OI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um") (unspec:OI [(match_operand:<V_four_elem> 1 "neon_struct_operand" "Um")
(unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD4_DUP))] UNSPEC_VLD4_DUP))]
"TARGET_NEON" "TARGET_NEON"
{ {
...@@ -6125,6 +6166,27 @@ if (BYTES_BIG_ENDIAN) ...@@ -6125,6 +6166,27 @@ if (BYTES_BIG_ENDIAN)
(const_string "neon_load1_1reg<q>")))] (const_string "neon_load1_1reg<q>")))]
) )
(define_insn "neon_vld4_dupv8bf"
[(set (match_operand:XI 0 "s_register_operand" "=w")
(unspec:XI [(match_operand:V2BF 1 "neon_struct_operand" "Um")
(unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD2_DUP))]
"TARGET_BF16_SIMD"
{
rtx ops[5];
int tabbase = REGNO (operands[0]);
ops[4] = operands[1];
ops[0] = gen_rtx_REG (V4BFmode, tabbase);
ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2);
ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4);
ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6);
output_asm_insn ("vld4.16\t{%P0[], %P1[], %P2[], %P3[]}, %A4", ops);
return "";
}
[(set_attr "type" "neon_load4_all_lanes_q")]
)
(define_expand "vec_store_lanesoi<mode>" (define_expand "vec_store_lanesoi<mode>"
[(set (match_operand:OI 0 "neon_struct_operand") [(set (match_operand:OI 0 "neon_struct_operand")
(unspec:OI [(match_operand:OI 1 "s_register_operand") (unspec:OI [(match_operand:OI 1 "s_register_operand")
......
2020-03-06 Delia Burduv <delia.burduv@arm.com> 2020-03-06 Delia Burduv <delia.burduv@arm.com>
* gcc.target/arm/simd/bf16_vldn_1.c: New test.
2020-03-06 Delia Burduv <delia.burduv@arm.com>
* gcc.target/arm/simd/bf16_vstn_1.c: New test. * gcc.target/arm/simd/bf16_vstn_1.c: New test.
2020-03-06 Kito Cheng <kito.cheng@sifive.com> 2020-03-06 Kito Cheng <kito.cheng@sifive.com>
......
/* { dg-do assemble } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
/* { dg-additional-options "-save-temps -O2 -mfloat-abi=hard" } */
/* { dg-final { check-function-bodies "**" "" } } */
#include "arm_neon.h"
/*
**test_vld2_bf16:
** ...
** vld2.16 {d0-d1}, \[r0\]
** bx lr
*/
bfloat16x4x2_t
test_vld2_bf16 (bfloat16_t * ptr)
{
return vld2_bf16 (ptr);
}
/*
**test_vld2q_bf16:
** ...
** vld2.16 {d0-d3}, \[r0\]
** bx lr
*/
bfloat16x8x2_t
test_vld2q_bf16 (bfloat16_t * ptr)
{
return vld2q_bf16 (ptr);
}
/*
**test_vld2_dup_bf16:
** ...
** vld2.16 {d0\[\], d1\[\]}, \[r0\]
** bx lr
*/
bfloat16x4x2_t
test_vld2_dup_bf16 (bfloat16_t * ptr)
{
return vld2_dup_bf16 (ptr);
}
/*
**test_vld2q_dup_bf16:
** ...
** vld2.16 {d0, d1, d2, d3}, \[r0\]
** bx lr
*/
bfloat16x8x2_t
test_vld2q_dup_bf16 (bfloat16_t * ptr)
{
return vld2q_dup_bf16 (ptr);
}
/*
**test_vld3_bf16:
** ...
** vld3.16 {d0-d2}, \[r0\]
** bx lr
*/
bfloat16x4x3_t
test_vld3_bf16 (bfloat16_t * ptr)
{
return vld3_bf16 (ptr);
}
/*
**test_vld3q_bf16:
** ...
** vld3.16 {d1, d3, d5}, \[r0\]
** bx lr
*/
bfloat16x8x3_t
test_vld3q_bf16 (bfloat16_t * ptr)
{
return vld3q_bf16 (ptr);
}
/*
**test_vld3_dup_bf16:
** ...
** vld3.16 {d0\[\], d1\[\], d2\[\]}, \[r0\]
** bx lr
*/
bfloat16x4x3_t
test_vld3_dup_bf16 (bfloat16_t * ptr)
{
return vld3_dup_bf16 (ptr);
}
/*
**test_vld3q_dup_bf16:
** ...
** vld3.16 {d0\[\], d1\[\], d2\[\]}, \[r0\]
** bx lr
*/
bfloat16x8x3_t
test_vld3q_dup_bf16 (bfloat16_t * ptr)
{
return vld3q_dup_bf16 (ptr);
}
/*
**test_vld4_bf16:
** ...
** vld4.16 {d0-d3}, \[r0\]
** bx lr
*/
bfloat16x4x4_t
test_vld4_bf16 (bfloat16_t * ptr)
{
return vld4_bf16 (ptr);
}
/*
**test_vld4q_bf16:
** ...
** vld4.16 {d1, d3, d5, d7}, \[r0\]
** bx lr
*/
bfloat16x8x4_t
test_vld4q_bf16 (bfloat16_t * ptr)
{
return vld4q_bf16 (ptr);
}
/*
**test_vld4_dup_bf16:
** ...
** vld4.16 {d0\[\], d1\[\], d2\[\], d3\[\]}, \[r0\]
** bx lr
*/
bfloat16x4x4_t
test_vld4_dup_bf16 (bfloat16_t * ptr)
{
return vld4_dup_bf16 (ptr);
}
/*
**test_vld4q_dup_bf16:
** ...
** vld4.16 {d0\[\], d1\[\], d2\[\], d3\[\]}, \[r0\]
** bx lr
*/
bfloat16x8x4_t
test_vld4q_dup_bf16 (bfloat16_t * ptr)
{
return vld4q_dup_bf16 (ptr);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment