Commit f275d73a by Stam Markianos-Wright

[GCC][PATCH][AArch64]Add ACLE intrinsics for bfdot for ARMv8.6 Extension

2020-01-16  Stam Markianos-Wright  <stam.markianos-wright@arm.com>

	* config/aarch64/aarch64-simd-builtins.def (aarch64_bfdot,
	aarch64_bfdot_lane, aarch64_bfdot_laneq): New.
	* config/aarch64/aarch64-simd.md (aarch64_bfdot, aarch64_bfdot_lane,
	aarch64_bfdot_laneq): New.
	* config/aarch64/arm_bf16.h (vbfdot_f32, vbfdotq_f32,
	vbfdot_lane_f32, vbfdotq_lane_f32, vbfdot_laneq_f32,
	vbfdotq_laneq_f32): New.
	* config/aarch64/iterators.md (UNSPEC_BFDOT, Vbfdottype,
	VBFMLA_W, VBF): New.
	(isquadop): Add V4BF, V8BF.

2020-01-16  Stam Markianos-Wright  <stam.markianos-wright@arm.com>

	* gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c: New.
	* gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c: New.
	* gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c: New.
parent d9165389
2020-01-16 Stam Markianos-Wright <stam.markianos-wright@arm.com> 2020-01-16 Stam Markianos-Wright <stam.markianos-wright@arm.com>
* config/aarch64/aarch64-simd-builtins.def (aarch64_bfdot,
aarch64_bfdot_lane, aarch64_bfdot_laneq): New.
* config/aarch64/aarch64-simd.md (aarch64_bfdot, aarch64_bfdot_lane,
aarch64_bfdot_laneq): New.
* config/aarch64/arm_bf16.h (vbfdot_f32, vbfdotq_f32,
vbfdot_lane_f32, vbfdotq_lane_f32, vbfdot_laneq_f32,
vbfdotq_laneq_f32): New.
* config/aarch64/iterators.md (UNSPEC_BFDOT, Vbfdottype,
VBFMLA_W, VBF): New.
(isquadop): Add V4BF, V8BF.
2020-01-16 Stam Markianos-Wright <stam.markianos-wright@arm.com>
* config/aarch64/aarch64-builtins.c: (enum aarch64_type_qualifiers): * config/aarch64/aarch64-builtins.c: (enum aarch64_type_qualifiers):
New qualifier_lane_quadtup_index, TYPES_TERNOP_SSUS, New qualifier_lane_quadtup_index, TYPES_TERNOP_SSUS,
TYPES_QUADOPSSUS_LANE_QUADTUP, TYPES_QUADOPSSSU_LANE_QUADTUP. TYPES_QUADOPSSUS_LANE_QUADTUP, TYPES_QUADOPSSSU_LANE_QUADTUP.
......
...@@ -687,3 +687,8 @@ ...@@ -687,3 +687,8 @@
BUILTIN_VSFDF (UNOP, frint32x, 0) BUILTIN_VSFDF (UNOP, frint32x, 0)
BUILTIN_VSFDF (UNOP, frint64z, 0) BUILTIN_VSFDF (UNOP, frint64z, 0)
BUILTIN_VSFDF (UNOP, frint64x, 0) BUILTIN_VSFDF (UNOP, frint64x, 0)
/* Implemented by aarch64_bfdot{_lane}{q}<mode>. */
VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
...@@ -7059,3 +7059,35 @@ ...@@ -7059,3 +7059,35 @@
"xtn\t%0.<Vntype>, %1.<Vtype>" "xtn\t%0.<Vntype>, %1.<Vtype>"
[(set_attr "type" "neon_shift_imm_narrow_q")] [(set_attr "type" "neon_shift_imm_narrow_q")]
) )
(define_insn "aarch64_bfdot<mode>"
[(set (match_operand:VDQSF 0 "register_operand" "=w")
(plus:VDQSF
(unspec:VDQSF
[(match_operand:<VBFMLA_W> 2 "register_operand" "w")
(match_operand:<VBFMLA_W> 3 "register_operand" "w")]
UNSPEC_BFDOT)
(match_operand:VDQSF 1 "register_operand" "0")))]
"TARGET_BF16_SIMD"
"bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
[(set_attr "type" "neon_dot<q>")]
)
(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
[(set (match_operand:VDQSF 0 "register_operand" "=w")
(plus:VDQSF
(unspec:VDQSF
[(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
(match_operand:VBF 3 "register_operand" "w")
(match_operand:SI 4 "const_int_operand" "n")]
UNSPEC_BFDOT)
(match_operand:VDQSF 1 "register_operand" "0")))]
"TARGET_BF16_SIMD"
{
int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
int lane = INTVAL (operands[4]);
operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
}
[(set_attr "type" "neon_dot<VDQSF:q>")]
)
...@@ -34611,6 +34611,57 @@ vrnd64xq_f64 (float64x2_t __a) ...@@ -34611,6 +34611,57 @@ vrnd64xq_f64 (float64x2_t __a)
#include "arm_bf16.h" #include "arm_bf16.h"
#pragma GCC push_options
#pragma GCC target ("arch=armv8.2-a+bf16")
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
{
return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
{
return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
const int __index)
{
return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
const int __index)
{
return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
const int __index)
{
return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
const int __index)
{
return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index);
}
#pragma GCC pop_options
/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */ /* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */
#pragma GCC push_options #pragma GCC push_options
...@@ -122,6 +122,9 @@ ...@@ -122,6 +122,9 @@
;; Quad vector with only 2 element modes. ;; Quad vector with only 2 element modes.
(define_mode_iterator VQ_2E [V2DI V2DF]) (define_mode_iterator VQ_2E [V2DI V2DF])
;; BFmode vector modes.
(define_mode_iterator VBF [V4BF V8BF])
;; This mode iterator allows :P to be used for patterns that operate on ;; This mode iterator allows :P to be used for patterns that operate on
;; addresses in different modes. In LP64, only DI will match, while in ;; addresses in different modes. In LP64, only DI will match, while in
;; ILP32, either can match. ;; ILP32, either can match.
...@@ -801,6 +804,7 @@ ...@@ -801,6 +804,7 @@
UNSPEC_USUBWT ; Used in aarch64-sve2.md. UNSPEC_USUBWT ; Used in aarch64-sve2.md.
UNSPEC_USDOT ; Used in aarch64-simd.md. UNSPEC_USDOT ; Used in aarch64-simd.md.
UNSPEC_SUDOT ; Used in aarch64-simd.md. UNSPEC_SUDOT ; Used in aarch64-simd.md.
UNSPEC_BFDOT ; Used in aarch64-simd.md.
]) ])
;; ------------------------------------------------------------------ ;; ------------------------------------------------------------------
...@@ -1451,6 +1455,9 @@ ...@@ -1451,6 +1455,9 @@
;; Register suffix for DOTPROD input types from the return type. ;; Register suffix for DOTPROD input types from the return type.
(define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")]) (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")])
;; Register suffix for BFDOT input types from the return type.
(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")])
;; Sum of lengths of instructions needed to move vector registers of a mode. ;; Sum of lengths of instructions needed to move vector registers of a mode.
(define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")]) (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
...@@ -1461,11 +1468,14 @@ ...@@ -1461,11 +1468,14 @@
;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub
(define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")]) (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")])
;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub
(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")])
(define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")]) (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")])
(define_mode_attr f16quad [(V2SF "") (V4SF "q")]) (define_mode_attr f16quad [(V2SF "") (V4SF "q")])
(define_mode_attr isquadop [(V8QI "") (V16QI "q")]) (define_mode_attr isquadop [(V8QI "") (V16QI "q") (V4BF "") (V8BF "q")])
(define_code_attr f16mac [(plus "a") (minus "s")]) (define_code_attr f16mac [(plus "a") (minus "s")])
......
2020-01-16 Stam Markianos-Wright <stam.markianos-wright@arm.com> 2020-01-16 Stam Markianos-Wright <stam.markianos-wright@arm.com>
* gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c: New test. * gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c: New test. * gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-3.c: New test. * gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c: New.
* gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-4.c: New test.
2020-01-16 Stam Markianos-Wright <stam.markianos-wright@arm.com>
* gcc.target/aarch64/advsimd-intrinsics/vdot-3-1.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/vdot-3-2.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/vdot-3-3.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/vdot-3-4.c: New test.
2020-01-16 Andre Vieira <andre.simoesdiasvieira@arm.com> 2020-01-16 Andre Vieira <andre.simoesdiasvieira@arm.com>
......
/* { dg-do assemble { target { aarch64*-*-* } } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
/* { dg-additional-options "-save-temps" } */
/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
#include <arm_neon.h>
/*
**ufoo:
** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
** ret
*/
float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
{
return vbfdot_f32 (r, x, y);
}
/*
**ufooq:
** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
** ret
*/
float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
{
return vbfdotq_f32 (r, x, y);
}
/*
**ufoo_lane:
** bfdot v0.2s, v1.4h, v2.2h\[0\]
** ret
*/
float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
{
return vbfdot_lane_f32 (r, x, y, 0);
}
/*
**ufooq_laneq:
** bfdot v0.4s, v1.8h, v2.2h\[2\]
** ret
*/
float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
{
return vbfdotq_laneq_f32 (r, x, y, 2);
}
/*
**ufoo_laneq:
** bfdot v0.2s, v1.4h, v2.2h\[3\]
** ret
*/
float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
{
return vbfdot_laneq_f32 (r, x, y, 3);
}
/*
**ufooq_lane:
** bfdot v0.4s, v1.8h, v2.2h\[1\]
** ret
*/
float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
{
return vbfdotq_lane_f32 (r, x, y, 1);
}
/*
**ufoo_untied:
** mov v0.8b, v1.8b
** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
** ret
*/
float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
{
return vbfdot_f32 (r, x, y);
}
/*
**ufooq_lane_untied:
** mov v0.16b, v1.16b
** bfdot v0.4s, v2.8h, v3.2h\[1\]
** ret
*/
float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
{
return vbfdotq_lane_f32 (r, x, y, 1);
}
/* { dg-do assemble { target { aarch64*-*-* } } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
/* { dg-additional-options "-mbig-endian --save-temps" } */
/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
#include <arm_neon.h>
/*
**ufoo:
** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
** ret
*/
float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
{
return vbfdot_f32 (r, x, y);
}
/*
**ufooq:
** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
** ret
*/
float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
{
return vbfdotq_f32 (r, x, y);
}
/*
**ufoo_lane:
** bfdot v0.2s, v1.4h, v2.2h\[0\]
** ret
*/
float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
{
return vbfdot_lane_f32 (r, x, y, 0);
}
/*
**ufooq_laneq:
** bfdot v0.4s, v1.8h, v2.2h\[2\]
** ret
*/
float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
{
return vbfdotq_laneq_f32 (r, x, y, 2);
}
/*
**ufoo_laneq:
** bfdot v0.2s, v1.4h, v2.2h\[3\]
** ret
*/
float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
{
return vbfdot_laneq_f32 (r, x, y, 3);
}
/*
**ufooq_lane:
** bfdot v0.4s, v1.8h, v2.2h\[1\]
** ret
*/
float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
{
return vbfdotq_lane_f32 (r, x, y, 1);
}
/*
**ufoo_untied:
** mov v0.8b, v1.8b
** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
** ret
*/
float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
{
return vbfdot_f32 (r, x, y);
}
/*
**ufooq_lane_untied:
** mov v0.16b, v1.16b
** bfdot v0.4s, v2.8h, v3.2h\[1\]
** ret
*/
float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
{
return vbfdotq_lane_f32 (r, x, y, 1);
}
/* { dg-do assemble { target { aarch64*-*-* } } } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
/* { dg-additional-options "--save-temps" } */
#include <arm_neon.h>
float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
{
return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 1} "" { target *-*-* } 0 } */
}
float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
{
return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 3} "" { target *-*-* } 0 } */
}
float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
{
return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 - 3} "" { target *-*-* } 0 } */
}
float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
{
return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 - 1} "" { target *-*-* } 0 } */
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment