Commit eb7ba6c3 by Dennis Zhang

arm: ACLE intrinsics for bfloat16 dot product

This patch is part of a series adding support for Armv8.6-A features.
It adds intrinsics for brain half-precision float-point (BF16) dot
instructions with AdvSIMD support.

gcc/ChangeLog:

2020-02-25  Dennis Zhang  <dennis.zhang@arm.com>

	* config/arm/arm_neon.h (vbfdot_f32, vbfdotq_f32): New
	(vbfdot_lane_f32, vbfdotq_laneq_f32): New.
	(vbfdot_laneq_f32, vbfdotq_lane_f32): New.
	* config/arm/arm_neon_builtins.def (vbfdot): New entry.
	(vbfdot_lanev4bf, vbfdot_lanev8bf): Likewise.
	* config/arm/iterators.md (VSF2BF): New attribute.
	* config/arm/neon.md (neon_vbfdot<VCVTF:mode>): New entry.
	(neon_vbfdot_lanev4bf<VCVTF:mode>): Likewise.
	(neon_vbfdot_lanev8bf<VCVTF:mode>): Likewise.

gcc/testsuite/ChangeLog:

2020-02-25  Dennis Zhang  <dennis.zhang@arm.com>

	* gcc.target/arm/simd/bf16_dot_1.c: New test.
	* gcc.target/arm/simd/bf16_dot_2.c: New test.
	* gcc.target/arm/simd/bf16_dot_3.c: New test.
parent 490350a1
2020-02-25 Dennis Zhang <dennis.zhang@arm.com>
* config/arm/arm_neon.h (vbfdot_f32, vbfdotq_f32): New
(vbfdot_lane_f32, vbfdotq_laneq_f32): New.
(vbfdot_laneq_f32, vbfdotq_lane_f32): New.
* config/arm/arm_neon_builtins.def (vbfdot): New entry.
(vbfdot_lanev4bf, vbfdot_lanev8bf): Likewise.
* config/arm/iterators.md (VSF2BF): New attribute.
* config/arm/neon.md (neon_vbfdot<VCVTF:mode>): New entry.
(neon_vbfdot_lanev4bf<VCVTF:mode>): Likewise.
(neon_vbfdot_lanev8bf<VCVTF:mode>): Likewise.
2020-02-25 Christophe Lyon <christophe.lyon@linaro.org> 2020-02-25 Christophe Lyon <christophe.lyon@linaro.org>
* config/arm/arm.md (required_for_purecode): New attribute. * config/arm/arm.md (required_for_purecode): New attribute.
......
...@@ -18819,6 +18819,58 @@ vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) ...@@ -18819,6 +18819,58 @@ vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
#pragma GCC pop_options #pragma GCC pop_options
/* AdvSIMD Brain half-precision float-point (Bfloat16) intrinsics. */
#pragma GCC push_options
#pragma GCC target ("arch=armv8.2-a+bf16")
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
{
return __builtin_neon_vbfdotv2sf (__r, __a, __b);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
{
return __builtin_neon_vbfdotv4sf (__r, __a, __b);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
const int __index)
{
return __builtin_neon_vbfdot_lanev4bfv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
const int __index)
{
return __builtin_neon_vbfdot_lanev8bfv4sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
const int __index)
{
return __builtin_neon_vbfdot_lanev8bfv2sf (__r, __a, __b, __index);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
const int __index)
{
return __builtin_neon_vbfdot_lanev4bfv4sf (__r, __a, __b, __index);
}
#pragma GCC pop_options
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
......
...@@ -381,3 +381,7 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf) ...@@ -381,3 +381,7 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
VAR1 (TERNOP, smmla, v16qi) VAR1 (TERNOP, smmla, v16qi)
VAR1 (UTERNOP, ummla, v16qi) VAR1 (UTERNOP, ummla, v16qi)
VAR1 (USTERNOP, usmmla, v16qi) VAR1 (USTERNOP, usmmla, v16qi)
VAR2 (TERNOP, vbfdot, v2sf, v4sf)
VAR2 (MAC_LANE_PAIR, vbfdot_lanev4bf, v2sf, v4sf)
VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf)
...@@ -835,6 +835,8 @@ ...@@ -835,6 +835,8 @@
(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")]) (define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")]) (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
(define_mode_attr VSF2BF [(V2SF "V4BF") (V4SF "V8BF")])
;;---------------------------------------------------------------------------- ;;----------------------------------------------------------------------------
;; Code attributes ;; Code attributes
;;---------------------------------------------------------------------------- ;;----------------------------------------------------------------------------
......
...@@ -6596,3 +6596,51 @@ if (BYTES_BIG_ENDIAN) ...@@ -6596,3 +6596,51 @@ if (BYTES_BIG_ENDIAN)
"v<sup>mmla.<mmla_sfx>\t%q0, %q2, %q3" "v<sup>mmla.<mmla_sfx>\t%q0, %q2, %q3"
[(set_attr "type" "neon_mla_s_q")] [(set_attr "type" "neon_mla_s_q")]
) )
(define_insn "neon_vbfdot<VCVTF:mode>"
[(set (match_operand:VCVTF 0 "register_operand" "=w")
(plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
(unspec:VCVTF [
(match_operand:<VSF2BF> 2 "register_operand" "w")
(match_operand:<VSF2BF> 3 "register_operand" "w")]
UNSPEC_DOT_S)))]
"TARGET_BF16_SIMD"
"vdot.bf16\\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
[(set_attr "type" "neon_dot<q>")]
)
(define_insn "neon_vbfdot_lanev4bf<VCVTF:mode>"
[(set (match_operand:VCVTF 0 "register_operand" "=w")
(plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
(unspec:VCVTF [
(match_operand:<VSF2BF> 2 "register_operand" "w")
(match_operand:V4BF 3 "register_operand" "x")
(match_operand:SI 4 "immediate_operand" "i")]
UNSPEC_DOT_S)))]
"TARGET_BF16_SIMD"
"vdot.bf16\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"
[(set_attr "type" "neon_dot<q>")]
)
(define_insn "neon_vbfdot_lanev8bf<VCVTF:mode>"
[(set (match_operand:VCVTF 0 "register_operand" "=w")
(plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
(unspec:VCVTF [
(match_operand:<VSF2BF> 2 "register_operand" "w")
(match_operand:V8BF 3 "register_operand" "x")
(match_operand:SI 4 "immediate_operand" "i")]
UNSPEC_DOT_S)))]
"TARGET_BF16_SIMD"
{
int lane = INTVAL (operands[4]);
int half = GET_MODE_NUNITS (GET_MODE (operands[3])) / 4;
if (lane < half)
return "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
else
{
operands[4] = GEN_INT (lane - half);
return "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
}
}
[(set_attr "type" "neon_dot<q>")]
)
2020-02-25 Dennis Zhang <dennis.zhang@arm.com>
* gcc.target/arm/simd/bf16_dot_1.c: New test.
* gcc.target/arm/simd/bf16_dot_2.c: New test.
* gcc.target/arm/simd/bf16_dot_3.c: New test.
2020-02-25 Jakub Jelinek <jakub@redhat.com> 2020-02-25 Jakub Jelinek <jakub@redhat.com>
PR rtl-optimization/93908 PR rtl-optimization/93908
......
/* { dg-do assemble } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-options "-save-temps -O2" } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
#include "arm_neon.h"
/* BF16 DOT without lane. */
float32x2_t
test_vbfdot_f32 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
{
/* vdot.bf16 d, d, d */
return vbfdot_f32 (r, a, b);
}
float32x4_t
test_vbfdotq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
{
/* vdot.bf16 q, q, q */
return vbfdotq_f32 (r, a, b);
}
/* 64-bit BF16 DOT with lane. */
float32x2_t
test_vbfdot_lane_f32_0 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
{
/* vdot.bf16 d, d, d[0] */
return vbfdot_lane_f32 (r, a, b, 0);
}
float32x2_t
test_vbfdot_lane_f32_1 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
{
/* vdot.bf16 d, d, d[1] */
return vbfdot_lane_f32 (r, a, b, 1);
}
float32x2_t
test_vbfdot_laneq_f32_0 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
{
/* vdot.bf16 d, d, d[0] */
return vbfdot_laneq_f32 (r, a, b, 0);
}
float32x2_t
test_vbfdot_laneq_f32_1 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
{
/* vdot.bf16 d, d, d[1] */
return vbfdot_laneq_f32 (r, a, b, 1);
}
float32x2_t
test_vbfdot_laneq_f32_2 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
{
/* vdot.bf16 d, d, d[0] */
return vbfdot_laneq_f32 (r, a, b, 2);
}
float32x2_t
test_vbfdot_laneq_f32_3 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
{
/* vdot.bf16 d, d, d[1] */
return vbfdot_laneq_f32 (r, a, b, 3);
}
/* 128-bit BF16 DOT with lane. */
float32x4_t
test_vbfdotq_lane_f32_0 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
{
/* vdot.bf16 q, q, d[0] */
return vbfdotq_lane_f32 (r, a, b, 0);
}
float32x4_t
test_vbfdotq_lane_f32_1 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
{
/* vdot.bf16 q, q, d[1] */
return vbfdotq_lane_f32 (r, a, b, 1);
}
float32x4_t
test_vbfdotq_laneq_f32_0 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
{
/* vdot.bf16 q, q, d[0] */
return vbfdotq_laneq_f32 (r, a, b, 0);
}
float32x4_t
test_vbfdotq_laneq_f32_3 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
{
/* vdot.bf16 q, q, d[1] */
return vbfdotq_laneq_f32 (r, a, b, 3);
}
/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\n} 1 } } */
/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 1 } } */
/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]\n} 3 } } */
/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\[1\]\n} 3 } } */
/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]\n} 2 } } */
/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[1\]\n} 2 } } */
/* { dg-do compile } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
#include "arm_neon.h"
float32x2_t
test_vbfdot_lane_f32_a (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
{
/* { dg-error "lane -1 out of range 0 - 1" "" {target *-*-*} 0 } */
return vbfdot_lane_f32 (r, a, b, -1);
}
float32x2_t
test_vbfdot_lane_f32_b (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
{
/* { dg-error "lane 2 out of range 0 - 1" "" {target *-*-*} 0 } */
return vbfdot_lane_f32 (r, a, b, 2);
}
float32x2_t
test_vbfdot_laneq_f32_a (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
{
/* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
return vbfdot_laneq_f32 (r, a, b, -1);
}
float32x2_t
test_vbfdot_laneq_f32_b (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
{
/* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
return vbfdot_laneq_f32 (r, a, b, 4);
}
/* { dg-do compile } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
#include "arm_neon.h"
float32x4_t
test_vbfdotq_lane_f32_a (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
{
/* { dg-error "lane -1 out of range 0 - 1" "" {target *-*-*} 0 } */
return vbfdotq_lane_f32 (r, a, b, -1);
}
float32x4_t
test_vbfdotq_lane_f32_b (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
{
/* { dg-error "lane 2 out of range 0 - 1" "" {target *-*-*} 0 } */
return vbfdotq_lane_f32 (r, a, b, 2);
}
float32x4_t
test_vbfdotq_laneq_f32_a (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
{
/* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
return vbfdotq_laneq_f32 (r, a, b, -1);
}
float32x4_t
test_vbfdotq_laneq_f32_b (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
{
/* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
return vbfdotq_laneq_f32 (r, a, b, 4);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment