Commit 17a13507 by Mihail Ionescu

[GCC][PATCH][ARM] Add vreinterpret, vdup, vget and vset bfloat16 intrinsics

This patch adds support for the bf16 vector create, get, set,
duplicate and reinterpret intrinsics.
ACLE documents are at https://developer.arm.com/docs/101028/latest
ISA documents are at https://developer.arm.com/docs/ddi0596/latest

gcc/ChangeLog:

2020-02-27  Mihail Ionescu  <mihail.ionescu@arm.com>

	* (__ARM_NUM_LANES, __arm_lane, __arm_lane_q): Move to the
	beginning of the file.
	(vcreate_bf16, vcombine_bf16): New.
	(vdup_n_bf16, vdupq_n_bf16): New.
	(vdup_lane_bf16, vdup_laneq_bf16): New.
	(vdupq_lane_bf16, vdupq_laneq_bf16): New.
	(vduph_lane_bf16, vduph_laneq_bf16): New.
	(vset_lane_bf16, vsetq_lane_bf16): New.
	(vget_lane_bf16, vgetq_lane_bf16): New.
	(vget_high_bf16, vget_low_bf16): New.
	(vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New.
	(vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New.
	(vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New.
	(vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New.
	(vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New.
	(vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New.
	(vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New.
	(vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New.
	(vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New.
	(vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New.
	(vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New.
	(vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New.
	(vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New.
	(vreinterpretq_bf16_p128): New.
	(vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New.
	(vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New.
	(vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New.
	(vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New.
	(vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New.
	(vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New.
	(vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New.
	(vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New.
	(vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New.
	(vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New.
	(vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New.
	(vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New.
	(vreinterpretq_p128_bf16): New.
	* config/arm/arm_neon_builtins.def (VDX): Add V4BF.
	(V_elem): Likewise.
	(V_elem_l): Likewise.
	(VD_LANE): Likewise.
	(VQX) Add V8BF.
	(V_DOUBLE): Likewise.
	(VDQX): Add V4BF and V8BF.
	(V_two_elem, V_three_elem, V_four_elem): Likewise.
	(V_reg): Likewise.
	(V_HALF): Likewise.
	(V_double_vector_mode): Likewise.
	(V_cmp_result): Likewise.
	(V_uf_sclr): Likewise.
	(V_sz_elem): Likewise.
	(Is_d_reg): Likewise.
	(V_mode_nunits): Likewise.
	* config/arm/neon.md (neon_vdup_lane): Enable for BFloat.

gcc/testsuite/ChangeLog:

2020-02-27  Mihail Ionescu  <mihail.ionescu@arm.com>

	* gcc.target/arm/bf16_dup.c: New test.
	* gcc.target/arm/bf16_reinterpret.c: Likewise.
parent dc941ea9
2020-02-27 Mihail Ionescu <mihail.ionescu@arm.com>
* (__ARM_NUM_LANES, __arm_lane, __arm_lane_q): Move to the
beginning of the file.
(vcreate_bf16, vcombine_bf16): New.
(vdup_n_bf16, vdupq_n_bf16): New.
(vdup_lane_bf16, vdup_laneq_bf16): New.
(vdupq_lane_bf16, vdupq_laneq_bf16): New.
(vduph_lane_bf16, vduph_laneq_bf16): New.
(vset_lane_bf16, vsetq_lane_bf16): New.
(vget_lane_bf16, vgetq_lane_bf16): New.
(vget_high_bf16, vget_low_bf16): New.
(vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New.
(vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New.
(vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New.
(vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New.
(vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New.
(vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New.
(vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New.
(vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New.
(vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New.
(vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New.
(vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New.
(vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New.
(vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New.
(vreinterpretq_bf16_p128): New.
(vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New.
(vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New.
(vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New.
(vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New.
(vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New.
(vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New.
(vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New.
(vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New.
(vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New.
(vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New.
(vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New.
(vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New.
(vreinterpretq_p128_bf16): New.
* config/arm/arm_neon_builtins.def (VDX): Add V4BF.
(V_elem): Likewise.
(V_elem_l): Likewise.
(VD_LANE): Likewise.
(VQX) Add V8BF.
(V_DOUBLE): Likewise.
(VDQX): Add V4BF and V8BF.
(V_two_elem, V_three_elem, V_four_elem): Likewise.
(V_reg): Likewise.
(V_HALF): Likewise.
(V_double_vector_mode): Likewise.
(V_cmp_result): Likewise.
(V_uf_sclr): Likewise.
(V_sz_elem): Likewise.
(Is_d_reg): Likewise.
(V_mode_nunits): Likewise.
* config/arm/neon.md (neon_vdup_lane): Enable for BFloat16.
2020-02-27 Andrew Stubbs <ams@codesourcery.com> 2020-02-27 Andrew Stubbs <ams@codesourcery.com>
* config/gcn/gcn-valu.md (VEC_SUBDWORD_MODE): New mode iterator. * config/gcn/gcn-valu.md (VEC_SUBDWORD_MODE): New mode iterator.
......
...@@ -42,6 +42,18 @@ extern "C" { ...@@ -42,6 +42,18 @@ extern "C" {
#include <arm_bf16.h> #include <arm_bf16.h>
#include <stdint.h> #include <stdint.h>
/* For big-endian, GCC's vector indices are reversed within each 64
bits compared to the architectural lane indices used by Neon
intrinsics. */
#ifdef __ARM_BIG_ENDIAN
#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0]))
#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) - 1))
#define __arm_laneq(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec)/2 - 1))
#else
#define __arm_lane(__vec, __idx) __idx
#define __arm_laneq(__vec, __idx) __idx
#endif
typedef __simd64_int8_t int8x8_t; typedef __simd64_int8_t int8x8_t;
typedef __simd64_int16_t int16x4_t; typedef __simd64_int16_t int16x4_t;
typedef __simd64_int32_t int32x2_t; typedef __simd64_int32_t int32x2_t;
...@@ -6144,18 +6156,6 @@ vget_lane_s32 (int32x2_t __a, const int __b) ...@@ -6144,18 +6156,6 @@ vget_lane_s32 (int32x2_t __a, const int __b)
were marked always-inline so there were no call sites, the declaration were marked always-inline so there were no call sites, the declaration
would nonetheless raise an error. Hence, we must use a macro instead. */ would nonetheless raise an error. Hence, we must use a macro instead. */
/* For big-endian, GCC's vector indices are reversed within each 64
bits compared to the architectural lane indices used by Neon
intrinsics. */
#ifdef __ARM_BIG_ENDIAN
#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0]))
#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) - 1))
#define __arm_laneq(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec)/2 - 1))
#else
#define __arm_lane(__vec, __idx) __idx
#define __arm_laneq(__vec, __idx) __idx
#endif
#define vget_lane_f16(__v, __idx) \ #define vget_lane_f16(__v, __idx) \
__extension__ \ __extension__ \
({ \ ({ \
...@@ -14476,6 +14476,15 @@ vreinterpret_p16_u32 (uint32x2_t __a) ...@@ -14476,6 +14476,15 @@ vreinterpret_p16_u32 (uint32x2_t __a)
#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
__extension__ extern __inline float16x4_t __extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_f16_bf16 (bfloat16x4_t __a)
{
return (float16x4_t) __a;
}
#endif
#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
__extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_f16_p8 (poly8x8_t __a) vreinterpret_f16_p8 (poly8x8_t __a)
{ {
return (float16x4_t) __a; return (float16x4_t) __a;
...@@ -15688,6 +15697,15 @@ vreinterpretq_f16_p16 (poly16x8_t __a) ...@@ -15688,6 +15697,15 @@ vreinterpretq_f16_p16 (poly16x8_t __a)
#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
__extension__ extern __inline float16x8_t __extension__ extern __inline float16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_f16_bf16 (bfloat16x8_t __a)
{
return (float16x8_t) __a;
}
#endif
#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
__extension__ extern __inline float16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_f16_f32 (float32x4_t __a) vreinterpretq_f16_f32 (float32x4_t __a)
{ {
return (float16x8_t) __a; return (float16x8_t) __a;
...@@ -18823,6 +18841,496 @@ vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) ...@@ -18823,6 +18841,496 @@ vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
#pragma GCC push_options #pragma GCC push_options
#pragma GCC target ("arch=armv8.2-a+bf16") #pragma GCC target ("arch=armv8.2-a+bf16")
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcreate_bf16 (uint64_t __a)
{
return (bfloat16x4_t) __a;
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdup_n_bf16 (bfloat16_t __a)
{
return __builtin_neon_vdup_nv4bf (__a);
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdupq_n_bf16 (bfloat16_t __a)
{
return __builtin_neon_vdup_nv8bf (__a);
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdup_lane_bf16 (bfloat16x4_t __a, const int __b)
{
return __builtin_neon_vdup_lanev4bf (__a, __b);
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdupq_lane_bf16 (bfloat16x4_t __a, const int __b)
{
return __builtin_neon_vdup_lanev8bf (__a, __b);
}
#define vset_lane_bf16(__e, __v, __idx) \
__extension__ \
({ \
bfloat16_t __elem = (__e); \
bfloat16x4_t __vec = (__v); \
__builtin_arm_lane_check (4, __idx); \
__vec[__arm_lane(__vec, __idx)] = __elem; \
__vec; \
})
#define vsetq_lane_bf16(__e, __v, __idx) \
__extension__ \
({ \
bfloat16_t __elem = (__e); \
bfloat16x8_t __vec = (__v); \
__builtin_arm_lane_check (8, __idx); \
__vec[__arm_laneq(__vec, __idx)] = __elem; \
__vec; \
})
#define vget_lane_bf16(__v, __idx) \
__extension__ \
({ \
bfloat16x4_t __vec = (__v); \
__builtin_arm_lane_check (4, __idx); \
bfloat16_t __res = __vec[__arm_lane(__vec, __idx)]; \
__res; \
})
#define vgetq_lane_bf16(__v, __idx) \
__extension__ \
({ \
bfloat16x8_t __vec = (__v); \
__builtin_arm_lane_check (8, __idx); \
bfloat16_t __res = __vec[__arm_laneq(__vec, __idx)]; \
__res; \
})
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdup_laneq_bf16 (bfloat16x8_t __a, const int __b)
{
return vdup_n_bf16( vgetq_lane_bf16 (__a, __b));
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b)
{
return vdupq_n_bf16( vgetq_lane_bf16 (__a, __b));
}
__extension__ extern __inline bfloat16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vduph_lane_bf16 (bfloat16x4_t __a, const int __b)
{
return vget_lane_bf16 (__a, __b);
}
__extension__ extern __inline bfloat16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vduph_laneq_bf16 (bfloat16x8_t __a, const int __b)
{
return vgetq_lane_bf16 (__a, __b);
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_bf16 (bfloat16x8_t __a)
{
return __builtin_neon_vget_highv8bf (__a);
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_bf16 (bfloat16x8_t __a)
{
return __builtin_neon_vget_lowv8bf (__a);
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcombine_bf16 (bfloat16x4_t __a, bfloat16x4_t __b)
{
return __builtin_neon_vcombinev4bf (__a, __b);
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_bf16_u8 (uint8x8_t __a)
{
return (bfloat16x4_t)__a;
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_bf16_u16 (uint16x4_t __a)
{
return (bfloat16x4_t)__a;
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_bf16_u32 (uint32x2_t __a)
{
return (bfloat16x4_t)__a;
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_bf16_u64 (uint64x1_t __a)
{
return (bfloat16x4_t)__a;
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_bf16_s8 (int8x8_t __a)
{
return (bfloat16x4_t)__a;
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_bf16_s16 (int16x4_t __a)
{
return (bfloat16x4_t)__a;
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_bf16_s32 (int32x2_t __a)
{
return (bfloat16x4_t)__a;
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_bf16_s64 (int64x1_t __a)
{
return (bfloat16x4_t)__a;
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_bf16_p8 (poly8x8_t __a)
{
return (bfloat16x4_t)__a;
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_bf16_p16 (poly16x4_t __a)
{
return (bfloat16x4_t)__a;
}
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_bf16_p64 (poly64x1_t __a)
{
return (bfloat16x4_t)__a;
}
#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_bf16_f16 (float16x4_t __a)
{
return (bfloat16x4_t)__a;
}
#endif
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_bf16_f32 (float32x2_t __a)
{
return (bfloat16x4_t)__a;
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_u8 (uint8x16_t __a)
{
return (bfloat16x8_t)__a;
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_u16 (uint16x8_t __a)
{
return (bfloat16x8_t)__a;
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_u32 (uint32x4_t __a)
{
return (bfloat16x8_t)__a;
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_u64 (uint64x2_t __a)
{
return (bfloat16x8_t)__a;
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_s8 (int8x16_t __a)
{
return (bfloat16x8_t)__a;
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_s16 (int16x8_t __a)
{
return (bfloat16x8_t)__a;
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_s32 (int32x4_t __a)
{
return (bfloat16x8_t)__a;
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_s64 (int64x2_t __a)
{
return (bfloat16x8_t)__a;
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_p8 (poly8x16_t __a)
{
return (bfloat16x8_t)__a;
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_p16 (poly16x8_t __a)
{
return (bfloat16x8_t)__a;
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_p64 (poly64x2_t __a)
{
return (bfloat16x8_t)__a;
}
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_p128 (poly128_t __a)
{
return (bfloat16x8_t)__a;
}
#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_f16 (float16x8_t __a)
{
return (bfloat16x8_t)__a;
}
#endif
__extension__ extern __inline bfloat16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_bf16_f32 (float32x4_t __a)
{
return (bfloat16x8_t)__a;
}
__extension__ extern __inline int8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_s8_bf16 (bfloat16x4_t __a)
{
return (int8x8_t)__a;
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_s16_bf16 (bfloat16x4_t __a)
{
return (int16x4_t)__a;
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_s32_bf16 (bfloat16x4_t __a)
{
return (int32x2_t)__a;
}
__extension__ extern __inline int64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_s64_bf16 (bfloat16x4_t __a)
{
return (int64x1_t)__a;
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_u8_bf16 (bfloat16x4_t __a)
{
return (uint8x8_t)__a;
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_u16_bf16 (bfloat16x4_t __a)
{
return (uint16x4_t)__a;
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_u32_bf16 (bfloat16x4_t __a)
{
return (uint32x2_t)__a;
}
__extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_u64_bf16 (bfloat16x4_t __a)
{
return (uint64x1_t)__a;
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_f32_bf16 (bfloat16x4_t __a)
{
return (float32x2_t)__a;
}
__extension__ extern __inline poly8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_p8_bf16 (bfloat16x4_t __a)
{
return (poly8x8_t)__a;
}
__extension__ extern __inline poly16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_p16_bf16 (bfloat16x4_t __a)
{
return (poly16x4_t)__a;
}
__extension__ extern __inline poly64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_p64_bf16 (bfloat16x4_t __a)
{
return (poly64x1_t)__a;
}
__extension__ extern __inline int8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_s8_bf16 (bfloat16x8_t __a)
{
return (int8x16_t)__a;
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_s16_bf16 (bfloat16x8_t __a)
{
return (int16x8_t)__a;
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_s32_bf16 (bfloat16x8_t __a)
{
return (int32x4_t)__a;
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_s64_bf16 (bfloat16x8_t __a)
{
return (int64x2_t)__a;
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_u8_bf16 (bfloat16x8_t __a)
{
return (uint8x16_t)__a;
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_u16_bf16 (bfloat16x8_t __a)
{
return (uint16x8_t)__a;
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_u32_bf16 (bfloat16x8_t __a)
{
return (uint32x4_t)__a;
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_u64_bf16 (bfloat16x8_t __a)
{
return (uint64x2_t)__a;
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_f32_bf16 (bfloat16x8_t __a)
{
return (float32x4_t)__a;
}
__extension__ extern __inline poly8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_p8_bf16 (bfloat16x8_t __a)
{
return (poly8x16_t)__a;
}
__extension__ extern __inline poly16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_p16_bf16 (bfloat16x8_t __a)
{
return (poly16x8_t)__a;
}
__extension__ extern __inline poly64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_p64_bf16 (bfloat16x8_t __a)
{
return (poly64x2_t)__a;
}
__extension__ extern __inline poly128_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_p128_bf16 (bfloat16x8_t __a)
{
return (poly128_t)__a;
}
__extension__ extern __inline float32x2_t __extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
......
...@@ -221,13 +221,13 @@ VAR10 (SETLANE, vset_lane, ...@@ -221,13 +221,13 @@ VAR10 (SETLANE, vset_lane,
VAR5 (UNOP, vcreate, v8qi, v4hi, v2si, v2sf, di) VAR5 (UNOP, vcreate, v8qi, v4hi, v2si, v2sf, di)
VAR10 (UNOP, vdup_n, VAR10 (UNOP, vdup_n,
v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
VAR2 (UNOP, vdup_n, v8hf, v4hf) VAR4 (UNOP, vdup_n, v8hf, v4hf, v8bf, v4bf)
VAR10 (GETLANE, vdup_lane, VAR10 (GETLANE, vdup_lane,
v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
VAR2 (GETLANE, vdup_lane, v8hf, v4hf) VAR4 (GETLANE, vdup_lane, v8hf, v4hf, v8bf, v4bf)
VAR6 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di) VAR7 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf)
VAR6 (UNOP, vget_high, v16qi, v8hi, v8hf, v4si, v4sf, v2di) VAR7 (UNOP, vget_high, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di)
VAR6 (UNOP, vget_low, v16qi, v8hi, v8hf, v4si, v4sf, v2di) VAR7 (UNOP, vget_low, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di)
VAR3 (UNOP, vmovn, v8hi, v4si, v2di) VAR3 (UNOP, vmovn, v8hi, v4si, v2di)
VAR3 (UNOP, vqmovns, v8hi, v4si, v2di) VAR3 (UNOP, vqmovns, v8hi, v4si, v2di)
VAR3 (UNOP, vqmovnu, v8hi, v4si, v2di) VAR3 (UNOP, vqmovnu, v8hi, v4si, v2di)
......
...@@ -82,14 +82,14 @@ ...@@ -82,14 +82,14 @@
(define_mode_iterator VD_RE [V8QI V4HI V2SI V2SF DI]) (define_mode_iterator VD_RE [V8QI V4HI V2SI V2SF DI])
;; Double-width vector modes plus 64-bit elements. ;; Double-width vector modes plus 64-bit elements.
(define_mode_iterator VDX [V8QI V4HI V4HF V2SI V2SF DI]) (define_mode_iterator VDX [V8QI V4HI V4HF V4BF V2SI V2SF DI])
;; Double-width vector modes plus 64-bit elements, ;; Double-width vector modes plus 64-bit elements,
;; with V4BFmode added, suitable for moves. ;; with V4BFmode added, suitable for moves.
(define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI]) (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI])
;; Double-width vector modes, with V4HF - for vldN_lane and vstN_lane. ;; Double-width vector modes, with V4HF - for vldN_lane and vstN_lane.
(define_mode_iterator VD_LANE [V8QI V4HI V4HF V2SI V2SF]) (define_mode_iterator VD_LANE [V8QI V4HI V4HF V4BF V2SI V2SF])
;; Double-width vector modes without floating-point elements. ;; Double-width vector modes without floating-point elements.
(define_mode_iterator VDI [V8QI V4HI V2SI]) (define_mode_iterator VDI [V8QI V4HI V2SI])
...@@ -104,7 +104,7 @@ ...@@ -104,7 +104,7 @@
(define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF]) (define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF])
;; Quad-width vector modes plus 64-bit elements. ;; Quad-width vector modes plus 64-bit elements.
(define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI]) (define_mode_iterator VQX [V16QI V8HI V8HF V8BF V4SI V4SF V2DI])
;; Quad-width vector modes without floating-point elements. ;; Quad-width vector modes without floating-point elements.
(define_mode_iterator VQI [V16QI V8HI V4SI]) (define_mode_iterator VQI [V16QI V8HI V4SI])
...@@ -153,7 +153,7 @@ ...@@ -153,7 +153,7 @@
;; Vector modes, including 64-bit integer elements. ;; Vector modes, including 64-bit integer elements.
(define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI (define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI
V4HF V8HF V2SF V4SF DI V2DI]) V4HF V8HF V4BF V8BF V2SF V4SF DI V2DI])
;; Vector modes including 64-bit integer elements, but no floats. ;; Vector modes including 64-bit integer elements, but no floats.
(define_mode_iterator VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI]) (define_mode_iterator VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI])
...@@ -522,6 +522,7 @@ ...@@ -522,6 +522,7 @@
(define_mode_attr V_elem [(V8QI "QI") (V16QI "QI") (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
(V4HI "HI") (V8HI "HI") (V4HI "HI") (V8HI "HI")
(V4HF "HF") (V8HF "HF") (V4HF "HF") (V8HF "HF")
(V4BF "BF") (V8BF "BF")
(V2SI "SI") (V4SI "SI") (V2SI "SI") (V4SI "SI")
(V2SF "SF") (V4SF "SF") (V2SF "SF") (V4SF "SF")
(DI "DI") (V2DI "DI")]) (DI "DI") (V2DI "DI")])
...@@ -530,6 +531,7 @@ ...@@ -530,6 +531,7 @@
(define_mode_attr V_elem_l [(V8QI "qi") (V16QI "qi") (define_mode_attr V_elem_l [(V8QI "qi") (V16QI "qi")
(V4HI "hi") (V8HI "hi") (V4HI "hi") (V8HI "hi")
(V4HF "hf") (V8HF "hf") (V4HF "hf") (V8HF "hf")
(V4BF "bf") (V8BF "bf")
(V2SI "si") (V4SI "si") (V2SI "si") (V4SI "si")
(V2SF "sf") (V4SF "sf") (V2SF "sf") (V4SF "sf")
(DI "di") (V2DI "di")]) (DI "di") (V2DI "di")])
...@@ -547,6 +549,7 @@ ...@@ -547,6 +549,7 @@
(define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI") (define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI")
(V4HI "SI") (V8HI "SI") (V4HI "SI") (V8HI "SI")
(V4HF "SF") (V8HF "SF") (V4HF "SF") (V8HF "SF")
(V4BF "BF") (V8BF "BF")
(V2SI "V2SI") (V4SI "V2SI") (V2SI "V2SI") (V4SI "V2SI")
(V2SF "V2SF") (V4SF "V2SF") (V2SF "V2SF") (V4SF "V2SF")
(DI "V2DI") (V2DI "V2DI")]) (DI "V2DI") (V2DI "V2DI")])
...@@ -567,6 +570,7 @@ ...@@ -567,6 +570,7 @@
(define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK") (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
(V4HI "BLK") (V8HI "BLK") (V4HI "BLK") (V8HI "BLK")
(V4HF "BLK") (V8HF "BLK") (V4HF "BLK") (V8HF "BLK")
(V4BF "BLK") (V8BF "BLK")
(V2SI "BLK") (V4SI "BLK") (V2SI "BLK") (V4SI "BLK")
(V2SF "BLK") (V4SF "BLK") (V2SF "BLK") (V4SF "BLK")
(DI "EI") (V2DI "EI")]) (DI "EI") (V2DI "EI")])
...@@ -575,6 +579,7 @@ ...@@ -575,6 +579,7 @@
(define_mode_attr V_four_elem [(V8QI "SI") (V16QI "SI") (define_mode_attr V_four_elem [(V8QI "SI") (V16QI "SI")
(V4HI "V4HI") (V8HI "V4HI") (V4HI "V4HI") (V8HI "V4HI")
(V4HF "V4HF") (V8HF "V4HF") (V4HF "V4HF") (V8HF "V4HF")
(V4BF "V4BF") (V8BF "V4BF")
(V2SI "V4SI") (V4SI "V4SI") (V2SI "V4SI") (V4SI "V4SI")
(V2SF "V4SF") (V4SF "V4SF") (V2SF "V4SF") (V4SF "V4SF")
(DI "OI") (V2DI "OI")]) (DI "OI") (V2DI "OI")])
...@@ -583,6 +588,7 @@ ...@@ -583,6 +588,7 @@
(define_mode_attr V_reg [(V8QI "P") (V16QI "q") (define_mode_attr V_reg [(V8QI "P") (V16QI "q")
(V4HI "P") (V8HI "q") (V4HI "P") (V8HI "q")
(V4HF "P") (V8HF "q") (V4HF "P") (V8HF "q")
(V4BF "P") (V8BF "q")
(V2SI "P") (V4SI "q") (V2SI "P") (V4SI "q")
(V2SF "P") (V4SF "q") (V2SF "P") (V4SF "q")
(DI "P") (V2DI "q") (DI "P") (V2DI "q")
...@@ -613,7 +619,8 @@ ...@@ -613,7 +619,8 @@
(define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI") (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
(V8HF "V4HF") (V4SI "V2SI") (V8HF "V4HF") (V4SI "V2SI")
(V4SF "V2SF") (V2DF "DF") (V4SF "V2SF") (V2DF "DF")
(V2DI "DI") (V4HF "HF")]) (V2DI "DI") (V4HF "HF")
(V4BF "BF") (V8BF "V4BF")])
;; Same, but lower-case. ;; Same, but lower-case.
(define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi") (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
...@@ -624,7 +631,7 @@ ...@@ -624,7 +631,7 @@
(define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI") (define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI")
(V2SI "V4SI") (V4HF "V8HF") (V2SI "V4SI") (V4HF "V8HF")
(V2SF "V4SF") (DF "V2DF") (V2SF "V4SF") (DF "V2DF")
(DI "V2DI")]) (DI "V2DI") (V4BF "V8BF")])
;; Same, but lower-case. ;; Same, but lower-case.
(define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi") (define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi")
...@@ -643,6 +650,7 @@ ...@@ -643,6 +650,7 @@
(V4SI "V2SI") (V4SF "V2SF") (V4SI "V2SI") (V4SF "V2SF")
(V8QI "V8QI") (V4HI "V4HI") (V8QI "V8QI") (V4HI "V4HI")
(V2SI "V2SI") (V2SF "V2SF") (V2SI "V2SI") (V2SF "V2SF")
(V8BF "V4BF") (V4BF "V4BF")
(V8HF "V4HF") (V4HF "V4HF")]) (V8HF "V4HF") (V4HF "V4HF")])
;; Mode of result of comparison operations (and bit-select operand 1). ;; Mode of result of comparison operations (and bit-select operand 1).
...@@ -650,6 +658,7 @@ ...@@ -650,6 +658,7 @@
(V4HI "V4HI") (V8HI "V8HI") (V4HI "V4HI") (V8HI "V8HI")
(V2SI "V2SI") (V4SI "V4SI") (V2SI "V2SI") (V4SI "V4SI")
(V4HF "V4HI") (V8HF "V8HI") (V4HF "V4HI") (V8HF "V8HI")
(V4BF "V4HI") (V8BF "V8HI")
(V2SF "V2SI") (V4SF "V4SI") (V2SF "V2SI") (V4SF "V4SI")
(DI "DI") (V2DI "V2DI")]) (DI "DI") (V2DI "V2DI")])
...@@ -691,6 +700,7 @@ ...@@ -691,6 +700,7 @@
(V4HI "u16") (V8HI "u16") (V4HI "u16") (V8HI "u16")
(V2SI "32") (V4SI "32") (V2SI "32") (V4SI "32")
(V4HF "u16") (V8HF "u16") (V4HF "u16") (V8HF "u16")
(V4BF "u16") (V8BF "u16")
(V2SF "32") (V4SF "32")]) (V2SF "32") (V4SF "32")])
(define_mode_attr V_sz_elem [(V8QI "8") (V16QI "8") (define_mode_attr V_sz_elem [(V8QI "8") (V16QI "8")
...@@ -698,6 +708,7 @@ ...@@ -698,6 +708,7 @@
(V2SI "32") (V4SI "32") (V2SI "32") (V4SI "32")
(DI "64") (V2DI "64") (DI "64") (V2DI "64")
(V4HF "16") (V8HF "16") (V4HF "16") (V8HF "16")
(V4BF "16") (V8BF "16")
(V2SF "32") (V4SF "32")]) (V2SF "32") (V4SF "32")])
(define_mode_attr V_elem_ch [(V8QI "b") (V16QI "b") (define_mode_attr V_elem_ch [(V8QI "b") (V16QI "b")
...@@ -768,10 +779,12 @@ ...@@ -768,10 +779,12 @@
(V2SI "true") (V4SI "false") (V2SI "true") (V4SI "false")
(V2SF "true") (V4SF "false") (V2SF "true") (V4SF "false")
(DI "true") (V2DI "false") (DI "true") (V2DI "false")
(V4BF "true") (V8BF "false")
(V4HF "true") (V8HF "false")]) (V4HF "true") (V8HF "false")])
(define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16") (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16")
(V4HF "4") (V8HF "8") (V4HF "4") (V8HF "8")
(V4BF "4") (V8BF "8")
(V4HI "4") (V8HI "8") (V4HI "4") (V8HI "8")
(V2SI "2") (V4SI "4") (V2SI "2") (V4SI "4")
(V2SF "2") (V4SF "4") (V2SF "2") (V4SF "4")
......
...@@ -3737,6 +3737,22 @@ if (BYTES_BIG_ENDIAN) ...@@ -3737,6 +3737,22 @@ if (BYTES_BIG_ENDIAN)
[(set_attr "type" "neon_from_gp_q")] [(set_attr "type" "neon_from_gp_q")]
) )
(define_insn "neon_vdup_nv4bf"
[(set (match_operand:V4BF 0 "s_register_operand" "=w")
(vec_duplicate:V4BF (match_operand:BF 1 "s_register_operand" "r")))]
"TARGET_NEON"
"vdup.16\t%P0, %1"
[(set_attr "type" "neon_from_gp")]
)
(define_insn "neon_vdup_nv8bf"
[(set (match_operand:V8BF 0 "s_register_operand" "=w")
(vec_duplicate:V8BF (match_operand:BF 1 "s_register_operand" "r")))]
"TARGET_NEON"
"vdup.16\t%q0, %1"
[(set_attr "type" "neon_from_gp_q")]
)
(define_insn "neon_vdup_n<mode>" (define_insn "neon_vdup_n<mode>"
[(set (match_operand:V32 0 "s_register_operand" "=w,w") [(set (match_operand:V32 0 "s_register_operand" "=w,w")
(vec_duplicate:V32 (match_operand:<V_elem> 1 "s_register_operand" "r,t")))] (vec_duplicate:V32 (match_operand:<V_elem> 1 "s_register_operand" "r,t")))]
...@@ -3791,12 +3807,12 @@ if (BYTES_BIG_ENDIAN) ...@@ -3791,12 +3807,12 @@ if (BYTES_BIG_ENDIAN)
) )
(define_insn "neon_vdup_lane<mode>_internal" (define_insn "neon_vdup_lane<mode>_internal"
[(set (match_operand:VH 0 "s_register_operand" "=w") [(set (match_operand:VHFBF 0 "s_register_operand" "=w")
(vec_duplicate:VH (vec_duplicate:VHFBF
(vec_select:<V_elem> (vec_select:<V_elem>
(match_operand:<V_double_vector_mode> 1 "s_register_operand" "w") (match_operand:<V_double_vector_mode> 1 "s_register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
"TARGET_NEON && TARGET_FP16" "TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)"
{ {
if (BYTES_BIG_ENDIAN) if (BYTES_BIG_ENDIAN)
{ {
...@@ -3832,10 +3848,10 @@ if (BYTES_BIG_ENDIAN) ...@@ -3832,10 +3848,10 @@ if (BYTES_BIG_ENDIAN)
}) })
(define_expand "neon_vdup_lane<mode>" (define_expand "neon_vdup_lane<mode>"
[(match_operand:VH 0 "s_register_operand") [(match_operand:VHFBF 0 "s_register_operand")
(match_operand:<V_double_vector_mode> 1 "s_register_operand") (match_operand:<V_double_vector_mode> 1 "s_register_operand")
(match_operand:SI 2 "immediate_operand")] (match_operand:SI 2 "immediate_operand")]
"TARGET_NEON && TARGET_FP16" "TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)"
{ {
if (BYTES_BIG_ENDIAN) if (BYTES_BIG_ENDIAN)
{ {
......
2020-02-27 Mihail Ionescu <mihail.ionescu@arm.com>
* gcc.target/arm/bf16_dup.c: New test.
* gcc.target/arm/bf16_reinterpret.c: Likewise.
2020-02-27 Will Schmidt <will_schmidt@vnet.ibm.com> 2020-02-27 Will Schmidt <will_schmidt@vnet.ibm.com>
* lib/target_supports.exp (check_effective_target_has_arch_pwr5): New. * lib/target_supports.exp (check_effective_target_has_arch_pwr5): New.
......
/* { dg-do assemble { target { arm*-*-* } } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
/* { dg-additional-options "-save-temps -march=armv8.2-a+bf16+fp16 -mfloat-abi=softfp" } */
#include "arm_neon.h"
float32x2_t
test_vbfdot_vcreate (float32x2_t r, uint64_t a, uint64_t b)
{
bfloat16x4_t _a = vcreate_bf16(a);
bfloat16x4_t _b = vcreate_bf16(b);
return vbfdot_f32 (r, _a, _b);
}
/* { dg-final { scan-assembler {vdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+} } } */
bfloat16x8_t test_vcombine_bf16 (bfloat16x4_t a, bfloat16x4_t b)
{
return vcombine_bf16 (a, b);
}
bfloat16x4_t test_vget_high_bf16 (bfloat16x8_t a)
{
return vget_high_bf16 (a);
}
bfloat16x4_t test_vget_low_bf16 (bfloat16x8_t a)
{
return vget_low_bf16 (a);
}
bfloat16_t test_vget_lane_bf16 (bfloat16x4_t a)
{
return vget_lane_bf16 (a, 1);
}
bfloat16_t test_vgetq_lane_bf16 (bfloat16x8_t a)
{
return vgetq_lane_bf16 (a, 7);
}
bfloat16x4_t test_vset_lane_bf16 (bfloat16_t a, bfloat16x4_t b)
{
return vset_lane_bf16 (a, b, 1);
}
bfloat16x8_t test_vsetq_lane_bf16 (bfloat16_t a, bfloat16x8_t b)
{
return vsetq_lane_bf16 (a, b, 7);
}
bfloat16x4_t vdup_test (bfloat16_t a)
{
return vdup_n_bf16 (a);
}
/* { dg-final { scan-assembler {vdup\.16\td[0-9]+, r[0-9]+} } } */
bfloat16x8_t vdupq_test (bfloat16_t a)
{
return vdupq_n_bf16 (a);
}
/* { dg-final { scan-assembler {vdup\.16\tq[0-9]+, r[0-9]+} } } */
bfloat16x4_t test_vdup_lane_bf16 (bfloat16x4_t a)
{
return vdup_lane_bf16 (a, 1);
}
/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, d[0-9]+\[1\]} 1 } } */
bfloat16x8_t test_vdupq_lane_bf16 (bfloat16x4_t a)
{
return vdupq_lane_bf16 (a, 1);
}
/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, d[0-9]+\[1\]} 1 } } */
bfloat16x4_t test_vdup_laneq_bf16 (bfloat16x8_t a)
{
return vdup_laneq_bf16 (a, 3);
}
bfloat16x8_t test_vdupq_laneq_bf16 (bfloat16x8_t a)
{
return vdupq_laneq_bf16 (a, 3);
}
bfloat16_t test_vduph_lane_bf16 (bfloat16x4_t a)
{
return vduph_lane_bf16 (a, 1);
}
bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a)
{
return vduph_laneq_bf16 (a, 7);
}
/* { dg-do assemble { target { arm*-*-* } } } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
/* { dg-additional-options "-save-temps -march=armv8.2-a+fp16+bf16 -mfloat-abi=hard -mfpu=crypto-neon-fp-armv8" } */
#include <arm_neon.h>
float32x2_t
test_vbfdot_f32_s8 (float32x2_t r, int8x8_t a, int8x8_t b)
{
bfloat16x4_t _a = vreinterpret_bf16_s8(a);
bfloat16x4_t _b = vreinterpret_bf16_s8(b);
return vbfdot_f32 (r, _a, _b);
}
float32x2_t
test_vbfdot_f32_s16 (float32x2_t r, int16x4_t a, int16x4_t b)
{
bfloat16x4_t _a = vreinterpret_bf16_s16(a);
bfloat16x4_t _b = vreinterpret_bf16_s16(b);
return vbfdot_f32 (r, _a, _b);
}
float32x2_t
test_vbfdot_f32_s32 (float32x2_t r, int32x2_t a, int32x2_t b)
{
bfloat16x4_t _a = vreinterpret_bf16_s32(a);
bfloat16x4_t _b = vreinterpret_bf16_s32(b);
return vbfdot_f32 (r, _a, _b);
}
float32x2_t
test_vbfdot_f32_s64 (float32x2_t r, int64x1_t a, int64x1_t b)
{
bfloat16x4_t _a = vreinterpret_bf16_s64(a);
bfloat16x4_t _b = vreinterpret_bf16_s64(b);
return vbfdot_f32 (r, _a, _b);
}
float32x2_t
test_vbfdot_f32_u8 (float32x2_t r, uint8x8_t a, uint8x8_t b)
{
bfloat16x4_t _a = vreinterpret_bf16_u8(a);
bfloat16x4_t _b = vreinterpret_bf16_u8(b);
return vbfdot_f32 (r, _a, _b);
}
float32x2_t
test_vbfdot_f32_u16 (float32x2_t r, uint16x4_t a, uint16x4_t b)
{
bfloat16x4_t _a = vreinterpret_bf16_u16(a);
bfloat16x4_t _b = vreinterpret_bf16_u16(b);
return vbfdot_f32 (r, _a, _b);
}
float32x2_t
test_vbfdot_f32_u32 (float32x2_t r, uint32x2_t a, uint32x2_t b)
{
bfloat16x4_t _a = vreinterpret_bf16_u32(a);
bfloat16x4_t _b = vreinterpret_bf16_u32(b);
return vbfdot_f32 (r, _a, _b);
}
float32x2_t
test_vbfdot_f32_u64 (float32x2_t r, uint64x1_t a, uint64x1_t b)
{
bfloat16x4_t _a = vreinterpret_bf16_u64(a);
bfloat16x4_t _b = vreinterpret_bf16_u64(b);
return vbfdot_f32 (r, _a, _b);
}
float32x2_t
test_vbfdot_f32_p8 (float32x2_t r, poly8x8_t a, poly8x8_t b)
{
bfloat16x4_t _a = vreinterpret_bf16_p8(a);
bfloat16x4_t _b = vreinterpret_bf16_p8(b);
return vbfdot_f32 (r, _a, _b);
}
float32x2_t
test_vbfdot_f32_p16 (float32x2_t r, poly16x4_t a, poly16x4_t b)
{
bfloat16x4_t _a = vreinterpret_bf16_p16(a);
bfloat16x4_t _b = vreinterpret_bf16_p16(b);
return vbfdot_f32 (r, _a, _b);
}
float32x2_t
test_vbfdot_f32_p64 (float32x2_t r, poly64x1_t a, poly64x1_t b)
{
bfloat16x4_t _a = vreinterpret_bf16_p64(a);
bfloat16x4_t _b = vreinterpret_bf16_p64(b);
return vbfdot_f32 (r, _a, _b);
}
float32x2_t
test_vbfdot_f32_f16 (float32x2_t r, float16x4_t a, float16x4_t b)
{
bfloat16x4_t _a = vreinterpret_bf16_f16(a);
bfloat16x4_t _b = vreinterpret_bf16_f16(b);
return vbfdot_f32 (r, _a, _b);
}
float32x2_t
test_vbfdot_f32_f32 (float32x2_t r, float32x2_t a, float32x2_t b)
{
bfloat16x4_t _a = vreinterpret_bf16_f32(a);
bfloat16x4_t _b = vreinterpret_bf16_f32(b);
return vbfdot_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_s8 (float32x4_t r, int8x16_t a, int8x16_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_s8(a);
bfloat16x8_t _b = vreinterpretq_bf16_s8(b);
return vbfdotq_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_s16 (float32x4_t r, int16x8_t a, int16x8_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_s16(a);
bfloat16x8_t _b = vreinterpretq_bf16_s16(b);
return vbfdotq_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_s32 (float32x4_t r, int32x4_t a, int32x4_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_s32(a);
bfloat16x8_t _b = vreinterpretq_bf16_s32(b);
return vbfdotq_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_s64 (float32x4_t r, int64x2_t a, int64x2_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_s64(a);
bfloat16x8_t _b = vreinterpretq_bf16_s64(b);
return vbfdotq_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_u8 (float32x4_t r, uint8x16_t a, uint8x16_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_u8(a);
bfloat16x8_t _b = vreinterpretq_bf16_u8(b);
return vbfdotq_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_u16 (float32x4_t r, uint16x8_t a, uint16x8_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_u16(a);
bfloat16x8_t _b = vreinterpretq_bf16_u16(b);
return vbfdotq_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_u32 (float32x4_t r, uint32x4_t a, uint32x4_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_u32(a);
bfloat16x8_t _b = vreinterpretq_bf16_u32(b);
return vbfdotq_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_u64 (float32x4_t r, uint64x2_t a, uint64x2_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_u64(a);
bfloat16x8_t _b = vreinterpretq_bf16_u64(b);
return vbfdotq_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_p8 (float32x4_t r, poly8x16_t a, poly8x16_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_p8(a);
bfloat16x8_t _b = vreinterpretq_bf16_p8(b);
return vbfdotq_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_p16 (float32x4_t r, poly16x8_t a, poly16x8_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_p16(a);
bfloat16x8_t _b = vreinterpretq_bf16_p16(b);
return vbfdotq_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_p64 (float32x4_t r, poly64x2_t a, poly64x2_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_p64(a);
bfloat16x8_t _b = vreinterpretq_bf16_p64(b);
return vbfdotq_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_p128 (float32x4_t r, poly128_t a, poly128_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_p128(a);
bfloat16x8_t _b = vreinterpretq_bf16_p128(b);
return vbfdotq_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_f16 (float32x4_t r, float16x8_t a, float16x8_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_f16(a);
bfloat16x8_t _b = vreinterpretq_bf16_f16(b);
return vbfdotq_f32 (r, _a, _b);
}
float32x4_t
test_vbfdotq_f32_f32 (float32x4_t r, float32x4_t a, float32x4_t b)
{
bfloat16x8_t _a = vreinterpretq_bf16_f32(a);
bfloat16x8_t _b = vreinterpretq_bf16_f32(b);
return vbfdotq_f32 (r, _a, _b);
}
/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\n} 13 } } */
/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 14 } } */
int8x8_t test_vreinterpret_s8_bf16 (bfloat16x4_t a, int8x8_t b)
{
int8x8_t _a = vreinterpret_s8_bf16 (a);
return vadd_s8 (_a, b);
}
int16x4_t test_vreinterpret_s16_bf16 (bfloat16x4_t a, int16x4_t b)
{
int16x4_t _a = vreinterpret_s16_bf16 (a);
return vadd_s16 (_a, b);
}
int32x2_t test_vreinterpret_s32_bf16 (bfloat16x4_t a, int32x2_t b)
{
int32x2_t _a = vreinterpret_s32_bf16 (a);
return vadd_s32 (_a, b);
}
int64x1_t test_vreinterpret_s64_bf16 (bfloat16x4_t a, int64x1_t b)
{
int64x1_t _a = vreinterpret_s64_bf16 (a);
return vrshl_s64 (_a, b);
}
uint8x8_t test_vreinterpret_u8_bf16 (bfloat16x4_t a, uint8x8_t b)
{
uint8x8_t _a = vreinterpret_u8_bf16 (a);
return vadd_u8 (_a, b);
}
uint16x4_t test_vreinterpret_u16_bf16 (bfloat16x4_t a, uint16x4_t b)
{
uint16x4_t _a = vreinterpret_u16_bf16 (a);
return vadd_u16 (_a, b);
}
uint32x2_t test_vreinterpret_u32_bf16 (bfloat16x4_t a, uint32x2_t b)
{
uint32x2_t _a = vreinterpret_u32_bf16 (a);
return vadd_u32 (_a, b);
}
uint64x1_t test_vreinterpret_u64_bf16 (bfloat16x4_t a, int64x1_t b)
{
uint64x1_t _a = vreinterpret_u64_bf16 (a);
return vrshl_u64 (_a, b);
}
poly8x8x2_t test_vreinterpret_p8_bf16 (bfloat16x4_t a, poly8x8_t b)
{
poly8x8_t _a = vreinterpret_p8_bf16 (a);
return vzip_p8 (_a, b);
}
poly16x4x2_t test_vreinterpret_p16_bf16 (bfloat16x4_t a, poly16x4_t b)
{
poly16x4_t _a = vreinterpret_p16_bf16 (a);
return vzip_p16 (_a, b);
}
poly64x1_t test_vreinterpret_p64_bf16 (bfloat16x4_t a, poly64x1_t b)
{
poly64x1_t _a = vreinterpret_p64_bf16 (a);
return vsli_n_p64 (_a, b, 3);
}
float32x2_t test_vreinterpret_f32_bf16 (bfloat16x4_t a, float32x2_t b)
{
float32x2_t _a = vreinterpret_f32_bf16 (a);
return vsub_f32 (_a, b);
}
int8x16_t test_vreinterpretq_s8_bf16 (bfloat16x8_t a, int8x16_t b)
{
int8x16_t _a = vreinterpretq_s8_bf16 (a);
return vaddq_s8 (_a, b);
}
int16x8_t test_vreinterpretq_s16_bf16 (bfloat16x8_t a, int16x8_t b)
{
int16x8_t _a = vreinterpretq_s16_bf16 (a);
return vaddq_s16 (_a, b);
}
int32x4_t test_vreinterpretq_s32_bf16 (bfloat16x8_t a, int32x4_t b)
{
int32x4_t _a = vreinterpretq_s32_bf16 (a);
return vaddq_s32 (_a, b);
}
int64x2_t test_vreinterpretq_s64_bf16 (bfloat16x8_t a, int64x2_t b)
{
int64x2_t _a = vreinterpretq_s64_bf16 (a);
return vaddq_s64 (_a, b);
}
uint8x16_t test_vreinterpretq_u8_bf16 (bfloat16x8_t a, uint8x16_t b)
{
uint8x16_t _a = vreinterpretq_u8_bf16 (a);
return vaddq_u8 (_a, b);
}
uint16x8_t test_vreinterpretq_u16_bf16 (bfloat16x8_t a, uint16x8_t b)
{
uint16x8_t _a = vreinterpretq_u16_bf16 (a);
return vaddq_u16 (_a, b);
}
uint32x4_t test_vreinterpretq_u32_bf16 (bfloat16x8_t a, uint32x4_t b)
{
uint32x4_t _a = vreinterpretq_u32_bf16 (a);
return vaddq_u32 (_a, b);
}
uint64x2_t test_vreinterpretq_u64_bf16 (bfloat16x8_t a, uint64x2_t b)
{
uint64x2_t _a = vreinterpretq_u64_bf16 (a);
return vaddq_u64 (_a, b);
}
poly8x16x2_t test_vreinterpretq_p8_bf16 (bfloat16x8_t a, poly8x16_t b)
{
poly8x16_t _a = vreinterpretq_p8_bf16 (a);
return vzipq_p8 (_a, b);
}
poly16x8x2_t test_vreinterpretq_p16_bf16 (bfloat16x8_t a, poly16x8_t b)
{
poly16x8_t _a = vreinterpretq_p16_bf16 (a);
return vzipq_p16 (_a, b);
}
poly64x2_t test_vreinterpretq_p64_bf16 (bfloat16x8_t a, poly64x2_t b)
{
poly64x2_t _a = vreinterpretq_p64_bf16 (a);
return vsliq_n_p64 (_a, b, 3);
}
poly128_t test_vreinterpretq_p128_bf16 (bfloat16x8_t a, poly16x8_t b)
{
poly128_t _a = vreinterpretq_p128_bf16 (a);
return _a;
}
float32x4_t test_vreinterpretq_f32_bf16 (bfloat16x8_t a, float32x4_t b)
{
float32x4_t _a = vreinterpretq_f32_bf16 (a);
return vsubq_f32 (_a, b);
}
float16x4_t test_vreinterpret_f16_bf16 (bfloat16x4_t a)
{
return vreinterpret_f16_bf16 (a);
}
float16x8_t test_vreinterpretq_f16_bf16 (bfloat16x8_t a)
{
return vreinterpretq_f16_bf16 (a);
}
/* { dg-final { scan-assembler-times {\tvadd.i8\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
/* { dg-final { scan-assembler-times {\tvadd.i16\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
/* { dg-final { scan-assembler-times {\tvadd.i32\td[0-9]+, d[0-9]+, d[0-9]+\n} 2 } } */
/* { dg-final { scan-assembler-times {\tvadd.i8\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
/* { dg-final { scan-assembler-times {\tvadd.i16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
/* { dg-final { scan-assembler-times {\tvadd.i32\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
/* { dg-final { scan-assembler-times {\tvadd.i64\tq[0-9]+, q[0-9]+, q[0-9]+\n} 2 } } */
/* { dg-final { scan-assembler {\tvsub.f32\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */
/* { dg-final { scan-assembler {\tvsub.f32\tq[0-9]+, q[0-9]+, q[0-9]+\n} } } */
/* { dg-final { scan-assembler {\tvzip.8\td[0-9]+, d[0-9]+\n} } } */
/* { dg-final { scan-assembler {\tvzip.16\td[0-9]+, d[0-9]+\n} } } */
/* { dg-final { scan-assembler {\tvzip.8\tq[0-9]+, q[0-9]+\n} } } */
/* { dg-final { scan-assembler {\tvzip.16\tq[0-9]+, q[0-9]+\n} } } */
/* { dg-final { scan-assembler {\tvrshl.s64\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */
/* { dg-final { scan-assembler {\tvrshl.u64\td[0-9]+, d[0-9]+, d[0-9]+\n} } } */
/* { dg-final { scan-assembler {\tvsli.64\td[0-9]+, d[0-9]+, #3\n} } } */
/* { dg-final { scan-assembler {\tvsli.64\tq[0-9]+, q[0-9]+, #3\n} } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment