Commit 6da37857 by Matthew Wahab Committed by Tamar Christina

Committed on behalf of Matthew Wahab

gcc/
2017-05-16  Matthew Wahab  <matthew.wahab@arm.com>

	* config/arm/arm_neon.h (vadd_f16): Use standard arithmetic
	operations in fast-math mode.
	(vaddq_f16): Likewise.
	(vmul_f16): Likewise.
	(vmulq_f16): Likewise.
	(vsub_f16): Likewise.
	(vsubq_f16): Likewise.
	* config/arm/neon.md (add<mode>3): New.
	(sub<mode>3): New.
	(fma:<VH:mode>3): New.  Also remove outdated comment.
	(mul<mode>3): New.

testsuite/
2017-05-16  Matthew Wahab  <matthew.wahab@arm.com>

	* gcc.target/arm/armv8_2-fp16-arith-1.c: Expand comment.  Update
	expected output of vadd, vsub and vmul instructions.
	* gcc.target/arm/armv8_2-fp16-arith-2.c: New.
	* gcc.target/arm/armv8_2-fp16-neon-2.c: New.
	* gcc.target/arm/armv8_2-fp16-neon-3.c: New.

From-SVN: r248090
parent d8c9bc36
2017-05-16 Matthew Wahab <matthew.wahab@arm.com>
* config/arm/arm_neon.h (vadd_f16): Use standard arithmetic
operations in fast-math mode.
(vaddq_f16): Likewise.
(vmul_f16): Likewise.
(vmulq_f16): Likewise.
(vsub_f16): Likewise.
(vsubq_f16): Likewise.
* config/arm/neon.md (add<mode>3): New.
(sub<mode>3): New.
(fma:<VH:mode>3): New. Also remove outdated comment.
(mul<mode>3): New.
2017-05-16 Martin Liska <mliska@suse.cz> 2017-05-16 Martin Liska <mliska@suse.cz>
PR ipa/79849. PR ipa/79849.
......
...@@ -17069,14 +17069,22 @@ __extension__ extern __inline float16x4_t ...@@ -17069,14 +17069,22 @@ __extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vadd_f16 (float16x4_t __a, float16x4_t __b) vadd_f16 (float16x4_t __a, float16x4_t __b)
{ {
#ifdef __FAST_MATH__
return __a + __b;
#else
return __builtin_neon_vaddv4hf (__a, __b); return __builtin_neon_vaddv4hf (__a, __b);
#endif
} }
__extension__ extern __inline float16x8_t __extension__ extern __inline float16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaddq_f16 (float16x8_t __a, float16x8_t __b) vaddq_f16 (float16x8_t __a, float16x8_t __b)
{ {
#ifdef __FAST_MATH__
return __a + __b;
#else
return __builtin_neon_vaddv8hf (__a, __b); return __builtin_neon_vaddv8hf (__a, __b);
#endif
} }
__extension__ extern __inline uint16x4_t __extension__ extern __inline uint16x4_t
...@@ -17587,7 +17595,11 @@ __extension__ extern __inline float16x4_t ...@@ -17587,7 +17595,11 @@ __extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmul_f16 (float16x4_t __a, float16x4_t __b) vmul_f16 (float16x4_t __a, float16x4_t __b)
{ {
#ifdef __FAST_MATH__
return __a * __b;
#else
return __builtin_neon_vmulfv4hf (__a, __b); return __builtin_neon_vmulfv4hf (__a, __b);
#endif
} }
__extension__ extern __inline float16x4_t __extension__ extern __inline float16x4_t
...@@ -17608,7 +17620,11 @@ __extension__ extern __inline float16x8_t ...@@ -17608,7 +17620,11 @@ __extension__ extern __inline float16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulq_f16 (float16x8_t __a, float16x8_t __b) vmulq_f16 (float16x8_t __a, float16x8_t __b)
{ {
#ifdef __FAST_MATH__
return __a * __b;
#else
return __builtin_neon_vmulfv8hf (__a, __b); return __builtin_neon_vmulfv8hf (__a, __b);
#endif
} }
__extension__ extern __inline float16x8_t __extension__ extern __inline float16x8_t
...@@ -17804,14 +17820,22 @@ __extension__ extern __inline float16x4_t ...@@ -17804,14 +17820,22 @@ __extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vsub_f16 (float16x4_t __a, float16x4_t __b) vsub_f16 (float16x4_t __a, float16x4_t __b)
{ {
#ifdef __FAST_MATH__
return __a - __b;
#else
return __builtin_neon_vsubv4hf (__a, __b); return __builtin_neon_vsubv4hf (__a, __b);
#endif
} }
__extension__ extern __inline float16x8_t __extension__ extern __inline float16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vsubq_f16 (float16x8_t __a, float16x8_t __b) vsubq_f16 (float16x8_t __a, float16x8_t __b)
{ {
#ifdef __FAST_MATH__
return __a - __b;
#else
return __builtin_neon_vsubv8hf (__a, __b); return __builtin_neon_vsubv8hf (__a, __b);
#endif
} }
#endif /* __ARM_FEATURE_VECTOR_FP16_ARITHMETIC. */ #endif /* __ARM_FEATURE_VECTOR_FP16_ARITHMETIC. */
......
...@@ -505,6 +505,23 @@ ...@@ -505,6 +505,23 @@
(const_string "neon_add<q>")))] (const_string "neon_add<q>")))]
) )
;; As with SFmode, full support for HFmode vector arithmetic is only available
;; when flag-unsafe-math-optimizations is enabled.
(define_insn "add<mode>3"
[(set
(match_operand:VH 0 "s_register_operand" "=w")
(plus:VH
(match_operand:VH 1 "s_register_operand" "w")
(match_operand:VH 2 "s_register_operand" "w")))]
"TARGET_NEON_FP16INST && flag_unsafe_math_optimizations"
"vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set (attr "type")
(if_then_else (match_test "<Is_float_mode>")
(const_string "neon_fp_addsub_s<q>")
(const_string "neon_add<q>")))]
)
(define_insn "add<mode>3_fp16" (define_insn "add<mode>3_fp16"
[(set [(set
(match_operand:VH 0 "s_register_operand" "=w") (match_operand:VH 0 "s_register_operand" "=w")
...@@ -557,6 +574,17 @@ ...@@ -557,6 +574,17 @@
(const_string "neon_sub<q>")))] (const_string "neon_sub<q>")))]
) )
(define_insn "sub<mode>3"
[(set
(match_operand:VH 0 "s_register_operand" "=w")
(minus:VH
(match_operand:VH 1 "s_register_operand" "w")
(match_operand:VH 2 "s_register_operand" "w")))]
"TARGET_NEON_FP16INST && flag_unsafe_math_optimizations"
"vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_sub<q>")]
)
(define_insn "sub<mode>3_fp16" (define_insn "sub<mode>3_fp16"
[(set [(set
(match_operand:VH 0 "s_register_operand" "=w") (match_operand:VH 0 "s_register_operand" "=w")
...@@ -664,8 +692,17 @@ ...@@ -664,8 +692,17 @@
[(set_attr "type" "neon_fp_mla_s<q>")] [(set_attr "type" "neon_fp_mla_s<q>")]
) )
;; There is limited support for unsafe-math optimizations using the NEON FP16 (define_insn "fma<VH:mode>4"
;; arithmetic instructions, so only the intrinsic is currently supported. [(set (match_operand:VH 0 "register_operand" "=w")
(fma:VH
(match_operand:VH 1 "register_operand" "w")
(match_operand:VH 2 "register_operand" "w")
(match_operand:VH 3 "register_operand" "0")))]
"TARGET_NEON_FP16INST && flag_unsafe_math_optimizations"
"vfma.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_fp_mla_s<q>")]
)
(define_insn "fma<VH:mode>4_intrinsic" (define_insn "fma<VH:mode>4_intrinsic"
[(set (match_operand:VH 0 "register_operand" "=w") [(set (match_operand:VH 0 "register_operand" "=w")
(fma:VH (fma:VH
...@@ -2175,6 +2212,17 @@ ...@@ -2175,6 +2212,17 @@
(const_string "neon_mul_<V_elem_ch><q>")))] (const_string "neon_mul_<V_elem_ch><q>")))]
) )
(define_insn "mul<mode>3"
[(set
(match_operand:VH 0 "s_register_operand" "=w")
(mult:VH
(match_operand:VH 1 "s_register_operand" "w")
(match_operand:VH 2 "s_register_operand" "w")))]
"TARGET_NEON_FP16INST && flag_unsafe_math_optimizations"
"vmul.f16\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
[(set_attr "type" "neon_mul_<VH_elem_ch><q>")]
)
(define_insn "neon_vmulf<mode>" (define_insn "neon_vmulf<mode>"
[(set [(set
(match_operand:VH 0 "s_register_operand" "=w") (match_operand:VH 0 "s_register_operand" "=w")
......
2016-05-16 Matthew Wahab <matthew.wahab@arm.com>
* gcc.target/arm/armv8_2-fp16-arith-1.c: Expand comment. Update
expected output of vadd, vsub and vmul instructions.
* gcc.target/arm/armv8_2-fp16-arith-2.c: New.
* gcc.target/arm/armv8_2-fp16-neon-2.c: New.
* gcc.target/arm/armv8_2-fp16-neon-3.c: New.
2017-05-15 Jerry DeLisle <jvdelisle@gcc.gnu.org> 2017-05-15 Jerry DeLisle <jvdelisle@gcc.gnu.org>
PR libgfortran/80727 PR libgfortran/80727
......
...@@ -3,7 +3,8 @@ ...@@ -3,7 +3,8 @@
/* { dg-options "-O2 -ffast-math" } */ /* { dg-options "-O2 -ffast-math" } */
/* { dg-add-options arm_v8_2a_fp16_neon } */ /* { dg-add-options arm_v8_2a_fp16_neon } */
/* Test instructions generated for half-precision arithmetic. */ /* Test instructions generated for half-precision arithmetic with
unsafe-math-optimizations enabled. */
typedef __fp16 float16_t; typedef __fp16 float16_t;
typedef __simd64_float16_t float16x4_t; typedef __simd64_float16_t float16x4_t;
...@@ -90,9 +91,18 @@ TEST_CMP (greaterthanqual, >=, int16x8_t, float16x8_t) ...@@ -90,9 +91,18 @@ TEST_CMP (greaterthanqual, >=, int16x8_t, float16x8_t)
/* { dg-final { scan-assembler-times {vneg\.f16\tq[0-9]+, q[0-9]+} 1 } } */ /* { dg-final { scan-assembler-times {vneg\.f16\tq[0-9]+, q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vabs\.f16\ts[0-9]+, s[0-9]+} 2 } } */ /* { dg-final { scan-assembler-times {vabs\.f16\ts[0-9]+, s[0-9]+} 2 } } */
/* { dg-final { scan-assembler-times {vadd\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ /* { dg-final { scan-assembler-times {vadd\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vsub\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ /* { dg-final { scan-assembler-times {vadd\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vmul\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ /* { dg-final { scan-assembler-times {vadd\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vsub\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vsub\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vsub\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vmul\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vdiv\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ /* { dg-final { scan-assembler-times {vdiv\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */
/* { dg-final { scan-assembler-times {vcmp\.f32\ts[0-9]+, s[0-9]+} 26 } } */ /* { dg-final { scan-assembler-times {vcmp\.f32\ts[0-9]+, s[0-9]+} 26 } } */
/* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, s[0-9]+} 52 } } */ /* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, s[0-9]+} 52 } } */
......
/* { dg-do compile } */
/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
/* { dg-options "-O2 -fno-fast-math" } */
/* { dg-add-options arm_v8_2a_fp16_neon } */
/* Test instructions generated for half-precision arithmetic without
unsafe-math-optimizations. */
typedef __fp16 float16_t;
typedef __simd64_float16_t float16x4_t;
typedef __simd128_float16_t float16x8_t;
typedef short int16x4_t __attribute__ ((vector_size (8)));
typedef short int int16x8_t __attribute__ ((vector_size (16)));
float16_t
fp16_abs (float16_t a)
{
return (a < 0) ? -a : a;
}
#define TEST_UNOP(NAME, OPERATOR, TY) \
TY test_##NAME##_##TY (TY a) \
{ \
return OPERATOR (a); \
}
#define TEST_BINOP(NAME, OPERATOR, TY) \
TY test_##NAME##_##TY (TY a, TY b) \
{ \
return a OPERATOR b; \
}
#define TEST_CMP(NAME, OPERATOR, RTY, TY) \
RTY test_##NAME##_##TY (TY a, TY b) \
{ \
return a OPERATOR b; \
}
/* Scalars. */
TEST_UNOP (neg, -, float16_t)
TEST_UNOP (abs, fp16_abs, float16_t)
TEST_BINOP (add, +, float16_t)
TEST_BINOP (sub, -, float16_t)
TEST_BINOP (mult, *, float16_t)
TEST_BINOP (div, /, float16_t)
TEST_CMP (equal, ==, int, float16_t)
TEST_CMP (unequal, !=, int, float16_t)
TEST_CMP (lessthan, <, int, float16_t)
TEST_CMP (greaterthan, >, int, float16_t)
TEST_CMP (lessthanequal, <=, int, float16_t)
TEST_CMP (greaterthanqual, >=, int, float16_t)
/* Vectors of size 4. */
TEST_UNOP (neg, -, float16x4_t)
TEST_BINOP (add, +, float16x4_t)
TEST_BINOP (sub, -, float16x4_t)
TEST_BINOP (mult, *, float16x4_t)
TEST_BINOP (div, /, float16x4_t)
TEST_CMP (equal, ==, int16x4_t, float16x4_t)
TEST_CMP (unequal, !=, int16x4_t, float16x4_t)
TEST_CMP (lessthan, <, int16x4_t, float16x4_t)
TEST_CMP (greaterthan, >, int16x4_t, float16x4_t)
TEST_CMP (lessthanequal, <=, int16x4_t, float16x4_t)
TEST_CMP (greaterthanqual, >=, int16x4_t, float16x4_t)
/* Vectors of size 8. */
TEST_UNOP (neg, -, float16x8_t)
TEST_BINOP (add, +, float16x8_t)
TEST_BINOP (sub, -, float16x8_t)
TEST_BINOP (mult, *, float16x8_t)
TEST_BINOP (div, /, float16x8_t)
TEST_CMP (equal, ==, int16x8_t, float16x8_t)
TEST_CMP (unequal, !=, int16x8_t, float16x8_t)
TEST_CMP (lessthan, <, int16x8_t, float16x8_t)
TEST_CMP (greaterthan, >, int16x8_t, float16x8_t)
TEST_CMP (lessthanequal, <=, int16x8_t, float16x8_t)
TEST_CMP (greaterthanqual, >=, int16x8_t, float16x8_t)
/* { dg-final { scan-assembler-times {vneg\.f16\ts[0-9]+, s[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vneg\.f16\td[0-9]+, d[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vneg\.f16\tq[0-9]+, q[0-9]+} 1 } } */
/* { dg-final { scan-assembler-times {vadd\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */
/* { dg-final { scan-assembler-times {vsub\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */
/* { dg-final { scan-assembler-times {vmul\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */
/* { dg-final { scan-assembler-times {vdiv\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */
/* { dg-final { scan-assembler-times {vcmp\.f32\ts[0-9]+, s[0-9]+} 26 } } */
/* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, s[0-9]+} 52 } } */
/* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, #0} 2 } } */
/* { dg-final { scan-assembler-not {vabs\.f16} } } */
/* { dg-final { scan-assembler-not {vadd\.f32} } } */
/* { dg-final { scan-assembler-not {vsub\.f32} } } */
/* { dg-final { scan-assembler-not {vmul\.f32} } } */
/* { dg-final { scan-assembler-not {vdiv\.f32} } } */
/* { dg-final { scan-assembler-not {vcmp\.f16} } } */
/* { dg-final { scan-assembler-not {vcmpe\.f16} } } */
/* { dg-do compile } */
/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
/* { dg-options "-O2 -ffast-math" } */
/* { dg-add-options arm_v8_2a_fp16_neon } */
/* Test compiler use of FP16 FMA/FMS instructions with -ffast-math. */
#include <arm_neon.h>
float16x4_t
test_vfma_1 (float16x4_t a, float16x4_t b, float16x4_t c)
{
return vadd_f16 (vmul_f16 (a, b), c);
}
float16x4_t
test_vfma_2 (float16x4_t a, float16x4_t b, float16x4_t c)
{
return vsub_f16 (vmul_f16 (a, b), vneg_f16 (c));
}
float16x4_t
test_vfma_3 (float16x4_t a, float16x4_t b, float16x4_t c)
{
return vsub_f16 (vmul_f16 (vneg_f16 (a), vneg_f16 (b)), vneg_f16 (c));
}
float16x4_t
test_vfma_4 (float16x4_t a, float16x4_t b, float16x4_t c)
{
return vsub_f16 (vmul_f16 (a, b), vneg_f16 (c));
}
/* { dg-final { scan-assembler-times {vfma\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 4 } } */
float16x8_t
test_vfmaq_1 (float16x8_t a, float16x8_t b, float16x8_t c)
{
return vaddq_f16 (vmulq_f16 (a, b), c);
}
float16x8_t
test_vfmaq_2 (float16x8_t a, float16x8_t b, float16x8_t c)
{
return vsubq_f16 (vmulq_f16 (a, b), vnegq_f16 (c));
}
float16x8_t
test_vfmaq_3 (float16x8_t a, float16x8_t b, float16x8_t c)
{
return vsubq_f16 (vmulq_f16 (vnegq_f16 (a), vnegq_f16 (b)), vnegq_f16 (c));
}
float16x8_t
test_vfmaq_4 (float16x8_t a, float16x8_t b, float16x8_t c)
{
return vsubq_f16 (vmulq_f16 (a, b), vnegq_f16 (c));
}
/* { dg-final { scan-assembler-times {vfma\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 4 } } */
float16x4_t
test_vfms_1 (float16x4_t a, float16x4_t b, float16x4_t c)
{
return vsub_f16 (c, vmul_f16 (a, b));
}
float16x4_t
test_vfms_2 (float16x4_t a, float16x4_t b, float16x4_t c)
{
return vsub_f16 (a, vmul_f16 (b, c));
}
float16x4_t
test_vfms_3 (float16x4_t a, float16x4_t b, float16x4_t c)
{
return vadd_f16 (vmul_f16 (vneg_f16 (a), b), c);
}
float16x4_t
test_vfms_4 (float16x4_t a, float16x4_t b, float16x4_t c)
{
return vadd_f16 (vmul_f16 (a, vneg_f16 (b)), c);
}
/* { dg-final { scan-assembler-times {vfms\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 4 } } */
float16x8_t
test_vfmsq_1 (float16x8_t a, float16x8_t b, float16x8_t c)
{
return vsubq_f16 (c, vmulq_f16 (a, b));
}
float16x8_t
test_vfmsq_2 (float16x8_t a, float16x8_t b, float16x8_t c)
{
return vsubq_f16 (a, vmulq_f16 (b, c));
}
float16x8_t
test_vfmsq_3 (float16x8_t a, float16x8_t b, float16x8_t c)
{
return vaddq_f16 (vmulq_f16 (vnegq_f16 (a), b), c);
}
float16x8_t
test_vfmsq_4 (float16x8_t a, float16x8_t b, float16x8_t c)
{
return vaddq_f16 (vmulq_f16 (a, vnegq_f16 (b)), c);
}
/* { dg-final { scan-assembler-times {vfms\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 4 } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment