Add support for fma intrinsics for ARM.

Correct dates in changelog from earlier commit. 2012-10-18 Matthew Gretton-Dann <matthew.gretton-dann@arm.com> Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> * config/arm/arm.c (neon_builtin_data): Add vfma and vfms builtins. * config/arm/neon-docgen.ml (intrinsic_groups): Add fused-multiply-* groups. * config/neon-gen.ml (print_feature_test_start): New function. (print_feature_test_end): Likewise. (print_variant): Print feature test macros. * config/arm/neon-testgen.ml (emit_prologue): Allow different tests to require different effective targets. (effective_target): New function. (test_intrinsic): Specify correct effective targets. * gcc/config/arm/neon.md (fma<VCVTF:mode>4_intrinsic): New pattern. (fmsub<VCVTF:mode>4_intrinsic): Likewise. (neon_vfma<VCVFT:mode>): New expand. (neon_vfms<VCVFT:mode>): Likewise. * config/neon.ml (opcode): Add Vfma and Vfms. (features): Add Requires_feature. (ops): Add VFMA and VFMS intrinsics. * config/arm/arm_neon.h: Regenerate. * doc/arm-neon-intrinsics.texi: Likewise. 2012-10-18 Matthew Gretton-Dann <matthew.gretton-dann@arm.com> * gcc.target/arm/neon/vfmaQf32.c: New testcase. * gcc.target/arm/neon/vfmaf32.c: Likewise. * gcc.target/arm/neon/vfmsQf32.c: Likewise. * gcc.target/arm/neon/vfmsf32.c: Likewise. Co-Authored-By: Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> From-SVN: r192560

Add support for fma intrinsics for ARM.
Correct dates in changelog from earlier commit. 2012-10-18 Matthew Gretton-Dann <matthew.gretton-dann@arm.com> Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> * config/arm/arm.c (neon_builtin_data): Add vfma and vfms builtins. * config/arm/neon-docgen.ml (intrinsic_groups): Add fused-multiply-* groups. * config/neon-gen.ml (print_feature_test_start): New function. (print_feature_test_end): Likewise. (print_variant): Print feature test macros. * config/arm/neon-testgen.ml (emit_prologue): Allow different tests to require different effective targets. (effective_target): New function. (test_intrinsic): Specify correct effective targets. * gcc/config/arm/neon.md (fma<VCVTF:mode>4_intrinsic): New pattern. (fmsub<VCVTF:mode>4_intrinsic): Likewise. (neon_vfma<VCVFT:mode>): New expand. (neon_vfms<VCVFT:mode>): Likewise. * config/neon.ml (opcode): Add Vfma and Vfms. (features): Add Requires_feature. (ops): Add VFMA and VFMS intrinsics. * config/arm/arm_neon.h: Regenerate. * doc/arm-neon-intrinsics.texi: Likewise. 2012-10-18 Matthew Gretton-Dann <matthew.gretton-dann@arm.com> * gcc.target/arm/neon/vfmaQf32.c: New testcase. * gcc.target/arm/neon/vfmaf32.c: Likewise. * gcc.target/arm/neon/vfmsQf32.c: Likewise. * gcc.target/arm/neon/vfmsf32.c: Likewise. Co-Authored-By: Ramana Radhakrishnan <ramana.radhakrishnan@arm.com> From-SVN: r192560
c4216388 · Matthew Gretton-Dann · Ramana Radhakrishnan · c61f8c3b · c4216388 · c4216388
Commit c4216388 authored Oct 18, 2012 by Matthew Gretton-Dann Committed by Ramana Radhakrishnan Oct 18, 2012
13 changed files
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2012-10-18  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>
+	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>
+        * config/arm/arm.c (neon_builtin_data): Add vfma and vfms
+      	builtins.
+        * config/arm/neon-docgen.ml (intrinsic_groups): Add
+        fused-multiply-* groups.
+        * config/neon-gen.ml (print_feature_test_start): New function.
+        (print_feature_test_end): Likewise.
+        (print_variant): Print feature test macros.
+        * config/arm/neon-testgen.ml (emit_prologue): Allow different
+        tests to require different effective targets.
+        (effective_target): New function.
+        (test_intrinsic): Specify correct effective targets.
+        * gcc/config/arm/neon.md (fma<VCVTF:mode>4_intrinsic): New pattern.
+        (fmsub<VCVTF:mode>4_intrinsic): Likewise.
+        (neon_vfma<VCVFT:mode>): New expand.
+        (neon_vfms<VCVFT:mode>): Likewise.
+        * config/neon.ml (opcode): Add Vfma and Vfms.
+        (features): Add Requires_feature.
+        (ops): Add VFMA and VFMS intrinsics.
+        * config/arm/arm_neon.h: Regenerate.
+        * doc/arm-neon-intrinsics.texi: Likewise.
 2012-10-18  Richard Guenther  <rguenther@suse.de>
 	* lto-streamer.h (enum LTO_tags): Add LTO_integer_cst.
@@ -11,7 +35,7 @@
 	(streamer_pack_tree_bitfields): Call it.
 	(streamer_write_integer_cst): Adjust.
-2012-10-17  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>
+2012-10-18  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>
 	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>
        * config.gcc: Add support for ARMv8 for arm*-*-* targets.
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -18726,6 +18726,8 @@ static neon_builtin_datum neon_builtin_data[] =
  VAR8 (BINOP, vmul, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
  VAR8 (TERNOP, vmla, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
  VAR3 (TERNOP, vmlal, v8qi, v4hi, v2si),
+  VAR2 (TERNOP, vfma, v2sf, v4sf),
+  VAR2 (TERNOP, vfms, v2sf, v4sf),
  VAR8 (TERNOP, vmls, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
  VAR3 (TERNOP, vmlsl, v8qi, v4hi, v2si),
  VAR4 (BINOP, vqdmulh, v4hi, v2si, v8hi, v4si),

--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -1350,6 +1350,38 @@ vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
  return (int64x2_t)__builtin_neon_vqdmlslv2si (__a, __b, __c, 1);
 }
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return (float32x2_t)__builtin_neon_vfmav2sf (__a, __b, __c, 3);
+}
+#endif
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return (float32x4_t)__builtin_neon_vfmav4sf (__a, __b, __c, 3);
+}
+#endif
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return (float32x2_t)__builtin_neon_vfmsv2sf (__a, __b, __c, 3);
+}
+#endif
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return (float32x4_t)__builtin_neon_vfmsv4sf (__a, __b, __c, 3);
+}
+#endif
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vsub_s8 (int8x8_t __a, int8x8_t __b)
 {

--- a/gcc/config/arm/neon-docgen.ml
+++ b/gcc/config/arm/neon-docgen.ml
@@ -103,6 +103,8 @@ let intrinsic_groups =
    "Multiplication", single_opcode Vmul;
    "Multiply-accumulate", single_opcode Vmla;
    "Multiply-subtract", single_opcode Vmls;
+    "Fused-multiply-accumulate", single_opcode Vfma;
+    "Fused-multiply-subtract", single_opcode Vfms;
    "Subtraction", single_opcode Vsub;
    "Comparison (equal-to)", single_opcode Vceq;
    "Comparison (greater-than-or-equal-to)", single_opcode Vcge;

--- a/gcc/config/arm/neon-gen.ml
+++ b/gcc/config/arm/neon-gen.ml
@@ -286,6 +286,24 @@ let get_shuffle features =
    | _ -> None
  with Not_found -> None
+let print_feature_test_start features =
+  try
+    match List.find (fun feature ->
+                       match feature with Requires_feature _ -> true
+                                        | _ -> false)
+                     features with
+      Requires_feature feature -> 
+        Format.printf "#ifdef __ARM_FEATURE_%s@\n" feature
+    | _ -> assert false
+  with Not_found -> assert true
+let print_feature_test_end features =
+  let feature =
+    List.exists (function Requires_feature x -> true
+                                        |  _ -> false) features in
+  if feature then Format.printf "#endif@\n"
 let print_variant opcode features shape name (ctype, asmtype, elttype) =
  let bits = infoword_value elttype features in
  let modesuf = mode_suffix elttype shape in
@@ -302,7 +320,11 @@ let print_variant opcode features shape name (ctype, asmtype, elttype) =
 	return ctype builtin in
  let body = pdecls @ rdecls @ stmts
  and fnname = (intrinsic_name name) ^ "_" ^ (string_of_elt elttype) in
-  print_function ctype fnname body
+  begin
+    print_feature_test_start features;
+    print_function ctype fnname body;
+    print_feature_test_end features;
+  end
 (* When this function processes the element types in the ops table, it rewrites
   them in a list of tuples (a,b,c):

--- a/gcc/config/arm/neon-testgen.ml
+++ b/gcc/config/arm/neon-testgen.ml
@@ -46,13 +46,14 @@ let open_test_file dir name =
    failwith ("Could not create test source file " ^ name ^ ": " ^ str)
 (* Emit prologue code to a test source file.  *)
-let emit_prologue chan test_name =
+let emit_prologue chan test_name effective_target =
  Printf.fprintf chan "/* Test the `%s' ARM Neon intrinsic.  */\n" test_name;
  Printf.fprintf chan "/* This file was autogenerated by neon-testgen.  */\n\n";
  Printf.fprintf chan "/* { dg-do assemble } */\n";
-  Printf.fprintf chan "/* { dg-require-effective-target arm_neon_ok } */\n";
+  Printf.fprintf chan "/* { dg-require-effective-target %s_ok } */\n"
+                 effective_target;
  Printf.fprintf chan "/* { dg-options \"-save-temps -O0\" } */\n";
-  Printf.fprintf chan "/* { dg-add-options arm_neon } */\n";
+  Printf.fprintf chan "/* { dg-add-options %s } */\n" effective_target;
  Printf.fprintf chan "\n#include \"arm_neon.h\"\n\n";
  Printf.fprintf chan "void test_%s (void)\n{\n" test_name
@@ -156,6 +157,17 @@ let check_types tys =
                then (Const :: flags, String.sub ty 6 ((String.length ty) - 6))
                else (flags, ty)) tys'
+(* Work out what the effective target should be.  *)
+let effective_target features =
+  try
+    match List.find (fun feature ->
+                       match feature with Requires_feature _ -> true
+                                        | _ -> false)
+                     features with
+      Requires_feature "FMA" -> "arm_neonv2"
+    | _ -> assert false
+  with Not_found -> "arm_neon"
 (* Given an intrinsic shape, produce a regexp that will match
   the right-hand sides of instructions generated by an intrinsic of
   that shape.  *)
@@ -263,8 +275,10 @@ let test_intrinsic dir opcode features shape name munge elt_ty =
 			  "!?\\(\\[ \t\\]+@\\[a-zA-Z0-9 \\]+\\)?\\n")
                         (analyze_all_shapes features shape analyze_shape)
  in
+  let effective_target = effective_target features
+  in
    (* Emit file and function prologues.  *)
-    emit_prologue chan test_name;
+    emit_prologue chan test_name effective_target;
    (* Emit local variable declarations.  *)
    emit_automatics chan c_types features;
    Printf.fprintf chan "\n";

--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -722,6 +722,10 @@
 )
 ;; Fused multiply-accumulate
+;; We define each insn twice here:
+;;    1: with flag_unsafe_math_optimizations for the widening multiply phase
+;;       to be able to use when converting to FMA.
+;;    2: without flag_unsafe_math_optimizations for the intrinsics to use.
 (define_insn "fma<VCVTF:mode>4"
  [(set (match_operand:VCVTF 0 "register_operand" "=w")
        (fma:VCVTF (match_operand:VCVTF 1 "register_operand" "w")
@@ -735,6 +739,19 @@
 		      (const_string "neon_fp_vmla_qqq")))]
 )
+(define_insn "fma<VCVTF:mode>4_intrinsic"
+  [(set (match_operand:VCVTF 0 "register_operand" "=w")
+        (fma:VCVTF (match_operand:VCVTF 1 "register_operand" "w")
+		 (match_operand:VCVTF 2 "register_operand" "w")
+		 (match_operand:VCVTF 3 "register_operand" "0")))]
+  "TARGET_NEON && TARGET_FMA"
+  "vfma%?.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+	(if_then_else (match_test "<Is_d_reg>")
+		      (const_string "neon_fp_vmla_ddd")
+		      (const_string "neon_fp_vmla_qqq")))]
+)
 (define_insn "*fmsub<VCVTF:mode>4"
  [(set (match_operand:VCVTF 0 "register_operand" "=w")
        (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
@@ -748,6 +765,19 @@
 		      (const_string "neon_fp_vmla_qqq")))]
 )
+(define_insn "fmsub<VCVTF:mode>4_intrinsic"
+  [(set (match_operand:VCVTF 0 "register_operand" "=w")
+        (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
+		   (match_operand:VCVTF 2 "register_operand" "w")
+		   (match_operand:VCVTF 3 "register_operand" "0")))]
+  "TARGET_NEON && TARGET_FMA"
+  "vfms%?.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+  [(set (attr "neon_type")
+	(if_then_else (match_test "<Is_d_reg>")
+		      (const_string "neon_fp_vmla_ddd")
+		      (const_string "neon_fp_vmla_qqq")))]
+)
 (define_insn "ior<mode>3"
  [(set (match_operand:VDQ 0 "s_register_operand" "=w,w")
 	(ior:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0")
@@ -1925,6 +1955,32 @@
  DONE;
 })
+(define_expand "neon_vfma<VCVTF:mode>"
+  [(match_operand:VCVTF 0 "s_register_operand")
+   (match_operand:VCVTF 1 "s_register_operand")
+   (match_operand:VCVTF 2 "s_register_operand")
+   (match_operand:VCVTF 3 "s_register_operand")
+   (match_operand:SI 4 "immediate_operand")]
+  "TARGET_NEON && TARGET_FMA"
+{
+  emit_insn (gen_fma<mode>4_intrinsic (operands[0], operands[2], operands[3],
+				       operands[1]));
+  DONE;
+})
+(define_expand "neon_vfms<VCVTF:mode>"
+  [(match_operand:VCVTF 0 "s_register_operand")
+   (match_operand:VCVTF 1 "s_register_operand")
+   (match_operand:VCVTF 2 "s_register_operand")
+   (match_operand:VCVTF 3 "s_register_operand")
+   (match_operand:SI 4 "immediate_operand")]
+  "TARGET_NEON && TARGET_FMA"
+{
+  emit_insn (gen_fmsub<mode>4_intrinsic (operands[0], operands[2], operands[3],
+					 operands[1]));
+  DONE;
+})
 ; Used for intrinsics when flag_unsafe_math_optimizations is false.
 (define_insn "neon_vmla<mode>_unspec"

--- a/gcc/config/arm/neon.ml
+++ b/gcc/config/arm/neon.ml
@@ -102,6 +102,8 @@ type opcode =
  | Vmul
  | Vmla
  | Vmls
+  | Vfma
+  | Vfms
  | Vsub
  | Vceq
  | Vcge
@@ -275,6 +277,8 @@ type features =
  | Const_valuator of (int -> int)
  | Fixed_vector_reg
  | Fixed_core_reg
+    (* Mark that the intrinsic requires __ARM_FEATURE_string to be defined.  *)
+  | Requires_feature of string
 exception MixedMode of elts * elts
@@ -802,6 +806,12 @@ let ops =
    Vmls, [], Long, "vmlsl", elts_same_io, su_8_32;
    Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32];
+    (* Fused-multiply-accumulate. *)
+    Vfma, [Requires_feature "FMA"], All (3, Dreg), "vfma", elts_same_io, [F32];
+    Vfma, [Requires_feature "FMA"], All (3, Qreg), "vfmaQ", elts_same_io, [F32];
+    Vfms, [Requires_feature "FMA"], All (3, Dreg), "vfms", elts_same_io, [F32];
+    Vfms, [Requires_feature "FMA"], All (3, Qreg), "vfmsQ", elts_same_io, [F32];
    (* Subtraction.  *)
    Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_32;
    Vsub, [No_op], All (3, Dreg), "vsub", sign_invar_2,  [S64; U64];

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
-2012-10-17  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>
+2012-10-18  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>
+        * gcc.target/arm/neon/vfmaQf32.c: New testcase.
+        * gcc.target/arm/neon/vfmaf32.c: Likewise.
+        * gcc.target/arm/neon/vfmsQf32.c: Likewise.
+        * gcc.target/arm/neon/vfmsf32.c: Likewise.
+2012-10-18  Matthew Gretton-Dann  <matthew.gretton-dann@arm.com>
        * gcc.target/arm/ftest-armv8a-arm.c: New testcase.
        * gcc.target/arm/ftest-armv8a-thumb.c: Likewise.

--- a/gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c
+++ b/gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c
+/* Test the `vfmaQf32' ARM Neon intrinsic.  */
+/* This file was autogenerated by neon-testgen.  */
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+#include "arm_neon.h"
+void test_vfmaQf32 (void)
+{
+  float32x4_t out_float32x4_t;
+  float32x4_t arg0_float32x4_t;
+  float32x4_t arg1_float32x4_t;
+  float32x4_t arg2_float32x4_t;
+  out_float32x4_t = vfmaq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
+}
+/* { dg-final { scan-assembler "vfma\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
--- a/gcc/testsuite/gcc.target/arm/neon/vfmaf32.c
+++ b/gcc/testsuite/gcc.target/arm/neon/vfmaf32.c
+/* Test the `vfmaf32' ARM Neon intrinsic.  */
+/* This file was autogenerated by neon-testgen.  */
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+#include "arm_neon.h"
+void test_vfmaf32 (void)
+{
+  float32x2_t out_float32x2_t;
+  float32x2_t arg0_float32x2_t;
+  float32x2_t arg1_float32x2_t;
+  float32x2_t arg2_float32x2_t;
+  out_float32x2_t = vfma_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
+}
+/* { dg-final { scan-assembler "vfma\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
--- a/gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c
+++ b/gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c
+/* Test the `vfmsQf32' ARM Neon intrinsic.  */
+/* This file was autogenerated by neon-testgen.  */
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+#include "arm_neon.h"
+void test_vfmsQf32 (void)
+{
+  float32x4_t out_float32x4_t;
+  float32x4_t arg0_float32x4_t;
+  float32x4_t arg1_float32x4_t;
+  float32x4_t arg2_float32x4_t;
+  out_float32x4_t = vfmsq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
+}
+/* { dg-final { scan-assembler "vfms\.f32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
--- a/gcc/testsuite/gcc.target/arm/neon/vfmsf32.c
+++ b/gcc/testsuite/gcc.target/arm/neon/vfmsf32.c
+/* Test the `vfmsf32' ARM Neon intrinsic.  */
+/* This file was autogenerated by neon-testgen.  */
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+#include "arm_neon.h"
+void test_vfmsf32 (void)
+{
+  float32x2_t out_float32x2_t;
+  float32x2_t arg0_float32x2_t;
+  float32x2_t arg1_float32x2_t;
+  float32x2_t arg2_float32x2_t;
+  out_float32x2_t = vfms_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
+}
+/* { dg-final { scan-assembler "vfms\.f32\[ 	\]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */