[AArch64] Rewrite the vdup_lane intrinsics in C

gcc/ * config/aarch64/aarch64-simd-builtins.def (dup_lane_scalar): Remove. * config/aarch64/aarch64-simd.md (aarch64_simd_dup): Add 'w->w' alternative. (aarch64_dup_lane<mode>): Allow for VALL. (aarch64_dup_lane_scalar<mode>): Remove. (aarch64_dup_lane_<vswap_width_name><mode>): New. (aarch64_get_lane_signed<mode>): Add w->w altenative. (aarch64_get_lane_unsigned<mode>): Likewise. (aarch64_get_lane<mode>): Likewise. * config/aarch64/aarch64.c (aarch64_evpc_dup): New. (aarch64_expand_vec_perm_const_1): Use aarch64_evpc_dup. * config/aarch64/iterators.md (VSWAP_WIDTH): New. (VCON): Change container of V2SF. (vswap_width_name): Likewise. * config/aarch64/arm_neon.h (__aarch64_vdup_lane_any): New. (__aarch64_vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise. (vdup<q>_n_<psuf><8,16,32,64>): Convert to C implementation. (vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise. gcc/testsuite/ * gcc.target/aarch64/scalar_intrinsics.c (vdup<bhsd>_lane<su><8,16,32,64>): Force values to SIMD registers. From-SVN: r202180

[AArch64] Rewrite the vdup_lane intrinsics in C
gcc/ * config/aarch64/aarch64-simd-builtins.def (dup_lane_scalar): Remove. * config/aarch64/aarch64-simd.md (aarch64_simd_dup): Add 'w->w' alternative. (aarch64_dup_lane<mode>): Allow for VALL. (aarch64_dup_lane_scalar<mode>): Remove. (aarch64_dup_lane_<vswap_width_name><mode>): New. (aarch64_get_lane_signed<mode>): Add w->w altenative. (aarch64_get_lane_unsigned<mode>): Likewise. (aarch64_get_lane<mode>): Likewise. * config/aarch64/aarch64.c (aarch64_evpc_dup): New. (aarch64_expand_vec_perm_const_1): Use aarch64_evpc_dup. * config/aarch64/iterators.md (VSWAP_WIDTH): New. (VCON): Change container of V2SF. (vswap_width_name): Likewise. * config/aarch64/arm_neon.h (__aarch64_vdup_lane_any): New. (__aarch64_vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise. (vdup<q>_n_<psuf><8,16,32,64>): Convert to C implementation. (vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise. gcc/testsuite/ * gcc.target/aarch64/scalar_intrinsics.c (vdup<bhsd>_lane<su><8,16,32,64>): Force values to SIMD registers. From-SVN: r202180
91bd4114 · James Greenhalgh · James Greenhalgh · d617d2d8 · 91bd4114 · 91bd4114
Commit 91bd4114 authored Sep 02, 2013 by James Greenhalgh Committed by James Greenhalgh Sep 02, 2013
7 changed files
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2013-09-02  James Greenhalgh  <james.greenhalgh@arm.com>
+	* config/aarch64/aarch64-simd-builtins.def
+	(dup_lane_scalar): Remove.
+	* config/aarch64/aarch64-simd.md
+	(aarch64_simd_dup): Add 'w->w' alternative.
+	(aarch64_dup_lane<mode>): Allow for VALL.
+	(aarch64_dup_lane_scalar<mode>): Remove.
+	(aarch64_dup_lane_<vswap_width_name><mode>): New.
+	(aarch64_get_lane_signed<mode>): Add w->w altenative.
+	(aarch64_get_lane_unsigned<mode>): Likewise.
+	(aarch64_get_lane<mode>): Likewise.
+	* config/aarch64/aarch64.c (aarch64_evpc_dup): New.
+	(aarch64_expand_vec_perm_const_1): Use aarch64_evpc_dup.
+	* config/aarch64/iterators.md (VSWAP_WIDTH): New.
+	(VCON): Change container of V2SF.
+	(vswap_width_name): Likewise.
+	* config/aarch64/arm_neon.h
+	(__aarch64_vdup_lane_any): New.
+	(__aarch64_vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise.
+	(vdup<q>_n_<psuf><8,16,32,64>): Convert to C implementation.
+	(vdup<q>_lane<q>_<fpsu><8,16,32,64>): Likewise.
 2013-09-02  Eric Botcazou  <ebotcazou@adacore.com>
 	PR middle-end/56382

--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -336,32 +336,47 @@
 })
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQ 0 "register_operand" "=w")
+  [(set (match_operand:VDQ 0 "register_operand" "=w, w")
-        (vec_duplicate:VDQ (match_operand:<VEL> 1 "register_operand" "r")))]
+        (vec_duplicate:VDQ (match_operand:<VEL> 1 "register_operand" "r, w")))]
+  "TARGET_SIMD"
+  "@
+   dup\\t%0.<Vtype>, %<vw>1
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]"
+  [(set_attr "simd_type" "simd_dupgp, simd_dup")
+   (set_attr "simd_mode" "<MODE>")]
+)
+(define_insn "aarch64_simd_dup<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+        (vec_duplicate:VDQF (match_operand:<VEL> 1 "register_operand" "w")))]
  "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %<vw>1"
+  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "simd_type" "simd_dupgp")
+  [(set_attr "simd_type" "simd_dup")
   (set_attr "simd_mode" "<MODE>")]
 )
 (define_insn "aarch64_dup_lane<mode>"
-  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
+  [(set (match_operand:VALL 0 "register_operand" "=w")
-        (vec_duplicate:VDQ_I
+	(vec_duplicate:VALL
 	  (vec_select:<VEL>
-	    (match_operand:<VCON> 1 "register_operand" "w")
+	    (match_operand:VALL 1 "register_operand" "w")
 	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
          )))]
  "TARGET_SIMD"
-  "dup\\t%<v>0<Vmtype>, %1.<Vetype>[%2]"
+  "dup\\t%0.<Vtype>, %1.<Vetype>[%2]"
  [(set_attr "simd_type" "simd_dup")
   (set_attr "simd_mode" "<MODE>")]
 )
-(define_insn "aarch64_simd_dup<mode>"
+(define_insn "aarch64_dup_lane_<vswap_width_name><mode>"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
+  [(set (match_operand:VALL 0 "register_operand" "=w")
-        (vec_duplicate:VDQF (match_operand:<VEL> 1 "register_operand" "w")))]
+	(vec_duplicate:VALL
+	  (vec_select:<VEL>
+	    (match_operand:<VSWAP_WIDTH> 1 "register_operand" "w")
+	    (parallel [(match_operand:SI 2 "immediate_operand" "i")])
+          )))]
  "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
+  "dup\\t%0.<Vtype>, %1.<Vetype>[%2]"
  [(set_attr "simd_type" "simd_dup")
   (set_attr "simd_mode" "<MODE>")]
 )

--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7932,6 +7932,55 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
 }
 static bool
+aarch64_evpc_dup (struct expand_vec_perm_d *d)
+{
+  rtx (*gen) (rtx, rtx, rtx);
+  rtx out = d->target;
+  rtx in0;
+  enum machine_mode vmode = d->vmode;
+  unsigned int i, elt, nelt = d->nelt;
+  rtx lane;
+  /* TODO: This may not be big-endian safe.  */
+  if (BYTES_BIG_ENDIAN)
+    return false;
+  elt = d->perm[0];
+  for (i = 1; i < nelt; i++)
+    {
+      if (elt != d->perm[i])
+	return false;
+    }
+  /* The generic preparation in aarch64_expand_vec_perm_const_1
+     swaps the operand order and the permute indices if it finds
+     d->perm[0] to be in the second operand.  Thus, we can always
+     use d->op0 and need not do any extra arithmetic to get the
+     correct lane number.  */
+  in0 = d->op0;
+  lane = GEN_INT (elt);
+  switch (vmode)
+    {
+    case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
+    case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
+    case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
+    case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
+    case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
+    case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
+    case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
+    case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
+    case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
+    case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
+    default:
+      return false;
+    }
+  emit_insn (gen (out, in0, lane));
+  return true;
+}
+static bool
 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
 {
  rtx rperm[MAX_VECT_LEN], sel;
@@ -7988,6 +8037,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	return true;
      else if (aarch64_evpc_trn (d))
 	return true;
+      else if (aarch64_evpc_dup (d))
+	return true;
      return aarch64_evpc_tbl (d);
    }
  return false;

--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -383,7 +383,7 @@
 			(V4HI "V8HI") (V8HI "V8HI")
 			(V2SI "V4SI") (V4SI "V4SI")
 			(DI   "V2DI") (V2DI "V2DI")
-			(V2SF "V2SF") (V4SF "V4SF")
+			(V2SF "V4SF") (V4SF "V4SF")
 			(V2DF "V2DF") (SI   "V4SI")
 			(HI   "V8HI") (QI   "V16QI")])
@@ -527,6 +527,20 @@
 (define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si")])
 (define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")])
+(define_mode_attr VSWAP_WIDTH [(V8QI "V16QI") (V16QI "V8QI")
+				(V4HI "V8HI") (V8HI  "V4HI")
+				(V2SI "V4SI") (V4SI  "V2SI")
+				(DI   "V2DI") (V2DI  "DI")
+				(V2SF "V4SF") (V4SF  "V2SF")
+				(DF   "V2DF") (V2DF  "DF")])
+(define_mode_attr vswap_width_name [(V8QI "to_128") (V16QI "to_64")
+				    (V4HI "to_128") (V8HI  "to_64")
+				    (V2SI "to_128") (V4SI  "to_64")
+				    (DI   "to_128") (V2DI  "to_64")
+				    (V2SF "to_128") (V4SF  "to_64")
+				    (DF   "to_128") (V2DF  "to_64")])
 ;; -------------------------------------------------------------------
 ;; Code Iterators
 ;; -------------------------------------------------------------------

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
+2013-09-02  James Greenhalgh  <james.greenhalgh@arm.com>
+	* gcc.target/aarch64/scalar_intrinsics.c
+	(vdup<bhsd>_lane<su><8,16,32,64>): Force values to SIMD registers.
 2013-09-02  Richard Biener  <rguenther@suse.de>
 	PR middle-end/57511

--- a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c
+++ b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c
@@ -198,13 +198,21 @@ test_vcltzd_s64 (int64x1_t a)
 int8x1_t
 test_vdupb_lane_s8 (int8x16_t a)
 {
-  return vdupb_lane_s8 (a, 2);
+  int8x1_t res;
+  force_simd (a);
+  res = vdupb_laneq_s8 (a, 2);
+  force_simd (res);
+  return res;
 }
 uint8x1_t
 test_vdupb_lane_u8 (uint8x16_t a)
 {
-  return vdupb_lane_u8 (a, 2);
+  uint8x1_t res;
+  force_simd (a);
+  res = vdupb_laneq_u8 (a, 2);
+  force_simd (res);
+  return res;
 }
 /* { dg-final { scan-assembler-times "aarch64_get_lanev8hi" 2 } } */
@@ -212,13 +220,21 @@ test_vdupb_lane_u8 (uint8x16_t a)
 int16x1_t
 test_vduph_lane_s16 (int16x8_t a)
 {
-  return vduph_lane_s16 (a, 2);
+  int16x1_t res;
+  force_simd (a);
+  res = vduph_laneq_s16 (a, 2);
+  force_simd (res);
+  return res;
 }
 uint16x1_t
 test_vduph_lane_u16 (uint16x8_t a)
 {
-  return vduph_lane_u16 (a, 2);
+  uint16x1_t res;
+  force_simd (a);
+  res = vduph_laneq_u16 (a, 2);
+  force_simd (res);
+  return res;
 }
 /* { dg-final { scan-assembler-times "aarch64_get_lanev4si" 2 } } */
@@ -226,13 +242,21 @@ test_vduph_lane_u16 (uint16x8_t a)
 int32x1_t
 test_vdups_lane_s32 (int32x4_t a)
 {
-  return vdups_lane_s32 (a, 2);
+  int32x1_t res;
+  force_simd (a);
+  res = vdups_laneq_s32 (a, 2);
+  force_simd (res);
+  return res;
 }
 uint32x1_t
 test_vdups_lane_u32 (uint32x4_t a)
 {
-  return vdups_lane_u32 (a, 2);
+  uint32x1_t res;
+  force_simd (a);
+  res = vdups_laneq_u32 (a, 2);
+  force_simd (res);
+  return res;
 }
 /* { dg-final { scan-assembler-times "aarch64_get_lanev2di" 2 } } */
@@ -240,13 +264,21 @@ test_vdups_lane_u32 (uint32x4_t a)
 int64x1_t
 test_vdupd_lane_s64 (int64x2_t a)
 {
-  return vdupd_lane_s64 (a, 1);
+  int64x1_t res;
+  force_simd (a);
+  res = vdupd_laneq_s64 (a, 1);
+  force_simd (res);
+  return res;
 }
 uint64x1_t
 test_vdupd_lane_u64 (uint64x2_t a)
 {
-  return vdupd_lane_u64 (a, 1);
+  uint64x1_t res;
+  force_simd (a);
+  res = vdupd_laneq_u64 (a, 1);
+  force_simd (res);
+  return res;
 }
 /* { dg-final { scan-assembler-times "\\tcmtst\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 2 } } */