Implement vec_perm broadcast, and tidy lots of patterns to help.

From-SVN: r154836

Implement vec_perm broadcast, and tidy lots of patterns to help.
From-SVN: r154836
5e04b3b6 · Richard Henderson · Richard Henderson · 9fda11a2 · 5e04b3b6 · 5e04b3b6
Commit 5e04b3b6 authored Nov 30, 2009 by Richard Henderson Committed by Richard Henderson Nov 30, 2009
Hide whitespace changes
Inline Side-by-side

Showing with 623 additions and 294 deletions

gcc/ChangeLog
+47 -0

gcc/config/i386/i386-protos.h
+1 -0

gcc/config/i386/i386.c
+218 -128

gcc/config/i386/predicates.md
+17 -0

gcc/config/i386/sse.md
+340 -166

No files found.
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2009-11-30  Richard Henderson  <rth@redhat.com>
+
+	* config/i386/i386.c (ix86_vec_interleave_v2df_operator_ok): New.
+	(bdesc_special_args): Update insn codes.
+	(avx_vpermilp_parallel): Correct range check.
+	(ix86_rtx_costs): Handle vector permutation rtx codes.
+	(struct expand_vec_perm_d): Move earlier.
+	(get_mode_wider_vector): New.
+	(expand_vec_perm_broadcast_1): New.
+	(ix86_expand_vector_init_duplicate): Use it.  Tidy AVX modes.
+	(expand_vec_perm_broadcast): New.
+	(ix86_expand_vec_perm_builtin_1): Use it.
+	* config/i386/i386-protos.h: Update.
+	* config/i386/predicates.md (avx_vbroadcast_operand): New.
+	* config/i386/sse.md (AVX256MODE24P): New.
+	(ssescalarmodesuffix2s): New.
+	(avxhalfvecmode, avxscalarmode): Fill out to all modes.
+	(avxmodesuffixf2c): Add V8SI, V4DI.
+	(vec_dupv4sf): New expander.
+	(*vec_dupv4sf_avx): Add vbroadcastss alternative.
+	(*vec_set<mode>_0_avx, **vec_set<mode>_0_sse4_1): Macro-ize for
+	V4SF and V4SI.  Move C alternatives to front.  Add insertps and
+	pinsrd alternatives.
+	(*vec_set<mode>_0_sse2): Split out from ...
+	(vec_set<mode>_0): Macro-ize for V4SF and V4SI.
+	(vec_interleave_highv2df, vec_interleave_lowv2df): Require register
+	destination; use ix86_vec_interleave_v2df_operator_ok, instead of
+	ix86_fixup_binary_operands.
+	(*avx_interleave_highv2df, avx_interleave_lowv2df): Add movddup.
+	(*sse3_interleave_highv2df, sse3_interleave_lowv2df): New.
+	(*avx_movddup, *sse3_movddup): Remove.  New splitter from
+	vec_select form to vec_duplicate form.
+	(*sse2_interleave_highv2df, sse2_interleave_lowv2df): Use
+	ix86_vec_interleave_v2df_operator_ok.
+	(avx_movddup256, avx_unpcklpd256): Change to expanders, merge into ... 
+	(*avx_unpcklpd256): ... here.
+	(*vec_dupv4si_avx): New.
+	(*vec_dupv2di_avx): Add movddup alternative.
+	(*vec_dupv2di_sse3): New.
+	(vec_dup<AVX256MODE24P>): Replace avx_vbroadcasts<AVXMODEF4P> and
+	avx_vbroadcastss256; represent with vec_duplicate instead of 
+	nested vec_concat operations.
+	(avx_vbroadcastf128_<mode>): Rename from
+	avx_vbroadcastf128_p<avxmodesuffixf2c>256.
+	(*avx_vperm_broadcast_v4sf): New.
+	(*avx_vperm_broadcast_<AVX256MODEF2P>): New.
+
 2009-11-30  Martin Jambor  <mjambor@suse.cz>

 	PR middle-end/42196
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -86,6 +86,7 @@ extern void ix86_expand_binary_operator (enum rtx_code,
 					 enum machine_mode, rtx[]);
 extern int ix86_binary_operator_ok (enum rtx_code, enum machine_mode, rtx[]);
 extern bool ix86_lea_for_add_ok (enum rtx_code, rtx, rtx[]);
+extern bool ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high);
 extern bool ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn);
 extern bool ix86_agi_dependent (rtx set_insn, rtx use_insn);
 extern void ix86_expand_unary_operator (enum rtx_code, enum machine_mode,

--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -13849,6 +13849,19 @@ ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
  return TRUE;
 }

+/* Return TRUE if the operands to a vec_interleave_{high,low}v2df
+   are ok, keeping in mind the possible movddup alternative.  */
+
+bool
+ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
+{
+  if (MEM_P (operands[0]))
+    return rtx_equal_p (operands[0], operands[1 + high]);
+  if (MEM_P (operands[1]) && MEM_P (operands[2]))
+    return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
+  return true;
+}
+
 /* Post-reload splitter for converting an SF or DFmode value in an
   SSE register into an unsigned SImode.  */

@@ -21480,11 +21493,11 @@ static const struct builtin_description bdesc_special_args[] =
  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },

-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastsd256, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss256, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_pd256, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_ps256, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },

  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
@@ -24597,7 +24610,7 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode)
      if (!CONST_INT_P (er))
 	return 0;
      ei = INTVAL (er);
-      if (ei >= nelt)
+      if (ei >= 2 * nelt)
 	return 0;
      ipar[i] = ei;
    }
@@ -25713,6 +25726,16 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int *total, bool speed)
 	*total = 0;
      return false;

+    case VEC_SELECT:
+    case VEC_CONCAT:
+    case VEC_MERGE:
+    case VEC_DUPLICATE:
+      /* ??? Assume all of these vector manipulation patterns are
+	 recognizable.  In which case they all pretty much have the
+	 same cost.  */
+     *total = COSTS_N_INSNS (1);
+     return true;
+
    default:
      return false;
    }
@@ -26547,16 +26570,43 @@ x86_emit_floatuns (rtx operands[2])
  emit_label (donelab);
 }

+/* AVX does not support 32-byte integer vector operations,
+   thus the longest vector we are faced with is V16QImode.  */
+#define MAX_VECT_LEN	16
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool testing_p;
+};
+
+static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
+static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
+
+/* Get a vector mode of the same size as the original but with elements
+   twice as wide.  This is only guaranteed to apply to integral vectors.  */
+
+static inline enum machine_mode
+get_mode_wider_vector (enum machine_mode o)
+{
+  /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
+  enum machine_mode n = GET_MODE_WIDER_MODE (o);
+  gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
+  gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
+  return n;
+}
+
 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
   with all elements equal to VAR.  Return true if successful.  */
-/* ??? Call into the vec_perm support to implement the broadcast.  */

 static bool
 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
 				   rtx target, rtx val)
 {
-  enum machine_mode hmode, smode, wsmode, wvmode;
-  rtx x;
+  bool ok;

  switch (mode)
    {
@@ -26566,13 +26616,28 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
 	return false;
      /* FALLTHRU */

+    case V4DFmode:
+    case V4DImode:
+    case V8SFmode:
+    case V8SImode:
    case V2DFmode:
    case V2DImode:
    case V4SFmode:
    case V4SImode:
-      val = force_reg (GET_MODE_INNER (mode), val);
-      x = gen_rtx_VEC_DUPLICATE (mode, val);
-      emit_insn (gen_rtx_SET (VOIDmode, target, x));
+      {
+	rtx insn, dup;
+
+	/* First attempt to recognize VAL as-is.  */
+	dup = gen_rtx_VEC_DUPLICATE (mode, val);
+	insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
+	if (recog_memoized (insn) < 0)
+	  {
+	    /* If that fails, force VAL into a register.  */
+	    XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
+	    ok = recog_memoized (insn) >= 0;
+	    gcc_assert (ok);
+	  }
+      }
      return true;

    case V4HImode:
@@ -26580,130 +26645,87 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
 	return false;
      if (TARGET_SSE || TARGET_3DNOW_A)
 	{
+	  rtx x;
+
 	  val = gen_lowpart (SImode, val);
 	  x = gen_rtx_TRUNCATE (HImode, val);
 	  x = gen_rtx_VEC_DUPLICATE (mode, x);
 	  emit_insn (gen_rtx_SET (VOIDmode, target, x));
 	  return true;
 	}
-      else
-	{
-	  smode = HImode;
-	  wsmode = SImode;
-	  wvmode = V2SImode;
-	  goto widen;
-	}
+      goto widen;

    case V8QImode:
      if (!mmx_ok)
 	return false;
-      smode = QImode;
-      wsmode = HImode;
-      wvmode = V4HImode;
      goto widen;
+
    case V8HImode:
      if (TARGET_SSE2)
 	{
+	  struct expand_vec_perm_d dperm;
 	  rtx tmp1, tmp2;
-	  /* Extend HImode to SImode using a paradoxical SUBREG.  */
+
+	permute:
+	  memset (&dperm, 0, sizeof (dperm));
+	  dperm.target = target;
+	  dperm.vmode = mode;
+	  dperm.nelt = GET_MODE_NUNITS (mode);
+	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
+
+	  /* Extend to SImode using a paradoxical SUBREG.  */
 	  tmp1 = gen_reg_rtx (SImode);
 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
-	  /* Insert the SImode value as low element of V4SImode vector. */
-	  tmp2 = gen_reg_rtx (V4SImode);
-	  tmp1 = gen_rtx_VEC_MERGE (V4SImode,
-				    gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
-				    CONST0_RTX (V4SImode),
-				    const1_rtx);
-	  emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
-	  /* Cast the V4SImode vector back to a V8HImode vector.  */
-	  tmp1 = gen_reg_rtx (V8HImode);
-	  emit_move_insn (tmp1, gen_lowpart (V8HImode, tmp2));
-	  /* Duplicate the low short through the whole low SImode word.  */
-	  emit_insn (gen_vec_interleave_lowv8hi (tmp1, tmp1, tmp1));
-	  /* Cast the V8HImode vector back to a V4SImode vector.  */
-	  tmp2 = gen_reg_rtx (V4SImode);
-	  emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
-	  /* Replicate the low element of the V4SImode vector.  */
-	  emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
-	  /* Cast the V2SImode back to V8HImode, and store in target.  */
-	  emit_move_insn (target, gen_lowpart (V8HImode, tmp2));
-	  return true;
+
+	  /* Insert the SImode value as low element of a V4SImode vector. */
+	  tmp2 = gen_lowpart (V4SImode, dperm.op0);
+	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
+
+	  ok = (expand_vec_perm_1 (&dperm)
+		|| expand_vec_perm_broadcast_1 (&dperm));
+	  gcc_assert (ok);
+	  return ok;
 	}
-      smode = HImode;
-      wsmode = SImode;
-      wvmode = V4SImode;
      goto widen;
+
    case V16QImode:
      if (TARGET_SSE2)
-	{
-	  rtx tmp1, tmp2;
-	  /* Extend QImode to SImode using a paradoxical SUBREG.  */
-	  tmp1 = gen_reg_rtx (SImode);
-	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
-	  /* Insert the SImode value as low element of V4SImode vector. */
-	  tmp2 = gen_reg_rtx (V4SImode);
-	  tmp1 = gen_rtx_VEC_MERGE (V4SImode,
-				    gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
-				    CONST0_RTX (V4SImode),
-				    const1_rtx);
-	  emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
-	  /* Cast the V4SImode vector back to a V16QImode vector.  */
-	  tmp1 = gen_reg_rtx (V16QImode);
-	  emit_move_insn (tmp1, gen_lowpart (V16QImode, tmp2));
-	  /* Duplicate the low byte through the whole low SImode word.  */
-	  emit_insn (gen_vec_interleave_lowv16qi (tmp1, tmp1, tmp1));
-	  emit_insn (gen_vec_interleave_lowv16qi (tmp1, tmp1, tmp1));
-	  /* Cast the V16QImode vector back to a V4SImode vector.  */
-	  tmp2 = gen_reg_rtx (V4SImode);
-	  emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
-	  /* Replicate the low element of the V4SImode vector.  */
-	  emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
-	  /* Cast the V2SImode back to V16QImode, and store in target.  */
-	  emit_move_insn (target, gen_lowpart (V16QImode, tmp2));
-	  return true;
-	}
-      smode = QImode;
-      wsmode = HImode;
-      wvmode = V8HImode;
+	goto permute;
      goto widen;
+
    widen:
      /* Replicate the value once into the next wider mode and recurse.  */
-      val = convert_modes (wsmode, smode, val, true);
-      x = expand_simple_binop (wsmode, ASHIFT, val,
-			       GEN_INT (GET_MODE_BITSIZE (smode)),
-			       NULL_RTX, 1, OPTAB_LIB_WIDEN);
-      val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
-
-      x = gen_reg_rtx (wvmode);
-      if (!ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val))
-	gcc_unreachable ();
-      emit_move_insn (target, gen_lowpart (mode, x));
-      return true;
+      {
+	enum machine_mode smode, wsmode, wvmode;
+	rtx x;
+
+	smode = GET_MODE_INNER (mode);
+	wvmode = get_mode_wider_vector (mode);
+	wsmode = GET_MODE_INNER (wvmode);
+
+	val = convert_modes (wsmode, smode, val, true);
+	x = expand_simple_binop (wsmode, ASHIFT, val,
+				 GEN_INT (GET_MODE_BITSIZE (smode)),
+				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
+	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
+
+	x = gen_lowpart (wvmode, target);
+	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
+	gcc_assert (ok);
+	return ok;
+      }

-    case V4DFmode:
-      hmode = V2DFmode;
-      goto half;
-    case V4DImode:
-      hmode = V2DImode;
-      goto half;
-    case V8SFmode:
-      hmode = V4SFmode;
-      goto half;
-    case V8SImode:
-      hmode = V4SImode;
-      goto half;
    case V16HImode:
-      hmode = V8HImode;
-      goto half;
    case V32QImode:
-      hmode = V16QImode;
-      goto half;
-half:
      {
-	rtx tmp = gen_reg_rtx (hmode);
-	ix86_expand_vector_init_duplicate (mmx_ok, hmode, tmp, val);
-	emit_insn (gen_rtx_SET (VOIDmode, target,
-				gen_rtx_VEC_CONCAT (mode, tmp, tmp)));
+	enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
+	rtx x = gen_reg_rtx (hvmode);
+
+	ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
+	gcc_assert (ok);
+
+	x = gen_rtx_VEC_CONCAT (mode, x, x);
+	emit_insn (gen_rtx_SET (VOIDmode, target, x));
      }
      return true;

@@ -29085,19 +29107,6 @@ ix86_vectorize_builtin_vec_perm (tree vec_type, tree *mask_type)
  return ix86_builtins[(int) fcode];
 }

-/* AVX does not support 32-byte integer vector operations,
-   thus the longest vector we are faced with is V16QImode.  */
-#define MAX_VECT_LEN	16
-
-struct expand_vec_perm_d
-{
-  rtx target, op0, op1;
-  unsigned char perm[MAX_VECT_LEN];
-  enum machine_mode vmode;
-  unsigned char nelt;
-  bool testing_p;
-};
-
 /* Return a vector mode with twice as many elements as VMODE.  */
 /* ??? Consider moving this to a table generated by genmodes.c.  */

@@ -29739,8 +29748,8 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
  return true;
 }

-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
-   extract-even and extract-odd permutations.  */
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
+   and extract-odd permutations.  */

 static bool
 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
@@ -29855,6 +29864,9 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
  return true;
 }

+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+   extract-even and extract-odd permutations.  */
+
 static bool
 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
 {
@@ -29871,6 +29883,84 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
  return expand_vec_perm_even_odd_1 (d, odd);
 }

+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
+   permutations.  We assume that expand_vec_perm_1 has already failed.  */
+
+static bool
+expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
+{
+  unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
+  enum machine_mode vmode = d->vmode;
+  unsigned char perm2[4];
+  rtx op0 = d->op0;
+  bool ok;
+
+  switch (vmode)
+    {
+    case V4DFmode:
+    case V8SFmode:
+      /* These are special-cased in sse.md so that we can optionally
+	 use the vbroadcast instruction.  They expand to two insns
+	 if the input happens to be in a register.  */
+      gcc_unreachable ();
+
+    case V2DFmode:
+    case V2DImode:
+    case V4SFmode:
+    case V4SImode:
+      /* These are always implementable using standard shuffle patterns.  */
+      gcc_unreachable ();
+
+    case V8HImode:
+    case V16QImode:
+      /* These can be implemented via interleave.  We save one insn by
+	 stopping once we have promoted to V4SImode and then use pshufd.  */
+      do
+	{
+	  optab otab = vec_interleave_low_optab;
+
+	  if (elt >= nelt2)
+	    {
+	      otab = vec_interleave_high_optab;
+	      elt -= nelt2;
+	    }
+	  nelt2 /= 2;
+
+	  op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
+	  vmode = get_mode_wider_vector (vmode);
+	  op0 = gen_lowpart (vmode, op0);
+	}
+      while (vmode != V4SImode);
+
+      memset (perm2, elt, 4);
+      ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
+      gcc_assert (ok);
+      return true;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+   broadcast permutations.  */
+
+static bool
+expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
+{
+  unsigned i, elt, nelt = d->nelt;
+
+  if (d->op0 != d->op1)
+    return false;
+
+  elt = d->perm[0];
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != elt)
+      return false;
+
+  return expand_vec_perm_broadcast_1 (d);
+}
+
 /* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook.
   With all of the interface bits taken care of, perform the expansion
   in D and return true on success.  */
@@ -29878,8 +29968,7 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
 static bool
 ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
 {
-  /* First things first -- check if the instruction is implementable
-     with a single instruction.  */
+  /* Try a single instruction expansion.  */
  if (expand_vec_perm_1 (d))
    return true;

@@ -29894,13 +29983,16 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
  if (expand_vec_perm_interleave2 (d))
    return true;

+  if (expand_vec_perm_broadcast (d))
+    return true;
+
  /* Try sequences of three instructions.  */

  if (expand_vec_perm_pshufb2 (d))
    return true;

  /* ??? Look for narrow permutations whose element orderings would
-     allow the promition to a wider mode.  */
+     allow the promotion to a wider mode.  */

  /* ??? Look for sequences of interleave or a wider permute that place
     the data into the correct lanes for a half-vector shuffle like
@@ -29912,8 +30004,6 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
  if (expand_vec_perm_even_odd (d))
    return true;

-  /* ??? Pattern match broadcast.  */
-
  return false;
 }


--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1241,3 +1241,20 @@
 (define_predicate "avx_vperm2f128_v4df_operand"
  (and (match_code "parallel")
       (match_test "avx_vperm2f128_parallel (op, V4DFmode)")))
+
+;; Return 1 if OP is a parallel for a vbroadcast permute.
+
+(define_predicate "avx_vbroadcast_operand"
+  (and (match_code "parallel")
+       (match_code "const_int" "a"))
+{
+  rtx elt = XVECEXP (op, 0, 0);
+  int i, nelt = XVECLEN (op, 0);
+
+  /* Don't bother checking there are the right number of operands,
+     merely that they're all identical.  */
+  for (i = 1; i < nelt; ++i)
+    if (XVECEXP (op, 0, i) != elt)
+      return false;
+  return true;
+})
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -54,6 +54,7 @@

 (define_mode_iterator AVX256MODEF2P [V8SF V4DF])
 (define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF])
+(define_mode_iterator AVX256MODE24P [V8SI V8SF V4DI V4DF])
 (define_mode_iterator AVX256MODE4P [V4DI V4DF])
 (define_mode_iterator AVX256MODE8P [V8SI V8SF])
 (define_mode_iterator AVXMODEF2P [V4SF V2DF V8SF V4DF])
@@ -96,6 +97,8 @@

 (define_mode_attr ssemodesuffixf2c [(V4SF "s") (V2DF "d")])

+(define_mode_attr ssescalarmodesuffix2s [(V4SF "ss") (V4SI "d")])
+
 ;; Mapping of the max integer size for xop rotate immediate constraint
 (define_mode_attr sserotatemax [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])

@@ -125,17 +128,18 @@
  [(V16QI "V4SF") (V8HI "V4SF") (V4SI "V4SF") (V2DI "V4SF")
   (V32QI "V8SF") (V16HI "V8SF") (V8SI "V8SF") (V4DI "V8SF")])
 (define_mode_attr avxhalfvecmode
-  [(V4SF "V2SF") (V32QI "V16QI")  (V16HI "V8HI") (V8SI "V4SI")
-   (V4DI "V2DI") (V8SF "V4SF") (V4DF "V2DF")])
+  [(V32QI "V16QI") (V16HI "V8HI") (V8SI "V4SI") (V4DI "V2DI")
+   (V8SF "V4SF") (V4DF "V2DF")
+   (V16QI  "V8QI") (V8HI  "V4HI") (V4SI "V2SI") (V4SF "V2SF")])
 (define_mode_attr avxscalarmode
-  [(V16QI "QI") (V8HI "HI") (V4SI "SI") (V4SF "SF") (V2DF "DF")
-   (V8SF "SF") (V4DF "DF")])
+  [(V16QI "QI") (V8HI  "HI") (V4SI "SI") (V2DI "DI") (V4SF "SF") (V2DF "DF")
+   (V32QI "QI") (V16HI "HI") (V8SI "SI") (V4DI "DI") (V8SF "SF") (V4DF "DF")])
 (define_mode_attr avxcvtvecmode
  [(V4SF "V4SI") (V8SF "V8SI") (V4SI "V4SF") (V8SI "V8SF")])
 (define_mode_attr avxpermvecmode
  [(V2DF "V2DI") (V4SF "V4SI") (V4DF "V4DI") (V8SF "V8SI")])
 (define_mode_attr avxmodesuffixf2c
-  [(V4SF "s") (V2DF "d") (V8SF "s") (V4DF "d")])
+  [(V4SF "s") (V2DF "d") (V8SI "s") (V8SF "s") (V4DI "d") (V4DF "d")])
 (define_mode_attr avxmodesuffixp
 [(V2DF "pd") (V4SI "si") (V4SF "ps") (V8SF "ps") (V8SI "si")
  (V4DF "pd")])
@@ -4012,14 +4016,27 @@
  [(set_attr "type" "ssemov")
   (set_attr "mode" "SF")])

+(define_expand "vec_dupv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "")
+	(vec_duplicate:V4SF
+	  (match_operand:SF 1 "nonimmediate_operand" "")))]
+  "TARGET_SSE"
+{
+  if (!TARGET_AVX)
+    operands[1] = force_reg (V4SFmode, operands[1]);
+})
+
 (define_insn "*vec_dupv4sf_avx"
-  [(set (match_operand:V4SF 0 "register_operand" "=x")
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
 	(vec_duplicate:V4SF
-	  (match_operand:SF 1 "register_operand" "x")))]
+	  (match_operand:SF 1 "nonimmediate_operand" "x,m")))]
  "TARGET_AVX"
-  "vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0}"
-  [(set_attr "type" "sselog1")
-   (set_attr "length_immediate" "1")
+  "@
+   vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0}
+   vbroadcastss\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog1,ssemov")
+   (set_attr "length_immediate" "1,0")
+   (set_attr "prefix_extra" "0,1")
   (set_attr "prefix" "vex")
   (set_attr "mode" "V4SF")])

@@ -4125,35 +4142,78 @@
  DONE;
 })

-(define_insn "*vec_setv4sf_0_avx"
-  [(set (match_operand:V4SF 0 "nonimmediate_operand"  "=x,x,x,m")
-	(vec_merge:V4SF
-	  (vec_duplicate:V4SF
-	    (match_operand:SF 2 "general_operand"     " x,m,*r,x*rfF"))
-	  (match_operand:V4SF 1 "vector_move_operand" " x,C,C ,0")
+(define_insn "*vec_set<mode>_0_avx"
+  [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand"  "=x,x, x,x,  x,m")
+	(vec_merge:SSEMODE4S
+	  (vec_duplicate:SSEMODE4S
+	    (match_operand:<ssescalarmode> 2
+	      "general_operand"                            " x,m,*r,x,*rm,x*rfF"))
+	  (match_operand:SSEMODE4S 1 "vector_move_operand" " C,C, C,x,  x,0")
 	  (const_int 1)))]
  "TARGET_AVX"
  "@
-   vmovss\t{%2, %1, %0|%0, %1, %2}
-   vmovss\t{%2, %0|%0, %2}
+   vinsertps\t{$0xe, %2, %2, %0|%0, %2, %2, 0xe}
+   vmov<ssescalarmodesuffix2s>\t{%2, %0|%0, %2}
   vmovd\t{%2, %0|%0, %2}
+   vmovss\t{%2, %1, %0|%0, %1, %2}
+   vpinsrd\t{$0, %2, %1, %0|%0, %1, %2, 0}
+   #"
+  [(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,*")
+   (set_attr "prefix_extra" "*,*,*,*,1,*")
+   (set_attr "length_immediate" "*,*,*,*,1,*")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SF,<ssescalarmode>,SI,SF,TI,*")])
+
+(define_insn "*vec_set<mode>_0_sse4_1"
+  [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand"  "=x,x, x,x,  x,m")
+	(vec_merge:SSEMODE4S
+	  (vec_duplicate:SSEMODE4S
+	    (match_operand:<ssescalarmode> 2
+	      "general_operand"                            " x,m,*r,x,*rm,*rfF"))
+	  (match_operand:SSEMODE4S 1 "vector_move_operand" " C,C, C,0,  0,0")
+	  (const_int 1)))]
+  "TARGET_SSE4_1"
+  "@
+   insertps\t{$0xe, %2, %0|%0, %2, 0xe}
+   mov<ssescalarmodesuffix2s>\t{%2, %0|%0, %2}
+   movd\t{%2, %0|%0, %2}
+   movss\t{%2, %0|%0, %2}
+   pinsrd\t{$0, %2, %0|%0, %2, 0}
+   #"
+  [(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,*")
+   (set_attr "prefix_extra" "*,*,*,*,1,*")
+   (set_attr "length_immediate" "*,*,*,*,1,*")
+   (set_attr "mode" "SF,<ssescalarmode>,SI,SF,TI,*")])
+
+(define_insn "*vec_set<mode>_0_sse2"
+  [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand"  "=x, x,x,m")
+	(vec_merge:SSEMODE4S
+	  (vec_duplicate:SSEMODE4S
+	    (match_operand:<ssescalarmode> 2
+	      "general_operand"                            " m,*r,x,x*rfF"))
+	  (match_operand:SSEMODE4S 1 "vector_move_operand" " C, C,0,0")
+	  (const_int 1)))]
+  "TARGET_SSE2"
+  "@
+   mov<ssescalarmodesuffix2s>\t{%2, %0|%0, %2}
+   movd\t{%2, %0|%0, %2}
+   movss\t{%2, %0|%0, %2}
   #"
  [(set_attr "type" "ssemov")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "SF")])
-
-(define_insn "vec_setv4sf_0"
-  [(set (match_operand:V4SF 0 "nonimmediate_operand"  "=x,x,Y2,m")
-	(vec_merge:V4SF
-	  (vec_duplicate:V4SF
-	    (match_operand:SF 2 "general_operand"     " x,m,*r,x*rfF"))
-	  (match_operand:V4SF 1 "vector_move_operand" " 0,C,C ,0")
+   (set_attr "mode" "<ssescalarmode>,SI,SF,*")])
+
+(define_insn "vec_set<mode>_0"
+  [(set (match_operand:SSEMODE4S 0 "nonimmediate_operand"  "=x,x,m")
+	(vec_merge:SSEMODE4S
+	  (vec_duplicate:SSEMODE4S
+	    (match_operand:<ssescalarmode> 2
+	      "general_operand"                            " m,x,x*rfF"))
+	  (match_operand:SSEMODE4S 1 "vector_move_operand" " C,0,0")
 	  (const_int 1)))]
  "TARGET_SSE"
  "@
   movss\t{%2, %0|%0, %2}
   movss\t{%2, %0|%0, %2}
-   movd\t{%2, %0|%0, %2}
   #"
  [(set_attr "type" "ssemov")
   (set_attr "mode" "SF")])
@@ -4484,7 +4544,7 @@
   (set_attr "mode" "V4DF")])

 (define_expand "vec_interleave_highv2df"
-  [(set (match_operand:V2DF 0 "nonimmediate_operand" "")
+  [(set (match_operand:V2DF 0 "register_operand" "")
 	(vec_select:V2DF
 	  (vec_concat:V4DF
 	    (match_operand:V2DF 1 "nonimmediate_operand" "")
@@ -4492,24 +4552,46 @@
 	  (parallel [(const_int 1)
 		     (const_int 3)])))]
  "TARGET_SSE2"
-  "ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);")
+{
+  if (!ix86_vec_interleave_v2df_operator_ok (operands, 1))
+    operands[2] = force_reg (V2DFmode, operands[2]);
+})

 (define_insn "*avx_interleave_highv2df"
-  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,m")
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,x,m")
 	(vec_select:V2DF
 	  (vec_concat:V4DF
-	    (match_operand:V2DF 1 "nonimmediate_operand" " x,o,x")
-	    (match_operand:V2DF 2 "nonimmediate_operand" " x,x,0"))
+	    (match_operand:V2DF 1 "nonimmediate_operand" " x,o,o,x")
+	    (match_operand:V2DF 2 "nonimmediate_operand" " x,1,x,0"))
 	  (parallel [(const_int 1)
 		     (const_int 3)])))]
-  "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "TARGET_AVX && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
  "@
   vunpckhpd\t{%2, %1, %0|%0, %1, %2}
+   vmovddup\t{%H1, %0|%0, %H1}
   vmovlpd\t{%H1, %2, %0|%0, %2, %H1}
   vmovhpd\t{%1, %0|%0, %1}"
-  [(set_attr "type" "sselog,ssemov,ssemov")
+  [(set_attr "type" "sselog,sselog,ssemov,ssemov")
   (set_attr "prefix" "vex")
-   (set_attr "mode" "V2DF,V1DF,V1DF")])
+   (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
+
+(define_insn "*sse3_interleave_highv2df"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,x,m")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (match_operand:V2DF 1 "nonimmediate_operand" " 0,o,o,x")
+	    (match_operand:V2DF 2 "nonimmediate_operand" " x,1,0,0"))
+	  (parallel [(const_int 1)
+		     (const_int 3)])))]
+  "TARGET_SSE3 && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
+  "@
+   unpckhpd\t{%2, %0|%0, %2}
+   movddup\t{%H1, %0|%0, %H1}
+   movlpd\t{%H1, %0|%0, %H1}
+   movhpd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog,sselog,ssemov,ssemov")
+   (set_attr "prefix_data16" "*,*,1,1")
+   (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])

 (define_insn "*sse2_interleave_highv2df"
  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,m")
@@ -4519,7 +4601,7 @@
 	    (match_operand:V2DF 2 "nonimmediate_operand" " x,0,0"))
 	  (parallel [(const_int 1)
 		     (const_int 3)])))]
-  "TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
  "@
   unpckhpd\t{%2, %0|%0, %2}
   movlpd\t{%H1, %0|%0, %H1}
@@ -4528,85 +4610,48 @@
   (set_attr "prefix_data16" "*,1,1")
   (set_attr "mode" "V2DF,V1DF,V1DF")])

-(define_insn "avx_movddup256"
-  [(set (match_operand:V4DF 0 "register_operand" "=x")
+;; Recall that the 256-bit unpck insns only shuffle within their lanes.
+(define_expand "avx_movddup256"
+  [(set (match_operand:V4DF 0 "register_operand" "")
 	(vec_select:V4DF
 	  (vec_concat:V8DF
-	    (match_operand:V4DF 1 "nonimmediate_operand" "xm")
+	    (match_operand:V4DF 1 "nonimmediate_operand" "")
 	    (match_dup 1))
-	  (parallel [(const_int 0) (const_int 2)
-		     (const_int 4) (const_int 6)])))]
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 2) (const_int 6)])))]
  "TARGET_AVX"
-  "vmovddup\t{%1, %0|%0, %1}"
-  [(set_attr "type" "sselog1")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "V4DF")])
-
-(define_insn "*avx_movddup"
-  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,o")
-	(vec_select:V2DF
-	  (vec_concat:V4DF
-	    (match_operand:V2DF 1 "nonimmediate_operand" "xm,x")
-	    (match_dup 1))
-	  (parallel [(const_int 0)
-		     (const_int 2)])))]
-  "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
-  "@
-   vmovddup\t{%1, %0|%0, %1}
-   #"
-  [(set_attr "type" "sselog1,ssemov")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "V2DF")])
-
-(define_insn "*sse3_movddup"
-  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,o")
-	(vec_select:V2DF
-	  (vec_concat:V4DF
-	    (match_operand:V2DF 1 "nonimmediate_operand" "xm,x")
-	    (match_dup 1))
-	  (parallel [(const_int 0)
-		     (const_int 2)])))]
-  "TARGET_SSE3 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
-  "@
-   movddup\t{%1, %0|%0, %1}
-   #"
-  [(set_attr "type" "sselog1,ssemov")
-   (set_attr "mode" "V2DF")])
-
-(define_split
-  [(set (match_operand:V2DF 0 "memory_operand" "")
-	(vec_select:V2DF
-	  (vec_concat:V4DF
-	    (match_operand:V2DF 1 "register_operand" "")
-	    (match_dup 1))
-	  (parallel [(const_int 0)
-		     (const_int 2)])))]
-  "TARGET_SSE3 && reload_completed"
-  [(const_int 0)]
-{
-  rtx low = gen_rtx_REG (DFmode, REGNO (operands[1]));
-  emit_move_insn (adjust_address (operands[0], DFmode, 0), low);
-  emit_move_insn (adjust_address (operands[0], DFmode, 8), low);
-  DONE;
-})
+  "")

-;; Recall that the 256-bit unpck insns only shuffle within their lanes.
-(define_insn "avx_unpcklpd256"
-  [(set (match_operand:V4DF 0 "register_operand" "=x")
+(define_expand "avx_unpcklpd256"
+  [(set (match_operand:V4DF 0 "register_operand" "")
 	(vec_select:V4DF
 	  (vec_concat:V8DF
-	    (match_operand:V4DF 1 "register_operand" "x")
-	    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
+	    (match_operand:V4DF 1 "register_operand" "")
+	    (match_operand:V4DF 2 "nonimmediate_operand" ""))
 	  (parallel [(const_int 0) (const_int 4)
 		     (const_int 2) (const_int 6)])))]
  "TARGET_AVX"
-  "vunpcklpd\t{%2, %1, %0|%0, %1, %2}"
+  "")
+
+(define_insn "*avx_unpcklpd256"
+  [(set (match_operand:V4DF 0 "register_operand"         "=x,x")
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_operand:V4DF 1 "nonimmediate_operand" "xm,x")
+	    (match_operand:V4DF 2 "nonimmediate_operand" " 1,xm"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 2) (const_int 6)])))]
+  "TARGET_AVX
+   && (!MEM_P (operands[1]) || rtx_equal_p (operands[1], operands[2]))"
+  "@
+   vmovddup\t{%1, %0|%0, %1}
+   vunpcklpd\t{%2, %1, %0|%0, %1, %2}"
  [(set_attr "type" "sselog")
   (set_attr "prefix" "vex")
   (set_attr "mode" "V4DF")])

 (define_expand "vec_interleave_lowv2df"
-  [(set (match_operand:V2DF 0 "nonimmediate_operand" "")
+  [(set (match_operand:V2DF 0 "register_operand" "")
 	(vec_select:V2DF
 	  (vec_concat:V4DF
 	    (match_operand:V2DF 1 "nonimmediate_operand" "")
@@ -4614,24 +4659,46 @@
 	  (parallel [(const_int 0)
 		     (const_int 2)])))]
  "TARGET_SSE2"
-  "ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);")
+{
+  if (!ix86_vec_interleave_v2df_operator_ok (operands, 0))
+    operands[1] = force_reg (V2DFmode, operands[1]);
+})

 (define_insn "*avx_interleave_lowv2df"
-  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,o")
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,x,o")
 	(vec_select:V2DF
 	  (vec_concat:V4DF
-	    (match_operand:V2DF 1 "nonimmediate_operand" " x,x,0")
-	    (match_operand:V2DF 2 "nonimmediate_operand" " x,m,x"))
+	    (match_operand:V2DF 1 "nonimmediate_operand" " x,m,x,0")
+	    (match_operand:V2DF 2 "nonimmediate_operand" " x,1,m,x"))
 	  (parallel [(const_int 0)
 		     (const_int 2)])))]
-  "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "TARGET_AVX && ix86_vec_interleave_v2df_operator_ok (operands, 0)"
  "@
   vunpcklpd\t{%2, %1, %0|%0, %1, %2}
+   vmovddup\t{%1, %0|%0, %1}
   vmovhpd\t{%2, %1, %0|%0, %1, %2}
   vmovlpd\t{%2, %H0|%H0, %2}"
-  [(set_attr "type" "sselog,ssemov,ssemov")
+  [(set_attr "type" "sselog,sselog,ssemov,ssemov")
   (set_attr "prefix" "vex")
-   (set_attr "mode" "V2DF,V1DF,V1DF")])
+   (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])
+
+(define_insn "*sse3_interleave_lowv2df"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,x,o")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (match_operand:V2DF 1 "nonimmediate_operand" " 0,m,0,0")
+	    (match_operand:V2DF 2 "nonimmediate_operand" " x,1,m,x"))
+	  (parallel [(const_int 0)
+		     (const_int 2)])))]
+  "TARGET_SSE3 && ix86_vec_interleave_v2df_operator_ok (operands, 0)"
+  "@
+   unpcklpd\t{%2, %0|%0, %2}
+   movddup\t{%1, %0|%0, %1}
+   movhpd\t{%2, %0|%0, %2}
+   movlpd\t{%2, %H0|%H0, %2}"
+  [(set_attr "type" "sselog,sselog,ssemov,ssemov")
+   (set_attr "prefix_data16" "*,*,1,1")
+   (set_attr "mode" "V2DF,V2DF,V1DF,V1DF")])

 (define_insn "*sse2_interleave_lowv2df"
  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,o")
@@ -4641,7 +4708,7 @@
 	    (match_operand:V2DF 2 "nonimmediate_operand" " x,m,x"))
 	  (parallel [(const_int 0)
 		     (const_int 2)])))]
-  "TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 0)"
  "@
   unpcklpd\t{%2, %0|%0, %2}
   movhpd\t{%2, %0|%0, %2}
@@ -4650,6 +4717,37 @@
   (set_attr "prefix_data16" "*,1,1")
   (set_attr "mode" "V2DF,V1DF,V1DF")])

+(define_split
+  [(set (match_operand:V2DF 0 "memory_operand" "")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (match_operand:V2DF 1 "register_operand" "")
+	    (match_dup 1))
+	  (parallel [(const_int 0)
+		     (const_int 2)])))]
+  "TARGET_SSE3 && reload_completed"
+  [(const_int 0)]
+{
+  rtx low = gen_rtx_REG (DFmode, REGNO (operands[1]));
+  emit_move_insn (adjust_address (operands[0], DFmode, 0), low);
+  emit_move_insn (adjust_address (operands[0], DFmode, 8), low);
+  DONE;
+})
+
+(define_split
+  [(set (match_operand:V2DF 0 "register_operand" "")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (match_operand:V2DF 1 "memory_operand" "")
+	    (match_dup 1))
+	  (parallel [(match_operand:SI 2 "const_0_to_1_operand" "")
+		     (match_operand:SI 3 "const_int_operand" "")])))]
+  "TARGET_SSE3 && INTVAL (operands[2]) + 2 == INTVAL (operands[3])"
+  [(set (match_dup 0) (vec_duplicate:V2DF (match_dup 1)))]
+{
+  operands[1] = adjust_address (operands[1], DFmode, INTVAL (operands[2]) * 8);
+})
+
 (define_expand "avx_shufpd256"
  [(match_operand:V4DF 0 "register_operand" "")
   (match_operand:V4DF 1 "register_operand" "")
@@ -7408,6 +7506,20 @@
  [(set_attr "type" "ssemov")
   (set_attr "mode" "V2SF,V4SF,V2SF")])

+(define_insn "*vec_dupv4si_avx"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x")
+	(vec_duplicate:V4SI
+	  (match_operand:SI 1 "register_operand" "x,m")))]
+  "TARGET_AVX"
+  "@
+   vpshufd\t{$0, %1, %0|%0, %1, 0}
+   vbroadcastss\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog1,ssemov")
+   (set_attr "length_immediate" "1,0")
+   (set_attr "prefix_extra" "0,1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI,V4SF")])
+
 (define_insn "*vec_dupv4si"
  [(set (match_operand:V4SI 0 "register_operand" "=Y2,x")
 	(vec_duplicate:V4SI
@@ -7417,19 +7529,31 @@
   %vpshufd\t{$0, %1, %0|%0, %1, 0}
   shufps\t{$0, %0, %0|%0, %0, 0}"
  [(set_attr "type" "sselog1")
-   (set_attr "prefix" "maybe_vex,orig")
   (set_attr "length_immediate" "1")
   (set_attr "mode" "TI,V4SF")])

 (define_insn "*vec_dupv2di_avx"
-  [(set (match_operand:V2DI 0 "register_operand" "=x")
+  [(set (match_operand:V2DI 0 "register_operand"     "=x,x")
 	(vec_duplicate:V2DI
-	  (match_operand:DI 1 "register_operand" "x")))]
+	  (match_operand:DI 1 "nonimmediate_operand" " x,m")))]
  "TARGET_AVX"
-  "vpunpcklqdq\t{%1, %1, %0|%0, %1, %1}"
+  "@
+   vpunpcklqdq\t{%1, %1, %0|%0, %1, %1}
+   vmovddup\t{%1, %0|%0, %1}"
  [(set_attr "type" "sselog1")
   (set_attr "prefix" "vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "TI,DF")])
+
+(define_insn "*vec_dupv2di_sse3"
+  [(set (match_operand:V2DI 0 "register_operand"     "=x,x")
+	(vec_duplicate:V2DI
+	  (match_operand:DI 1 "nonimmediate_operand" " 0,m")))]
+  "TARGET_SSE3"
+  "@
+   punpcklqdq\t%0, %0
+   movddup\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog1")
+   (set_attr "mode" "TI,DF")])

 (define_insn "*vec_dupv2di"
  [(set (match_operand:V2DI 0 "register_operand" "=Y2,x")
@@ -11838,6 +11962,108 @@
   (set_attr "prefix" "vex")
   (set_attr "mode" "OI")])

+(define_insn_and_split "vec_dup<mode>"
+  [(set (match_operand:AVX256MODE24P 0 "register_operand" "=x,x")
+	(vec_duplicate:AVX256MODE24P
+	  (match_operand:<avxscalarmode> 1 "nonimmediate_operand" "m,?x")))]
+  "TARGET_AVX"
+  "@
+   vbroadcasts<avxmodesuffixf2c>\t{%1, %0|%0, %1}
+   #"
+  "&& reload_completed && REG_P (operands[1])"
+  [(set (match_dup 2) (vec_duplicate:<avxhalfvecmode> (match_dup 1)))
+   (set (match_dup 0) (vec_concat:AVX256MODE24P (match_dup 2) (match_dup 2)))]
+{
+  operands[2] = gen_rtx_REG (<avxhalfvecmode>mode, REGNO (operands[0]));
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "avx_vbroadcastf128_<mode>"
+  [(set (match_operand:AVX256MODE 0 "register_operand" "=x,x,x")
+	(vec_concat:AVX256MODE
+	  (match_operand:<avxhalfvecmode> 1 "nonimmediate_operand" "m,0,?x")
+	  (match_dup 1)))]
+  "TARGET_AVX"
+  "@
+   vbroadcastf128\t{%1, %0|%0, %1}
+   vinsertf128\t{$1, %1, %0, %0|%0, %0, %1, 1}
+   vperm2f128\t{$0, %t1, %t1, %0|%0, %t1, %t1, 0}"
+  [(set_attr "type" "ssemov,sselog1,sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "0,1,1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF,V8SF,V8SF")])
+
+;; Recognize broadcast as a vec_select as produced by builtin_vec_perm.
+;; If it so happens that the input is in memory, use vbroadcast.
+;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128).
+(define_insn "*avx_vperm_broadcast_v4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x,x")
+	(vec_select:V4SF
+	  (match_operand:V4SF 1 "nonimmediate_operand" "m,o,x")
+	  (match_parallel 2 "avx_vbroadcast_operand"
+	    [(match_operand 3 "const_int_operand" "C,n,n")])))]
+  "TARGET_AVX"
+{
+  int elt = INTVAL (operands[3]);
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+      operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4);
+      return "vbroadcastss\t{%1, %0|%0, %1}";
+    case 2:
+      operands[2] = GEN_INT (elt * 0x55);
+      return "vpermilps\t{%2, %1, %0|%0, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "ssemov,ssemov,sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "0,0,1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SF,SF,V4SF")])
+
+(define_insn_and_split "*avx_vperm_broadcast_<mode>"
+  [(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x,x,x")
+	(vec_select:AVX256MODEF2P
+	  (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "m,o,?x")
+	  (match_parallel 2 "avx_vbroadcast_operand"
+	    [(match_operand 3 "const_int_operand" "C,n,n")])))]
+  "TARGET_AVX"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (vec_duplicate:AVX256MODEF2P (match_dup 1)))]
+{
+  rtx op0 = operands[0], op1 = operands[1];
+  int elt = INTVAL (operands[3]);
+
+  if (REG_P (op1))
+    {
+      int mask;
+
+      /* Shuffle element we care about into all elements of the 128-bit lane.
+	 The other lane gets shuffled too, but we don't care.  */
+      if (<MODE>mode == V4DFmode)
+	mask = (elt & 1 ? 15 : 0);
+      else
+	mask = (elt & 3) * 0x55;
+      emit_insn (gen_avx_vpermil<mode> (op0, op1, GEN_INT (mask)));
+
+      /* Shuffle the lane we care about into both lanes of the dest.  */
+      mask = (elt / (<ssescalarnum> / 2)) * 0x11;
+      emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask)));
+      DONE;
+    }
+
+  operands[1] = adjust_address_nv (op1, <avxscalarmode>mode,
+  	      			   elt * GET_MODE_SIZE (<avxscalarmode>mode));
+})
+
 (define_expand "avx_vpermil<mode>"
  [(set (match_operand:AVXMODEFDP 0 "register_operand" "")
 	(vec_select:AVXMODEFDP
@@ -11989,58 +12215,6 @@
   (set_attr "prefix" "vex")
   (set_attr "mode" "V8SF")])

-(define_insn "avx_vbroadcasts<avxmodesuffixf2c><avxmodesuffix>"
-  [(set (match_operand:AVXMODEF4P 0 "register_operand" "=x")
-	(vec_concat:AVXMODEF4P
-	  (vec_concat:<avxhalfvecmode>
-	    (match_operand:<avxscalarmode> 1 "memory_operand" "m")
-	    (match_dup 1))
-	  (vec_concat:<avxhalfvecmode>
-	    (match_dup 1)
-	    (match_dup 1))))]
-  "TARGET_AVX"
-  "vbroadcasts<avxmodesuffixf2c>\t{%1, %0|%0, %1}"
-  [(set_attr "type" "ssemov")
-   (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "<avxscalarmode>")])
-
-(define_insn "avx_vbroadcastss256"
-  [(set (match_operand:V8SF 0 "register_operand" "=x")
-	(vec_concat:V8SF
-	  (vec_concat:V4SF
-	    (vec_concat:V2SF
-	      (match_operand:SF 1 "memory_operand" "m")
-	      (match_dup 1))
-	    (vec_concat:V2SF
-	      (match_dup 1)
-	      (match_dup 1)))
-	  (vec_concat:V4SF
-	    (vec_concat:V2SF
-	      (match_dup 1)
-	      (match_dup 1))
-	    (vec_concat:V2SF
-	      (match_dup 1)
-	      (match_dup 1)))))]
-  "TARGET_AVX"
-  "vbroadcastss\t{%1, %0|%0, %1}"
-  [(set_attr "type" "ssemov")
-   (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "SF")])
-
-(define_insn "avx_vbroadcastf128_p<avxmodesuffixf2c>256"
-  [(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x")
-	(vec_concat:AVX256MODEF2P
-	  (match_operand:<avxhalfvecmode> 1 "memory_operand" "m")
-	  (match_dup 1)))]
-  "TARGET_AVX"
-  "vbroadcastf128\t{%1, %0|%0, %1}"
-  [(set_attr "type" "ssemov")
-   (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "V4SF")])
-
 (define_expand "avx_vinsertf128<mode>"
  [(match_operand:AVX256MODE 0 "register_operand" "")
   (match_operand:AVX256MODE 1 "register_operand" "")