re PR c/7344 (performance regression on huge case statements)

* i386.md (movv2di_internal): New pattern. (movv2df_internal, movv8hi_internal, movv16qi_internal): Fix predicate. (movv2di): New expander. * i386.c (ix86_preferred_reload_class): Return NO_REGS for vector operands. * i386.c (ix86_expand_timode_binop_builtin): Delete. (builtin_description): Add SSE1 logicals; rename SSE2 logicals. (ix86_init_mmx_sse_builtins): Kill SSE1 logicals. (ix86_expand_builtin): Likewise. * i386.h (sse_andti4_df_1, sse_andti3_df_2, sse_andti3_sf_1, sse_andti3_sf_2, sse_andti3, sse_andnti4_df_1, sse_andti3_df_2, sse_andti3_sf_1, sse_andti3_sf_2, sse_andnti3, sse_orti4_df_1, sse_orti3_df_2, sse_orti3_sf_1, sse_orti3_sf_2, sse_orti3, sse_xorti4_df_1, sse_xorti3_df_2, sse_xorti3_sf_1, sse_xorti3_sf_2, sse_xorti3): Kill. (sse_andv4sf3, sse_andnv4sf3, sse_orv2df3, sse_xorv2df3, sse_andv2df3, sse_andnv2df3, sse_orv2df3, sse_xorv2df3): New expanders. (*sse_andv4sf3, *sse_andnv2df3, *sse_orv4sf3, *sse_xorv4sf3, *sse_andv2df3, *sse_andnv2df3, *sse_orv2df3, *sse_xorv2df3): New patterns. (*sse_andsf3, *sse_andndf3, *sse_ordf3, *sse_xordf3, *sse_anddf3, *sse_andndf3, *sse_orv2df3, *sse_xorv2df3): New patterns. * xmmintrin.h (__m128i): Define as __v2di. PR c/7344 * predict.c (can_predict_insn_p): New function. (estimate_probability): Avoid unnecesary work. (process_note_prediction): Likewise. * toplev.c (rest_of_compilation): Account early branch prediction pass as TV_BRANCH_PROB. PR c++/6419 (expand_expr): Use DECL_RTL_SET_P. From-SVN: r58156

re PR c/7344 (performance regression on huge case statements)
* i386.md (movv2di_internal): New pattern. (movv2df_internal, movv8hi_internal, movv16qi_internal): Fix predicate. (movv2di): New expander. * i386.c (ix86_preferred_reload_class): Return NO_REGS for vector operands. * i386.c (ix86_expand_timode_binop_builtin): Delete. (builtin_description): Add SSE1 logicals; rename SSE2 logicals. (ix86_init_mmx_sse_builtins): Kill SSE1 logicals. (ix86_expand_builtin): Likewise. * i386.h (sse_andti4_df_1, sse_andti3_df_2, sse_andti3_sf_1, sse_andti3_sf_2, sse_andti3, sse_andnti4_df_1, sse_andti3_df_2, sse_andti3_sf_1, sse_andti3_sf_2, sse_andnti3, sse_orti4_df_1, sse_orti3_df_2, sse_orti3_sf_1, sse_orti3_sf_2, sse_orti3, sse_xorti4_df_1, sse_xorti3_df_2, sse_xorti3_sf_1, sse_xorti3_sf_2, sse_xorti3): Kill. (sse_andv4sf3, sse_andnv4sf3, sse_orv2df3, sse_xorv2df3, sse_andv2df3, sse_andnv2df3, sse_orv2df3, sse_xorv2df3): New expanders. (*sse_andv4sf3, *sse_andnv2df3, *sse_orv4sf3, *sse_xorv4sf3, *sse_andv2df3, *sse_andnv2df3, *sse_orv2df3, *sse_xorv2df3): New patterns. (*sse_andsf3, *sse_andndf3, *sse_ordf3, *sse_xordf3, *sse_anddf3, *sse_andndf3, *sse_orv2df3, *sse_xorv2df3): New patterns. * xmmintrin.h (__m128i): Define as __v2di. PR c/7344 * predict.c (can_predict_insn_p): New function. (estimate_probability): Avoid unnecesary work. (process_note_prediction): Likewise. * toplev.c (rest_of_compilation): Account early branch prediction pass as TV_BRANCH_PROB. PR c++/6419 (expand_expr): Use DECL_RTL_SET_P. From-SVN: r58156
1877be45 · Jan Hubicka · Jan Hubicka · 0aab899b · 1877be45 · 1877be45
Commit 1877be45 authored Oct 15, 2002 by Jan Hubicka Committed by Jan Hubicka Oct 15, 2002
12 changed files
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+Mon Oct 14 20:33:12 CEST 2002  Jan Hubicka  <jh@suse.cz>
+
+	* i386.md (movv2di_internal): New pattern.
+	(movv2df_internal, movv8hi_internal, movv16qi_internal): Fix predicate.
+	(movv2di): New expander.
+	* i386.c (ix86_preferred_reload_class): Return NO_REGS for vector operands.
+
+	* i386.c (ix86_expand_timode_binop_builtin): Delete.
+	(builtin_description): Add SSE1 logicals; rename SSE2 logicals.
+	(ix86_init_mmx_sse_builtins): Kill SSE1 logicals.
+	(ix86_expand_builtin): Likewise.
+	* i386.h (sse_andti4_df_1, sse_andti3_df_2, sse_andti3_sf_1, sse_andti3_sf_2,
+        sse_andti3,
+	sse_andnti4_df_1, sse_andti3_df_2, sse_andti3_sf_1, sse_andti3_sf_2,
+        sse_andnti3,
+	sse_orti4_df_1, sse_orti3_df_2, sse_orti3_sf_1, sse_orti3_sf_2,
+        sse_orti3,
+	sse_xorti4_df_1, sse_xorti3_df_2, sse_xorti3_sf_1, sse_xorti3_sf_2,
+        sse_xorti3): Kill.
+	(sse_andv4sf3, sse_andnv4sf3, sse_orv2df3, sse_xorv2df3, sse_andv2df3,
+	 sse_andnv2df3, sse_orv2df3, sse_xorv2df3): New expanders.
+	(*sse_andv4sf3, *sse_andnv2df3, *sse_orv4sf3, *sse_xorv4sf3, *sse_andv2df3,
+	 *sse_andnv2df3, *sse_orv2df3, *sse_xorv2df3): New patterns.
+	(*sse_andsf3, *sse_andndf3, *sse_ordf3, *sse_xordf3, *sse_anddf3,
+	 *sse_andndf3, *sse_orv2df3, *sse_xorv2df3): New patterns.
+
+	* xmmintrin.h (__m128i): Define as __v2di.
+
+	PR c/7344
+	* predict.c (can_predict_insn_p): New function.
+	(estimate_probability): Avoid unnecesary work.
+	(process_note_prediction): Likewise.
+	* toplev.c (rest_of_compilation): Account early branch prediction pass
+	as TV_BRANCH_PROB.
+
+	PR c++/6419
+	(expand_expr): Use DECL_RTL_SET_P.
+
 2002-10-14  Roger Sayle  <roger@eyesopen.com>

 	* combine.c (simplify_set):  Treat MODE_CC registers like cc0.
@@ -136,7 +174,7 @@ Fri Oct 11 22:22:38 CEST 2002  Jan Hubicka  <jh@suse.cz>
 	PR c/7344
 	* cfgbuild.c (make_edges): Create edge cache when we do have
 	large jumptable.
-	(do_tablejump): Note size of maximal jumptable.
+	* expr.c (do_tablejump): Note size of maximal jumptable.
 	* function.c (prepare_function_start): Zero out size.
 	* function.h (function): Add max_jumptable_ents.


--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -771,8 +771,6 @@ static rtx ix86_expand_sse_compare PARAMS ((const struct builtin_description *,
 static rtx ix86_expand_unop1_builtin PARAMS ((enum insn_code, tree, rtx));
 static rtx ix86_expand_unop_builtin PARAMS ((enum insn_code, tree, rtx, int));
 static rtx ix86_expand_binop_builtin PARAMS ((enum insn_code, tree, rtx));
-static rtx ix86_expand_timode_binop_builtin PARAMS ((enum insn_code,
-						     tree, rtx));
 static rtx ix86_expand_store_builtin PARAMS ((enum insn_code, tree));
 static rtx safe_vector_operand PARAMS ((rtx, enum machine_mode));
 static enum rtx_code ix86_fp_compare_code_to_integer PARAMS ((enum rtx_code));
@@ -11811,6 +11809,11 @@ static const struct builtin_description bdesc_2arg[] =
  { MASK_SSE1, CODE_FOR_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, 0, 0 },
  { MASK_SSE1, CODE_FOR_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, 0, 0 },

+  { MASK_SSE1, CODE_FOR_sse_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, 0, 0 },
+  { MASK_SSE1, CODE_FOR_sse_nandv4sf3,  "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, 0, 0 },
+  { MASK_SSE1, CODE_FOR_sse_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, 0, 0 },
+  { MASK_SSE1, CODE_FOR_sse_xorv4sf3,  "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, 0, 0 },
+
  { MASK_SSE1, CODE_FOR_sse_movss,  "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, 0, 0 },
  { MASK_SSE1, CODE_FOR_sse_movhlps,  "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, 0, 0 },
  { MASK_SSE1, CODE_FOR_sse_movlhps,  "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, 0, 0 },
@@ -11935,10 +11938,10 @@ static const struct builtin_description bdesc_2arg[] =
  { MASK_SSE2, CODE_FOR_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, 0, 0 },
  { MASK_SSE2, CODE_FOR_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, 0, 0 },

-  { MASK_SSE2, CODE_FOR_sse2_anddf3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, 0, 0 },
-  { MASK_SSE2, CODE_FOR_sse2_nanddf3,  "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, 0, 0 },
-  { MASK_SSE2, CODE_FOR_sse2_iordf3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, 0, 0 },
-  { MASK_SSE2, CODE_FOR_sse2_xordf3,  "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, 0, 0 },
+  { MASK_SSE2, CODE_FOR_sse2_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, 0, 0 },
+  { MASK_SSE2, CODE_FOR_sse2_nandv2df3,  "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, 0, 0 },
+  { MASK_SSE2, CODE_FOR_sse2_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, 0, 0 },
+  { MASK_SSE2, CODE_FOR_sse2_xorv2df3,  "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, 0, 0 },

  { MASK_SSE2, CODE_FOR_sse2_movsd,  "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, 0, 0 },
  { MASK_SSE2, CODE_FOR_sse2_unpckhpd, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, 0, 0 },
@@ -12443,11 +12446,6 @@ ix86_init_mmx_sse_builtins ()
  def_builtin (MASK_SSE1, "__builtin_ia32_cvttps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTTPS2PI);
  def_builtin (MASK_SSE1, "__builtin_ia32_cvttss2si", int_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI);

-  def_builtin (MASK_SSE1, "__builtin_ia32_andps", v4sf_ftype_v4sf_v4sf, IX86_BUILTIN_ANDPS);
-  def_builtin (MASK_SSE1, "__builtin_ia32_andnps", v4sf_ftype_v4sf_v4sf, IX86_BUILTIN_ANDNPS);
-  def_builtin (MASK_SSE1, "__builtin_ia32_orps", v4sf_ftype_v4sf_v4sf, IX86_BUILTIN_ORPS);
-  def_builtin (MASK_SSE1, "__builtin_ia32_xorps", v4sf_ftype_v4sf_v4sf, IX86_BUILTIN_XORPS);
-
  def_builtin (MASK_SSE1 | MASK_3DNOW_A, "__builtin_ia32_pextrw", int_ftype_v4hi_int, IX86_BUILTIN_PEXTRW);
  def_builtin (MASK_SSE1 | MASK_3DNOW_A, "__builtin_ia32_pinsrw", v4hi_ftype_v4hi_int_int, IX86_BUILTIN_PINSRW);

@@ -12680,45 +12678,6 @@ ix86_expand_binop_builtin (icode, arglist, target)
  return target;
 }

-/* In type_for_mode we restrict the ability to create TImode types
-   to hosts with 64-bit H_W_I.  So we've defined the SSE logicals
-   to have a V4SFmode signature.  Convert them in-place to TImode.  */
-
-static rtx
-ix86_expand_timode_binop_builtin (icode, arglist, target)
-     enum insn_code icode;
-     tree arglist;
-     rtx target;
-{
-  rtx pat;
-  tree arg0 = TREE_VALUE (arglist);
-  tree arg1 = TREE_VALUE (TREE_CHAIN (arglist));
-  rtx op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0);
-  rtx op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0);
-
-  op0 = gen_lowpart (TImode, op0);
-  op1 = gen_lowpart (TImode, op1);
-  target = gen_reg_rtx (TImode);
-
-  if (! (*insn_data[icode].operand[1].predicate) (op0, TImode))
-    op0 = copy_to_mode_reg (TImode, op0);
-  if (! (*insn_data[icode].operand[2].predicate) (op1, TImode))
-    op1 = copy_to_mode_reg (TImode, op1);
-
-  /* In the commutative cases, both op0 and op1 are nonimmediate_operand,
-     yet one of the two must not be a memory.  This is normally enforced
-     by expanders, but we didn't bother to create one here.  */
-  if (GET_CODE (op0) == MEM && GET_CODE (op1) == MEM)
-    op0 = copy_to_mode_reg (TImode, op0);
-
-  pat = GEN_FCN (icode) (target, op0, op1);
-  if (! pat)
-    return 0;
-  emit_insn (pat);
-
-  return gen_lowpart (V4SFmode, target);
-}
-
 /* Subroutine of ix86_expand_builtin to take care of stores.  */

 static rtx
@@ -13064,19 +13023,6 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore)
    case IX86_BUILTIN_RCPSS:
      return ix86_expand_unop1_builtin (CODE_FOR_vmrcpv4sf2, arglist, target);

-    case IX86_BUILTIN_ANDPS:
-      return ix86_expand_timode_binop_builtin (CODE_FOR_sse_andti3,
-					       arglist, target);
-    case IX86_BUILTIN_ANDNPS:
-      return ix86_expand_timode_binop_builtin (CODE_FOR_sse_nandti3,
-					       arglist, target);
-    case IX86_BUILTIN_ORPS:
-      return ix86_expand_timode_binop_builtin (CODE_FOR_sse_iorti3,
-					       arglist, target);
-    case IX86_BUILTIN_XORPS:
-      return ix86_expand_timode_binop_builtin (CODE_FOR_sse_xorti3,
-					       arglist, target);
-
    case IX86_BUILTIN_LOADAPS:
      return ix86_expand_unop_builtin (CODE_FOR_sse_movaps, arglist, target, 1);

@@ -13553,6 +13499,8 @@ ix86_preferred_reload_class (x, class)
     rtx x;
     enum reg_class class;
 {
+  if (GET_CODE (x) == CONST_VECTOR && x != CONST0_RTX (GET_MODE (x)))
+    return NO_REGS;
  if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
    {
      /* SSE can't load any constant directly yet.  */

--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -1066,7 +1066,7 @@ typedef int __v4si __attribute__ ((mode (V4SI)));
 typedef int __v8hi __attribute__ ((mode (V8HI)));
 typedef int __v16qi __attribute__ ((mode (V16QI)));

-#define __m128i __m128
+#define __m128i __v2di
 #define __m128d __v2df

 static __inline __m128d

--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -6532,7 +6532,7 @@ expand_expr (exp, target, tmode, modifier)
      }

    case PARM_DECL:
-      if (DECL_RTL (exp) == 0)
+      if (!DECL_RTL_SET_P (exp))
 	{
 	  error_with_decl (exp, "prior parameter's size depends on `%s'");
 	  return CONST0_RTX (mode);
@@ -10942,6 +10942,9 @@ do_tablejump (index, mode, range, table_label, default_label)
 {
  rtx temp, vector;

+  if (range > cfun->max_jumptable_ents)
+    cfun->max_jumptable_ents = range;
+
  /* Do an unsigned comparison (in the proper mode) between the index
     expression and the value which represents the length of the range.
     Since we just finished subtracting the lower bound of the range

--- a/gcc/final.c
+++ b/gcc/final.c
@@ -997,7 +997,7 @@ compute_alignments ()
 	 align it.  It is most likely an first block of loop.  */
      if (has_fallthru
 	  && branch_frequency + fallthru_frequency > BB_FREQ_MAX / 10
-	  && branch_frequency > fallthru_frequency * 5)
+	  && branch_frequency > fallthru_frequency * 2)
 	{
 	  log = LOOP_ALIGN (label);
 	  if (max_log < log)

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
+Mon Oct 14 20:37:51 CEST 2002  Jan Hubicka  <jh@suse.cz>
+
+	* gcc.dg/i386-ssetype-[1-5].c: New tests.
+
 2002-10-14  Richard Henderson  <rth@redhat.com>

 	* gcc.dg/20020219-1.c: Disable for 16-bit targets.

--- a/gcc/testsuite/gcc.dg/i386-ssetype-1.c
+++ b/gcc/testsuite/gcc.dg/i386-ssetype-1.c
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -msse2 -march=athlon" } */
+/* { dg-final { scan-assembler "andpd.*\[bs\]p" } } */
+/* { dg-final { scan-assembler "andnpd.*\[bs\]p" } } */
+/* { dg-final { scan-assembler "xorpd.*\[bs\]p" } } */
+/* { dg-final { scan-assembler "iorpd.*\[bs\]p" } } */
+/* { dg-final { scan-assembler-not "movdqa" } } */
+/* { dg-final { scan-assembler "movapd.*\[bs\]p" } } */
+
+/* Verify that we generate proper instruction with memory operand.  */
+
+#include <xmmintrin.h>
+__m128d
+t1(__m128d a, __m128d b)
+{
+return _mm_and_pd (a,b);
+}
+__m128d
+t2(__m128d a, __m128d b)
+{
+return _mm_andnot_pd (a,b);
+}
+__m128d
+t3(__m128d a, __m128d b)
+{
+return _mm_or_pd (a,b);
+}
+__m128d
+t4(__m128d a, __m128d b)
+{
+return _mm_xor_pd (a,b);
+}
--- a/gcc/testsuite/gcc.dg/i386-ssetype-2.c
+++ b/gcc/testsuite/gcc.dg/i386-ssetype-2.c
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -msse2 -march=athlon" } */
+/* { dg-final { scan-assembler "andpd" } } */
+/* { dg-final { scan-assembler "andnpd" } } */
+/* { dg-final { scan-assembler "xorpd" } } */
+/* { dg-final { scan-assembler "iorpd" } } */
+/* { dg-final { scan-assembler-not "movdqa" } } */
+/* { dg-final { scan-assembler "movapd" } } */
+
+/* Verify that we generate proper instruction without memory operand.  */
+
+#include <xmmintrin.h>
+__m128d
+t1(__m128d a, __m128d b)
+{
+a=_mm_sqrt_pd(a);
+b=_mm_sqrt_pd(b);
+return _mm_and_pd (a,b);
+}
+__m128d
+t2(__m128d a, __m128d b)
+{
+a=_mm_sqrt_pd(a);
+b=_mm_sqrt_pd(b);
+return _mm_andnot_pd (a,b);
+}
+__m128d
+t3(__m128d a, __m128d b)
+{
+a=_mm_sqrt_pd(a);
+b=_mm_sqrt_pd(b);
+return _mm_or_pd (a,b);
+}
+__m128d
+t4(__m128d a, __m128d b)
+{
+a=_mm_sqrt_pd(a);
+b=_mm_sqrt_pd(b);
+return _mm_xor_pd (a,b);
+}
--- a/gcc/testsuite/gcc.dg/i386-ssetype-3.c
+++ b/gcc/testsuite/gcc.dg/i386-ssetype-3.c
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -msse2 -march=athlon" } */
+/* { dg-final { scan-assembler "andps.*\[bs]p" } } */
+/* { dg-final { scan-assembler "andnps.*\[bs]p" } } */
+/* { dg-final { scan-assembler "xorps.*\[bs]p" } } */
+/* { dg-final { scan-assembler "orps.\[b*s]p" } } */
+/* { dg-final { scan-assembler-not "movdqa" } } */
+/* { dg-final { scan-assembler "movaps.*\[bs]p" } } */
+
+/* Verify that we generate proper instruction with memory operand.  */
+
+#include <xmmintrin.h>
+__m128
+t1(__m128 a, __m128 b)
+{
+return _mm_and_ps (a,b);
+}
+__m128
+t2(__m128 a, __m128 b)
+{
+return _mm_andnot_ps (a,b);
+}
+__m128
+t3(__m128 a, __m128 b)
+{
+return _mm_or_ps (a,b);
+}
+__m128
+t4(__m128 a, __m128 b)
+{
+return _mm_xor_ps (a,b);
+}
--- a/gcc/testsuite/gcc.dg/i386-ssetype-4.c
+++ b/gcc/testsuite/gcc.dg/i386-ssetype-4.c
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -msse2 -march=athlon" } */
+/* { dg-final { scan-assembler "andps" } } */
+/* { dg-final { scan-assembler "andnps" } } */
+/* { dg-final { scan-assembler "xorps" } } */
+/* { dg-final { scan-assembler "orps" } } */
+
+/* Verify that we generate proper instruction without memory operand.  */
+
+#include <xmmintrin.h>
+__m128
+t1(__m128 a, __m128 b)
+{
+a=_mm_sqrt_ps(a);
+b=_mm_sqrt_ps(b);
+return _mm_and_ps (a,b);
+}
+__m128
+t2(__m128 a, __m128 b)
+{
+a=_mm_sqrt_ps(a);
+b=_mm_sqrt_ps(b);
+return _mm_andnot_ps (a,b);
+}
+__m128
+t3(__m128 a, __m128 b)
+{
+a=_mm_sqrt_ps(a);
+b=_mm_sqrt_ps(b);
+return _mm_or_ps (a,b);
+}
+__m128
+t4(__m128 a, __m128 b)
+{
+a=_mm_sqrt_ps(a);
+b=_mm_sqrt_ps(b);
+return _mm_xor_ps (a,b);
+}
--- a/gcc/testsuite/gcc.dg/i386-ssetype-5.c
+++ b/gcc/testsuite/gcc.dg/i386-ssetype-5.c
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -msse2 -march=athlon" } */
+/* { dg-final { scan-assembler "pand.*\[bs\]p" } } */
+/* { dg-final { scan-assembler "pandn.*\[bs\]p" } } */
+/* { dg-final { scan-assembler "pxor.*\[bs\]p" } } */
+/* { dg-final { scan-assembler "por.*\[bs\]p" } } */
+/* { dg-final { scan-assembler "movdqa" } } */
+/* { dg-final { scan-assembler-not "movaps.*\[bs\]p" } } */
+
+/* Verify that we generate proper instruction with memory operand.  */
+
+#include <xmmintrin.h>
+__m128i
+t1(__m128i a, __m128i b)
+{
+return _mm_and_si128 (a,b);
+}
+__m128i
+t2(__m128i a, __m128i b)
+{
+return _mm_andnot_si128 (a,b);
+}
+__m128i
+t3(__m128i a, __m128i b)
+{
+return _mm_or_si128 (a,b);
+}
+__m128i
+t4(__m128i a, __m128i b)
+{
+return _mm_xor_si128 (a,b);
+}
+