Commit 1877be45 by Jan Hubicka Committed by Jan Hubicka

re PR c/7344 (performance regression on huge case statements)

	* i386.md (movv2di_internal): New pattern.
	(movv2df_internal, movv8hi_internal, movv16qi_internal): Fix predicate.
	(movv2di): New expander.
	* i386.c (ix86_preferred_reload_class): Return NO_REGS for vector operands.

	* i386.c (ix86_expand_timode_binop_builtin): Delete.
	(builtin_description): Add SSE1 logicals; rename SSE2 logicals.
	(ix86_init_mmx_sse_builtins): Kill SSE1 logicals.
	(ix86_expand_builtin): Likewise.
	* i386.h (sse_andti4_df_1, sse_andti3_df_2, sse_andti3_sf_1, sse_andti3_sf_2,
        sse_andti3,
	sse_andnti4_df_1, sse_andti3_df_2, sse_andti3_sf_1, sse_andti3_sf_2,
        sse_andnti3,
	sse_orti4_df_1, sse_orti3_df_2, sse_orti3_sf_1, sse_orti3_sf_2,
        sse_orti3,
	sse_xorti4_df_1, sse_xorti3_df_2, sse_xorti3_sf_1, sse_xorti3_sf_2,
        sse_xorti3): Kill.
	(sse_andv4sf3, sse_andnv4sf3, sse_orv2df3, sse_xorv2df3, sse_andv2df3,
	 sse_andnv2df3, sse_orv2df3, sse_xorv2df3): New expanders.
	(*sse_andv4sf3, *sse_andnv2df3, *sse_orv4sf3, *sse_xorv4sf3, *sse_andv2df3,
	 *sse_andnv2df3, *sse_orv2df3, *sse_xorv2df3): New patterns.
	(*sse_andsf3, *sse_andndf3, *sse_ordf3, *sse_xordf3, *sse_anddf3,
	 *sse_andndf3, *sse_orv2df3, *sse_xorv2df3): New patterns.

	* xmmintrin.h (__m128i): Define as __v2di.

	PR c/7344
	* predict.c (can_predict_insn_p): New function.
	(estimate_probability): Avoid unnecesary work.
	(process_note_prediction): Likewise.
	* toplev.c (rest_of_compilation): Account early branch prediction pass
	as TV_BRANCH_PROB.

	PR c++/6419
	(expand_expr): Use DECL_RTL_SET_P.

From-SVN: r58156
parent 0aab899b
Mon Oct 14 20:33:12 CEST 2002 Jan Hubicka <jh@suse.cz>
* i386.md (movv2di_internal): New pattern.
(movv2df_internal, movv8hi_internal, movv16qi_internal): Fix predicate.
(movv2di): New expander.
* i386.c (ix86_preferred_reload_class): Return NO_REGS for vector operands.
* i386.c (ix86_expand_timode_binop_builtin): Delete.
(builtin_description): Add SSE1 logicals; rename SSE2 logicals.
(ix86_init_mmx_sse_builtins): Kill SSE1 logicals.
(ix86_expand_builtin): Likewise.
* i386.h (sse_andti4_df_1, sse_andti3_df_2, sse_andti3_sf_1, sse_andti3_sf_2,
sse_andti3,
sse_andnti4_df_1, sse_andti3_df_2, sse_andti3_sf_1, sse_andti3_sf_2,
sse_andnti3,
sse_orti4_df_1, sse_orti3_df_2, sse_orti3_sf_1, sse_orti3_sf_2,
sse_orti3,
sse_xorti4_df_1, sse_xorti3_df_2, sse_xorti3_sf_1, sse_xorti3_sf_2,
sse_xorti3): Kill.
(sse_andv4sf3, sse_andnv4sf3, sse_orv2df3, sse_xorv2df3, sse_andv2df3,
sse_andnv2df3, sse_orv2df3, sse_xorv2df3): New expanders.
(*sse_andv4sf3, *sse_andnv2df3, *sse_orv4sf3, *sse_xorv4sf3, *sse_andv2df3,
*sse_andnv2df3, *sse_orv2df3, *sse_xorv2df3): New patterns.
(*sse_andsf3, *sse_andndf3, *sse_ordf3, *sse_xordf3, *sse_anddf3,
*sse_andndf3, *sse_orv2df3, *sse_xorv2df3): New patterns.
* xmmintrin.h (__m128i): Define as __v2di.
PR c/7344
* predict.c (can_predict_insn_p): New function.
(estimate_probability): Avoid unnecesary work.
(process_note_prediction): Likewise.
* toplev.c (rest_of_compilation): Account early branch prediction pass
as TV_BRANCH_PROB.
PR c++/6419
(expand_expr): Use DECL_RTL_SET_P.
2002-10-14 Roger Sayle <roger@eyesopen.com>
* combine.c (simplify_set): Treat MODE_CC registers like cc0.
......@@ -136,7 +174,7 @@ Fri Oct 11 22:22:38 CEST 2002 Jan Hubicka <jh@suse.cz>
PR c/7344
* cfgbuild.c (make_edges): Create edge cache when we do have
large jumptable.
(do_tablejump): Note size of maximal jumptable.
* expr.c (do_tablejump): Note size of maximal jumptable.
* function.c (prepare_function_start): Zero out size.
* function.h (function): Add max_jumptable_ents.
......
......@@ -771,8 +771,6 @@ static rtx ix86_expand_sse_compare PARAMS ((const struct builtin_description *,
static rtx ix86_expand_unop1_builtin PARAMS ((enum insn_code, tree, rtx));
static rtx ix86_expand_unop_builtin PARAMS ((enum insn_code, tree, rtx, int));
static rtx ix86_expand_binop_builtin PARAMS ((enum insn_code, tree, rtx));
static rtx ix86_expand_timode_binop_builtin PARAMS ((enum insn_code,
tree, rtx));
static rtx ix86_expand_store_builtin PARAMS ((enum insn_code, tree));
static rtx safe_vector_operand PARAMS ((rtx, enum machine_mode));
static enum rtx_code ix86_fp_compare_code_to_integer PARAMS ((enum rtx_code));
......@@ -11811,6 +11809,11 @@ static const struct builtin_description bdesc_2arg[] =
{ MASK_SSE1, CODE_FOR_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, 0, 0 },
{ MASK_SSE1, CODE_FOR_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, 0, 0 },
{ MASK_SSE1, CODE_FOR_sse_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, 0, 0 },
{ MASK_SSE1, CODE_FOR_sse_nandv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, 0, 0 },
{ MASK_SSE1, CODE_FOR_sse_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, 0, 0 },
{ MASK_SSE1, CODE_FOR_sse_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, 0, 0 },
{ MASK_SSE1, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, 0, 0 },
{ MASK_SSE1, CODE_FOR_sse_movhlps, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, 0, 0 },
{ MASK_SSE1, CODE_FOR_sse_movlhps, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, 0, 0 },
......@@ -11935,10 +11938,10 @@ static const struct builtin_description bdesc_2arg[] =
{ MASK_SSE2, CODE_FOR_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, 0, 0 },
{ MASK_SSE2, CODE_FOR_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, 0, 0 },
{ MASK_SSE2, CODE_FOR_sse2_anddf3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, 0, 0 },
{ MASK_SSE2, CODE_FOR_sse2_nanddf3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, 0, 0 },
{ MASK_SSE2, CODE_FOR_sse2_iordf3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, 0, 0 },
{ MASK_SSE2, CODE_FOR_sse2_xordf3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, 0, 0 },
{ MASK_SSE2, CODE_FOR_sse2_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, 0, 0 },
{ MASK_SSE2, CODE_FOR_sse2_nandv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, 0, 0 },
{ MASK_SSE2, CODE_FOR_sse2_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, 0, 0 },
{ MASK_SSE2, CODE_FOR_sse2_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, 0, 0 },
{ MASK_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, 0, 0 },
{ MASK_SSE2, CODE_FOR_sse2_unpckhpd, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, 0, 0 },
......@@ -12443,11 +12446,6 @@ ix86_init_mmx_sse_builtins ()
def_builtin (MASK_SSE1, "__builtin_ia32_cvttps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTTPS2PI);
def_builtin (MASK_SSE1, "__builtin_ia32_cvttss2si", int_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI);
def_builtin (MASK_SSE1, "__builtin_ia32_andps", v4sf_ftype_v4sf_v4sf, IX86_BUILTIN_ANDPS);
def_builtin (MASK_SSE1, "__builtin_ia32_andnps", v4sf_ftype_v4sf_v4sf, IX86_BUILTIN_ANDNPS);
def_builtin (MASK_SSE1, "__builtin_ia32_orps", v4sf_ftype_v4sf_v4sf, IX86_BUILTIN_ORPS);
def_builtin (MASK_SSE1, "__builtin_ia32_xorps", v4sf_ftype_v4sf_v4sf, IX86_BUILTIN_XORPS);
def_builtin (MASK_SSE1 | MASK_3DNOW_A, "__builtin_ia32_pextrw", int_ftype_v4hi_int, IX86_BUILTIN_PEXTRW);
def_builtin (MASK_SSE1 | MASK_3DNOW_A, "__builtin_ia32_pinsrw", v4hi_ftype_v4hi_int_int, IX86_BUILTIN_PINSRW);
......@@ -12680,45 +12678,6 @@ ix86_expand_binop_builtin (icode, arglist, target)
return target;
}
/* In type_for_mode we restrict the ability to create TImode types
to hosts with 64-bit H_W_I. So we've defined the SSE logicals
to have a V4SFmode signature. Convert them in-place to TImode. */
static rtx
ix86_expand_timode_binop_builtin (icode, arglist, target)
enum insn_code icode;
tree arglist;
rtx target;
{
rtx pat;
tree arg0 = TREE_VALUE (arglist);
tree arg1 = TREE_VALUE (TREE_CHAIN (arglist));
rtx op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0);
rtx op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0);
op0 = gen_lowpart (TImode, op0);
op1 = gen_lowpart (TImode, op1);
target = gen_reg_rtx (TImode);
if (! (*insn_data[icode].operand[1].predicate) (op0, TImode))
op0 = copy_to_mode_reg (TImode, op0);
if (! (*insn_data[icode].operand[2].predicate) (op1, TImode))
op1 = copy_to_mode_reg (TImode, op1);
/* In the commutative cases, both op0 and op1 are nonimmediate_operand,
yet one of the two must not be a memory. This is normally enforced
by expanders, but we didn't bother to create one here. */
if (GET_CODE (op0) == MEM && GET_CODE (op1) == MEM)
op0 = copy_to_mode_reg (TImode, op0);
pat = GEN_FCN (icode) (target, op0, op1);
if (! pat)
return 0;
emit_insn (pat);
return gen_lowpart (V4SFmode, target);
}
/* Subroutine of ix86_expand_builtin to take care of stores. */
static rtx
......@@ -13064,19 +13023,6 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore)
case IX86_BUILTIN_RCPSS:
return ix86_expand_unop1_builtin (CODE_FOR_vmrcpv4sf2, arglist, target);
case IX86_BUILTIN_ANDPS:
return ix86_expand_timode_binop_builtin (CODE_FOR_sse_andti3,
arglist, target);
case IX86_BUILTIN_ANDNPS:
return ix86_expand_timode_binop_builtin (CODE_FOR_sse_nandti3,
arglist, target);
case IX86_BUILTIN_ORPS:
return ix86_expand_timode_binop_builtin (CODE_FOR_sse_iorti3,
arglist, target);
case IX86_BUILTIN_XORPS:
return ix86_expand_timode_binop_builtin (CODE_FOR_sse_xorti3,
arglist, target);
case IX86_BUILTIN_LOADAPS:
return ix86_expand_unop_builtin (CODE_FOR_sse_movaps, arglist, target, 1);
......@@ -13553,6 +13499,8 @@ ix86_preferred_reload_class (x, class)
rtx x;
enum reg_class class;
{
if (GET_CODE (x) == CONST_VECTOR && x != CONST0_RTX (GET_MODE (x)))
return NO_REGS;
if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
{
/* SSE can't load any constant directly yet. */
......
......@@ -1066,7 +1066,7 @@ typedef int __v4si __attribute__ ((mode (V4SI)));
typedef int __v8hi __attribute__ ((mode (V8HI)));
typedef int __v16qi __attribute__ ((mode (V16QI)));
#define __m128i __m128
#define __m128i __v2di
#define __m128d __v2df
static __inline __m128d
......
......@@ -6532,7 +6532,7 @@ expand_expr (exp, target, tmode, modifier)
}
case PARM_DECL:
if (DECL_RTL (exp) == 0)
if (!DECL_RTL_SET_P (exp))
{
error_with_decl (exp, "prior parameter's size depends on `%s'");
return CONST0_RTX (mode);
......@@ -10942,6 +10942,9 @@ do_tablejump (index, mode, range, table_label, default_label)
{
rtx temp, vector;
if (range > cfun->max_jumptable_ents)
cfun->max_jumptable_ents = range;
/* Do an unsigned comparison (in the proper mode) between the index
expression and the value which represents the length of the range.
Since we just finished subtracting the lower bound of the range
......
......@@ -997,7 +997,7 @@ compute_alignments ()
align it. It is most likely an first block of loop. */
if (has_fallthru
&& branch_frequency + fallthru_frequency > BB_FREQ_MAX / 10
&& branch_frequency > fallthru_frequency * 5)
&& branch_frequency > fallthru_frequency * 2)
{
log = LOOP_ALIGN (label);
if (max_log < log)
......
Mon Oct 14 20:37:51 CEST 2002 Jan Hubicka <jh@suse.cz>
* gcc.dg/i386-ssetype-[1-5].c: New tests.
2002-10-14 Richard Henderson <rth@redhat.com>
* gcc.dg/20020219-1.c: Disable for 16-bit targets.
......
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-options "-O2 -msse2 -march=athlon" } */
/* { dg-final { scan-assembler "andpd.*\[bs\]p" } } */
/* { dg-final { scan-assembler "andnpd.*\[bs\]p" } } */
/* { dg-final { scan-assembler "xorpd.*\[bs\]p" } } */
/* { dg-final { scan-assembler "iorpd.*\[bs\]p" } } */
/* { dg-final { scan-assembler-not "movdqa" } } */
/* { dg-final { scan-assembler "movapd.*\[bs\]p" } } */
/* Verify that we generate proper instruction with memory operand. */
#include <xmmintrin.h>
__m128d
t1(__m128d a, __m128d b)
{
return _mm_and_pd (a,b);
}
__m128d
t2(__m128d a, __m128d b)
{
return _mm_andnot_pd (a,b);
}
__m128d
t3(__m128d a, __m128d b)
{
return _mm_or_pd (a,b);
}
__m128d
t4(__m128d a, __m128d b)
{
return _mm_xor_pd (a,b);
}
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-options "-O2 -msse2 -march=athlon" } */
/* { dg-final { scan-assembler "andpd" } } */
/* { dg-final { scan-assembler "andnpd" } } */
/* { dg-final { scan-assembler "xorpd" } } */
/* { dg-final { scan-assembler "iorpd" } } */
/* { dg-final { scan-assembler-not "movdqa" } } */
/* { dg-final { scan-assembler "movapd" } } */
/* Verify that we generate proper instruction without memory operand. */
#include <xmmintrin.h>
__m128d
t1(__m128d a, __m128d b)
{
a=_mm_sqrt_pd(a);
b=_mm_sqrt_pd(b);
return _mm_and_pd (a,b);
}
__m128d
t2(__m128d a, __m128d b)
{
a=_mm_sqrt_pd(a);
b=_mm_sqrt_pd(b);
return _mm_andnot_pd (a,b);
}
__m128d
t3(__m128d a, __m128d b)
{
a=_mm_sqrt_pd(a);
b=_mm_sqrt_pd(b);
return _mm_or_pd (a,b);
}
__m128d
t4(__m128d a, __m128d b)
{
a=_mm_sqrt_pd(a);
b=_mm_sqrt_pd(b);
return _mm_xor_pd (a,b);
}
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-options "-O2 -msse2 -march=athlon" } */
/* { dg-final { scan-assembler "andps.*\[bs]p" } } */
/* { dg-final { scan-assembler "andnps.*\[bs]p" } } */
/* { dg-final { scan-assembler "xorps.*\[bs]p" } } */
/* { dg-final { scan-assembler "orps.\[b*s]p" } } */
/* { dg-final { scan-assembler-not "movdqa" } } */
/* { dg-final { scan-assembler "movaps.*\[bs]p" } } */
/* Verify that we generate proper instruction with memory operand. */
#include <xmmintrin.h>
__m128
t1(__m128 a, __m128 b)
{
return _mm_and_ps (a,b);
}
__m128
t2(__m128 a, __m128 b)
{
return _mm_andnot_ps (a,b);
}
__m128
t3(__m128 a, __m128 b)
{
return _mm_or_ps (a,b);
}
__m128
t4(__m128 a, __m128 b)
{
return _mm_xor_ps (a,b);
}
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-options "-O2 -msse2 -march=athlon" } */
/* { dg-final { scan-assembler "andps" } } */
/* { dg-final { scan-assembler "andnps" } } */
/* { dg-final { scan-assembler "xorps" } } */
/* { dg-final { scan-assembler "orps" } } */
/* Verify that we generate proper instruction without memory operand. */
#include <xmmintrin.h>
__m128
t1(__m128 a, __m128 b)
{
a=_mm_sqrt_ps(a);
b=_mm_sqrt_ps(b);
return _mm_and_ps (a,b);
}
__m128
t2(__m128 a, __m128 b)
{
a=_mm_sqrt_ps(a);
b=_mm_sqrt_ps(b);
return _mm_andnot_ps (a,b);
}
__m128
t3(__m128 a, __m128 b)
{
a=_mm_sqrt_ps(a);
b=_mm_sqrt_ps(b);
return _mm_or_ps (a,b);
}
__m128
t4(__m128 a, __m128 b)
{
a=_mm_sqrt_ps(a);
b=_mm_sqrt_ps(b);
return _mm_xor_ps (a,b);
}
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-options "-O2 -msse2 -march=athlon" } */
/* { dg-final { scan-assembler "pand.*\[bs\]p" } } */
/* { dg-final { scan-assembler "pandn.*\[bs\]p" } } */
/* { dg-final { scan-assembler "pxor.*\[bs\]p" } } */
/* { dg-final { scan-assembler "por.*\[bs\]p" } } */
/* { dg-final { scan-assembler "movdqa" } } */
/* { dg-final { scan-assembler-not "movaps.*\[bs\]p" } } */
/* Verify that we generate proper instruction with memory operand. */
#include <xmmintrin.h>
__m128i
t1(__m128i a, __m128i b)
{
return _mm_and_si128 (a,b);
}
__m128i
t2(__m128i a, __m128i b)
{
return _mm_andnot_si128 (a,b);
}
__m128i
t3(__m128i a, __m128i b)
{
return _mm_or_si128 (a,b);
}
__m128i
t4(__m128i a, __m128i b)
{
return _mm_xor_si128 (a,b);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment