Commit 4845dbb5 by Jan Hubicka Committed by Jan Hubicka

i386.h (ix86_tune_indices): Add X86_USE_VECTOR_CONVERTS.

	* i386.h (ix86_tune_indices): Add X86_USE_VECTOR_CONVERTS.
	(TARGET_USE_VECTOR_CONVERTS): New.
	* i386.md: New post-reload splitters for converting SF to DF and DF to
	SF.
	(floatsi* expander): Special case vector conversions.
	(floatsisf2_mixed_vector, floatsisf2_sse_vector_nointernunit,
	floatsisf2_sse_vector_internunit, floatsisf2_sse_vector,
	floatsidf2_mixed_vector, floatsidf2_sse_vector): New.
	(floatsisf2_mixed, floatsisf2_sse, floatsidf2_mixed, floatsidf2_sse):
	Disable when doing vector converts.
	(floatsi<mode>2_i387): Disable when
	* sse.md (vec_dupv2df): Export.
	* i386.c (ix86_tune_features): Enable SSE conversions.

Co-Authored-By: Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>

From-SVN: r128301
parent e9f91f62
2007-09-09 Jan Hubicka <jh@suse.cz>
Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>
* i386.h (ix86_tune_indices): Add X86_USE_VECTOR_CONVERTS.
(TARGET_USE_VECTOR_CONVERTS): New.
* i386.md: New post-reload splitters for converting SF to DF and DF to
SF.
(floatsi* expander): Special case vector conversions.
(floatsisf2_mixed_vector, floatsisf2_sse_vector_nointernunit,
floatsisf2_sse_vector_internunit, floatsisf2_sse_vector,
floatsidf2_mixed_vector, floatsidf2_sse_vector): New.
(floatsisf2_mixed, floatsisf2_sse, floatsidf2_mixed, floatsidf2_sse):
Disable when doing vector converts.
(floatsi<mode>2_i387): Disable when
* sse.md (vec_dupv2df): Export.
* i386.c (ix86_tune_features): Enable SSE conversions.
2007-09-09 Richard Guenther <rguenther@suse.de>
* tree-ssa-operands.c (add_virtual_operand): Only mark
......@@ -1258,6 +1258,10 @@ unsigned int ix86_tune_features[X86_TUNE_LAST] = {
operand that cannot be represented using a modRM byte. The XOR
replacement is long decoded, so this split helps here as well. */
m_K6,
/* X86_USE_VECTOR_CONVERTS: Preffer vector packed SSE conversion from
integer to FP. */
m_AMDFAM10,
};
/* Feature tests against the various architecture variations. */
......
......@@ -257,6 +257,7 @@ enum ix86_tune_indices {
X86_TUNE_MOVE_M1_VIA_OR,
X86_TUNE_NOT_UNPAIRABLE,
X86_TUNE_NOT_VECTORMODE,
X86_USE_VECTOR_CONVERTS,
X86_TUNE_LAST
};
......@@ -337,6 +338,7 @@ extern unsigned int ix86_tune_features[X86_TUNE_LAST];
#define TARGET_MOVE_M1_VIA_OR ix86_tune_features[X86_TUNE_MOVE_M1_VIA_OR]
#define TARGET_NOT_UNPAIRABLE ix86_tune_features[X86_TUNE_NOT_UNPAIRABLE]
#define TARGET_NOT_VECTORMODE ix86_tune_features[X86_TUNE_NOT_VECTORMODE]
#define TARGET_USE_VECTOR_CONVERTS ix86_tune_features[X86_USE_VECTOR_CONVERTS]
/* Feature tests against the various architecture variations. */
enum ix86_arch_indices {
......
......@@ -3916,6 +3916,49 @@
}
})
/* For converting SF(xmm2) to DF(xmm1), use the following code instead of
cvtss2sd:
unpcklps xmm2,xmm2 ; packed conversion might crash on signaling NaNs
cvtps2pd xmm2,xmm1
We do the conversion post reload to avoid producing of 128bit spills
that might lead to ICE on 32bit target. The sequence unlikely combine
anyway. */
(define_split
[(set (match_operand:DF 0 "register_operand" "")
(float_extend:DF
(match_operand:SF 1 "nonimmediate_operand" "")))]
"TARGET_USE_VECTOR_CONVERTS && !optimize_size
&& reload_completed && SSE_REG_P (operands[0])"
[(set (match_dup 2)
(float_extend:V2DF
(vec_select:V2SF
(match_dup 3)
(parallel [(const_int 0) (const_int 1)]))))]
{
operands[2] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
operands[3] = simplify_gen_subreg (V4SFmode, operands[0], DFmode, 0);
/* Use movss for loading from memory, unpcklps reg, reg for registers.
Try to avoid move when unpacking can be done in source. */
if (REG_P (operands[1]))
{
/* If it is unsafe to overwrite upper half of source, we need
to move to destination and unpack there. */
if ((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
|| PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4)
&& true_regnum (operands[0]) != true_regnum (operands[1]))
{
rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0]));
emit_move_insn (tmp, operands[1]);
}
else
operands[3] = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
emit_insn (gen_sse_unpcklps (operands[3], operands[3], operands[3]));
}
else
emit_insn (gen_vec_setv4sf_0 (operands[3],
CONST0_RTX (V4SFmode), operands[1]));
})
(define_insn "*extendsfdf2_mixed"
[(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x")
(float_extend:DF
......@@ -4009,6 +4052,51 @@
}
})
/* For converting DF(xmm2) to SF(xmm1), use the following code instead of
cvtsd2ss:
unpcklpd xmm2,xmm2 ; packed conversion might crash on signaling NaNs
cvtpd2ps xmm2,xmm1
We do the conversion post reload to avoid producing of 128bit spills
that might lead to ICE on 32bit target. The sequence unlikely combine
anyway. */
(define_split
[(set (match_operand:SF 0 "register_operand" "")
(float_truncate:SF
(match_operand:DF 1 "nonimmediate_operand" "")))]
"TARGET_USE_VECTOR_CONVERTS && !optimize_size
&& reload_completed && SSE_REG_P (operands[0])"
[(set (match_dup 2)
(vec_concat:V4SF
(float_truncate:V2SF
(match_dup 4))
(match_dup 3)))]
{
operands[2] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
operands[3] = CONST0_RTX (V2SFmode);
operands[4] = simplify_gen_subreg (V2DFmode, operands[0], SFmode, 0);
/* Use movsd for loading from memory, unpcklpd for registers.
Try to avoid move when unpacking can be done in source, or SSE3
movddup is available. */
if (REG_P (operands[1]))
{
if (!TARGET_SSE3
&& true_regnum (operands[0]) != true_regnum (operands[1])
&& (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
|| PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8))
{
rtx tmp = simplify_gen_subreg (DFmode, operands[0], SFmode, 0);
emit_move_insn (tmp, operands[1]);
operands[1] = tmp;
}
else if (!TARGET_SSE3)
operands[4] = simplify_gen_subreg (V2DFmode, operands[1], DFmode, 0);
emit_insn (gen_vec_dupv2df (operands[4], operands[1]));
}
else
emit_insn (gen_sse2_loadlpd (operands[4],
CONST0_RTX (V2DFmode), operands[1]));
})
(define_expand "truncdfsf2_with_temp"
[(parallel [(set (match_operand:SF 0 "" "")
(float_truncate:SF (match_operand:DF 1 "" "")))
......@@ -4685,12 +4773,67 @@
[(set (match_operand:MODEF 0 "register_operand" "")
(float:MODEF (match_operand:SI 1 "nonimmediate_operand" "")))]
"TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
"")
"
/* When we use vector converts, we can't have input in memory. */
if (GET_MODE (operands[0]) == DFmode && GET_MODE (operands[1]) == SImode
&& TARGET_USE_VECTOR_CONVERTS && !optimize_size && TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (DFmode))
operands[1] = force_reg (SImode, operands[1]);
if (GET_MODE (operands[0]) == SFmode && GET_MODE (operands[1]) == SImode
&& !optimize_size && TARGET_USE_VECTOR_CONVERTS && TARGET_SSE_MATH
&& SSE_FLOAT_MODE_P (SFmode))
{
/* When !flag_trapping_math, we handle SImode->SFmode vector
conversions same way as SImode->DFmode.
For flat_trapping_math we can't safely use vector conversion without
clearing upper half, otherwise precision exception might occur.
However we can still generate the common sequence converting value
from general register to XMM register as:
mov reg32, mem32
movd mem32, xmm
cvtdq2pd xmm,xmm
because we know that movd clears the upper half.
Sadly in this case we can't rely on reload moving the value to XMM
register, since we need to know if upper half is OK, so we need
to do reloading by hand. We force operand to memory unless target
supports inter unit moves. */
if (!flag_trapping_math)
operands[1] = force_reg (SImode, operands[1]);
else if (!MEM_P (operands[1]))
{
rtx tmp = assign_386_stack_local (SImode, SLOT_VIRTUAL);
emit_move_insn (tmp, operands[1]);
operands[1] = tmp;
}
}
")
(define_insn "*floatsisf2_mixed_vector"
[(set (match_operand:SF 0 "register_operand" "=x,f,?f")
(float:SF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
"TARGET_MIX_SSE_I387 && !flag_trapping_math
&& TARGET_USE_VECTOR_CONVERTS && !optimize_size"
"@
cvtpq2ps\t{%1, %0|%0, %1}
fild%z1\t%1
#"
[(set_attr "type" "sseicvt,fmov,multi")
(set_attr "mode" "SF")
(set_attr "unit" "*,i387,*")
(set_attr "athlon_decode" "double,*,*")
(set_attr "amdfam10_decode" "double,*,*")
(set_attr "fp_int_src" "false,true,true")])
(define_insn "*floatsisf2_mixed"
[(set (match_operand:SF 0 "register_operand" "=f,?f,x,x")
(float:SF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
"TARGET_MIX_SSE_I387"
"TARGET_MIX_SSE_I387
&& (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
"@
fild%z1\t%1
#
......@@ -4703,10 +4846,68 @@
(set_attr "amdfam10_decode" "*,*,vector,double")
(set_attr "fp_int_src" "true")])
(define_insn "*floatsisf2_sse_vector_nointernunit"
[(set (match_operand:SF 0 "register_operand" "=x")
(float:SF (match_operand:SI 1 "memory_operand" "m")))]
"flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
&& !TARGET_INTER_UNIT_MOVES"
"#"
[(set_attr "type" "multi")])
(define_insn "*floatsisf2_sse_vector_internunit"
[(set (match_operand:SF 0 "register_operand" "=x,x")
(float:SF (match_operand:SI 1 "nonimmediate_operand" "rm,x")))]
"flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
&& TARGET_INTER_UNIT_MOVES"
"#"
[(set_attr "type" "multi")])
(define_split
[(set (match_operand:SF 0 "register_operand" "")
(float:SF (match_operand:SI 1 "nonimmediate_operand" "")))]
"flag_trapping_math
&& TARGET_USE_VECTOR_CONVERTS && reload_completed
&& (TARGET_INTER_UNIT_MOVES || MEM_P (operands[1]))
&& !SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
[(set (match_dup 0)
(float:V4SF (match_dup 2)))]
{
operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1]));
})
(define_split
[(set (match_operand:SF 0 "register_operand" "")
(float:SF (match_operand:SI 1 "register_operand" "")))]
"flag_trapping_math
&& TARGET_USE_VECTOR_CONVERTS && reload_completed
&& SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
[(set (match_dup 2) (vec_duplicate:V4SI (match_dup 1)))
(set (match_dup 0)
(float:V4SF (match_dup 2)))]
{
operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
})
(define_insn "*floatsisf2_sse_vector"
[(set (match_operand:SF 0 "register_operand" "=x")
(float:SF (match_operand:SI 1 "register_operand" "x")))]
"!flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
&& !TARGET_INTER_UNIT_MOVES"
"cvtpq2ps\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
(set_attr "mode" "SF")
(set_attr "athlon_decode" "double")
(set_attr "amdfam10_decode" "double")
(set_attr "fp_int_src" "true")])
(define_insn "*floatsisf2_sse"
[(set (match_operand:SF 0 "register_operand" "=x,x")
(float:SF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
"TARGET_SSE_MATH"
"TARGET_SSE_MATH
&& (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
"cvtsi2ss\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
(set_attr "mode" "SF")
......@@ -4714,38 +4915,89 @@
(set_attr "amdfam10_decode" "vector,double")
(set_attr "fp_int_src" "true")])
(define_insn "*floatsidf2_mixed_vector"
[(set (match_operand:DF 0 "register_operand" "=x,f,f")
(float:DF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
"TARGET_SSE2 && TARGET_MIX_SSE_I387
&& TARGET_USE_VECTOR_CONVERTS && !optimize_size"
"@
cvtdq2pd\t{%1, %0|%0, %1}
fild%z1\t%1
#"
[(set_attr "type" "sseicvt,fmov,multi")
(set_attr "mode" "V2DF,DF,DF")
(set_attr "unit" "*,*,i387")
(set_attr "athlon_decode" "double,*,*")
(set_attr "amdfam10_decode" "double,*,*")
(set_attr "fp_int_src" "false,true,true")])
(define_insn "*floatsidf2_mixed"
[(set (match_operand:DF 0 "register_operand" "=f,?f,x,x")
(float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
"TARGET_SSE2 && TARGET_MIX_SSE_I387"
[(set (match_operand:DF 0 "register_operand" "=f,?f,x,x,!x")
(float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m,x")))]
"TARGET_SSE2 && TARGET_MIX_SSE_I387
&& (!TARGET_USE_VECTOR_CONVERTS || !optimize_size)"
"@
fild%z1\t%1
#
cvtsi2sd\t{%1, %0|%0, %1}
cvtsi2sd\t{%1, %0|%0, %1}"
[(set_attr "type" "fmov,multi,sseicvt,sseicvt")
(set_attr "mode" "DF")
(set_attr "unit" "*,i387,*,*")
(set_attr "athlon_decode" "*,*,double,direct")
(set_attr "amdfam10_decode" "*,*,vector,double")
cvtsi2sd\t{%1, %0|%0, %1}
cvtdq2pd\t{%1, %0|%0, %1}"
[(set_attr "type" "fmov,multi,sseicvt,sseicvt,sseicvt")
(set_attr "mode" "DF,DF,DF,DF,V2DF")
(set_attr "unit" "*,i387,*,*,*")
(set_attr "athlon_decode" "*,*,double,direct,double")
(set_attr "amdfam10_decode" "*,*,vector,double,double")
(set_attr "fp_int_src" "true,true,true,true,false")])
(define_insn "*floatsidf2_sse_vector"
[(set (match_operand:DF 0 "register_operand" "=x")
(float:DF (match_operand:SI 1 "register_operand" "x")))]
"TARGET_SSE2 && TARGET_SSE_MATH
&& TARGET_USE_VECTOR_CONVERTS && !optimize_size"
"cvtdq2pd\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
(set_attr "mode" "V2DF")
(set_attr "athlon_decode" "double")
(set_attr "amdfam10_decode" "double")
(set_attr "fp_int_src" "true")])
(define_split
[(set (match_operand:DF 0 "register_operand" "")
(float:DF (match_operand:SI 1 "memory_operand" "")))]
"TARGET_USE_VECTOR_CONVERTS && reload_completed
&& SSE_REG_P (operands[0])"
[(set (match_dup 0)
(float:V2DF
(vec_select:V2SI
(match_dup 2)
(parallel [(const_int 0) (const_int 1)]))))]
{
operands[2] = simplify_gen_subreg (V4SImode, operands[0], DFmode, 0);
operands[0] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1]));
})
(define_insn "*floatsidf2_sse"
[(set (match_operand:DF 0 "register_operand" "=x,x")
(float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
"TARGET_SSE2 && TARGET_SSE_MATH"
"cvtsi2sd\t{%1, %0|%0, %1}"
[(set (match_operand:DF 0 "register_operand" "=x,x,!x")
(float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m,x")))]
"TARGET_SSE2 && TARGET_SSE_MATH
&& (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
"@
cvtsi2sd\t{%1, %0|%0, %1}
cvtsi2sd\t{%1, %0|%0, %1}
cvtdq2pd\t{%1, %0|%0, %1}"
[(set_attr "type" "sseicvt")
(set_attr "mode" "DF")
(set_attr "athlon_decode" "double,direct")
(set_attr "amdfam10_decode" "vector,double")
(set_attr "mode" "DF,DF,V2DF")
(set_attr "athlon_decode" "double,direct,double")
(set_attr "amdfam10_decode" "vector,double,double")
(set_attr "fp_int_src" "true")])
(define_insn "*floatsi<mode>2_i387"
[(set (match_operand:MODEF 0 "register_operand" "=f,f")
(float:MODEF
(match_operand:SI 1 "nonimmediate_operand" "m,?r")))]
"TARGET_80387"
"TARGET_80387
&& (!TARGET_SSE_MATH || !SSE_FLOAT_MODE_P (GET_MODE (operands[0])))"
"@
fild%z1\t%1
#"
......
......@@ -2740,7 +2740,7 @@
[(set_attr "type" "sselog1")
(set_attr "mode" "DF")])
(define_insn "*vec_dupv2df"
(define_insn "vec_dupv2df"
[(set (match_operand:V2DF 0 "register_operand" "=x")
(vec_duplicate:V2DF
(match_operand:DF 1 "register_operand" "0")))]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment