Commit fdab73dc by Uros Bizjak

re PR target/70873 ([7 Regressio] 20% performance regression at 482.sphinx3…

re PR target/70873 ([7 Regressio] 20% performance regression at 482.sphinx3 after r235442 with -O2 -m32 on Haswell.)

	PR target/70873
	* config/i386/i386.md
	(TARGET_SSE_PARTIAL_REG_DEPENDENCY float_extend sf->df peephole2):
	Change to post-epilogue_completed late splitter.  Use sse_reg_operand
	as operand 0 predicate.
	(TARGET_SSE_PARTIAL_REG_DEPENDENCY float_truncate df->sf peephole2):
	Ditto.
	(TARGET_SSE_PARTIAL_REG_DEPENDENCY float {si,di}->{sf,df} peephole2):
	Ditto.  Emit the pattern using RTX.

	(TARGET_USE_VECTOR_FP_CONVERTS float_extend sf->df splitter):
	Use sse_reg_opreand as operand 0 predicate.  Do not use true_regnum in
	the post-reload splitter.  Use lowpart_subreg instead of gen_rtx_REG.
	(TARGET_USE_VECTOR_FP_CONVERTS float_truncate df->sf splitter):
	Ditto.
	(TARGET_USE_VECTOR_CONVERTS float si->{sf,df} splitter): Use
	sse_reg_operand as operand 0 predicate.

	(TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_extend sf->df peephole2):
	Use sse_reg_opreand as operand 0 predicate.  Use lowpart_subreg
	instead of gen_rtx_REG.
	(TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_truncate sf->df peephole2):
	Ditto.

From-SVN: r235906
parent d07d2177
2016-05-04 Uros Bizjak <ubizjak@gmail.com>
PR target/70873
* config/i386/i386.md
(TARGET_SSE_PARTIAL_REG_DEPENDENCY float_extend sf->df peephole2):
Change to post-epilogue_completed late splitter. Use sse_reg_operand
as operand 0 predicate.
(TARGET_SSE_PARTIAL_REG_DEPENDENCY float_truncate df->sf peephole2):
Ditto.
(TARGET_SSE_PARTIAL_REG_DEPENDENCY float {si,di}->{sf,df} peephole2):
Ditto. Emit the pattern using RTX.
(TARGET_USE_VECTOR_FP_CONVERTS float_extend sf->df splitter):
Use sse_reg_opreand as operand 0 predicate. Do not use true_regnum in
the post-reload splitter. Use lowpart_subreg instead of gen_rtx_REG.
(TARGET_USE_VECTOR_FP_CONVERTS float_truncate df->sf splitter):
Ditto.
(TARGET_USE_VECTOR_CONVERTS float si->{sf,df} splitter): Use
sse_reg_operand as operand 0 predicate.
(TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_extend sf->df peephole2):
Use sse_reg_opreand as operand 0 predicate. Use lowpart_subreg
instead of gen_rtx_REG.
(TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_truncate sf->df peephole2):
Ditto.
2016-05-04 Segher Boessenkool <segher@kernel.crashing.org>
* function.c (emit_use_return_register_into_block): Delete.
......@@ -94,8 +120,7 @@
* match.pd: Add BIT_FIELD_REF canonicalizations and vector
constructor simplifications.
* fold-const.c (fold_ternary_loc): Remove duplicate functionality
here.
* fold-const.c (fold_ternary_loc): Remove duplicate functionality here.
2016-05-04 Oleg Endo <olegendo@gcc.gnu.org>
......@@ -219,8 +244,7 @@
2016-05-03 Jakub Jelinek <jakub@redhat.com>
* config/i386/i386.md (*truncdfsf_mixed, *truncdfsf_i387,
*truncxfsf2_mixed, *truncxfdf2_mixed): Use v constraint instead
of x.
*truncxfsf2_mixed, *truncxfdf2_mixed): Use v constraint instead of x.
2016-05-03 Richard Biener <rguenther@suse.de>
......
......@@ -4231,12 +4231,12 @@
that might lead to ICE on 32bit target. The sequence unlikely combine
anyway. */
(define_split
[(set (match_operand:DF 0 "register_operand")
[(set (match_operand:DF 0 "sse_reg_operand")
(float_extend:DF
(match_operand:SF 1 "nonimmediate_operand")))]
"TARGET_USE_VECTOR_FP_CONVERTS
&& optimize_insn_for_speed_p ()
&& reload_completed && SSE_REG_P (operands[0])
&& reload_completed
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
[(set (match_dup 2)
......@@ -4253,13 +4253,11 @@
{
/* If it is unsafe to overwrite upper half of source, we need
to move to destination and unpack there. */
if (((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
|| PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4)
&& true_regnum (operands[0]) != true_regnum (operands[1]))
if (REGNO (operands[0]) != REGNO (operands[1])
|| (EXT_REX_SSE_REG_P (operands[1])
&& !TARGET_AVX512VL))
{
rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0]));
rtx tmp = lowpart_subreg (SFmode, operands[0], DFmode);
emit_move_insn (tmp, operands[1]);
}
else
......@@ -4267,7 +4265,7 @@
/* FIXME: vec_interleave_lowv4sf for AVX512VL should allow
=v, v, then vbroadcastss will be only needed for AVX512F without
AVX512VL. */
if (!EXT_REX_SSE_REGNO_P (true_regnum (operands[3])))
if (!EXT_REX_SSE_REGNO_P (REGNO (operands[3])))
emit_insn (gen_vec_interleave_lowv4sf (operands[3], operands[3],
operands[3]));
else
......@@ -4283,15 +4281,14 @@
;; It's more profitable to split and then extend in the same register.
(define_peephole2
[(set (match_operand:DF 0 "register_operand")
[(set (match_operand:DF 0 "sse_reg_operand")
(float_extend:DF
(match_operand:SF 1 "memory_operand")))]
"TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
&& optimize_insn_for_speed_p ()
&& SSE_REG_P (operands[0])"
&& optimize_insn_for_speed_p ()"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (float_extend:DF (match_dup 2)))]
"operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));")
"operands[2] = lowpart_subreg (SFmode, operands[0], DFmode);")
(define_insn "*extendsfdf2"
[(set (match_operand:DF 0 "nonimm_ssenomem_operand" "=f,m,v")
......@@ -4390,12 +4387,12 @@
that might lead to ICE on 32bit target. The sequence unlikely combine
anyway. */
(define_split
[(set (match_operand:SF 0 "register_operand")
[(set (match_operand:SF 0 "sse_reg_operand")
(float_truncate:SF
(match_operand:DF 1 "nonimmediate_operand")))]
"TARGET_USE_VECTOR_FP_CONVERTS
&& optimize_insn_for_speed_p ()
&& reload_completed && SSE_REG_P (operands[0])
&& reload_completed
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
[(set (match_dup 2)
......@@ -4413,9 +4410,7 @@
if (REG_P (operands[1]))
{
if (!TARGET_SSE3
&& true_regnum (operands[0]) != true_regnum (operands[1])
&& (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
|| PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8))
&& REGNO (operands[0]) != REGNO (operands[1]))
{
rtx tmp = lowpart_subreg (DFmode, operands[0], SFmode);
emit_move_insn (tmp, operands[1]);
......@@ -4432,15 +4427,14 @@
;; It's more profitable to split and then extend in the same register.
(define_peephole2
[(set (match_operand:SF 0 "register_operand")
[(set (match_operand:SF 0 "sse_reg_operand")
(float_truncate:SF
(match_operand:DF 1 "memory_operand")))]
"TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
&& optimize_insn_for_speed_p ()
&& SSE_REG_P (operands[0])"
&& optimize_insn_for_speed_p ()"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (float_truncate:SF (match_dup 2)))]
"operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));")
"operands[2] = lowpart_subreg (DFmode, operands[0], SFmode);")
(define_expand "truncdfsf2_with_temp"
[(parallel [(set (match_operand:SF 0)
......@@ -4547,7 +4541,7 @@
"reload_completed"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (match_dup 2))]
"operands[1] = gen_rtx_REG (SFmode, true_regnum (operands[1]));")
"operands[1] = gen_rtx_REG (SFmode, REGNO (operands[1]));")
;; Conversion from XFmode to {SF,DF}mode
......@@ -5153,11 +5147,11 @@
;; slots when !TARGET_INTER_UNIT_MOVES_TO_VEC disables the general_regs
;; alternative in sse2_loadld.
(define_split
[(set (match_operand:MODEF 0 "register_operand")
[(set (match_operand:MODEF 0 "sse_reg_operand")
(float:MODEF (match_operand:SI 1 "nonimmediate_operand")))]
"TARGET_SSE2 && TARGET_SSE_MATH
&& TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
&& reload_completed && SSE_REG_P (operands[0])
"TARGET_USE_VECTOR_CONVERTS
&& optimize_function_for_speed_p (cfun)
&& reload_completed
&& (MEM_P (operands[1]) || TARGET_INTER_UNIT_MOVES_TO_VEC)
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
......@@ -5176,41 +5170,43 @@
DONE;
})
;; Avoid partial SSE register dependency stalls
;; Avoid partial SSE register dependency stalls. This splitter should split
;; late in the pass sequence (after register rename pass), so allocated
;; registers won't change anymore
(define_split
[(set (match_operand:MODEF 0 "register_operand")
[(set (match_operand:MODEF 0 "sse_reg_operand")
(float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))]
"TARGET_SSE2 && TARGET_SSE_MATH
&& TARGET_SSE_PARTIAL_REG_DEPENDENCY
"TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& optimize_function_for_speed_p (cfun)
&& reload_completed && SSE_REG_P (operands[0])
&& epilogue_completed
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
[(const_int 0)]
[(set (match_dup 0)
(vec_merge:<MODEF:ssevecmode>
(vec_duplicate:<MODEF:ssevecmode>
(float:MODEF
(match_dup 1)))
(match_dup 0)
(const_int 1)))]
{
const machine_mode vmode = <MODEF:ssevecmode>mode;
const machine_mode mode = <MODEF:MODE>mode;
rtx t, op0 = lowpart_subreg (vmode, operands[0], mode);
emit_move_insn (op0, CONST0_RTX (vmode));
t = gen_rtx_FLOAT (mode, operands[1]);
t = gen_rtx_VEC_DUPLICATE (vmode, t);
t = gen_rtx_VEC_MERGE (vmode, t, op0, const1_rtx);
emit_insn (gen_rtx_SET (op0, t));
DONE;
operands[0] = lowpart_subreg (vmode, operands[0], <MODEF:MODE>mode);
emit_move_insn (operands[0], CONST0_RTX (vmode));
})
;; Break partial reg stall for cvtsd2ss.
;; Break partial reg stall for cvtsd2ss. This splitter should split
;; late in the pass sequence (after register rename pass),
;; so allocated registers won't change anymore.
(define_peephole2
[(set (match_operand:SF 0 "register_operand")
(define_split
[(set (match_operand:SF 0 "sse_reg_operand")
(float_truncate:SF
(match_operand:DF 1 "nonimmediate_operand")))]
"TARGET_SSE2 && TARGET_SSE_MATH
&& TARGET_SSE_PARTIAL_REG_DEPENDENCY
"TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& optimize_function_for_speed_p (cfun)
&& SSE_REG_P (operands[0])
&& epilogue_completed
&& (!SSE_REG_P (operands[1])
|| REGNO (operands[0]) != REGNO (operands[1]))
&& (!EXT_REX_SSE_REG_P (operands[0])
......@@ -5228,16 +5224,17 @@
emit_move_insn (operands[0], CONST0_RTX (V4SFmode));
})
;; Break partial reg stall for cvtss2sd.
;; Break partial reg stall for cvtss2sd. This splitter should split
;; late in the pass sequence (after register rename pass),
;; so allocated registers won't change anymore.
(define_peephole2
[(set (match_operand:DF 0 "register_operand")
(define_split
[(set (match_operand:DF 0 "sse_reg_operand")
(float_extend:DF
(match_operand:SF 1 "nonimmediate_operand")))]
"TARGET_SSE2 && TARGET_SSE_MATH
&& TARGET_SSE_PARTIAL_REG_DEPENDENCY
"TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& optimize_function_for_speed_p (cfun)
&& SSE_REG_P (operands[0])
&& epilogue_completed
&& (!SSE_REG_P (operands[1])
|| REGNO (operands[0]) != REGNO (operands[1]))
&& (!EXT_REX_SSE_REG_P (operands[0])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment