Commit 55805e54 by Yuri Rumyantsev Committed by H.J. Lu

Silvermont (SLM) architecture performance tuning

2013-05-31  Yuri Rumyantsev  <yuri.s.rumyantsev@intel.com>
	    Igor Zamyatin  <igor.zamyatin@intel.com>

	* config/i386/i386.h (enum ix86_tune_indices): Add
	X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS.
	(TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS): New define.

	* config/i386/i386.c (initial_ix86_tune_features)
	<X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS>: Initialize.
	(ix86_lea_outperforms): Handle Silvermont tuning.
	(ix86_avoid_lea_for_add): Add new argument to ix86_lea_outperforms
	call.
	(ix86_use_lea_for_mov): Likewise.
	(ix86_avoid_lea_for_addr): Likewise.
	(ix86_lea_for_add_ok): Likewise.
	(exact_dependency_1): New function.
	(exact_store_load_dependency): Likewise.
	(ix86_adjust_cost): Handle Silvermont tuning.
	(do_reoder_for_imul): Likewise.
	(swap_top_of_ready_list): New function.
	(ix86_sched_reorder): Changed to handle Silvermont tuning.

	* config/i386/i386.md (peepholes that split memory operand in fp
	converts): New.

From-SVN: r199546
parent e19c9de2
2013-05-31 Yuri Rumyantsev <yuri.s.rumyantsev@intel.com>
Igor Zamyatin <igor.zamyatin@intel.com>
Silvermont (SLM) architecture performance tuning.
* config/i386/i386.h (enum ix86_tune_indices): Add
X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS.
(TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS): New define.
* config/i386/i386.c (initial_ix86_tune_features)
<X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS>: Initialize.
(ix86_lea_outperforms): Handle Silvermont tuning.
(ix86_avoid_lea_for_add): Add new argument to ix86_lea_outperforms
call.
(ix86_use_lea_for_mov): Likewise.
(ix86_avoid_lea_for_addr): Likewise.
(ix86_lea_for_add_ok): Likewise.
(exact_dependency_1): New function.
(exact_store_load_dependency): Likewise.
(ix86_adjust_cost): Handle Silvermont tuning.
(do_reoder_for_imul): Likewise.
(swap_top_of_ready_list): New function.
(ix86_sched_reorder): Changed to handle Silvermont tuning.
* config/i386/i386.md (peepholes that split memory operand in fp
converts): New.
2013-05-31 Marcus Shawcroft <marcus.shawcroft@arm.com> 2013-05-31 Marcus Shawcroft <marcus.shawcroft@arm.com>
* config/aarch64/aarch64.c (aarch64_load_symref_appropriately): * config/aarch64/aarch64.c (aarch64_load_symref_appropriately):
...@@ -718,11 +744,11 @@ ...@@ -718,11 +744,11 @@
2013-05-24 Vladimir Makarov <vmakarov@redhat.com> 2013-05-24 Vladimir Makarov <vmakarov@redhat.com>
* lra-constraints.c (emit_spill_move): Use smaller mode for * lra-constraints.c (emit_spill_move): Use smaller mode for
mem-mem moves. mem-mem moves.
(check_and_process_move): Consider mem-reg moves for secondary (check_and_process_move): Consider mem-reg moves for secondary
too. too.
(curr_insn_transform): Don't lose insns emitted before for (curr_insn_transform): Don't lose insns emitted before for
secondary memory moves. secondary memory moves.
(inherit_in_ebb): Mark defined reg. Add usage only if it is not a (inherit_in_ebb): Mark defined reg. Add usage only if it is not a
reg set up in the current insn. reg set up in the current insn.
...@@ -1085,8 +1111,8 @@ ...@@ -1085,8 +1111,8 @@
2013-05-21 Christian Bruel <christian.bruel@st.com> 2013-05-21 Christian Bruel <christian.bruel@st.com>
* dwarf2out.c (multiple_reg_loc_descriptor): Use dbx_reg_number for * dwarf2out.c (multiple_reg_loc_descriptor): Use dbx_reg_number for
spanning registers. LEAF_REG_REMAP is supported only for contiguous spanning registers. LEAF_REG_REMAP is supported only for contiguous
registers. Set register size out of the PARALLEL loop. registers. Set register size out of the PARALLEL loop.
2013-05-20 Oleg Endo <olegendo@gcc.gnu.org> 2013-05-20 Oleg Endo <olegendo@gcc.gnu.org>
......
...@@ -2108,7 +2108,12 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { ...@@ -2108,7 +2108,12 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
a conditional move. */ a conditional move. */
m_ATOM m_ATOM,
/* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
fp converts to destination register. */
m_SLM
}; };
/* Feature tests against the various architecture variations. */ /* Feature tests against the various architecture variations. */
...@@ -17392,10 +17397,24 @@ distance_agu_use (unsigned int regno0, rtx insn) ...@@ -17392,10 +17397,24 @@ distance_agu_use (unsigned int regno0, rtx insn)
static bool static bool
ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1, ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
unsigned int regno2, int split_cost) unsigned int regno2, int split_cost, bool has_scale)
{ {
int dist_define, dist_use; int dist_define, dist_use;
/* For Silvermont if using a 2-source or 3-source LEA for
non-destructive destination purposes, or due to wanting
ability to use SCALE, the use of LEA is justified. */
if (ix86_tune == PROCESSOR_SLM)
{
if (has_scale)
return true;
if (split_cost < 1)
return false;
if (regno0 == regno1 || regno0 == regno2)
return false;
return true;
}
dist_define = distance_non_agu_define (regno1, regno2, insn); dist_define = distance_non_agu_define (regno1, regno2, insn);
dist_use = distance_agu_use (regno0, insn); dist_use = distance_agu_use (regno0, insn);
...@@ -17484,7 +17503,7 @@ ix86_avoid_lea_for_add (rtx insn, rtx operands[]) ...@@ -17484,7 +17503,7 @@ ix86_avoid_lea_for_add (rtx insn, rtx operands[])
if (regno0 == regno1 || regno0 == regno2) if (regno0 == regno1 || regno0 == regno2)
return false; return false;
else else
return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1); return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
} }
/* Return true if we should emit lea instruction instead of mov /* Return true if we should emit lea instruction instead of mov
...@@ -17506,7 +17525,7 @@ ix86_use_lea_for_mov (rtx insn, rtx operands[]) ...@@ -17506,7 +17525,7 @@ ix86_use_lea_for_mov (rtx insn, rtx operands[])
regno0 = true_regnum (operands[0]); regno0 = true_regnum (operands[0]);
regno1 = true_regnum (operands[1]); regno1 = true_regnum (operands[1]);
return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0); return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
} }
/* Return true if we need to split lea into a sequence of /* Return true if we need to split lea into a sequence of
...@@ -17585,7 +17604,8 @@ ix86_avoid_lea_for_addr (rtx insn, rtx operands[]) ...@@ -17585,7 +17604,8 @@ ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
split_cost -= 1; split_cost -= 1;
} }
return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost); return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
parts.scale > 1);
} }
/* Emit x86 binary operand CODE in mode MODE, where the first operand /* Emit x86 binary operand CODE in mode MODE, where the first operand
...@@ -17770,7 +17790,7 @@ ix86_lea_for_add_ok (rtx insn, rtx operands[]) ...@@ -17770,7 +17790,7 @@ ix86_lea_for_add_ok (rtx insn, rtx operands[])
if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
return false; return false;
return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0); return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
} }
/* Return true if destination reg of SET_BODY is shift count of /* Return true if destination reg of SET_BODY is shift count of
...@@ -24368,6 +24388,73 @@ ix86_agi_dependent (rtx set_insn, rtx use_insn) ...@@ -24368,6 +24388,73 @@ ix86_agi_dependent (rtx set_insn, rtx use_insn)
return false; return false;
} }
/* Helper function for exact_store_load_dependency.
Return true if addr is found in insn. */
static bool
exact_dependency_1 (rtx addr, rtx insn)
{
enum rtx_code code;
const char *format_ptr;
int i, j;
code = GET_CODE (insn);
switch (code)
{
case MEM:
if (rtx_equal_p (addr, insn))
return true;
break;
case REG:
CASE_CONST_ANY:
case SYMBOL_REF:
case CODE_LABEL:
case PC:
case CC0:
case EXPR_LIST:
return false;
default:
break;
}
format_ptr = GET_RTX_FORMAT (code);
for (i = 0; i < GET_RTX_LENGTH (code); i++)
{
switch (*format_ptr++)
{
case 'e':
if (exact_dependency_1 (addr, XEXP (insn, i)))
return true;
break;
case 'E':
for (j = 0; j < XVECLEN (insn, i); j++)
if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
return true;
break;
}
}
return false;
}
/* Return true if there exists exact dependency for store & load, i.e.
the same memory address is used in them. */
static bool
exact_store_load_dependency (rtx store, rtx load)
{
rtx set1, set2;
set1 = single_set (store);
if (!set1)
return false;
if (!MEM_P (SET_DEST (set1)))
return false;
set2 = single_set (load);
if (!set2)
return false;
if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
return true;
return false;
}
static int static int
ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
{ {
...@@ -24519,6 +24606,39 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) ...@@ -24519,6 +24606,39 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
else else
cost = 0; cost = 0;
} }
break;
case PROCESSOR_SLM:
if (!reload_completed)
return cost;
/* Increase cost of integer loads. */
memory = get_attr_memory (dep_insn);
if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
{
enum attr_unit unit = get_attr_unit (dep_insn);
if (unit == UNIT_INTEGER && cost == 1)
{
if (memory == MEMORY_LOAD)
cost = 3;
else
{
/* Increase cost of ld/st for short int types only
because of store forwarding issue. */
rtx set = single_set (dep_insn);
if (set && (GET_MODE (SET_DEST (set)) == QImode
|| GET_MODE (SET_DEST (set)) == HImode))
{
/* Increase cost of store/load insn if exact
dependence exists and it is load insn. */
enum attr_memory insn_memory = get_attr_memory (insn);
if (insn_memory == MEMORY_LOAD
&& exact_store_load_dependency (dep_insn, insn))
cost = 3;
}
}
}
}
default: default:
break; break;
...@@ -24565,47 +24685,32 @@ ia32_multipass_dfa_lookahead (void) ...@@ -24565,47 +24685,32 @@ ia32_multipass_dfa_lookahead (void)
execution. It is applied if execution. It is applied if
(1) IMUL instruction is on the top of list; (1) IMUL instruction is on the top of list;
(2) There exists the only producer of independent IMUL instruction in (2) There exists the only producer of independent IMUL instruction in
ready list; ready list.
(3) Put found producer on the top of ready list. Return index of IMUL producer if it was found and -1 otherwise. */
Returns issue rate. */
static int static int
ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready, do_reoder_for_imul(rtx *ready, int n_ready)
int clock_var ATTRIBUTE_UNUSED)
{ {
static int issue_rate = -1; rtx insn, set, insn1, insn2;
int n_ready = *pn_ready;
rtx insn, insn1, insn2;
int i;
sd_iterator_def sd_it; sd_iterator_def sd_it;
dep_t dep; dep_t dep;
int index = -1; int index = -1;
int i;
/* Set up issue rate. */
issue_rate = ix86_issue_rate();
/* Do reodering for Atom only. */
if (ix86_tune != PROCESSOR_ATOM) if (ix86_tune != PROCESSOR_ATOM)
return issue_rate; return index;
/* Do not perform ready list reodering for pre-reload schedule pass. */ /* Do not perform ready list reodering for pre-reload schedule pass. */
if (!reload_completed) if (!reload_completed)
return issue_rate; return index;
/* Nothing to do if ready list contains only 1 instruction. */
if (n_ready <= 1)
return issue_rate;
/* Check that IMUL instruction is on the top of ready list. */ /* Check that IMUL instruction is on the top of ready list. */
insn = ready[n_ready - 1]; insn = ready[n_ready - 1];
if (!NONDEBUG_INSN_P (insn)) set = single_set (insn);
return issue_rate; if (!set)
insn = PATTERN (insn); return index;
if (GET_CODE (insn) == PARALLEL) if (!(GET_CODE (SET_SRC (set)) == MULT
insn = XVECEXP (insn, 0, 0); && GET_MODE (SET_SRC (set)) == SImode))
if (GET_CODE (insn) != SET) return index;
return issue_rate;
if (!(GET_CODE (SET_SRC (insn)) == MULT
&& GET_MODE (SET_SRC (insn)) == SImode))
return issue_rate;
/* Search for producer of independent IMUL instruction. */ /* Search for producer of independent IMUL instruction. */
for (i = n_ready - 2; i>= 0; i--) for (i = n_ready - 2; i>= 0; i--)
...@@ -24656,19 +24761,134 @@ ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready, ...@@ -24656,19 +24761,134 @@ ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
if (index >= 0) if (index >= 0)
break; break;
} }
if (index < 0) return index;
return issue_rate; /* Didn't find IMUL producer. */ }
/* Try to find the best candidate on the top of ready list if two insns
have the same priority - candidate is best if its dependees were
scheduled earlier. Applied for Silvermont only.
Return true if top 2 insns must be interchanged. */
static bool
swap_top_of_ready_list(rtx *ready, int n_ready)
{
rtx top = ready[n_ready - 1];
rtx next = ready[n_ready - 2];
rtx set;
sd_iterator_def sd_it;
dep_t dep;
int clock1 = -1;
int clock2 = -1;
#define INSN_TICK(INSN) (HID (INSN)->tick)
if (sched_verbose > 1) if (ix86_tune != PROCESSOR_SLM)
fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n", return false;
INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1])); if (!reload_completed)
return false;
/* Put IMUL producer (ready[index]) at the top of ready list. */ if (!NONDEBUG_INSN_P (top))
insn1= ready[index]; return false;
for (i = index; i < n_ready - 1; i++) if (!NONJUMP_INSN_P (top))
ready[i] = ready[i + 1]; return false;
ready[n_ready - 1] = insn1; if (!NONDEBUG_INSN_P (next))
return false;
if (!NONJUMP_INSN_P (next))
return false;
set = single_set (top);
if (!set)
return false;
set = single_set (next);
if (!set)
return false;
if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
{
if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
return false;
/* Determine winner more precise. */
FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
{
rtx pro;
pro = DEP_PRO (dep);
if (!NONDEBUG_INSN_P (pro))
continue;
if (INSN_TICK (pro) > clock1)
clock1 = INSN_TICK (pro);
}
FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
{
rtx pro;
pro = DEP_PRO (dep);
if (!NONDEBUG_INSN_P (pro))
continue;
if (INSN_TICK (pro) > clock2)
clock2 = INSN_TICK (pro);
}
if (clock1 == clock2)
{
/* Determine winner - load must win. */
enum attr_memory memory1, memory2;
memory1 = get_attr_memory (top);
memory2 = get_attr_memory (next);
if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
return true;
}
return (bool) (clock2 < clock1);
}
return false;
#undef INSN_TICK
}
/* Perform possible reodering of ready list for Atom/Silvermont only.
Return issue rate. */
static int
ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
int clock_var)
{
int issue_rate = -1;
int n_ready = *pn_ready;
int i;
rtx insn;
int index = -1;
/* Set up issue rate. */
issue_rate = ix86_issue_rate();
/* Do reodering for Atom/SLM only. */
if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM)
return issue_rate;
/* Nothing to do if ready list contains only 1 instruction. */
if (n_ready <= 1)
return issue_rate;
/* Do reodering for post-reload scheduler only. */
if (!reload_completed)
return issue_rate;
if ((index = do_reoder_for_imul (ready, n_ready)) >= 0)
{
if (sched_verbose > 1)
fprintf(dump, ";;\tatom sched_reorder: put %d insn on top\n",
INSN_UID (ready[index]));
/* Put IMUL producer (ready[index]) at the top of ready list. */
insn= ready[index];
for (i = index; i < n_ready - 1; i++)
ready[i] = ready[i + 1];
ready[n_ready - 1] = insn;
return issue_rate;
}
if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
{
if (sched_verbose > 1)
fprintf(dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
/* Swap 2 top elements of ready list. */
insn = ready[n_ready - 1];
ready[n_ready - 1] = ready[n_ready - 2];
ready[n_ready - 2] = insn;
}
return issue_rate; return issue_rate;
} }
...@@ -333,6 +333,7 @@ enum ix86_tune_indices { ...@@ -333,6 +333,7 @@ enum ix86_tune_indices {
X86_TUNE_REASSOC_FP_TO_PARALLEL, X86_TUNE_REASSOC_FP_TO_PARALLEL,
X86_TUNE_GENERAL_REGS_SSE_SPILL, X86_TUNE_GENERAL_REGS_SSE_SPILL,
X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE,
X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS,
X86_TUNE_LAST X86_TUNE_LAST
}; };
...@@ -443,6 +444,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ...@@ -443,6 +444,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_GENERAL_REGS_SSE_SPILL] ix86_tune_features[X86_TUNE_GENERAL_REGS_SSE_SPILL]
#define TARGET_AVOID_MEM_OPND_FOR_CMOVE \ #define TARGET_AVOID_MEM_OPND_FOR_CMOVE \
ix86_tune_features[X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE] ix86_tune_features[X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE]
#define TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS \
ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS]
/* Feature tests against the various architecture variations. */ /* Feature tests against the various architecture variations. */
enum ix86_arch_indices { enum ix86_arch_indices {
......
...@@ -3625,6 +3625,18 @@ ...@@ -3625,6 +3625,18 @@
CONST0_RTX (V4SFmode), operands[1])); CONST0_RTX (V4SFmode), operands[1]));
}) })
;; It's more profitable to split and then extend in the same register.
(define_peephole2
[(set (match_operand:DF 0 "register_operand")
(float_extend:DF
(match_operand:SF 1 "memory_operand")))]
"TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
&& optimize_insn_for_speed_p ()
&& SSE_REG_P (operands[0])"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (float_extend:DF (match_dup 2)))]
"operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));")
(define_insn "*extendsfdf2_mixed" (define_insn "*extendsfdf2_mixed"
[(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x") [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x")
(float_extend:DF (float_extend:DF
...@@ -3766,6 +3778,18 @@ ...@@ -3766,6 +3778,18 @@
CONST0_RTX (V2DFmode), operands[1])); CONST0_RTX (V2DFmode), operands[1]));
}) })
;; It's more profitable to split and then extend in the same register.
(define_peephole2
[(set (match_operand:SF 0 "register_operand")
(float_truncate:SF
(match_operand:DF 1 "memory_operand")))]
"TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
&& optimize_insn_for_speed_p ()
&& SSE_REG_P (operands[0])"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (float_truncate:SF (match_dup 2)))]
"operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));")
(define_expand "truncdfsf2_with_temp" (define_expand "truncdfsf2_with_temp"
[(parallel [(set (match_operand:SF 0) [(parallel [(set (match_operand:SF 0)
(float_truncate:SF (match_operand:DF 1))) (float_truncate:SF (match_operand:DF 1)))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment