Commit 340c7904 by Maxim Kuvyrkov Committed by Maxim Kuvyrkov

Model cache auto-prefetcher in scheduler

	* config/arm/arm-protos.h (struct tune_params): New field
	sched_autopref_queue_depth.
	* config/arm/arm.c (sched-int.h): Include header.
	(arm_first_cycle_multipass_dfa_lookahead_guard,)
	(TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define hook.
	(arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,)
	(arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,)
	(arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,)
	(arm_cortex_a53_tune, arm_cortex_a57_tune, arm_xgene1_tune,)
	(arm_cortex_a5_tune, arm_cortex_a9_tune, arm_cortex_a12_tune,)
	(arm_v7m_tune, arm_cortex_m7_tune, arm_v6m_tune, arm_fa726te_tune):
	Specify sched_autopref_queue_depth value.  Enabled for A15 and A57.
	* config/arm/t-arm (arm.o): Update.
	* haifa-sched.c (update_insn_after_change): Update.
	(rank_for_schedule): Use auto-prefetcher model, if requested.
	(autopref_multipass_init): New static function.
	(autopref_rank_for_schedule): New rank_for_schedule heuristic.
	(autopref_multipass_dfa_lookahead_guard_started_dump_p): New static
	variable for debug dumps.
	(autopref_multipass_dfa_lookahead_guard_1): New static helper function.
	(autopref_multipass_dfa_lookahead_guard): New global function that
	implements TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD hook.
	(init_h_i_d): Update.
	* params.def (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH): New tuning knob.
	* sched-int.h (enum autopref_multipass_data_status): New const enum.
	(autopref_multipass_data_): Structure for auto-prefetcher data.
	(autopref_multipass_data_def, autopref_multipass_data_t): New typedefs.
	(struct _haifa_insn_data:autopref_multipass_data): New field.
	(INSN_AUTOPREF_MULTIPASS_DATA): New access macro.
	(autopref_multipass_dfa_lookahead_guard): Declare.

From-SVN: r219789
parent 71acd477
2015-01-17 Maxim Kuvyrkov <maxim.kuvyrkov@linaro.org>
* config/arm/arm-protos.h (struct tune_params): New field
sched_autopref_queue_depth.
* config/arm/arm.c (sched-int.h): Include header.
(arm_first_cycle_multipass_dfa_lookahead_guard,)
(TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define hook.
(arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,)
(arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,)
(arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,)
(arm_cortex_a53_tune, arm_cortex_a57_tune, arm_xgene1_tune,)
(arm_cortex_a5_tune, arm_cortex_a9_tune, arm_cortex_a12_tune,)
(arm_v7m_tune, arm_cortex_m7_tune, arm_v6m_tune, arm_fa726te_tune):
Specify sched_autopref_queue_depth value. Enabled for A15 and A57.
* config/arm/t-arm (arm.o): Update.
* haifa-sched.c (update_insn_after_change): Update.
(rank_for_schedule): Use auto-prefetcher model, if requested.
(autopref_multipass_init): New static function.
(autopref_rank_for_schedule): New rank_for_schedule heuristic.
(autopref_multipass_dfa_lookahead_guard_started_dump_p): New static
variable for debug dumps.
(autopref_multipass_dfa_lookahead_guard_1): New static helper function.
(autopref_multipass_dfa_lookahead_guard): New global function that
implements TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD hook.
(init_h_i_d): Update.
* params.def (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH): New tuning knob.
* sched-int.h (enum autopref_multipass_data_status): New const enum.
(autopref_multipass_data_): Structure for auto-prefetcher data.
(autopref_multipass_data_def, autopref_multipass_data_t): New typedefs.
(struct _haifa_insn_data:autopref_multipass_data): New field.
(INSN_AUTOPREF_MULTIPASS_DATA): New access macro.
(autopref_multipass_dfa_lookahead_guard): Declare.
2015-01-17 Maxim Kuvyrkov <maxim.kuvyrkov@linaro.org>
* rtlanal.c (get_base_term): Handle SCRATCH.
2015-01-17 Maxim Kuvyrkov <maxim.kuvyrkov@linaro.org>
......
......@@ -291,6 +291,8 @@ struct tune_params
int max_insns_inline_memset;
/* Bitfield encoding the fuseable pairs of instructions. */
unsigned int fuseable_ops;
/* Depth of scheduling queue to check for L2 autoprefetcher. */
int sched_autopref_queue_depth;
};
extern const struct tune_params *current_tune;
......
......@@ -91,7 +91,8 @@ arm.o: $(srcdir)/config/arm/arm.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
$(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
$(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
$(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
intl.h libfuncs.h $(PARAMS_H) $(OPTS_H) $(srcdir)/config/arm/arm-cores.def \
intl.h libfuncs.h $(PARAMS_H) $(OPTS_H) sched-int.h \
$(srcdir)/config/arm/arm-cores.def \
$(srcdir)/config/arm/arm-arches.def $(srcdir)/config/arm/arm-fpus.def \
$(srcdir)/config/arm/arm-protos.h \
$(srcdir)/config/arm/arm_neon_builtins.def
......
......@@ -841,6 +841,7 @@ add_delay_dependencies (rtx_insn *insn)
/* Forward declarations. */
static int priority (rtx_insn *);
static int autopref_rank_for_schedule (const rtx_insn *, const rtx_insn *);
static int rank_for_schedule (const void *, const void *);
static void swap_sort (rtx_insn **, int);
static void queue_insn (rtx_insn *, int, const char *);
......@@ -1184,6 +1185,12 @@ update_insn_after_change (rtx_insn *insn)
INSN_COST (insn) = -1;
/* Invalidate INSN_TICK, so it'll be recalculated. */
INSN_TICK (insn) = INVALID_TICK;
/* Invalidate autoprefetch data entry. */
INSN_AUTOPREF_MULTIPASS_DATA (insn)[0].status
= AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
INSN_AUTOPREF_MULTIPASS_DATA (insn)[1].status
= AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
}
......@@ -2724,6 +2731,13 @@ rank_for_schedule (const void *x, const void *y)
if (flag_sched_critical_path_heuristic && priority_val)
return rfs_result (RFS_PRIORITY, priority_val, tmp, tmp2);
if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) >= 0)
{
int autopref = autopref_rank_for_schedule (tmp, tmp2);
if (autopref != 0)
return autopref;
}
/* Prefer speculative insn with greater dependencies weakness. */
if (flag_sched_spec_insn_heuristic && spec_info)
{
......@@ -5500,6 +5514,241 @@ insn_finishes_cycle_p (rtx_insn *insn)
return false;
}
/* Functions to model cache auto-prefetcher.
Some of the CPUs have cache auto-prefetcher, which /seems/ to initiate
memory prefetches if it sees instructions with consequitive memory accesses
in the instruction stream. Details of such hardware units are not published,
so we can only guess what exactly is going on there.
In the scheduler, we model abstract auto-prefetcher. If there are memory
insns in the ready list (or the queue) that have same memory base, but
different offsets, then we delay the insns with larger offsets until insns
with smaller offsets get scheduled. If PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
is "1", then we look at the ready list; if it is N>1, then we also look
through N-1 queue entries.
If the param is N>=0, then rank_for_schedule will consider auto-prefetching
among its heuristics.
Param value of "-1" disables modelling of the auto-prefetcher. */
/* Initialize autoprefetcher model data for INSN. */
static void
autopref_multipass_init (const rtx_insn *insn, int write)
{
autopref_multipass_data_t data = &INSN_AUTOPREF_MULTIPASS_DATA (insn)[write];
gcc_assert (data->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED);
data->base = NULL_RTX;
data->offset = 0;
/* Set insn entry initialized, but not relevant for auto-prefetcher. */
data->status = AUTOPREF_MULTIPASS_DATA_IRRELEVANT;
rtx set = single_set (insn);
if (set == NULL_RTX)
return;
rtx mem = write ? SET_DEST (set) : SET_SRC (set);
if (!MEM_P (mem))
return;
struct address_info info;
decompose_mem_address (&info, mem);
/* TODO: Currently only (base+const) addressing is supported. */
if (info.base == NULL || !REG_P (*info.base)
|| (info.disp != NULL && !CONST_INT_P (*info.disp)))
return;
/* This insn is relevant for auto-prefetcher. */
data->base = *info.base;
data->offset = info.disp ? INTVAL (*info.disp) : 0;
data->status = AUTOPREF_MULTIPASS_DATA_NORMAL;
}
/* Helper function for rank_for_schedule sorting. */
static int
autopref_rank_for_schedule (const rtx_insn *insn1, const rtx_insn *insn2)
{
for (int write = 0; write < 2; ++write)
{
autopref_multipass_data_t data1
= &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write];
autopref_multipass_data_t data2
= &INSN_AUTOPREF_MULTIPASS_DATA (insn2)[write];
if (data1->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
autopref_multipass_init (insn1, write);
if (data1->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
continue;
if (data2->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
autopref_multipass_init (insn2, write);
if (data2->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
continue;
if (!rtx_equal_p (data1->base, data2->base))
continue;
return data1->offset - data2->offset;
}
return 0;
}
/* True if header of debug dump was printed. */
static bool autopref_multipass_dfa_lookahead_guard_started_dump_p;
/* Helper for autopref_multipass_dfa_lookahead_guard.
Return "1" if INSN1 should be delayed in favor of INSN2. */
static int
autopref_multipass_dfa_lookahead_guard_1 (const rtx_insn *insn1,
const rtx_insn *insn2, int write)
{
autopref_multipass_data_t data1
= &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write];
autopref_multipass_data_t data2
= &INSN_AUTOPREF_MULTIPASS_DATA (insn2)[write];
if (data2->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
autopref_multipass_init (insn2, write);
if (data2->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
return 0;
if (rtx_equal_p (data1->base, data2->base)
&& data1->offset > data2->offset)
{
if (sched_verbose >= 2)
{
if (!autopref_multipass_dfa_lookahead_guard_started_dump_p)
{
fprintf (sched_dump,
";;\t\tnot trying in max_issue due to autoprefetch "
"model: ");
autopref_multipass_dfa_lookahead_guard_started_dump_p = true;
}
fprintf (sched_dump, " %d(%d)", INSN_UID (insn1), INSN_UID (insn2));
}
return 1;
}
return 0;
}
/* General note:
We could have also hooked autoprefetcher model into
first_cycle_multipass_backtrack / first_cycle_multipass_issue hooks
to enable intelligent selection of "[r1+0]=r2; [r1+4]=r3" on the same cycle
(e.g., once "[r1+0]=r2" is issued in max_issue(), "[r1+4]=r3" gets
unblocked). We don't bother about this yet because target of interest
(ARM Cortex-A15) can issue only 1 memory operation per cycle. */
/* Implementation of first_cycle_multipass_dfa_lookahead_guard hook.
Return "1" if INSN1 should not be considered in max_issue due to
auto-prefetcher considerations. */
int
autopref_multipass_dfa_lookahead_guard (rtx_insn *insn1, int ready_index)
{
int r = 0;
if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) <= 0)
return 0;
if (sched_verbose >= 2 && ready_index == 0)
autopref_multipass_dfa_lookahead_guard_started_dump_p = false;
for (int write = 0; write < 2; ++write)
{
autopref_multipass_data_t data1
= &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write];
if (data1->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
autopref_multipass_init (insn1, write);
if (data1->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
continue;
if (ready_index == 0
&& data1->status == AUTOPREF_MULTIPASS_DATA_DONT_DELAY)
/* We allow only a single delay on priviledged instructions.
Doing otherwise would cause infinite loop. */
{
if (sched_verbose >= 2)
{
if (!autopref_multipass_dfa_lookahead_guard_started_dump_p)
{
fprintf (sched_dump,
";;\t\tnot trying in max_issue due to autoprefetch "
"model: ");
autopref_multipass_dfa_lookahead_guard_started_dump_p = true;
}
fprintf (sched_dump, " *%d*", INSN_UID (insn1));
}
continue;
}
for (int i2 = 0; i2 < ready.n_ready; ++i2)
{
rtx_insn *insn2 = get_ready_element (i2);
if (insn1 == insn2)
continue;
r = autopref_multipass_dfa_lookahead_guard_1 (insn1, insn2, write);
if (r)
{
if (ready_index == 0)
{
r = -1;
data1->status = AUTOPREF_MULTIPASS_DATA_DONT_DELAY;
}
goto finish;
}
}
if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) == 1)
continue;
/* Everything from the current queue slot should have been moved to
the ready list. */
gcc_assert (insn_queue[NEXT_Q_AFTER (q_ptr, 0)] == NULL_RTX);
int n_stalls = PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) - 1;
if (n_stalls > max_insn_queue_index)
n_stalls = max_insn_queue_index;
for (int stalls = 1; stalls <= n_stalls; ++stalls)
{
for (rtx_insn_list *link = insn_queue[NEXT_Q_AFTER (q_ptr, stalls)];
link != NULL_RTX;
link = link->next ())
{
rtx_insn *insn2 = link->insn ();
r = autopref_multipass_dfa_lookahead_guard_1 (insn1, insn2,
write);
if (r)
{
/* Queue INSN1 until INSN2 can issue. */
r = -stalls;
if (ready_index == 0)
data1->status = AUTOPREF_MULTIPASS_DATA_DONT_DELAY;
goto finish;
}
}
}
}
finish:
if (sched_verbose >= 2
&& autopref_multipass_dfa_lookahead_guard_started_dump_p
&& (ready_index == ready.n_ready - 1 || r < 0))
/* This does not /always/ trigger. We don't output EOL if the last
insn is not recognized (INSN_CODE < 0) and lookahead_guard is not
called. We can live with this. */
fprintf (sched_dump, "\n");
return r;
}
/* Define type for target data used in multipass scheduling. */
#ifndef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T
# define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T int
......@@ -8710,6 +8959,10 @@ init_h_i_d (rtx_insn *insn)
INSN_EXACT_TICK (insn) = INVALID_TICK;
INTER_TICK (insn) = INVALID_TICK;
TODO_SPEC (insn) = HARD_DEP;
INSN_AUTOPREF_MULTIPASS_DATA (insn)[0].status
= AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
INSN_AUTOPREF_MULTIPASS_DATA (insn)[1].status
= AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
}
}
......
......@@ -668,6 +668,11 @@ DEFPARAM (PARAM_SCHED_MEM_TRUE_DEP_COST,
"Minimal distance between possibly conflicting store and load",
1, 0, 0)
DEFPARAM (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
"sched-autopref-queue-depth",
"Hardware autoprefetcher scheduler model control flag. Number of lookahead cycles the model looks into; at '0' only enable instruction sorting heuristic. Disabled by default.",
-1, 0, 0)
DEFPARAM(PARAM_MAX_LAST_VALUE_RTL,
"max-last-value-rtl",
"The maximum number of RTL nodes that can be recorded as combiner's last value",
......
......@@ -793,6 +793,32 @@ struct reg_set_data
struct reg_set_data *next_insn_set;
};
enum autopref_multipass_data_status {
/* Entry is irrelevant for auto-prefetcher. */
AUTOPREF_MULTIPASS_DATA_IRRELEVANT = -2,
/* Entry is uninitialized. */
AUTOPREF_MULTIPASS_DATA_UNINITIALIZED = -1,
/* Entry is relevant for auto-prefetcher and insn can be delayed
to allow another insn through. */
AUTOPREF_MULTIPASS_DATA_NORMAL = 0,
/* Entry is relevant for auto-prefetcher, but insn should not be
delayed as that will break scheduling. */
AUTOPREF_MULTIPASS_DATA_DONT_DELAY = 1
};
/* Data for modeling cache auto-prefetcher. */
struct autopref_multipass_data_
{
/* Base part of memory address. */
rtx base;
/* Memory offset. */
int offset;
/* Entry status. */
enum autopref_multipass_data_status status;
};
typedef struct autopref_multipass_data_ autopref_multipass_data_def;
typedef autopref_multipass_data_def *autopref_multipass_data_t;
struct _haifa_insn_data
{
/* We can't place 'struct _deps_list' into h_i_d instead of deps_list_t
......@@ -893,6 +919,10 @@ struct _haifa_insn_data
/* The deciding reason for INSN's place in the ready list. */
int last_rfs_win;
/* Two entries for cache auto-prefetcher model: one for mem reads,
and one for mem writes. */
autopref_multipass_data_def autopref_multipass_data[2];
};
typedef struct _haifa_insn_data haifa_insn_data_def;
......@@ -915,6 +945,8 @@ extern vec<haifa_insn_data_def> h_i_d;
(HID (INSN)->reg_pressure_excess_cost_change)
#define INSN_PRIORITY_STATUS(INSN) (HID (INSN)->priority_status)
#define INSN_MODEL_INDEX(INSN) (HID (INSN)->model_index)
#define INSN_AUTOPREF_MULTIPASS_DATA(INSN) \
(HID (INSN)->autopref_multipass_data)
typedef struct _haifa_deps_insn_data haifa_deps_insn_data_def;
typedef haifa_deps_insn_data_def *haifa_deps_insn_data_t;
......@@ -1363,6 +1395,8 @@ extern int cycle_issued_insns;
extern int issue_rate;
extern int dfa_lookahead;
extern int autopref_multipass_dfa_lookahead_guard (rtx_insn *, int);
extern void ready_sort (struct ready_list *);
extern rtx_insn *ready_element (struct ready_list *, int);
extern rtx_insn **ready_lastpos (struct ready_list *);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment