Commit c856f536 by Vladimir Makarov Committed by Vladimir Makarov

ia64.c: Add more comments about insn bundling.

2003-12-17  Vladimir Makarov  <vmakarov@redhat.com>

	* config/ia64/ia64.c: Add more comments about insn bundling.

From-SVN: r74751
parent 4d450e9a
2003-12-17 Vladimir Makarov <vmakarov@redhat.com>
* config/ia64/ia64.c: Add more comments about insn bundling.
2003-12-17 Richard Earnshaw <rearnsha@arm.com> 2003-12-17 Richard Earnshaw <rearnsha@arm.com>
PR optimization/10592 PR optimization/10592
......
...@@ -6557,13 +6557,44 @@ get_next_important_insn (rtx insn, rtx tail) ...@@ -6557,13 +6557,44 @@ get_next_important_insn (rtx insn, rtx tail)
return NULL_RTX; return NULL_RTX;
} }
/* The following function does insn bundling. Bundling algorithm is /* The following function does insn bundling. Bundling means
based on dynamic programming. It tries to insert different number of inserting templates and nop insns to fit insn groups into permitted
nop insns before/after the real insns. At the end of EBB, it chooses the templates. Instruction scheduling uses NDFA (non-deterministic
best alternative and then, moving back in EBB, inserts templates for finite automata) encoding informations about the templates and the
the best alternative. The algorithm is directed by information inserted nops. Nondeterminism of the automata permits follows
(changes of simulated processor cycle) created by the 2nd insn all possible insn sequences very fast.
scheduling. */
Unfortunately it is not possible to get information about inserting
nop insns and used templates from the automata states. The
automata only says that we can issue an insn possibly inserting
some nops before it and using some template. Therefore insn
bundling in this function is implemented by using DFA
(deterministic finite automata). We follows all possible insn
sequences by inserting 0-2 nops (that is what the NDFA describe for
insn scheduling) before/after each insn being bundled. We know the
start of simulated processor cycle from insn scheduling (insn
starting a new cycle has TImode).
Simple implementation of insn bundling would create enormous
number of possible insn sequences satisfying information about new
cycle ticks taken from the insn scheduling. To make the algorithm
practical we use dynamic programming. Each decision (about
inserting nops and implicitly about previous decisions) is described
by structure bundle_state (see above). If we generate the same
bundle state (key is automaton state after issuing the insns and
nops for it), we reuse already generated one. As consequence we
reject some decisions which can not improve the solution and
reduce memory for the algorithm.
When we reach the end of EBB (extended basic block), we choose the
best sequence and then, moving back in EBB, insert templates for
the best alternative. The templates are taken from querying
automaton state for each insn in chosen bundle states.
So the algorithm makes two (forward and backward) passes through
EBB. There is an additional forward pass through EBB for Itanium1
processor. This pass inserts more nops to make dependency between
a producer insn and MMMUL/MMSHF at least 4 cycles long. */
static void static void
bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
...@@ -6578,6 +6609,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6578,6 +6609,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
enum attr_type type; enum attr_type type;
insn_num = 0; insn_num = 0;
/* Count insns in the EBB. */
for (insn = NEXT_INSN (prev_head_insn); for (insn = NEXT_INSN (prev_head_insn);
insn && insn != tail; insn && insn != tail;
insn = NEXT_INSN (insn)) insn = NEXT_INSN (insn))
...@@ -6590,7 +6622,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6590,7 +6622,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
initiate_bundle_state_table (); initiate_bundle_state_table ();
index_to_bundle_states = xmalloc ((insn_num + 2) index_to_bundle_states = xmalloc ((insn_num + 2)
* sizeof (struct bundle_state *)); * sizeof (struct bundle_state *));
/* First (forward) pass -- generates states. */ /* First (forward) pass -- generation of bundle states. */
curr_state = get_free_bundle_state (); curr_state = get_free_bundle_state ();
curr_state->insn = NULL; curr_state->insn = NULL;
curr_state->before_nops_num = 0; curr_state->before_nops_num = 0;
...@@ -6604,6 +6636,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6604,6 +6636,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
state_reset (curr_state->dfa_state); state_reset (curr_state->dfa_state);
index_to_bundle_states [0] = curr_state; index_to_bundle_states [0] = curr_state;
insn_num = 0; insn_num = 0;
/* Shift cycle mark if it is put on insn which could be ignored. */
for (insn = NEXT_INSN (prev_head_insn); for (insn = NEXT_INSN (prev_head_insn);
insn != tail; insn != tail;
insn = NEXT_INSN (insn)) insn = NEXT_INSN (insn))
...@@ -6626,6 +6659,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6626,6 +6659,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
break; break;
} }
} }
/* Froward pass: generation of bundle states. */
for (insn = get_next_important_insn (NEXT_INSN (prev_head_insn), tail); for (insn = get_next_important_insn (NEXT_INSN (prev_head_insn), tail);
insn != NULL_RTX; insn != NULL_RTX;
insn = next_insn) insn = next_insn)
...@@ -6645,19 +6679,25 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6645,19 +6679,25 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
{ {
pos = curr_state->accumulated_insns_num % 3; pos = curr_state->accumulated_insns_num % 3;
next_state = curr_state->next; next_state = curr_state->next;
/* Finish the current bundle in order to start a subsequent /* We must fill up the current bundle in order to start a
asm insn in a new bundle. */ subsequent asm insn in a new bundle. Asm insn is always
placed in a separate bundle. */
only_bundle_end_p only_bundle_end_p
= (next_insn != NULL_RTX = (next_insn != NULL_RTX
&& INSN_CODE (insn) == CODE_FOR_insn_group_barrier && INSN_CODE (insn) == CODE_FOR_insn_group_barrier
&& ia64_safe_type (next_insn) == TYPE_UNKNOWN); && ia64_safe_type (next_insn) == TYPE_UNKNOWN);
/* We may fill up the current bundle if it is the cycle end
without a group barrier. */
bundle_end_p bundle_end_p
= (only_bundle_end_p || next_insn == NULL_RTX = (only_bundle_end_p || next_insn == NULL_RTX
|| (GET_MODE (next_insn) == TImode || (GET_MODE (next_insn) == TImode
&& INSN_CODE (insn) != CODE_FOR_insn_group_barrier)); && INSN_CODE (insn) != CODE_FOR_insn_group_barrier));
if (type == TYPE_F || type == TYPE_B || type == TYPE_L if (type == TYPE_F || type == TYPE_B || type == TYPE_L
|| type == TYPE_S || type == TYPE_S
/* We need to insert 2 Nops for cases like M_MII. */ /* We need to insert 2 nops for cases like M_MII. To
guarantee issuing all insns on the same cycle for
Itanium 1, we need to issue 2 nops after the first M
insn (MnnMII where n is a nop insn). */
|| (type == TYPE_M && ia64_tune == PROCESSOR_ITANIUM || (type == TYPE_M && ia64_tune == PROCESSOR_ITANIUM
&& !bundle_end_p && pos == 1)) && !bundle_end_p && pos == 1))
issue_nops_and_insn (curr_state, 2, insn, bundle_end_p, issue_nops_and_insn (curr_state, 2, insn, bundle_end_p,
...@@ -6674,6 +6714,10 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6674,6 +6714,10 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
curr_state = curr_state->next) curr_state = curr_state->next)
if (verbose >= 2 && dump) if (verbose >= 2 && dump)
{ {
/* This structure is taken from generated code of the
pipeline hazard recognizer (see file insn-attrtab.c).
Please don't forget to change the structure if a new
automaton is added to .md file. */
struct DFA_chip struct DFA_chip
{ {
unsigned short one_automaton_state; unsigned short one_automaton_state;
...@@ -6698,12 +6742,18 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6698,12 +6742,18 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
} }
} }
if (index_to_bundle_states [insn_num] == NULL) if (index_to_bundle_states [insn_num] == NULL)
/* We should find a solution because the 2nd insn scheduling has
found one. */
abort (); abort ();
/* Finding state with a minimal cost: */ /* Find a state corresponding to the best insn sequence. */
best_state = NULL; best_state = NULL;
for (curr_state = index_to_bundle_states [insn_num]; for (curr_state = index_to_bundle_states [insn_num];
curr_state != NULL; curr_state != NULL;
curr_state = curr_state->next) curr_state = curr_state->next)
/* We are just looking at the states with fully filled up last
bundle. The first we prefer insn sequences with minimal cost
then with minimal inserted nops and finally with branch insns
placed in the 3rd slots. */
if (curr_state->accumulated_insns_num % 3 == 0 if (curr_state->accumulated_insns_num % 3 == 0
&& (best_state == NULL || best_state->cost > curr_state->cost && (best_state == NULL || best_state->cost > curr_state->cost
|| (best_state->cost == curr_state->cost || (best_state->cost == curr_state->cost
...@@ -6714,7 +6764,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6714,7 +6764,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
&& curr_state->branch_deviation && curr_state->branch_deviation
< best_state->branch_deviation))))) < best_state->branch_deviation)))))
best_state = curr_state; best_state = curr_state;
/* Second (backward) pass: adding nops and templates: */ /* Second (backward) pass: adding nops and templates. */
insn_num = best_state->before_nops_num; insn_num = best_state->before_nops_num;
template0 = template1 = -1; template0 = template1 = -1;
for (curr_state = best_state; for (curr_state = best_state;
...@@ -6749,9 +6799,17 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6749,9 +6799,17 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
: ((struct DFA_chip *) curr_state->dfa_state)->twob_automaton_state), : ((struct DFA_chip *) curr_state->dfa_state)->twob_automaton_state),
INSN_UID (insn)); INSN_UID (insn));
} }
/* Find the position in the current bundle window. The window can
contain at most two bundles. Two bundle window means that
the processor will make two bundle rotation. */
max_pos = get_max_pos (curr_state->dfa_state); max_pos = get_max_pos (curr_state->dfa_state);
if (max_pos == 6 || (max_pos == 3 && template0 < 0)) if (max_pos == 6
/* The following (negative template number) means that the
processor did one bundle rotation. */
|| (max_pos == 3 && template0 < 0))
{ {
/* We are at the end of the window -- find template(s) for
its bundle(s). */
pos = max_pos; pos = max_pos;
if (max_pos == 3) if (max_pos == 3)
template0 = get_template (curr_state->dfa_state, 3); template0 = get_template (curr_state->dfa_state, 3);
...@@ -6762,6 +6820,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6762,6 +6820,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
} }
} }
if (max_pos > 3 && template1 < 0) if (max_pos > 3 && template1 < 0)
/* It may happen when we have the stop inside a bundle. */
{ {
if (pos > 3) if (pos > 3)
abort (); abort ();
...@@ -6769,6 +6828,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6769,6 +6828,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
pos += 3; pos += 3;
} }
if (!asm_p) if (!asm_p)
/* Emit nops after the current insn. */
for (i = 0; i < curr_state->after_nops_num; i++) for (i = 0; i < curr_state->after_nops_num; i++)
{ {
nop = gen_nop (); nop = gen_nop ();
...@@ -6778,18 +6838,26 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6778,18 +6838,26 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
abort (); abort ();
if (pos % 3 == 0) if (pos % 3 == 0)
{ {
/* We are at the start of a bundle: emit the template
(it should be defined). */
if (template0 < 0) if (template0 < 0)
abort (); abort ();
b = gen_bundle_selector (GEN_INT (template0)); b = gen_bundle_selector (GEN_INT (template0));
ia64_emit_insn_before (b, nop); ia64_emit_insn_before (b, nop);
/* If we have two bundle window, we make one bundle
rotation. Otherwise template0 will be undefined
(negative value). */
template0 = template1; template0 = template1;
template1 = -1; template1 = -1;
} }
} }
/* Move the position backward in the window. Group barrier has
no slot. Asm insn takes all bundle. */
if (INSN_CODE (insn) != CODE_FOR_insn_group_barrier if (INSN_CODE (insn) != CODE_FOR_insn_group_barrier
&& GET_CODE (PATTERN (insn)) != ASM_INPUT && GET_CODE (PATTERN (insn)) != ASM_INPUT
&& asm_noperands (PATTERN (insn)) < 0) && asm_noperands (PATTERN (insn)) < 0)
pos--; pos--;
/* Long insn takes 2 slots. */
if (ia64_safe_type (insn) == TYPE_L) if (ia64_safe_type (insn) == TYPE_L)
pos--; pos--;
if (pos < 0) if (pos < 0)
...@@ -6799,15 +6867,20 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6799,15 +6867,20 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
&& GET_CODE (PATTERN (insn)) != ASM_INPUT && GET_CODE (PATTERN (insn)) != ASM_INPUT
&& asm_noperands (PATTERN (insn)) < 0) && asm_noperands (PATTERN (insn)) < 0)
{ {
/* The current insn is at the bundle start: emit the
template. */
if (template0 < 0) if (template0 < 0)
abort (); abort ();
b = gen_bundle_selector (GEN_INT (template0)); b = gen_bundle_selector (GEN_INT (template0));
ia64_emit_insn_before (b, insn); ia64_emit_insn_before (b, insn);
b = PREV_INSN (insn); b = PREV_INSN (insn);
insn = b; insn = b;
/* See comment above in analogous place for emiting nops
after the insn. */
template0 = template1; template0 = template1;
template1 = -1; template1 = -1;
} }
/* Emit nops after the current insn. */
for (i = 0; i < curr_state->before_nops_num; i++) for (i = 0; i < curr_state->before_nops_num; i++)
{ {
nop = gen_nop (); nop = gen_nop ();
...@@ -6819,6 +6892,8 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6819,6 +6892,8 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
abort (); abort ();
if (pos % 3 == 0) if (pos % 3 == 0)
{ {
/* See comment above in analogous place for emiting nops
after the insn. */
if (template0 < 0) if (template0 < 0)
abort (); abort ();
b = gen_bundle_selector (GEN_INT (template0)); b = gen_bundle_selector (GEN_INT (template0));
...@@ -6831,7 +6906,11 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6831,7 +6906,11 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
} }
} }
if (ia64_tune == PROCESSOR_ITANIUM) if (ia64_tune == PROCESSOR_ITANIUM)
/* Insert additional cycles for MM-insns: */ /* Insert additional cycles for MM-insns (MMMUL and MMSHF).
Itanium1 has a strange design, if the distance between an insn
and dependent MM-insn is less 4 then we have a 6 additional
cycles stall. So we make the distance equal to 4 cycles if it
is less. */
for (insn = get_next_important_insn (NEXT_INSN (prev_head_insn), tail); for (insn = get_next_important_insn (NEXT_INSN (prev_head_insn), tail);
insn != NULL_RTX; insn != NULL_RTX;
insn = next_insn) insn = next_insn)
...@@ -6843,11 +6922,16 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6843,11 +6922,16 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
abort (); abort ();
next_insn = get_next_important_insn (NEXT_INSN (insn), tail); next_insn = get_next_important_insn (NEXT_INSN (insn), tail);
if (INSN_UID (insn) < clocks_length && add_cycles [INSN_UID (insn)]) if (INSN_UID (insn) < clocks_length && add_cycles [INSN_UID (insn)])
/* We found a MM-insn which needs additional cycles. */
{ {
rtx last; rtx last;
int i, j, n; int i, j, n;
int pred_stop_p; int pred_stop_p;
/* Now we are searching for a template of the bundle in
which the MM-insn is placed and the position of the
insn in the bundle (0, 1, 2). Also we are searching
for that there is a stop before the insn. */
last = prev_active_insn (insn); last = prev_active_insn (insn);
pred_stop_p = recog_memoized (last) == CODE_FOR_insn_group_barrier; pred_stop_p = recog_memoized (last) == CODE_FOR_insn_group_barrier;
if (pred_stop_p) if (pred_stop_p)
...@@ -6858,17 +6942,27 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6858,17 +6942,27 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
{ {
template0 = XINT (XVECEXP (PATTERN (last), 0, 0), 0); template0 = XINT (XVECEXP (PATTERN (last), 0, 0), 0);
if (template0 == 9) if (template0 == 9)
/* The insn is in MLX bundle. Change the template
onto MFI because we will add nops before the
insn. It simplifies subsequent code a lot. */
PATTERN (last) PATTERN (last)
= gen_bundle_selector (GEN_INT (2)); /* -> MFI */ = gen_bundle_selector (GEN_INT (2)); /* -> MFI */
break; break;
} }
else if (recog_memoized (last) != CODE_FOR_insn_group_barrier) else if (recog_memoized (last) != CODE_FOR_insn_group_barrier)
n++; n++;
/* Some check of correctness: the stop is not at the
bundle start, there are no more 3 insns in the bundle,
and the MM-insn is not at the start of bundle with
template MLX. */
if ((pred_stop_p && n == 0) || n > 2 if ((pred_stop_p && n == 0) || n > 2
|| (template0 == 9 && n != 0)) || (template0 == 9 && n != 0))
abort (); abort ();
/* Put nops after the insn in the bundle. */
for (j = 3 - n; j > 0; j --) for (j = 3 - n; j > 0; j --)
ia64_emit_insn_before (gen_nop (), insn); ia64_emit_insn_before (gen_nop (), insn);
/* It takes into account that we will add more N nops
before the insn lately -- please see code below. */
add_cycles [INSN_UID (insn)]--; add_cycles [INSN_UID (insn)]--;
if (!pred_stop_p || add_cycles [INSN_UID (insn)]) if (!pred_stop_p || add_cycles [INSN_UID (insn)])
ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)), ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
...@@ -6877,13 +6971,15 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6877,13 +6971,15 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
add_cycles [INSN_UID (insn)]--; add_cycles [INSN_UID (insn)]--;
for (i = add_cycles [INSN_UID (insn)]; i > 0; i--) for (i = add_cycles [INSN_UID (insn)]; i > 0; i--)
{ {
/* Insert .MII bundle. */ /* Insert "MII;" template. */
ia64_emit_insn_before (gen_bundle_selector (GEN_INT (0)), ia64_emit_insn_before (gen_bundle_selector (GEN_INT (0)),
insn); insn);
ia64_emit_insn_before (gen_nop (), insn); ia64_emit_insn_before (gen_nop (), insn);
ia64_emit_insn_before (gen_nop (), insn); ia64_emit_insn_before (gen_nop (), insn);
if (i > 1) if (i > 1)
{ {
/* To decrease code size, we use "MI;I;"
template. */
ia64_emit_insn_before ia64_emit_insn_before
(gen_insn_group_barrier (GEN_INT (3)), insn); (gen_insn_group_barrier (GEN_INT (3)), insn);
i--; i--;
...@@ -6892,10 +6988,15 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail) ...@@ -6892,10 +6988,15 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)), ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
insn); insn);
} }
/* Put the MM-insn in the same slot of a bundle with the
same template as the original one. */
ia64_emit_insn_before (gen_bundle_selector (GEN_INT (template0)), ia64_emit_insn_before (gen_bundle_selector (GEN_INT (template0)),
insn); insn);
/* To put the insn in the same slot, add necessary number
of nops. */
for (j = n; j > 0; j --) for (j = n; j > 0; j --)
ia64_emit_insn_before (gen_nop (), insn); ia64_emit_insn_before (gen_nop (), insn);
/* Put the stop if the original bundle had it. */
if (pred_stop_p) if (pred_stop_p)
ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)), ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
insn); insn);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment