Commit 74835ed8 by Richard Henderson Committed by Richard Henderson

alpha.h (ISSUE_RATE): Define.

        * alpha.h (ISSUE_RATE): Define.
        * alpha.c (alpha_adjust_cost): Handle EV5 mult delay; don't apply
        EV4 adjustments to EV5.
        * alpha.md: Remove all scaling from function unit delays.  Rework
        EV5 function units to match the CPU.
        (umuldi3_highpart): EV5 added the IMULH insn class.

From-SVN: r15916
parent ee80685a
Wed Oct 15 18:16:05 1997 Richard Henderson <rth@cygnus.com>
Tune Haifa scheduler for Alpha:
* alpha.h (ISSUE_RATE): Define.
* alpha.c (alpha_adjust_cost): Handle EV5 mult delay; don't apply
EV4 adjustments to EV5.
* alpha.md: Remove all scaling from function unit delays. Rework
EV5 function units to match the CPU.
(umuldi3_highpart): EV5 added the IMULH insn class.
Wed Oct 15 17:42:41 1997 Jeffrey A Law (law@cygnus.com)
* pa.c (following_call): Fail if the CALL_INSN is an indirect
......
......@@ -1150,7 +1150,7 @@ alpha_adjust_cost (insn, link, dep_insn, cost)
rtx dep_insn;
int cost;
{
rtx set;
rtx set, set_src;
/* If the dependence is an anti-dependence, there is no cost. For an
output dependence, there is sometimes a cost, but it doesn't seem
......@@ -1159,12 +1159,12 @@ alpha_adjust_cost (insn, link, dep_insn, cost)
if (REG_NOTE_KIND (link) != 0)
return 0;
/* EV5 costs are as given in alpha.md; exceptions are given here. */
if (alpha_cpu == PROCESSOR_EV5)
{
/* And the lord DEC sayeth: "A special bypass provides an effective
latency of 0 cycles for an ICMP or ILOG insn producing the test
operand of an IBR or CMOV insn." */
/* On EV5, "A special bypass provides an effective latency of 0
cycles for an ICMP or ILOG insn producing the test operand of an
IBR or CMOV insn." */
if (recog_memoized (dep_insn) >= 0
&& (get_attr_type (dep_insn) == TYPE_ICMP
|| get_attr_type (dep_insn) == TYPE_ILOG)
......@@ -1173,67 +1173,104 @@ alpha_adjust_cost (insn, link, dep_insn, cost)
|| (get_attr_type (insn) == TYPE_CMOV
&& !((set = single_set (dep_insn)) != 0
&& GET_CODE (PATTERN (insn)) == SET
&& GET_CODE (SET_SRC (PATTERN (insn))) == IF_THEN_ELSE
&& (rtx_equal_p (SET_DEST (set),
XEXP (SET_SRC (PATTERN (insn)), 1))
|| rtx_equal_p (SET_DEST (set),
XEXP (SET_SRC (PATTERN (insn)), 2)))))))
return 1;
return cost;
}
/* If INSN is a store insn and DEP_INSN is setting the data being stored,
we can sometimes lower the cost. */
if (recog_memoized (insn) >= 0 && get_attr_type (insn) == TYPE_ST
&& (set = single_set (dep_insn)) != 0
&& GET_CODE (PATTERN (insn)) == SET
&& rtx_equal_p (SET_DEST (set), SET_SRC (PATTERN (insn))))
switch (get_attr_type (dep_insn))
{
case TYPE_LD:
/* No savings here. */
return cost;
case TYPE_IMULL:
case TYPE_IMULQ:
/* In these cases, we save one cycle. */
return cost - 2;
&& (set_src = SET_SRC (PATTERN (insn)),
GET_CODE (set_src) == IF_THEN_ELSE)
&& (set = SET_DEST (set),
rtx_equal_p (set, XEXP (set_src, 1))
|| rtx_equal_p (set, XEXP (set_src, 2)))))))
return 0;
default:
/* In all other cases, we save two cycles. */
return MAX (0, cost - 4);
}
/* On EV5 it takes longer to get data to the multiplier than to
anywhere else, so increase costs. */
if (recog_memoized (insn) >= 0
&& recog_memoized (dep_insn) >= 0
&& (get_attr_type (insn) == TYPE_IMULL
|| get_attr_type (insn) == TYPE_IMULQ
|| get_attr_type (insn) == TYPE_IMULH)
&& (set = single_set (dep_insn)) != 0
&& GET_CODE (PATTERN (insn)) == SET
&& (set_src = SET_SRC (PATTERN (insn)),
GET_CODE (set_src) == MULT)
&& (set = SET_DEST (set),
rtx_equal_p (set, XEXP (set_src, 0))
|| rtx_equal_p (set, XEXP (set_src, 1))))
{
switch (get_attr_type (insn))
{
case TYPE_LD:
case TYPE_CMOV:
case TYPE_IMULL:
case TYPE_IMULQ:
case TYPE_IMULH:
return cost + 1;
case TYPE_JSR:
case TYPE_IADD:
case TYPE_ILOG:
case TYPE_SHIFT:
case TYPE_ICMP:
return cost + 2;
}
}
}
else
{
/* On EV4, if INSN is a store insn and DEP_INSN is setting the data
being stored, we can sometimes lower the cost. */
/* Another case that needs adjustment is an arithmetic or logical
operation. It's cost is usually one cycle, but we default it to
two in the MD file. The only case that it is actually two is
for the address in loads and stores. */
if (recog_memoized (insn) >= 0 && get_attr_type (insn) == TYPE_ST
&& (set = single_set (dep_insn)) != 0
&& GET_CODE (PATTERN (insn)) == SET
&& rtx_equal_p (SET_DEST (set), SET_SRC (PATTERN (insn))))
{
switch (get_attr_type (dep_insn))
{
case TYPE_LD:
/* No savings here. */
return cost;
case TYPE_IMULL:
case TYPE_IMULQ:
case TYPE_IMULH:
/* In these cases, we save one cycle. */
return cost - 1;
default:
/* In all other cases, we save two cycles. */
return MAX (0, cost - 2);
}
}
if (recog_memoized (dep_insn) >= 0
&& (get_attr_type (dep_insn) == TYPE_IADD
|| get_attr_type (dep_insn) == TYPE_ILOG))
switch (get_attr_type (insn))
{
case TYPE_LD:
case TYPE_ST:
return cost;
/* Another case that needs adjustment is an arithmetic or logical
operation. It's cost is usually one cycle, but we default it to
two in the MD file. The only case that it is actually two is
for the address in loads and stores. */
default:
return 2;
}
if (recog_memoized (dep_insn) >= 0
&& (get_attr_type (dep_insn) == TYPE_IADD
|| get_attr_type (dep_insn) == TYPE_ILOG))
{
switch (get_attr_type (insn))
{
case TYPE_LD:
case TYPE_ST:
return cost;
default:
return 1;
}
}
/* The final case is when a compare feeds into an integer branch. The cost
is only one cycle in that case. */
/* The final case is when a compare feeds into an integer branch;
the cost is only one cycle in that case. */
if (recog_memoized (dep_insn) >= 0
&& get_attr_type (dep_insn) == TYPE_ICMP
&& recog_memoized (insn) >= 0
&& get_attr_type (insn) == TYPE_IBR)
return 2;
if (recog_memoized (dep_insn) >= 0
&& get_attr_type (dep_insn) == TYPE_ICMP
&& recog_memoized (insn) >= 0
&& get_attr_type (insn) == TYPE_IBR)
return 1;
}
/* Otherwise, return the default cost. */
return cost;
}
......
......@@ -1595,6 +1595,9 @@ extern void final_prescan_insn ();
our own exit function. */
#define HAVE_ATEXIT
/* The EV4 is dual issue; EV5 is quad issue. */
#define ISSUE_RATE (alpha_cpu == PROCESSOR_EV4 ? 2 : 4)
/* Compute the cost of computing a constant rtl expression RTX
whose rtx-code is CODE. The body of this macro is a portion
of a switch statement. If the code is computed here,
......
......@@ -33,7 +33,7 @@
;; separately.
(define_attr "type"
"ld,st,ibr,fbr,jsr,iadd,ilog,shift,cmov,icmp,imull,imulq,fadd,fmul,fcpys,fdivs,fdivt,ldsym,isubr"
"ld,st,ibr,fbr,jsr,iadd,ilog,shift,cmov,icmp,imull,imulq,imulh,fadd,fmul,fcpys,fdivs,fdivt,ldsym,isubr,misc"
(const_string "iadd"))
;; The TRAP_TYPE attribute marks instructions that may generate traps
......@@ -41,35 +41,30 @@
;; is desired).
(define_attr "trap" "yes,no" (const_string "no"))
;; For the EV4 we include four function units: ABOX, which computes the address,
;; BBOX, used for branches, EBOX, used for integer operations, and FBOX,
;; used for FP operations.
;;
;; We assume that we have been successful in getting double issues and
;; hence multiply all costs by two insns per cycle. The minimum time in
;; a function unit is 2 cycle, which will tend to produce the double
;; issues.
;; For the EV4 we include four function units: ABOX, which computes
;; the address, BBOX, used for branches, EBOX, used for integer
;; operations, and FBOX, used for FP operations.
;; Memory delivers its result in three cycles.
(define_function_unit "ev4_abox" 1 0
(and (eq_attr "cpu" "ev4")
(eq_attr "type" "ld,st"))
6 2)
(eq_attr "type" "ld,ldsym,st"))
3 1)
;; Branches have no delay cost, but do tie up the unit for two cycles.
(define_function_unit "ev4_bbox" 1 1
(and (eq_attr "cpu" "ev4")
(eq_attr "type" "ibr,fbr,jsr"))
4 4)
2 2)
;; Arithmetic insns are normally have their results available after two
;; cycles. There are a number of exceptions. They are encoded in
;; Arithmetic insns are normally have their results available after
;; two cycles. There are a number of exceptions. They are encoded in
;; ADJUST_COST. Some of the other insns have similar exceptions.
(define_function_unit "ev4_ebox" 1 0
(and (eq_attr "cpu" "ev4")
(eq_attr "type" "iadd,ilog,ldsym,shift,cmov,icmp"))
4 2)
(eq_attr "type" "iadd,ilog,shift,cmov,icmp"))
2 1)
;; These really don't take up the integer pipeline, but they do occupy
;; IBOX1; we approximate here.
......@@ -77,135 +72,145 @@
(define_function_unit "ev4_ebox" 1 0
(and (eq_attr "cpu" "ev4")
(eq_attr "type" "imull"))
42 2)
21 1)
(define_function_unit "ev4_ebox" 1 0
(and (eq_attr "cpu" "ev4")
(eq_attr "type" "imulq"))
46 2)
(eq_attr "type" "imulq,imulh"))
23 1)
(define_function_unit "ev4_imult" 1 0
(and (eq_attr "cpu" "ev4")
(eq_attr "type" "imull"))
42 38)
21 19)
(define_function_unit "ev4_imult" 1 0
(and (eq_attr "cpu" "ev4")
(eq_attr "type" "imulq"))
46 42)
(eq_attr "type" "imulq,imulh"))
23 21)
(define_function_unit "ev4_fbox" 1 0
(and (eq_attr "cpu" "ev4")
(eq_attr "type" "fadd,fmul,fcpys"))
12 2)
6 1)
(define_function_unit "ev4_fbox" 1 0
(and (eq_attr "cpu" "ev4")
(eq_attr "type" "fdivs"))
68 0)
34 0)
(define_function_unit "ev4_fbox" 1 0
(and (eq_attr "cpu" "ev4")
(eq_attr "type" "fdivt"))
126 0)
63 0)
(define_function_unit "ev4_divider" 1 0
(and (eq_attr "cpu" "ev4")
(eq_attr "type" "fdivs"))
68 60)
34 30)
(define_function_unit "ev4_divider" 1 0
(and (eq_attr "cpu" "ev4")
(eq_attr "type" "fdivt"))
126 118)
64 59)
;; EV5 scheduling. EV5 can issue 4 insns per clock.
;; Multiply all costs by 4.
;; EV5 has two integer units.
;; EV5 has two asymetric integer units. Model this with ebox,e0,e1.
;; Everything uses ebox, and those that require particular pipes grab
;; those as well.
(define_function_unit "ev5_ebox" 2 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "iadd,ilog,icmp,ldsym"))
4 4)
(eq_attr "type" "iadd,ilog,icmp,st,shift,imull,imulq,imulh"))
1 1)
;; Memory takes at least 2 clocks.
;; Conditional moves always take 2 ticks.
;; Memory takes at least 2 clocks, and load cannot dual issue with stores.
(define_function_unit "ev5_ebox" 2 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "ld,cmov"))
8 4)
(eq_attr "type" "ld,ldsym"))
2 1)
(define_function_unit "ev5_e0" 1 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "ld,ldsym"))
0 1
[(eq_attr "type" "st")])
;; Loads can dual issue. Store cannot; nor can loads + stores.
;; Model this with a mythical load/store unit.
(define_function_unit "ev5_ldst" 1 0
;; Conditional moves always take 2 ticks.
(define_function_unit "ev5_ebox" 2 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "ld"))
8 4 [(eq_attr "type" "st")])
(eq_attr "type" "cmov"))
2 1)
(define_function_unit "ev5_ldst" 1 0
;; Stores, shifts, and multiplies can only issue to E0
(define_function_unit "ev5_e0" 1 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "st"))
4 4)
1 1)
(define_function_unit "ev5_ebox" 2 0
;; But shifts and multiplies don't conflict with loads.
(define_function_unit "ev5_e0" 1 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "imull"))
32 4)
(eq_attr "type" "shift,imull,imulq,imulh"))
1 1
[(eq_attr "type" "st,shift,imull,imulq,imulh")])
(define_function_unit "ev5_ebox" 2 0
;; Branches can only issue to E1
(define_function_unit "ev5_e1" 1 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "imulq"))
48 4)
(eq_attr "type" "ibr,jsr"))
1 1)
;; Multiplies also use the integer multiplier.
(define_function_unit "ev5_imult" 1 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "imull"))
16 8)
8 4)
(define_function_unit "ev5_imult" 1 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "imulq"))
48 32)
12 8)
;; There is only 1 shifter/zapper.
(define_function_unit "ev5_shift" 1 0
(define_function_unit "ev5_imult" 1 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "shift"))
4 4)
(eq_attr "type" "imulh"))
14 8)
;; Similarly for the FPU we have two asymetric units. But fcpys can issue
;; on either so we have to play the game again.
;; We pretend EV5 has symmetrical 2 fpus,
;; even though cpys is the only insn that can issue on either unit.
(define_function_unit "ev5_fpu" 2 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "fadd,fmul,fcpys"))
16 4)
(eq_attr "type" "fadd,fmul,fcpys,fbr,fdivs,fdivt"))
4 1)
;; Multiplies (resp. adds) also use the fmul (resp. fadd) units.
(define_function_unit "ev5_fpmul" 1 0
(define_function_unit "ev5_fm" 1 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "fmul"))
16 4)
4 1)
(define_function_unit "ev5_fpadd" 1 0
(define_function_unit "ev5_fa" 1 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "fadd"))
16 4)
4 1)
(define_function_unit "ev5_fpadd" 1 0
(define_function_unit "ev5_fa" 1 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "fbr"))
4 4)
1 1)
(define_function_unit "ev5_fpadd" 1 0
(define_function_unit "ev5_fa" 1 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "fdivs"))
60 4)
15 1)
(define_function_unit "ev5_fpadd" 1 0
(define_function_unit "ev5_fa" 1 0
(and (eq_attr "cpu" "ev5")
(eq_attr "type" "fdivt"))
88 4)
22 1)
;; First define the arithmetic insns. Note that the 32-bit forms also
;; sign-extend.
......@@ -607,7 +612,7 @@
(const_int 64))))]
""
"umulh %1,%2,%0"
[(set_attr "type" "imulq")])
[(set_attr "type" "imulh")])
(define_insn ""
[(set (match_operand:DI 0 "register_operand" "=r")
......@@ -618,7 +623,7 @@
(const_int 64))))]
""
"umulh %1,%2,%0"
[(set_attr "type" "imulq")])
[(set_attr "type" "imulh")])
;; The divide and remainder operations always take their inputs from
;; r24 and r25, put their output in r27, and clobber r23 and r28.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment