Commit 5d50fab3 by Jeff Law Committed by Jeff Law

pa-protos.h (hppa_fpstore_bypass_p): Declare.

        * pa-protos.h (hppa_fpstore_bypass_p): Declare.
        * pa.c (pa_adjust_cost): Remove all true dependency cost
        adjustments.  Also remove support for non-DFA scheduling.
        * pa.md (700, 7100, 7100lc, 7200, 7300): Use bypass mechanism
        to adjust true dependency costs.  Update various comments.
        (7100lc, 7200, 7300 scheduling): Simplify by combining the
        FP ALU & MPY units into a single unit.

From-SVN: r53227
parent a17a104c
2002-05-06 Jeff Law <law@redhat.com>
* pa-protos.h (hppa_fpstore_bypass_p): Declare.
* pa.c (pa_adjust_cost): Remove all true dependency cost
adjustments. Also remove support for non-DFA scheduling.
* pa.md (700, 7100, 7100lc, 7200, 7300): Use bypass mechanism
to adjust true dependency costs. Update various comments.
(7100lc, 7200, 7300 scheduling): Simplify by combining the
FP ALU & MPY units into a single unit.
2002-05-06 Catherine Moore <clm@redhat.com> 2002-05-06 Catherine Moore <clm@redhat.com>
* config/v850/v850.c (compute_register_save_size): Make sure * config/v850/v850.c (compute_register_save_size): Make sure
......
...@@ -103,6 +103,7 @@ extern int is_function_label_plus_const PARAMS ((rtx)); ...@@ -103,6 +103,7 @@ extern int is_function_label_plus_const PARAMS ((rtx));
extern int jump_in_call_delay PARAMS ((rtx)); extern int jump_in_call_delay PARAMS ((rtx));
extern enum reg_class secondary_reload_class PARAMS ((enum reg_class, extern enum reg_class secondary_reload_class PARAMS ((enum reg_class,
enum machine_mode, rtx)); enum machine_mode, rtx));
extern int hppa_fpstore_bypass_p PARAMS ((rtx, rtx));
/* Declare functions defined in pa.c and used in templates. */ /* Declare functions defined in pa.c and used in templates. */
......
...@@ -60,6 +60,33 @@ hppa_use_dfa_pipeline_interface () ...@@ -60,6 +60,33 @@ hppa_use_dfa_pipeline_interface ()
return 1; return 1;
} }
/* Return nonzero if there is a bypass for the output of
OUT_INSN and the fp store IN_INSN. */
int
hppa_fpstore_bypass_p (out_insn, in_insn)
rtx out_insn, in_insn;
{
enum machine_mode store_mode;
enum machine_mode other_mode;
rtx set;
if (recog_memoized (in_insn) < 0
|| get_attr_type (in_insn) != TYPE_FPSTORE
|| recog_memoized (out_insn) < 0)
return 0;
store_mode = GET_MODE (SET_SRC (PATTERN (in_insn)));
set = single_set (out_insn);
if (!set)
return 0;
other_mode = GET_MODE (SET_SRC (set));
return (GET_MODE_SIZE (store_mode) == GET_MODE_SIZE (other_mode));
}
#ifndef DO_FRAME_NOTES #ifndef DO_FRAME_NOTES
#ifdef INCOMING_RETURN_ADDR_RTX #ifdef INCOMING_RETURN_ADDR_RTX
#define DO_FRAME_NOTES 1 #define DO_FRAME_NOTES 1
...@@ -3907,8 +3934,9 @@ pa_adjust_cost (insn, link, dep_insn, cost) ...@@ -3907,8 +3934,9 @@ pa_adjust_cost (insn, link, dep_insn, cost)
{ {
enum attr_type attr_type; enum attr_type attr_type;
/* Don't adjust costs for a pa8000 chip. */ /* Don't adjust costs for a pa8000 chip, also do not adjust any
if (pa_cpu >= PROCESSOR_8000) true dependencies as they are described with bypasses now. */
if (pa_cpu >= PROCESSOR_8000 || REG_NOTE_KIND (link) == 0)
return cost; return cost;
if (! recog_memoized (insn)) if (! recog_memoized (insn))
...@@ -3916,65 +3944,7 @@ pa_adjust_cost (insn, link, dep_insn, cost) ...@@ -3916,65 +3944,7 @@ pa_adjust_cost (insn, link, dep_insn, cost)
attr_type = get_attr_type (insn); attr_type = get_attr_type (insn);
if (REG_NOTE_KIND (link) == 0) if (REG_NOTE_KIND (link) == REG_DEP_ANTI)
{
/* Data dependency; DEP_INSN writes a register that INSN reads some
cycles later. */
if (attr_type == TYPE_FPSTORE)
{
rtx pat = PATTERN (insn);
rtx dep_pat = PATTERN (dep_insn);
if (GET_CODE (pat) == PARALLEL)
{
/* This happens for the fstXs,mb patterns. */
pat = XVECEXP (pat, 0, 0);
}
if (GET_CODE (pat) != SET || GET_CODE (dep_pat) != SET)
/* If this happens, we have to extend this to schedule
optimally. Return 0 for now. */
return 0;
if (rtx_equal_p (SET_DEST (dep_pat), SET_SRC (pat)))
{
if (! recog_memoized (dep_insn))
return 0;
/* DEP_INSN is writing its result to the register
being stored in the fpstore INSN. */
switch (get_attr_type (dep_insn))
{
case TYPE_FPLOAD:
/* This cost 3 cycles, not 2 as the md says for the
700 and 7100, 7100lc, 7200 and 7300. */
return cost + 1;
case TYPE_FPALU:
case TYPE_FPMULSGL:
case TYPE_FPMULDBL:
case TYPE_FPDIVSGL:
case TYPE_FPDIVDBL:
case TYPE_FPSQRTSGL:
case TYPE_FPSQRTDBL:
/* In these important cases, we save one cycle compared to
when flop instruction feed each other. */
return cost - 1;
default:
return cost;
}
}
/* A flop-flop true depenendency where the sizes of the operand
carrying the dependency is difference causes an additional
cycle stall on the 7100lc, 7200, and 7300. Similarly for
a fpload-flop true dependency. */
}
/* For other data dependencies, the default cost specified in the
md is correct. */
return cost;
}
else if (REG_NOTE_KIND (link) == REG_DEP_ANTI)
{ {
/* Anti dependency; DEP_INSN reads a register that INSN writes some /* Anti dependency; DEP_INSN reads a register that INSN writes some
cycles later. */ cycles later. */
...@@ -4010,10 +3980,7 @@ pa_adjust_cost (insn, link, dep_insn, cost) ...@@ -4010,10 +3980,7 @@ pa_adjust_cost (insn, link, dep_insn, cost)
preceding arithmetic operation has finished if preceding arithmetic operation has finished if
the target of the fpload is any of the sources the target of the fpload is any of the sources
(or destination) of the arithmetic operation. */ (or destination) of the arithmetic operation. */
if (hppa_use_dfa_pipeline_interface ()) return insn_default_latency (dep_insn) - 1;
return insn_default_latency (dep_insn) - 1;
else
return cost - 1;
default: default:
return 0; return 0;
...@@ -4048,10 +4015,7 @@ pa_adjust_cost (insn, link, dep_insn, cost) ...@@ -4048,10 +4015,7 @@ pa_adjust_cost (insn, link, dep_insn, cost)
preceding divide or sqrt operation has finished if preceding divide or sqrt operation has finished if
the target of the ALU flop is any of the sources the target of the ALU flop is any of the sources
(or destination) of the divide or sqrt operation. */ (or destination) of the divide or sqrt operation. */
if (hppa_use_dfa_pipeline_interface ()) return insn_default_latency (dep_insn) - 2;
return insn_default_latency (dep_insn) - 2;
else
return cost - 2;
default: default:
return 0; return 0;
...@@ -4101,10 +4065,7 @@ pa_adjust_cost (insn, link, dep_insn, cost) ...@@ -4101,10 +4065,7 @@ pa_adjust_cost (insn, link, dep_insn, cost)
Exception: For PA7100LC, PA7200 and PA7300, the cost Exception: For PA7100LC, PA7200 and PA7300, the cost
is 3 cycles, unless they bundle together. We also is 3 cycles, unless they bundle together. We also
pay the penalty if the second insn is a fpload. */ pay the penalty if the second insn is a fpload. */
if (hppa_use_dfa_pipeline_interface ()) return insn_default_latency (dep_insn) - 1;
return insn_default_latency (dep_insn) - 1;
else
return cost - 1;
default: default:
return 0; return 0;
...@@ -4139,10 +4100,7 @@ pa_adjust_cost (insn, link, dep_insn, cost) ...@@ -4139,10 +4100,7 @@ pa_adjust_cost (insn, link, dep_insn, cost)
preceding divide or sqrt operation has finished if preceding divide or sqrt operation has finished if
the target of the ALU flop is also the target of the target of the ALU flop is also the target of
the divide or sqrt operation. */ the divide or sqrt operation. */
if (hppa_use_dfa_pipeline_interface ()) return insn_default_latency (dep_insn) - 2;
return insn_default_latency (dep_insn) - 2;
else
return cost - 2;
default: default:
return 0; return 0;
......
...@@ -206,20 +206,41 @@ ...@@ -206,20 +206,41 @@
"fpmpy_700*18") "fpmpy_700*18")
(define_insn_reservation "W7" 2 (define_insn_reservation "W7" 2
(and (eq_attr "type" "load,fpload") (and (eq_attr "type" "load")
(eq_attr "cpu" "700")) (eq_attr "cpu" "700"))
"mem_700") "mem_700")
(define_insn_reservation "W8" 3 (define_insn_reservation "W8" 2
(and (eq_attr "type" "store,fpstore") (and (eq_attr "type" "fpload")
(eq_attr "cpu" "700"))
"mem_700")
(define_insn_reservation "W9" 3
(and (eq_attr "type" "store")
(eq_attr "cpu" "700"))
"mem_700*3")
(define_insn_reservation "W10" 3
(and (eq_attr "type" "fpstore")
(eq_attr "cpu" "700")) (eq_attr "cpu" "700"))
"mem_700*3") "mem_700*3")
(define_insn_reservation "W9" 1 (define_insn_reservation "W11" 1
(and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpdivdbl,fpsqrtsgl,fpsqrtdbl,load,fpload,store,fpstore") (and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpdivdbl,fpsqrtsgl,fpsqrtdbl,load,fpload,store,fpstore")
(eq_attr "cpu" "700")) (eq_attr "cpu" "700"))
"dummy_700") "dummy_700")
;; We have a bypass for all computations in the FP unit which feed an
;; FP store as long as the sizes are the same.
(define_bypass 2 "W1,W2" "W10" "hppa_fpstore_bypass_p")
(define_bypass 9 "W3" "W10" "hppa_fpstore_bypass_p")
(define_bypass 11 "W4" "W10" "hppa_fpstore_bypass_p")
(define_bypass 13 "W5" "W10" "hppa_fpstore_bypass_p")
(define_bypass 17 "W6" "W10" "hppa_fpstore_bypass_p")
;; We have an "anti-bypass" for FP loads which feed an FP store.
(define_bypass 4 "W8" "W10" "hppa_fpstore_bypass_p")
;; Function units for the 7100 and 7150. The 7100/7150 can dual-issue ;; Function units for the 7100 and 7150. The 7100/7150 can dual-issue
;; floating point computations with non-floating point computations (fp loads ;; floating point computations with non-floating point computations (fp loads
;; and stores are not fp computations). ;; and stores are not fp computations).
...@@ -228,8 +249,12 @@ ...@@ -228,8 +249,12 @@
;; take two cycles, during which no Dcache operations should be scheduled. ;; take two cycles, during which no Dcache operations should be scheduled.
;; Any special cases are handled in pa_adjust_cost. The 7100, 7150 and 7100LC ;; Any special cases are handled in pa_adjust_cost. The 7100, 7150 and 7100LC
;; all have the same memory characteristics if one disregards cache misses. ;; all have the same memory characteristics if one disregards cache misses.
;;
;; The 7100/7150 has three floating-point units: ALU, MUL, and DIV. ;; The 7100/7150 has three floating-point units: ALU, MUL, and DIV.
;; There's no value in modeling the ALU and MUL separately though
;; since there can never be a functional unit conflict given the
;; latency and issue rates for those units.
;;
;; Timings: ;; Timings:
;; Instruction Time Unit Minimum Distance (unit contention) ;; Instruction Time Unit Minimum Distance (unit contention)
;; fcpy 2 ALU 1 ;; fcpy 2 ALU 1
...@@ -247,11 +272,6 @@ ...@@ -247,11 +272,6 @@
;; fdiv,dbl 15 DIV 15 ;; fdiv,dbl 15 DIV 15
;; fsqrt,sgl 8 DIV 8 ;; fsqrt,sgl 8 DIV 8
;; fsqrt,dbl 15 DIV 15 ;; fsqrt,dbl 15 DIV 15
;;
;; We don't really model the FP ALU/MPY units properly (they are
;; distinct subunits in the FP unit). However, there can never be
;; a functional unit; conflict given the latency and issue rates
;; for those units.
(define_automaton "pa7100") (define_automaton "pa7100")
(define_cpu_unit "i_7100, f_7100,fpmac_7100,fpdivsqrt_7100,mem_7100" "pa7100") (define_cpu_unit "i_7100, f_7100,fpmac_7100,fpdivsqrt_7100,mem_7100" "pa7100")
...@@ -272,21 +292,45 @@ ...@@ -272,21 +292,45 @@
"f_7100+fpdivsqrt_7100,fpdivsqrt_7100*14") "f_7100+fpdivsqrt_7100,fpdivsqrt_7100*14")
(define_insn_reservation "X3" 2 (define_insn_reservation "X3" 2
(and (eq_attr "type" "load,fpload") (and (eq_attr "type" "load")
(eq_attr "cpu" "7100")) (eq_attr "cpu" "7100"))
"i_7100+mem_7100") "i_7100+mem_7100")
(define_insn_reservation "X4" 2 (define_insn_reservation "X4" 2
(and (eq_attr "type" "store,fpstore") (and (eq_attr "type" "fpload")
(eq_attr "cpu" "7100"))
"i_7100+mem_7100")
(define_insn_reservation "X5" 2
(and (eq_attr "type" "store")
(eq_attr "cpu" "7100"))
"i_7100+mem_7100,mem_7100")
(define_insn_reservation "X6" 2
(and (eq_attr "type" "fpstore")
(eq_attr "cpu" "7100")) (eq_attr "cpu" "7100"))
"i_7100+mem_7100,mem_7100") "i_7100+mem_7100,mem_7100")
(define_insn_reservation "X5" 1 (define_insn_reservation "X7" 1
(and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl,load,fpload,store,fpstore") (and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl,load,fpload,store,fpstore")
(eq_attr "cpu" "7100")) (eq_attr "cpu" "7100"))
"i_7100") "i_7100")
;; We have a bypass for all computations in the FP unit which feed an
;; FP store as long as the sizes are the same.
(define_bypass 1 "X0" "X6" "hppa_fpstore_bypass_p")
(define_bypass 7 "X1" "X6" "hppa_fpstore_bypass_p")
(define_bypass 14 "X2" "X6" "hppa_fpstore_bypass_p")
;; We have an "anti-bypass" for FP loads which feed an FP store.
(define_bypass 3 "X4" "X6" "hppa_fpstore_bypass_p")
;; The 7100LC has three floating-point units: ALU, MUL, and DIV. ;; The 7100LC has three floating-point units: ALU, MUL, and DIV.
;; There's no value in modeling the ALU and MUL separately though
;; since there can never be a functional unit conflict that
;; can be avoided given the latency, issue rates and mandatory
;; one cycle cpu-wide lock for a double precision fp multiply.
;;
;; Timings: ;; Timings:
;; Instruction Time Unit Minimum Distance (unit contention) ;; Instruction Time Unit Minimum Distance (unit contention)
;; fcpy 2 ALU 1 ;; fcpy 2 ALU 1
...@@ -321,29 +365,25 @@ ...@@ -321,29 +365,25 @@
;; ;;
;; load-load pairs ;; load-load pairs
;; store-store pairs ;; store-store pairs
;; fmpyadd,dbl
;; fmpysub,dbl
;; other issue modeling ;; other issue modeling
(define_automaton "pa7100lc") (define_automaton "pa7100lc")
(define_cpu_unit "i0_7100lc, i1_7100lc, f_7100lc" "pa7100lc") (define_cpu_unit "i0_7100lc, i1_7100lc, f_7100lc" "pa7100lc")
(define_cpu_unit "fpalu_7100lc,fpmul_7100lc" "pa7100lc") (define_cpu_unit "fpmac_7100lc" "pa7100lc")
(define_cpu_unit "mem_7100lc" "pa7100lc") (define_cpu_unit "mem_7100lc" "pa7100lc")
(define_insn_reservation "Y0" 2
(and (eq_attr "type" "fpcc,fpalu")
(eq_attr "cpu" "7100LC,7200,7300"))
"f_7100lc,fpalu_7100lc")
;; Double precision multiplies lock the entire CPU for one ;; Double precision multiplies lock the entire CPU for one
;; cycle. There is no way to avoid this lock and trying to ;; cycle. There is no way to avoid this lock and trying to
;; schedule around the lock is pointless and thus there is no ;; schedule around the lock is pointless and thus there is no
;; value in trying to model this lock. Not modeling the lock ;; value in trying to model this lock.
;; allows for a smaller DFA and may reduce register pressure. ;;
(define_insn_reservation "Y1" 2 ;; Not modeling the lock allows us to treat fp multiplies just
(and (eq_attr "type" "fpmulsgl,fpmuldbl") ;; like any other FP alu instruction. It allows for a smaller
;; DFA and may reduce register pressure.
(define_insn_reservation "Y0" 2
(and (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl")
(eq_attr "cpu" "7100LC,7200,7300")) (eq_attr "cpu" "7100LC,7200,7300"))
"f_7100lc,fpmul_7100lc") "f_7100lc,fpmac_7100lc")
;; fp division and sqrt instructions lock the entire CPU for ;; fp division and sqrt instructions lock the entire CPU for
;; 7 cycles (single precision) or 14 cycles (double precision). ;; 7 cycles (single precision) or 14 cycles (double precision).
...@@ -351,43 +391,66 @@ ...@@ -351,43 +391,66 @@
;; around the lock is pointless and thus there is no value in ;; around the lock is pointless and thus there is no value in
;; trying to model this lock. Not modeling the lock allows ;; trying to model this lock. Not modeling the lock allows
;; for a smaller DFA and may reduce register pressure. ;; for a smaller DFA and may reduce register pressure.
(define_insn_reservation "Y2" 1 (define_insn_reservation "Y1" 1
(and (eq_attr "type" "fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl") (and (eq_attr "type" "fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl")
(eq_attr "cpu" "7100LC,7200,7300")) (eq_attr "cpu" "7100LC,7200,7300"))
"f_7100lc") "f_7100lc")
(define_insn_reservation "Y2" 2
(and (eq_attr "type" "load")
(eq_attr "cpu" "7100LC,7200,7300"))
"i1_7100lc+mem_7100lc")
(define_insn_reservation "Y3" 2 (define_insn_reservation "Y3" 2
(and (eq_attr "type" "load,fpload") (and (eq_attr "type" "fpload")
(eq_attr "cpu" "7100LC,7200,7300")) (eq_attr "cpu" "7100LC,7200,7300"))
"i1_7100lc+mem_7100lc") "i1_7100lc+mem_7100lc")
(define_insn_reservation "Y4" 2 (define_insn_reservation "Y4" 2
(and (eq_attr "type" "store,fpstore") (and (eq_attr "type" "store")
(eq_attr "cpu" "7100LC"))
"i1_7100lc+mem_7100lc,mem_7100lc")
(define_insn_reservation "Y5" 2
(and (eq_attr "type" "fpstore")
(eq_attr "cpu" "7100LC")) (eq_attr "cpu" "7100LC"))
"i1_7100lc+mem_7100lc,mem_7100lc") "i1_7100lc+mem_7100lc,mem_7100lc")
(define_insn_reservation "Y5" 1 (define_insn_reservation "Y6" 1
(and (eq_attr "type" "shift,nullshift") (and (eq_attr "type" "shift,nullshift")
(eq_attr "cpu" "7100LC,7200,7300")) (eq_attr "cpu" "7100LC,7200,7300"))
"i1_7100lc") "i1_7100lc")
(define_insn_reservation "Y6" 1 (define_insn_reservation "Y7" 1
(and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl,load,fpload,store,fpstore,shift,nullshift") (and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl,load,fpload,store,fpstore,shift,nullshift")
(eq_attr "cpu" "7100LC,7200,7300")) (eq_attr "cpu" "7100LC,7200,7300"))
"(i0_7100lc|i1_7100lc)") "(i0_7100lc|i1_7100lc)")
;; The 7200 has a store-load penalty ;; The 7200 has a store-load penalty
(define_insn_reservation "Y7" 2 (define_insn_reservation "Y8" 2
(and (eq_attr "type" "store,fpstore") (and (eq_attr "type" "store")
(eq_attr "cpu" "7200"))
"i1_7100lc,mem_7100lc")
(define_insn_reservation "Y9" 2
(and (eq_attr "type" "fpstore")
(eq_attr "cpu" "7200")) (eq_attr "cpu" "7200"))
"i1_7100lc,mem_7100lc") "i1_7100lc,mem_7100lc")
;; The 7300 has no penalty for store-store or store-load ;; The 7300 has no penalty for store-store or store-load
(define_insn_reservation "Y8" 2 (define_insn_reservation "Y10" 2
(and (eq_attr "type" "store,fpstore") (and (eq_attr "type" "store")
(eq_attr "cpu" "7300")) (eq_attr "cpu" "7300"))
"i1_7100lc") "i1_7100lc")
(define_insn_reservation "Y11" 2
(and (eq_attr "type" "fpstore")
(eq_attr "cpu" "7300"))
"i1_7100lc")
;; We have an "anti-bypass" for FP loads which feed an FP store.
(define_bypass 3 "Y3" "Y5,Y9,Y11" "hppa_fpstore_bypass_p")
;; Scheduling for the PA8000 is somewhat different than scheduling for a ;; Scheduling for the PA8000 is somewhat different than scheduling for a
;; traditional architecture. ;; traditional architecture.
;; ;;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment