Commit cdc1afa3 by James Greenhalgh Committed by James Greenhalgh

[Patch 2/2 ARM/AArch64] Add a new Cortex-A53 scheduling model

	* config/arm/aarch-common-protos.h
	(aarch_accumulator_forwarding): New.
	(aarch_forward_to_shift_is_not_shifted_reg): Likewise.
	* config/arm/aarch-common.c (aarch_accumulator_forwarding): New.
	(aarch_forward_to_shift_is_not_shifted_reg): Liekwise.
	* config/arm/cortex-a53.md: Rewrite.

From-SVN: r228324
parent 34050b6b
2015-10-01 James Greenhalgh <james.greenhalgh@arm.com>
* config/arm/aarch-common-protos.h
(aarch_accumulator_forwarding): New.
(aarch_forward_to_shift_is_not_shifted_reg): Likewise.
* config/arm/aarch-common.c (aarch_accumulator_forwarding): New.
(aarch_forward_to_shift_is_not_shifted_reg): Liekwise.
* config/arm/cortex-a53.md: Rewrite.
2015-10-01 Richard Biener <rguenther@suse.de> 2015-10-01 Richard Biener <rguenther@suse.de>
* gimple-match.h (mprts_hook): Declare. * gimple-match.h (mprts_hook): Declare.
...@@ -23,7 +23,9 @@ ...@@ -23,7 +23,9 @@
#ifndef GCC_AARCH_COMMON_PROTOS_H #ifndef GCC_AARCH_COMMON_PROTOS_H
#define GCC_AARCH_COMMON_PROTOS_H #define GCC_AARCH_COMMON_PROTOS_H
extern int aarch_accumulator_forwarding (rtx_insn *, rtx_insn *);
extern int aarch_crypto_can_dual_issue (rtx_insn *, rtx_insn *); extern int aarch_crypto_can_dual_issue (rtx_insn *, rtx_insn *);
extern int aarch_forward_to_shift_is_not_shifted_reg (rtx_insn *, rtx_insn *);
extern bool aarch_rev16_p (rtx); extern bool aarch_rev16_p (rtx);
extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode); extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode);
extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode); extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode);
......
...@@ -394,6 +394,112 @@ arm_mac_accumulator_is_result (rtx producer, rtx consumer) ...@@ -394,6 +394,112 @@ arm_mac_accumulator_is_result (rtx producer, rtx consumer)
&& !reg_overlap_mentioned_p (result, op1)); && !reg_overlap_mentioned_p (result, op1));
} }
/* Return non-zero if the destination of PRODUCER feeds the accumulator
operand of an MLA-like operation. */
int
aarch_accumulator_forwarding (rtx_insn *producer, rtx_insn *consumer)
{
rtx producer_set = single_set (producer);
rtx consumer_set = single_set (consumer);
/* We are looking for a SET feeding a SET. */
if (!producer_set || !consumer_set)
return 0;
rtx dest = SET_DEST (producer_set);
rtx mla = SET_SRC (consumer_set);
/* We're looking for a register SET. */
if (!REG_P (dest))
return 0;
rtx accumulator;
/* Strip a zero_extend. */
if (GET_CODE (mla) == ZERO_EXTEND)
mla = XEXP (mla, 0);
switch (GET_CODE (mla))
{
case PLUS:
/* Possibly an MADD. */
if (GET_CODE (XEXP (mla, 0)) == MULT)
accumulator = XEXP (mla, 1);
else
return 0;
break;
case MINUS:
/* Possibly an MSUB. */
if (GET_CODE (XEXP (mla, 1)) == MULT)
accumulator = XEXP (mla, 0);
else
return 0;
break;
case FMA:
{
/* Possibly an FMADD/FMSUB/FNMADD/FNMSUB. */
if (REG_P (XEXP (mla, 1))
&& REG_P (XEXP (mla, 2))
&& (REG_P (XEXP (mla, 0))
|| GET_CODE (XEXP (mla, 0)) == NEG))
{
/* FMADD/FMSUB. */
accumulator = XEXP (mla, 2);
}
else if (REG_P (XEXP (mla, 1))
&& GET_CODE (XEXP (mla, 2)) == NEG
&& (REG_P (XEXP (mla, 0))
|| GET_CODE (XEXP (mla, 0)) == NEG))
{
/* FNMADD/FNMSUB. */
accumulator = XEXP (XEXP (mla, 2), 0);
}
else
return 0;
break;
}
default:
/* Not an MLA-like operation. */
return 0;
}
return (REGNO (dest) == REGNO (accumulator));
}
/* Return nonzero if the CONSUMER instruction is some sort of
arithmetic or logic + shift operation, and the register we are
writing in PRODUCER is not used in a register shift by register
operation. */
int
aarch_forward_to_shift_is_not_shifted_reg (rtx_insn *producer,
rtx_insn *consumer)
{
rtx value, op;
rtx early_op;
if (!arm_get_set_operands (producer, consumer, &value, &op))
return 0;
if ((early_op = arm_find_shift_sub_rtx (op)))
{
if (REG_P (early_op))
early_op = op;
/* Any other canonicalisation of a shift is a shift-by-constant
so we don't care. */
if (GET_CODE (early_op) == ASHIFT)
return (!REG_P (XEXP (early_op, 0))
|| !REG_P (XEXP (early_op, 1)));
else
return 1;
}
return 0;
}
/* Return non-zero if the consumer (a multiply-accumulate instruction) /* Return non-zero if the consumer (a multiply-accumulate instruction)
has an accumulator dependency on the result of the producer (a has an accumulator dependency on the result of the producer (a
multiplication instruction) and no other dependency on that result. */ multiplication instruction) and no other dependency on that result. */
......
...@@ -22,345 +22,700 @@ ...@@ -22,345 +22,700 @@
(define_automaton "cortex_a53") (define_automaton "cortex_a53")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Functional units. ;; General-purpose functional units.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; There are two main integer execution pipelines, described as ;; We use slot0 and slot1 to model constraints on which instructions may
;; slot 0 and issue slot 1. ;; dual-issue.
(define_cpu_unit "cortex_a53_slot0" "cortex_a53") (define_cpu_unit "cortex_a53_slot0" "cortex_a53")
(define_cpu_unit "cortex_a53_slot1" "cortex_a53") (define_cpu_unit "cortex_a53_slot1" "cortex_a53")
(define_reservation "cortex_a53_slot_any" "cortex_a53_slot0|cortex_a53_slot1") (define_reservation "cortex_a53_slot_any"
(define_reservation "cortex_a53_single_issue" "cortex_a53_slot0+cortex_a53_slot1") "cortex_a53_slot0\
|cortex_a53_slot1")
;; The load/store pipeline. Load/store instructions can dual-issue from (define_reservation "cortex_a53_single_issue"
;; either pipeline, but two load/stores cannot simultaneously issue. "cortex_a53_slot0\
+cortex_a53_slot1")
(define_cpu_unit "cortex_a53_ls" "cortex_a53") ;; Used to model load and store pipelines. Load/store instructions
;; can dual-issue with other instructions, but two load/stores cannot
;; The store pipeline. Shared between both execution pipelines. ;; simultaneously issue.
(define_cpu_unit "cortex_a53_store" "cortex_a53") (define_cpu_unit "cortex_a53_store" "cortex_a53")
(define_cpu_unit "cortex_a53_load" "cortex_a53")
(define_cpu_unit "cortex_a53_ls_agen" "cortex_a53")
;; The branch pipeline. Branches can dual-issue with other instructions ;; Used to model a branch pipeline. Branches can dual-issue with other
;; (except when those instructions take multiple cycles to issue). ;; instructions (except when those instructions take multiple cycles
;; to issue).
(define_cpu_unit "cortex_a53_branch" "cortex_a53") (define_cpu_unit "cortex_a53_branch" "cortex_a53")
;; The integer divider. ;; Used to model an integer divide pipeline.
(define_cpu_unit "cortex_a53_idiv" "cortex_a53") (define_cpu_unit "cortex_a53_idiv" "cortex_a53")
;; The floating-point add pipeline used to model the usage ;; Used to model an integer multiply/multiply-accumulate pipeline.
;; of the add pipeline by fmac instructions.
(define_cpu_unit "cortex_a53_fpadd_pipe" "cortex_a53")
;; Floating-point div/sqrt (long latency, out-of-order completion). (define_cpu_unit "cortex_a53_imul" "cortex_a53")
(define_cpu_unit "cortex_a53_fp_div_sqrt" "cortex_a53") ;; Model general structural hazards, for wherever we need them.
;; The Advanced SIMD pipelines. (define_cpu_unit "cortex_a53_hazard" "cortex_a53")
(define_cpu_unit "cortex_a53_simd0" "cortex_a53")
(define_cpu_unit "cortex_a53_simd1" "cortex_a53")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ALU instructions. ;; ALU instructions.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define_insn_reservation "cortex_a53_alu" 2 (define_insn_reservation "cortex_a53_shift" 2
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,\ (eq_attr "type" "adr,shift_imm,shift_reg,mov_imm,mvn_imm"))
alu_sreg,alus_sreg,logic_reg,logics_reg,\
adc_imm,adcs_imm,adc_reg,adcs_reg,\
adr,bfm,csel,clz,rbit,rev,alu_dsp_reg,\
rotate_imm,shift_imm,shift_reg,\
mov_imm,mov_reg,mvn_imm,mvn_reg,\
mrs,multiple,no_insn"))
"cortex_a53_slot_any") "cortex_a53_slot_any")
(define_insn_reservation "cortex_a53_alu_shift" 2 (define_insn_reservation "cortex_a53_alu_rotate_imm" 2
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "alu_shift_imm,alus_shift_imm,\ (eq_attr "type" "rotate_imm"))
crc,logic_shift_imm,logics_shift_imm,\ "(cortex_a53_slot1)
alu_ext,alus_ext,alu_shift_reg,alus_shift_reg,\ | (cortex_a53_single_issue)")
logic_shift_reg,logics_shift_reg,\
extend,mov_shift,mov_shift_reg,\
mvn_shift,mvn_shift_reg"))
"cortex_a53_slot_any")
;; Forwarding path for unshifted operands. (define_insn_reservation "cortex_a53_alu" 3
(and (eq_attr "tune" "cortexa53")
(define_bypass 1 "cortex_a53_alu,cortex_a53_alu_shift" (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,
"cortex_a53_alu") alu_sreg,alus_sreg,logic_reg,logics_reg,
adc_imm,adcs_imm,adc_reg,adcs_reg,
bfm,csel,clz,rbit,rev,alu_dsp_reg,
mov_reg,mvn_reg,
mrs,multiple,no_insn"))
"cortex_a53_slot_any")
(define_bypass 1 "cortex_a53_alu,cortex_a53_alu_shift" (define_insn_reservation "cortex_a53_alu_shift" 3
"cortex_a53_alu_shift" (and (eq_attr "tune" "cortexa53")
"arm_no_early_alu_shift_dep") (eq_attr "type" "alu_shift_imm,alus_shift_imm,
crc,logic_shift_imm,logics_shift_imm,
alu_ext,alus_ext,
extend,mov_shift,mvn_shift"))
"cortex_a53_slot_any")
;; The multiplier pipeline can forward results so there's no need to specify (define_insn_reservation "cortex_a53_alu_shift_reg" 3
;; bypasses. Multiplies can only single-issue currently. (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "alu_shift_reg,alus_shift_reg,
logic_shift_reg,logics_shift_reg,
mov_shift_reg,mvn_shift_reg"))
"cortex_a53_slot_any+cortex_a53_hazard")
(define_insn_reservation "cortex_a53_mul" 3 (define_insn_reservation "cortex_a53_mul" 3
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(ior (eq_attr "mul32" "yes") (ior (eq_attr "mul32" "yes")
(eq_attr "mul64" "yes"))) (eq_attr "mul64" "yes")))
"cortex_a53_single_issue") "cortex_a53_slot_any+cortex_a53_imul")
;; A multiply with a single-register result or an MLA, followed by an
;; MLA with an accumulator dependency, has its result forwarded so two
;; such instructions can issue back-to-back.
(define_bypass 1 "cortex_a53_mul"
"cortex_a53_mul"
"arm_mac_accumulator_is_mul_result")
;; Punt with a high enough latency for divides. ;; From the perspective of the GCC scheduling state machine, if we wish to
(define_insn_reservation "cortex_a53_udiv" 8 ;; model an instruction as serialising other instructions, we are best to do
(and (eq_attr "tune" "cortexa53") ;; so by modelling it as taking very few cycles. Scheduling many other
(eq_attr "type" "udiv")) ;; instructions underneath it at the cost of freedom to pick from the
"(cortex_a53_slot0+cortex_a53_idiv),cortex_a53_idiv*7") ;; ready list is likely to hurt us more than it helps. However, we do
;; want to model some resource and latency cost for divide instructions in
;; order to avoid divides ending up too lumpy.
(define_insn_reservation "cortex_a53_sdiv" 9 (define_insn_reservation "cortex_a53_div" 4
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "sdiv")) (eq_attr "type" "udiv,sdiv"))
"(cortex_a53_slot0+cortex_a53_idiv),cortex_a53_idiv*8") "cortex_a53_slot0,cortex_a53_idiv*2")
(define_bypass 2 "cortex_a53_mul,cortex_a53_udiv,cortex_a53_sdiv"
"cortex_a53_alu")
(define_bypass 2 "cortex_a53_mul,cortex_a53_udiv,cortex_a53_sdiv"
"cortex_a53_alu_shift"
"arm_no_early_alu_shift_dep")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Load/store instructions. ;; Load/store instructions.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Address-generation happens in the issue stage. ;; TODO: load<n> is not prescriptive about how much data is to be loaded.
;; This is most obvious for LDRD from AArch32 and LDP (X register) from
;; AArch64, both are tagged load2 but LDP will load 128-bits compared to
;; LDRD which is 64-bits.
;;
;; For the below, we assume AArch64 X-registers for load2, and AArch32
;; registers for load3/load4.
(define_insn_reservation "cortex_a53_load1" 3 (define_insn_reservation "cortex_a53_load1" 4
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "load_byte,load1,load_acq")) (eq_attr "type" "load_byte,load1,load_acq"))
"cortex_a53_slot_any+cortex_a53_ls") "cortex_a53_slot_any+cortex_a53_ls_agen,
cortex_a53_load")
(define_insn_reservation "cortex_a53_store1" 2 (define_insn_reservation "cortex_a53_store1" 2
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "store1,store_rel")) (eq_attr "type" "store1,store_rel"))
"cortex_a53_slot_any+cortex_a53_ls+cortex_a53_store") "cortex_a53_slot_any+cortex_a53_ls_agen,
cortex_a53_store")
(define_insn_reservation "cortex_a53_load2" 3 ;; Model AArch64-sized LDP Xm, Xn, [Xa]
(define_insn_reservation "cortex_a53_load2" 4
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "load2")) (eq_attr "type" "load2"))
"cortex_a53_single_issue+cortex_a53_ls") "cortex_a53_single_issue+cortex_a53_ls_agen,
cortex_a53_load+cortex_a53_slot0,
cortex_a53_load")
(define_insn_reservation "cortex_a53_store2" 2 (define_insn_reservation "cortex_a53_store2" 2
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "store2")) (eq_attr "type" "store2"))
"cortex_a53_single_issue+cortex_a53_ls+cortex_a53_store") "cortex_a53_slot_any+cortex_a53_ls_agen,
cortex_a53_store")
;; Model AArch32-sized LDM Ra, {Rm, Rn, Ro}
(define_insn_reservation "cortex_a53_load3plus" 4 (define_insn_reservation "cortex_a53_load3plus" 6
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "load3,load4")) (eq_attr "type" "load3,load4"))
"(cortex_a53_single_issue+cortex_a53_ls)*2") "cortex_a53_single_issue+cortex_a53_ls_agen,
cortex_a53_load+cortex_a53_slot0,
cortex_a53_load")
(define_insn_reservation "cortex_a53_store3plus" 3 (define_insn_reservation "cortex_a53_store3plus" 2
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "store3,store4")) (eq_attr "type" "store3,store4"))
"(cortex_a53_single_issue+cortex_a53_ls+cortex_a53_store)*2") "cortex_a53_slot_any+cortex_a53_ls_agen,
cortex_a53_store+cortex_a53_slot0,
;; Load/store addresses are required early in Issue. cortex_a53_store")
(define_bypass 3 "cortex_a53_load1,cortex_a53_load2,cortex_a53_load3plus,cortex_a53_alu,cortex_a53_alu_shift"
"cortex_a53_load*"
"arm_early_load_addr_dep")
(define_bypass 3 "cortex_a53_load1,cortex_a53_load2,cortex_a53_load3plus,cortex_a53_alu,cortex_a53_alu_shift"
"cortex_a53_store*"
"arm_early_store_addr_dep")
;; Load data can forward in the ALU pipeline
(define_bypass 2 "cortex_a53_load1,cortex_a53_load2"
"cortex_a53_alu")
(define_bypass 2 "cortex_a53_load1,cortex_a53_load2"
"cortex_a53_alu_shift"
"arm_no_early_alu_shift_dep")
;; ALU ops can forward to stores.
(define_bypass 0 "cortex_a53_alu,cortex_a53_alu_shift"
"cortex_a53_store1,cortex_a53_store2,cortex_a53_store3plus"
"arm_no_early_store_addr_dep")
(define_bypass 1 "cortex_a53_mul,cortex_a53_udiv,cortex_a53_sdiv,cortex_a53_load1,cortex_a53_load2,cortex_a53_load3plus"
"cortex_a53_store1,cortex_a53_store2,cortex_a53_store3plus"
"arm_no_early_store_addr_dep")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Branches. ;; Branches.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Currently models all branches as dual-issuable from either execution ;; Model all branches as dual-issuable from either execution, which
;; slot, which isn't true for all cases. We still need to model indirect ;; is not strictly true for all cases (indirect branches).
;; branches.
(define_insn_reservation "cortex_a53_branch" 0 (define_insn_reservation "cortex_a53_branch" 0
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "branch,call")) (eq_attr "type" "branch,call"))
"cortex_a53_slot_any+cortex_a53_branch") "cortex_a53_slot_any,cortex_a53_branch")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; General-purpose register bypasses
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Model bypasses for unshifted operands to ALU instructions.
(define_bypass 1 "cortex_a53_shift"
"cortex_a53_shift")
(define_bypass 1 "cortex_a53_alu,
cortex_a53_alu_shift*,
cortex_a53_alu_rotate_imm,
cortex_a53_shift"
"cortex_a53_alu")
(define_bypass 2 "cortex_a53_alu,
cortex_a53_alu_shift*"
"cortex_a53_alu_shift*"
"aarch_forward_to_shift_is_not_shifted_reg")
;; In our model, we allow any general-purpose register operation to
;; bypass to the accumulator operand of an integer MADD-like operation.
(define_bypass 1 "cortex_a53_alu*,
cortex_a53_load*,
cortex_a53_mul"
"cortex_a53_mul"
"aarch_accumulator_forwarding")
;; Model a bypass from MLA/MUL to many ALU instructions.
(define_bypass 2 "cortex_a53_mul"
"cortex_a53_alu,
cortex_a53_alu_shift*")
;; We get neater schedules by allowing an MLA/MUL to feed an
;; early load address dependency to a load.
(define_bypass 2 "cortex_a53_mul"
"cortex_a53_load*"
"arm_early_load_addr_dep")
;; Model bypasses for loads which are to be consumed by the ALU.
(define_bypass 2 "cortex_a53_load1"
"cortex_a53_alu")
(define_bypass 3 "cortex_a53_load1"
"cortex_a53_alu_shift*")
;; Model a bypass for ALU instructions feeding stores.
(define_bypass 1 "cortex_a53_alu*"
"cortex_a53_store1,
cortex_a53_store2,
cortex_a53_store3plus"
"arm_no_early_store_addr_dep")
;; Model a bypass for load and multiply instructions feeding stores.
(define_bypass 2 "cortex_a53_mul,
cortex_a53_load1,
cortex_a53_load2,
cortex_a53_load3plus"
"cortex_a53_store1,
cortex_a53_store2,
cortex_a53_store3plus"
"arm_no_early_store_addr_dep")
;; Model a GP->FP register move as similar to stores.
(define_bypass 1 "cortex_a53_alu*"
"cortex_a53_r2f")
(define_bypass 2 "cortex_a53_mul,
cortex_a53_load1,
cortex_a53_load2,
cortex_a53_load3plus"
"cortex_a53_r2f")
;; Shifts feeding Load/Store addresses may not be ready in time.
(define_bypass 3 "cortex_a53_shift"
"cortex_a53_load*"
"arm_early_load_addr_dep")
(define_bypass 3 "cortex_a53_shift"
"cortex_a53_store*"
"arm_early_store_addr_dep")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Floating-point/Advanced SIMD.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define_automaton "cortex_a53_advsimd")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Broad Advanced SIMD type categorisation
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define_attr "cortex_a53_advsimd_type"
"advsimd_alu, advsimd_alu_q,
advsimd_mul, advsimd_mul_q,
advsimd_div_s, advsimd_div_s_q,
advsimd_div_d, advsimd_div_d_q,
advsimd_load_64, advsimd_store_64,
advsimd_load_128, advsimd_store_128,
advsimd_load_lots, advsimd_store_lots,
unknown"
(cond [
(eq_attr "type" "neon_add, neon_qadd, neon_add_halve, neon_sub, neon_qsub,\
neon_sub_halve, neon_abs, neon_neg, neon_qneg,\
neon_qabs, neon_abd, neon_minmax, neon_compare,\
neon_compare_zero, neon_arith_acc, neon_reduc_add,\
neon_reduc_add_acc, neon_reduc_minmax,\
neon_logic, neon_tst, neon_shift_imm,\
neon_shift_reg, neon_shift_acc, neon_sat_shift_imm,\
neon_sat_shift_reg, neon_ins, neon_move,\
neon_permute, neon_zip, neon_tbl1,\
neon_tbl2, neon_tbl3, neon_tbl4, neon_bsl,\
neon_cls, neon_cnt, neon_dup,\
neon_ext, neon_rbit, neon_rev,\
neon_fp_abd_s, neon_fp_abd_d,\
neon_fp_abs_s, neon_fp_abs_d,\
neon_fp_addsub_s, neon_fp_addsub_d, neon_fp_compare_s,\
neon_fp_compare_d, neon_fp_minmax_s,\
neon_fp_minmax_d, neon_fp_neg_s, neon_fp_neg_d,\
neon_fp_reduc_add_s, neon_fp_reduc_add_d,\
neon_fp_reduc_minmax_s, neon_fp_reduc_minmax_d,\
neon_fp_cvt_widen_h, neon_fp_to_int_s,neon_fp_to_int_d,\
neon_int_to_fp_s, neon_int_to_fp_d, neon_fp_round_s,\
neon_fp_recpe_s, neon_fp_recpe_d, neon_fp_recps_s,\
neon_fp_recps_d, neon_fp_recpx_s, neon_fp_recpx_d,\
neon_fp_rsqrte_s, neon_fp_rsqrte_d, neon_fp_rsqrts_s,\
neon_fp_rsqrts_d")
(const_string "advsimd_alu")
(eq_attr "type" "neon_add_q, neon_add_widen, neon_add_long,\
neon_qadd_q, neon_add_halve_q, neon_add_halve_narrow_q,\
neon_sub_q, neon_sub_widen, neon_sub_long,\
neon_qsub_q, neon_sub_halve_q, neon_sub_halve_narrow_q,\
neon_abs_q, neon_neg_q, neon_qneg_q, neon_qabs_q,\
neon_abd_q, neon_abd_long, neon_minmax_q,\
neon_compare_q, neon_compare_zero_q,\
neon_arith_acc_q, neon_reduc_add_q,\
neon_reduc_add_long, neon_reduc_add_acc_q,\
neon_reduc_minmax_q, neon_logic_q, neon_tst_q,\
neon_shift_imm_q, neon_shift_imm_narrow_q,\
neon_shift_imm_long, neon_shift_reg_q,\
neon_shift_acc_q, neon_sat_shift_imm_q,\
neon_sat_shift_imm_narrow_q, neon_sat_shift_reg_q,\
neon_ins_q, neon_move_q, neon_move_narrow_q,\
neon_permute_q, neon_zip_q,\
neon_tbl1_q, neon_tbl2_q, neon_tbl3_q,\
neon_tbl4_q, neon_bsl_q, neon_cls_q, neon_cnt_q,\
neon_dup_q, neon_ext_q, neon_rbit_q,\
neon_rev_q, neon_fp_abd_s_q, neon_fp_abd_d_q,\
neon_fp_abs_s_q, neon_fp_abs_d_q,\
neon_fp_addsub_s_q, neon_fp_addsub_d_q,\
neon_fp_compare_s_q, neon_fp_compare_d_q,\
neon_fp_minmax_s_q, neon_fp_minmax_d_q,\
neon_fp_cvt_widen_s, neon_fp_neg_s_q, neon_fp_neg_d_q,\
neon_fp_reduc_add_s_q, neon_fp_reduc_add_d_q,\
neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d_q,\
neon_fp_cvt_narrow_s_q, neon_fp_cvt_narrow_d_q,\
neon_fp_to_int_s_q, neon_fp_to_int_d_q,\
neon_int_to_fp_s_q, neon_int_to_fp_d_q,\
neon_fp_round_s_q,\
neon_fp_recpe_s_q, neon_fp_recpe_d_q,\
neon_fp_recps_s_q, neon_fp_recps_d_q,\
neon_fp_recpx_s_q, neon_fp_recpx_d_q,\
neon_fp_rsqrte_s_q, neon_fp_rsqrte_d_q,\
neon_fp_rsqrts_s_q, neon_fp_rsqrts_d_q")
(const_string "advsimd_alu_q")
(eq_attr "type" "neon_mul_b, neon_mul_h, neon_mul_s,\
neon_mul_h_scalar, neon_mul_s_scalar,\
neon_sat_mul_b, neon_sat_mul_h, neon_sat_mul_s,\
neon_sat_mul_h_scalar, neon_sat_mul_s_scalar,\
neon_mla_b, neon_mla_h, neon_mla_s,\
neon_mla_h_scalar, neon_mla_s_scalar,\
neon_fp_mul_s, neon_fp_mul_s_scalar,\
neon_fp_mul_d, neon_fp_mla_s,\
neon_fp_mla_s_scalar, neon_fp_mla_d")
(const_string "advsimd_mul")
(eq_attr "type" "neon_mul_b_q, neon_mul_h_q, neon_mul_s_q,\
neon_mul_b_long, neon_mul_h_long, neon_mul_s_long,\
neon_mul_d_long, neon_mul_h_scalar_q,\
neon_mul_s_scalar_q, neon_mul_h_scalar_long,\
neon_mul_s_scalar_long, neon_sat_mul_b_q,\
neon_sat_mul_h_q, neon_sat_mul_s_q,\
neon_sat_mul_b_long, neon_sat_mul_h_long,\
neon_sat_mul_s_long, neon_sat_mul_h_scalar_q,\
neon_sat_mul_s_scalar_q, neon_sat_mul_h_scalar_long,\
neon_sat_mul_s_scalar_long, neon_mla_b_q,\
neon_mla_h_q, neon_mla_s_q, neon_mla_b_long,\
neon_mla_h_long, neon_mla_s_long,\
neon_mla_h_scalar_q, neon_mla_s_scalar_q,\
neon_mla_h_scalar_long, neon_mla_s_scalar_long,\
neon_sat_mla_b_long, neon_sat_mla_h_long,\
neon_sat_mla_s_long, neon_sat_mla_h_scalar_long,\
neon_sat_mla_s_scalar_long,\
neon_fp_mul_s_q, neon_fp_mul_s_scalar_q,\
neon_fp_mul_d_q, neon_fp_mul_d_scalar_q,\
neon_fp_mla_s_q, neon_fp_mla_s_scalar_q,\
neon_fp_mla_d_q, neon_fp_mla_d_scalar_q")
(const_string "advsimd_mul_q")
(eq_attr "type" "neon_fp_sqrt_s, neon_fp_div_s")
(const_string "advsimd_div_s")
(eq_attr "type" "neon_fp_sqrt_s_q, neon_fp_div_s_q")
(const_string "advsimd_div_s_q")
(eq_attr "type" "neon_fp_sqrt_d, neon_fp_div_d")
(const_string "advsimd_div_d")
(eq_attr "type" "neon_fp_sqrt_d_q, neon_fp_div_d_q")
(const_string "advsimd_div_d_q")
(eq_attr "type" "neon_ldr, neon_load1_1reg,\
neon_load1_all_lanes, neon_load1_all_lanes_q,\
neon_load1_one_lane, neon_load1_one_lane_q")
(const_string "advsimd_load_64")
(eq_attr "type" "neon_str, neon_store1_1reg,\
neon_store1_one_lane,neon_store1_one_lane_q")
(const_string "advsimd_store_64")
(eq_attr "type" "neon_load1_1reg_q, neon_load1_2reg,\
neon_load2_2reg,\
neon_load2_all_lanes, neon_load2_all_lanes_q,\
neon_load2_one_lane, neon_load2_one_lane_q")
(const_string "advsimd_load_128")
(eq_attr "type" "neon_store1_1reg_q, neon_store1_2reg,\
neon_store2_2reg,\
neon_store2_one_lane, neon_store2_one_lane_q")
(const_string "advsimd_store_128")
(eq_attr "type" "neon_load1_2reg_q, neon_load1_3reg, neon_load1_3reg_q,\
neon_load1_4reg, neon_load1_4reg_q, \
neon_load2_2reg_q, neon_load2_4reg,\
neon_load2_4reg_q, neon_load3_3reg,\
neon_load3_3reg_q, neon_load3_all_lanes,\
neon_load3_all_lanes_q, neon_load3_one_lane,\
neon_load3_one_lane_q, neon_load4_4reg,\
neon_load4_4reg_q, neon_load4_all_lanes,\
neon_load4_all_lanes_q, neon_load4_one_lane,\
neon_load4_one_lane_q, neon_ldp, neon_ldp_q")
(const_string "advsimd_load_lots")
(eq_attr "type" "neon_store1_2reg_q, neon_store1_3reg,\
neon_store1_3reg_q, neon_store1_4reg,\
neon_store1_4reg_q, neon_store2_2reg_q,\
neon_store2_4reg, neon_store2_4reg_q,\
neon_store3_3reg, neon_store3_3reg_q,\
neon_store3_one_lane, neon_store3_one_lane_q,\
neon_store4_4reg, neon_store4_4reg_q,\
neon_store4_one_lane, neon_store4_one_lane_q,\
neon_stp, neon_stp_q")
(const_string "advsimd_store_lots")]
(const_string "unknown")))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Floating-point/Advanced SIMD functional units.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; We model the Advanced SIMD unit as two 64-bit units, each with three
;; pipes, FP_ALU, FP_MUL, FP_DIV. We also give convenient reservations
;; for 128-bit Advanced SIMD instructions, which use both units.
;; The floating-point/Advanced SIMD ALU pipelines.
(define_cpu_unit "cortex_a53_fp_alu_lo,\
cortex_a53_fp_alu_hi"
"cortex_a53_advsimd")
(define_reservation "cortex_a53_fp_alu"
"cortex_a53_fp_alu_lo\
|cortex_a53_fp_alu_hi")
(define_reservation "cortex_a53_fp_alu_q"
"cortex_a53_fp_alu_lo\
+cortex_a53_fp_alu_hi")
;; The floating-point/Advanced SIMD multiply/multiply-accumulate
;; pipelines.
(define_cpu_unit "cortex_a53_fp_mul_lo,\
cortex_a53_fp_mul_hi"
"cortex_a53_advsimd")
(define_reservation "cortex_a53_fp_mul"
"cortex_a53_fp_mul_lo\
|cortex_a53_fp_mul_hi")
(define_reservation "cortex_a53_fp_mul_q"
"cortex_a53_fp_mul_lo\
+cortex_a53_fp_mul_hi")
;; Floating-point/Advanced SIMD divide/square root.
(define_cpu_unit "cortex_a53_fp_div_lo,\
cortex_a53_fp_div_hi"
"cortex_a53_advsimd")
;; Once we choose a pipe, stick with it for three simulated cycles.
(define_reservation "cortex_a53_fp_div"
"(cortex_a53_fp_div_lo*3)\
|(cortex_a53_fp_div_hi*3)")
(define_reservation "cortex_a53_fp_div_q"
"(cortex_a53_fp_div_lo*3)\
+(cortex_a53_fp_div_hi*3)")
;; Cryptographic extensions
(define_cpu_unit "cortex_a53_crypto"
"cortex_a53_advsimd")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Floating-point arithmetic. ;; Floating-point arithmetic.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define_insn_reservation "cortex_a53_fpalu" 4 (define_insn_reservation "cortex_a53_fpalu" 5
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "ffariths, fadds, ffarithd, faddd, fmov, fmuls,\ (eq_attr "type" "ffariths, fadds, ffarithd, faddd, fmov,
f_cvt,f_cvtf2i,f_cvti2f,\ f_cvt, fcmps, fcmpd, fcsel, f_rints, f_rintd,
fcmps, fcmpd, fcsel, f_rints, f_rintd, f_minmaxs,\ f_minmaxs, f_minmaxd"))
f_minmaxd")) "cortex_a53_slot_any,cortex_a53_fp_alu")
"cortex_a53_slot0+cortex_a53_fpadd_pipe")
(define_insn_reservation "cortex_a53_fconst" 2 (define_insn_reservation "cortex_a53_fconst" 3
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "fconsts,fconstd")) (eq_attr "type" "fconsts,fconstd"))
"cortex_a53_slot0+cortex_a53_fpadd_pipe") "cortex_a53_slot_any,cortex_a53_fp_alu")
(define_insn_reservation "cortex_a53_fpmul" 4 (define_insn_reservation "cortex_a53_fpmul" 5
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "fmuls,fmuld")) (eq_attr "type" "fmuls,fmuld"))
"cortex_a53_slot0") "cortex_a53_slot_any,cortex_a53_fp_mul")
;; For single-precision multiply-accumulate, the add (accumulate) is issued after ;; For multiply-accumulate, model the add (accumulate) as being issued
;; the multiply completes. Model that accordingly. ;; after the multiply completes.
(define_insn_reservation "cortex_a53_fpmac" 8 (define_insn_reservation "cortex_a53_fpmac" 8
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "fmacs,fmacd,ffmas,ffmad")) (eq_attr "type" "fmacs,fmacd,ffmas,ffmad"))
"cortex_a53_slot0, nothing*3, cortex_a53_fpadd_pipe") "cortex_a53_slot_any,cortex_a53_fp_mul,
nothing*3, cortex_a53_fp_alu")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Floating-point divide/square root instructions. ;; Floating-point to/from core transfers.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; fsqrt really takes one cycle less, but that is not modelled.
(define_insn_reservation "cortex_a53_fdivs" 14 (define_insn_reservation "cortex_a53_r2f" 6
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "fdivs, fsqrts")) (eq_attr "type" "f_mcr,f_mcrr,f_cvti2f,
"cortex_a53_slot0, cortex_a53_fp_div_sqrt * 5") neon_from_gp, neon_from_gp_q"))
"cortex_a53_slot_any,cortex_a53_store,
nothing,cortex_a53_fp_alu")
(define_insn_reservation "cortex_a53_fdivd" 29 (define_insn_reservation "cortex_a53_f2r" 6
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "fdivd, fsqrtd")) (eq_attr "type" "f_mrc,f_mrrc,f_cvtf2i,
"cortex_a53_slot0, cortex_a53_fp_div_sqrt * 8") neon_to_gp, neon_to_gp_q"))
"cortex_a53_slot_any,cortex_a53_fp_alu,
nothing,cortex_a53_store")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ARMv8-A Cryptographic extensions. ;; Floating-point flag transfer.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define_insn_reservation "cortex_a53_crypto_aese" 2 (define_insn_reservation "cortex_a53_f_flags" 5
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "crypto_aese")) (eq_attr "type" "f_flag"))
"cortex_a53_simd0") "cortex_a53_slot_any")
(define_insn_reservation "cortex_a53_crypto_aesmc" 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(and (eq_attr "tune" "cortexa53") ;; Floating-point load/store.
(eq_attr "type" "crypto_aesmc")) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
"cortex_a53_simd0 | cortex_a53_simd1")
(define_insn_reservation "cortex_a53_crypto_sha1_fast" 2 (define_insn_reservation "cortex_a53_f_load_64" 4
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "crypto_sha1_fast, crypto_sha256_fast")) (ior (eq_attr "type" "f_loads,f_loadd")
"cortex_a53_simd0") (eq_attr "cortex_a53_advsimd_type"
"advsimd_load_64")))
"cortex_a53_slot_any+cortex_a53_ls_agen,
cortex_a53_load")
(define_insn_reservation "cortex_a53_crypto_sha1_xor" 3 (define_insn_reservation "cortex_a53_f_load_many" 5
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "crypto_sha1_xor")) (eq_attr "cortex_a53_advsimd_type"
"cortex_a53_simd0") "advsimd_load_128,advsimd_load_lots"))
"cortex_a53_single_issue+cortex_a53_ls_agen,
cortex_a53_load+cortex_a53_slot0,
cortex_a53_load")
(define_insn_reservation "cortex_a53_crypto_sha_slow" 5 (define_insn_reservation "cortex_a53_f_store_64" 0
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "crypto_sha1_slow, crypto_sha256_slow")) (ior (eq_attr "type" "f_stores,f_stored")
"cortex_a53_simd0") (eq_attr "cortex_a53_advsimd_type"
"advsimd_store_64")))
"cortex_a53_slot_any+cortex_a53_ls_agen,
cortex_a53_store")
(define_bypass 0 "cortex_a53_crypto_aese" (define_insn_reservation "cortex_a53_f_store_many" 0
"cortex_a53_crypto_aesmc" (and (eq_attr "tune" "cortexa53")
"aarch_crypto_can_dual_issue") (eq_attr "cortex_a53_advsimd_type"
"advsimd_store_128,advsimd_store_lots"))
"cortex_a53_slot_any+cortex_a53_ls_agen,
cortex_a53_store+cortex_a53_slot0,
cortex_a53_store")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; VFP to/from core transfers. ;; Advanced SIMD.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define_insn_reservation "cortex_a53_r2f" 4 ;; Either we want to model use of the ALU pipe, the multiply pipe or the
;; divide/sqrt pipe. In all cases we need to check if we are a 64-bit
;; operation (in which case we model dual-issue without penalty)
;; or a 128-bit operation in which case we require in our model that we
;; issue from slot 0.
(define_insn_reservation "cortex_a53_advsimd_alu" 5
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "f_mcr,f_mcrr")) (eq_attr "cortex_a53_advsimd_type" "advsimd_alu"))
"cortex_a53_slot0") "cortex_a53_slot_any,cortex_a53_fp_alu")
(define_insn_reservation "cortex_a53_f2r" 2 (define_insn_reservation "cortex_a53_advsimd_alu_q" 5
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "f_mrc,f_mrrc")) (eq_attr "cortex_a53_advsimd_type" "advsimd_alu_q"))
"cortex_a53_slot0") "cortex_a53_slot0,cortex_a53_fp_alu_q")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_insn_reservation "cortex_a53_advsimd_mul" 5
;; VFP flag transfer. (and (eq_attr "tune" "cortexa53")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (eq_attr "cortex_a53_advsimd_type" "advsimd_mul"))
"cortex_a53_slot_any,cortex_a53_fp_mul")
(define_insn_reservation "cortex_a53_f_flags" 4 (define_insn_reservation "cortex_a53_advsimd_mul_q" 5
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "f_flag")) (eq_attr "cortex_a53_advsimd_type" "advsimd_mul_q"))
"cortex_a53_slot0") "cortex_a53_slot0,cortex_a53_fp_mul_q")
;; SIMD Dividers.
(define_insn_reservation "cortex_a53_advsimd_div_s" 14
(and (eq_attr "tune" "cortexa53")
(ior (eq_attr "type" "fdivs,fsqrts")
(eq_attr "cortex_a53_advsimd_type" "advsimd_div_s")))
"cortex_a53_slot0,cortex_a53_fp_mul,
cortex_a53_fp_div")
(define_insn_reservation "cortex_a53_advsimd_div_d" 29
(and (eq_attr "tune" "cortexa53")
(ior (eq_attr "type" "fdivd,fsqrtd")
(eq_attr "cortex_a53_advsimd_type" "advsimd_div_d")))
"cortex_a53_slot0,cortex_a53_fp_mul,
cortex_a53_fp_div")
(define_insn_reservation "cortex_a53_advsimd_div_s_q" 14
(and (eq_attr "tune" "cortexa53")
(eq_attr "cortex_a53_advsimd_type" "advsimd_div_s_q"))
"cortex_a53_single_issue,cortex_a53_fp_mul_q,
cortex_a53_fp_div_q")
(define_insn_reservation "cortex_a53_advsimd_divd_q" 29
(and (eq_attr "tune" "cortexa53")
(eq_attr "cortex_a53_advsimd_type" "advsimd_div_d_q"))
"cortex_a53_single_issue,cortex_a53_fp_mul_q,
cortex_a53_fp_div_q")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; VFP load/store. ;; ARMv8-A Cryptographic extensions.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define_insn_reservation "cortex_a53_f_loads" 4 ;; We want AESE and AESMC to end up consecutive to one another.
(define_insn_reservation "cortex_a53_crypto_aese" 3
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "f_loads")) (eq_attr "type" "crypto_aese"))
"cortex_a53_slot0") "cortex_a53_slot0")
(define_insn_reservation "cortex_a53_f_loadd" 5 (define_insn_reservation "cortex_a53_crypto_aesmc" 3
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "f_loadd")) (eq_attr "type" "crypto_aesmc"))
"cortex_a53_slot0") "cortex_a53_slot_any")
(define_insn_reservation "cortex_a53_f_load_2reg" 5 ;; SHA1H
(define_insn_reservation "cortex_a53_crypto_sha1_fast" 3
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "neon_ldp, neon_ldp_q, neon_load2_2reg_q")) (eq_attr "type" "crypto_sha1_fast"))
"(cortex_a53_slot_any+cortex_a53_ls)*2") "cortex_a53_slot_any,cortex_a53_crypto")
(define_insn_reservation "cortex_a53_f_loadq" 5 (define_insn_reservation "cortex_a53_crypto_sha256_fast" 3
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "neon_load1_1reg_q")) (eq_attr "type" "crypto_sha256_fast"))
"cortex_a53_slot_any+cortex_a53_ls") "cortex_a53_slot0,cortex_a53_crypto")
(define_insn_reservation "cortex_a53_f_stores" 0 (define_insn_reservation "cortex_a53_crypto_sha1_xor" 4
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "f_stores")) (eq_attr "type" "crypto_sha1_xor"))
"cortex_a53_slot0") "cortex_a53_slot0,cortex_a53_crypto")
(define_insn_reservation "cortex_a53_f_stored" 0 (define_insn_reservation "cortex_a53_crypto_sha_slow" 5
(and (eq_attr "tune" "cortexa53") (and (eq_attr "tune" "cortexa53")
(eq_attr "type" "f_stored")) (eq_attr "type" "crypto_sha1_slow, crypto_sha256_slow"))
"cortex_a53_slot0") "cortex_a53_slot0,cortex_a53_crypto")
;; Load-to-use for floating-point values has a penalty of one cycle, ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; i.e. a latency of two. ;; Floating-point/Advanced SIMD register bypasses.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define_bypass 2 "cortex_a53_f_loads" ;; Model the late use of the accumulator operand for floating-point
"cortex_a53_fpalu, cortex_a53_fpmac, cortex_a53_fpmul,\ ;; multiply-accumulate operations as a bypass reducing the latency
cortex_a53_fdivs, cortex_a53_fdivd,\ ;; of producing instructions to near zero.
cortex_a53_f2r")
(define_bypass 2 "cortex_a53_f_loadd" (define_bypass 1 "cortex_a53_fp*,
"cortex_a53_fpalu, cortex_a53_fpmac, cortex_a53_fpmul,\ cortex_a53_r2f,
cortex_a53_fdivs, cortex_a53_fdivd,\ cortex_a53_f_load*"
cortex_a53_f2r") "cortex_a53_fpmac"
"aarch_accumulator_forwarding")
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Model a bypass from the result of an FP operation to a use.
;; Crude Advanced SIMD approximation.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_bypass 4 "cortex_a53_fpalu,
cortex_a53_fpmul"
"cortex_a53_fpalu,
cortex_a53_fpmul,
cortex_a53_fpmac,
cortex_a53_advsimd_div*")
;; We want AESE and AESMC to end up consecutive to one another.
(define_bypass 0 "cortex_a53_crypto_aese"
"cortex_a53_crypto_aesmc"
"aarch_crypto_can_dual_issue")
(define_insn_reservation "cortex_a53_advsimd" 4
(and (eq_attr "tune" "cortexa53")
(eq_attr "is_neon_type" "yes"))
"cortex_a53_simd0")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment