Commit 00a8574a by Wilco Dijkstra Committed by Wilco Dijkstra

Enable instruction fusion of dependent AESE; AESMC and AESD; AESIMC pairs.

This can give up to 2x speedup on many AArch64 implementations. Also model
the crypto instructions on Cortex-A57 according to the Optimization Guide.

    gcc/
        * config/aarch64/aarch64.c (cortexa53_tunings): Enable AES fusion.
        (cortexa57_tunings): Likewise.
        (cortexa72_tunings): Likewise.
        (arch_macro_fusion_pair_p): Add support for AES fusion.
        * config/aarch64/aarch64-fusion-pairs.def: Add AES_AESMC entry.
        * config/arm/aarch-common.c (aarch_crypto_can_dual_issue):
        Allow virtual registers before reload so early scheduling works.
        * config/arm/cortex-a57.md (cortex_a57_crypto_simple): Use
        correct latency and pipeline.
        (cortex_a57_crypto_complex): Likewise.
        (cortex_a57_crypto_xor): Likewise.
        (define_bypass): Add AES bypass.

From-SVN: r233268
parent 24a179f8
2016-02-10 Wilco Dijkstra <wdijkstr@arm.com>
* config/aarch64/aarch64.c (cortexa53_tunings): Enable AES fusion.
(cortexa57_tunings): Likewise.
(cortexa72_tunings): Likewise.
(arch_macro_fusion_pair_p): Add support for AES fusion.
* config/aarch64/aarch64-fusion-pairs.def: Add AES_AESMC entry.
* config/arm/aarch-common.c (aarch_crypto_can_dual_issue):
Allow virtual registers before reload so early scheduling works.
* config/arm/cortex-a57.md (cortex_a57_crypto_simple): Use
correct latency and pipeline.
(cortex_a57_crypto_complex): Likewise.
(cortex_a57_crypto_xor): Likewise.
(define_bypass): Add AES bypass.
2016-02-10 Richard Biener <rguenther@suse.de> 2016-02-10 Richard Biener <rguenther@suse.de>
PR tree-optimization/69726 PR tree-optimization/69726
......
...@@ -33,4 +33,5 @@ AARCH64_FUSION_PAIR ("adrp+add", ADRP_ADD) ...@@ -33,4 +33,5 @@ AARCH64_FUSION_PAIR ("adrp+add", ADRP_ADD)
AARCH64_FUSION_PAIR ("movk+movk", MOVK_MOVK) AARCH64_FUSION_PAIR ("movk+movk", MOVK_MOVK)
AARCH64_FUSION_PAIR ("adrp+ldr", ADRP_LDR) AARCH64_FUSION_PAIR ("adrp+ldr", ADRP_LDR)
AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH) AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
...@@ -451,7 +451,7 @@ static const struct tune_params cortexa53_tunings = ...@@ -451,7 +451,7 @@ static const struct tune_params cortexa53_tunings =
&generic_branch_cost, &generic_branch_cost,
4, /* memmov_cost */ 4, /* memmov_cost */
2, /* issue_rate */ 2, /* issue_rate */
(AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
8, /* function_align. */ 8, /* function_align. */
8, /* jump_align. */ 8, /* jump_align. */
...@@ -476,7 +476,7 @@ static const struct tune_params cortexa57_tunings = ...@@ -476,7 +476,7 @@ static const struct tune_params cortexa57_tunings =
&cortexa57_branch_cost, &cortexa57_branch_cost,
4, /* memmov_cost */ 4, /* memmov_cost */
3, /* issue_rate */ 3, /* issue_rate */
(AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
16, /* function_align. */ 16, /* function_align. */
8, /* jump_align. */ 8, /* jump_align. */
...@@ -502,7 +502,7 @@ static const struct tune_params cortexa72_tunings = ...@@ -502,7 +502,7 @@ static const struct tune_params cortexa72_tunings =
&generic_branch_cost, &generic_branch_cost,
4, /* memmov_cost */ 4, /* memmov_cost */
3, /* issue_rate */ 3, /* issue_rate */
(AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
16, /* function_align. */ 16, /* function_align. */
8, /* jump_align. */ 8, /* jump_align. */
...@@ -13328,6 +13328,10 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) ...@@ -13328,6 +13328,10 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
} }
} }
if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_AES_AESMC)
&& aarch_crypto_can_dual_issue (prev, curr))
return true;
if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH) if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
&& any_condjump_p (curr)) && any_condjump_p (curr))
{ {
......
...@@ -58,8 +58,11 @@ aarch_crypto_can_dual_issue (rtx_insn *producer_insn, rtx_insn *consumer_insn) ...@@ -58,8 +58,11 @@ aarch_crypto_can_dual_issue (rtx_insn *producer_insn, rtx_insn *consumer_insn)
{ {
unsigned int regno = REGNO (SET_DEST (producer_set)); unsigned int regno = REGNO (SET_DEST (producer_set));
return REGNO (SET_DEST (consumer_set)) == regno /* Before reload the registers are virtual, so the destination of
&& REGNO (XVECEXP (consumer_src, 0, 0)) == regno; consumer_set doesn't need to match. */
return (REGNO (SET_DEST (consumer_set)) == regno || !reload_completed)
&& REGNO (XVECEXP (consumer_src, 0, 0)) == regno;
} }
return 0; return 0;
......
...@@ -747,20 +747,20 @@ ...@@ -747,20 +747,20 @@
neon_fp_sqrt_s_q, neon_fp_sqrt_d_q")) neon_fp_sqrt_s_q, neon_fp_sqrt_d_q"))
"ca57_cx2_block*3") "ca57_cx2_block*3")
(define_insn_reservation "cortex_a57_crypto_simple" 4 (define_insn_reservation "cortex_a57_crypto_simple" 3
(and (eq_attr "tune" "cortexa57") (and (eq_attr "tune" "cortexa57")
(eq_attr "type" "crypto_aese,crypto_aesmc,crypto_sha1_fast,crypto_sha256_fast")) (eq_attr "type" "crypto_aese,crypto_aesmc,crypto_sha1_fast,crypto_sha256_fast"))
"ca57_cx2") "ca57_cx1")
(define_insn_reservation "cortex_a57_crypto_complex" 7 (define_insn_reservation "cortex_a57_crypto_complex" 6
(and (eq_attr "tune" "cortexa57") (and (eq_attr "tune" "cortexa57")
(eq_attr "type" "crypto_sha1_slow,crypto_sha256_slow")) (eq_attr "type" "crypto_sha1_slow,crypto_sha256_slow"))
"ca57_cx2+(ca57_cx2_issue,ca57_cx2)") "ca57_cx1*2")
(define_insn_reservation "cortex_a57_crypto_xor" 7 (define_insn_reservation "cortex_a57_crypto_xor" 6
(and (eq_attr "tune" "cortexa57") (and (eq_attr "tune" "cortexa57")
(eq_attr "type" "crypto_sha1_xor")) (eq_attr "type" "crypto_sha1_xor"))
"(ca57_cx1+ca57_cx2)") "(ca57_cx1*2)|(ca57_cx2*2)")
;; We lie with calls. They take up all issue slots, but are otherwise ;; We lie with calls. They take up all issue slots, but are otherwise
;; not harmful. ;; not harmful.
...@@ -797,3 +797,8 @@ ...@@ -797,3 +797,8 @@
(define_bypass 1 "cortex_a57_*" (define_bypass 1 "cortex_a57_*"
"cortex_a57_call,cortex_a57_branch") "cortex_a57_call,cortex_a57_branch")
;; AESE+AESMC and AESD+AESIMC pairs forward with zero latency
(define_bypass 0 "cortex_a57_crypto_simple"
"cortex_a57_crypto_simple"
"aarch_crypto_can_dual_issue")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment