Commit 9f5361c8 by Kyrylo Tkachov Committed by Kyrylo Tkachov

[AArch64] Support for LDP/STP of Q-registers

This patch adds support for generating LDPs and STPs of Q-registers.
This allows for more compact code generation and makes better use of the ISA.

It's implemented in a straightforward way by allowing 16-byte modes in the
sched-fusion machinery and adding appropriate peepholes in aarch64-ldpstp.md
as well as the patterns themselves in aarch64-simd.md.

It adds a new no_ldp_stp_qregs tuning flag.
I use it to restrict the peepholes in aarch64-ldpstp.md from merging the
operations together into PARALLELs. I also use it to restrict the sched fusion
check that brings such loads and stores together. This is enough to avoid
forming the pairs when the tuning flag is set.

I didn't see any non-noise performance effect on SPEC2017 on Cortex-A72 and Cortex-A53.

        * config/aarch64/aarch64-tuning-flags.def (no_ldp_stp_qregs): New.
        * config/aarch64/aarch64.c (xgene1_tunings): Add
        AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS to tune_flags.
        (aarch64_mode_valid_for_sched_fusion_p):
        Allow 16-byte modes.
        (aarch64_classify_address): Allow 16-byte modes for load_store_pair_p.
        * config/aarch64/aarch64-ldpstp.md: Add peepholes for LDP STP of
        128-bit modes.
        * config/aarch64/aarch64-simd.md (load_pair<VQ:mode><VQ2:mode>):
        New pattern.
        (vec_store_pair<VQ:mode><VQ2:mode>): Likewise.
        * config/aarch64/iterators.md (VQ2): New mode iterator.

        * gcc.target/aarch64/ldp_stp_q.c: New test.
        * gcc.target/aarch64/stp_vec_128_1.c: Likewise.
        * gcc.target/aarch64/ldp_stp_q_disable.c: Likewise.

From-SVN: r261796
parent de840bde
2018-06-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/aarch64/aarch64-tuning-flags.def (no_ldp_stp_qregs): New.
* config/aarch64/aarch64.c (xgene1_tunings): Add
AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS to tune_flags.
(aarch64_mode_valid_for_sched_fusion_p):
Allow 16-byte modes.
(aarch64_classify_address): Allow 16-byte modes for load_store_pair_p.
* config/aarch64/aarch64-ldpstp.md: Add peepholes for LDP STP of
128-bit modes.
* config/aarch64/aarch64-simd.md (load_pair<VQ:mode><VQ2:mode>):
New pattern.
(vec_store_pair<VQ:mode><VQ2:mode>): Likewise.
* config/aarch64/iterators.md (VQ2): New mode iterator.
2018-06-20 Martin Liska <mliska@suse.cz>
* tree-switch-conversion.c (jump_table_cluster::can_be_handled):
......
......@@ -91,6 +91,37 @@
aarch64_swap_ldrstr_operands (operands, false);
})
(define_peephole2
[(set (match_operand:VQ 0 "register_operand" "")
(match_operand:VQ 1 "memory_operand" ""))
(set (match_operand:VQ2 2 "register_operand" "")
(match_operand:VQ2 3 "memory_operand" ""))]
"TARGET_SIMD
&& aarch64_operands_ok_for_ldpstp (operands, true, <VQ:MODE>mode)
&& (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
[(parallel [(set (match_dup 0) (match_dup 1))
(set (match_dup 2) (match_dup 3))])]
{
aarch64_swap_ldrstr_operands (operands, true);
})
(define_peephole2
[(set (match_operand:VQ 0 "memory_operand" "")
(match_operand:VQ 1 "register_operand" ""))
(set (match_operand:VQ2 2 "memory_operand" "")
(match_operand:VQ2 3 "register_operand" ""))]
"TARGET_SIMD
&& aarch64_operands_ok_for_ldpstp (operands, false, <VQ:MODE>mode)
&& (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
[(parallel [(set (match_dup 0) (match_dup 1))
(set (match_dup 2) (match_dup 3))])]
{
aarch64_swap_ldrstr_operands (operands, false);
})
;; Handle sign/zero extended consecutive load/store.
(define_peephole2
......
......@@ -205,6 +205,34 @@
[(set_attr "type" "neon_stp")]
)
(define_insn "load_pair<VQ:mode><VQ2:mode>"
[(set (match_operand:VQ 0 "register_operand" "=w")
(match_operand:VQ 1 "aarch64_mem_pair_operand" "Ump"))
(set (match_operand:VQ2 2 "register_operand" "=w")
(match_operand:VQ2 3 "memory_operand" "m"))]
"TARGET_SIMD
&& rtx_equal_p (XEXP (operands[3], 0),
plus_constant (Pmode,
XEXP (operands[1], 0),
GET_MODE_SIZE (<VQ:MODE>mode)))"
"ldp\\t%q0, %q2, %1"
[(set_attr "type" "neon_ldp_q")]
)
(define_insn "vec_store_pair<VQ:mode><VQ2:mode>"
[(set (match_operand:VQ 0 "aarch64_mem_pair_operand" "=Ump")
(match_operand:VQ 1 "register_operand" "w"))
(set (match_operand:VQ2 2 "memory_operand" "=m")
(match_operand:VQ2 3 "register_operand" "w"))]
"TARGET_SIMD && rtx_equal_p (XEXP (operands[2], 0),
plus_constant (Pmode,
XEXP (operands[0], 0),
GET_MODE_SIZE (<VQ:MODE>mode)))"
"stp\\t%q1, %q3, %0"
[(set_attr "type" "neon_stp_q")]
)
(define_split
[(set (match_operand:VQ 0 "register_operand" "")
(match_operand:VQ 1 "register_operand" ""))]
......
......@@ -41,4 +41,7 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
are not considered cheap. */
AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
/* Disallow load/store pair instructions on Q-registers. */
AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp_qregs", NO_LDP_STP_QREGS)
#undef AARCH64_EXTRA_TUNING_OPTION
......@@ -880,7 +880,7 @@ static const struct tune_params xgene1_tunings =
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
(AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
&generic_prefetch_tune
};
......@@ -5690,7 +5690,10 @@ aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
return mode == SImode || mode == DImode
|| mode == SFmode || mode == DFmode
|| (aarch64_vector_mode_supported_p (mode)
&& known_eq (GET_MODE_SIZE (mode), 8));
&& (known_eq (GET_MODE_SIZE (mode), 8)
|| (known_eq (GET_MODE_SIZE (mode), 16)
&& (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
}
/* Return true if REGNO is a virtual pointer register, or an eliminable
......@@ -5847,7 +5850,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
if (load_store_pair_p)
return ((known_eq (GET_MODE_SIZE (mode), 4)
|| known_eq (GET_MODE_SIZE (mode), 8))
|| known_eq (GET_MODE_SIZE (mode), 8)
|| known_eq (GET_MODE_SIZE (mode), 16))
&& aarch64_offset_7bit_signed_scaled_p (mode, offset));
else
return (offset_9bit_signed_unscaled_p (mode, offset)
......@@ -5907,7 +5911,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
if (load_store_pair_p)
return ((known_eq (GET_MODE_SIZE (mode), 4)
|| known_eq (GET_MODE_SIZE (mode), 8))
|| known_eq (GET_MODE_SIZE (mode), 8)
|| known_eq (GET_MODE_SIZE (mode), 16))
&& aarch64_offset_7bit_signed_scaled_p (mode, offset));
else
return offset_9bit_signed_unscaled_p (mode, offset);
......
......@@ -84,6 +84,9 @@
;; Quad vector modes.
(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
;; Copy of the above.
(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
;; Quad integer vector modes.
(define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
......
2018-06-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.target/aarch64/ldp_stp_q.c: New test.
* gcc.target/aarch64/stp_vec_128_1.c: Likewise.
* gcc.target/aarch64/ldp_stp_q_disable.c: Likewise.
2018-06-20 Martin Liska <mliska@suse.cz>
* gcc.dg/tree-ssa/vrp104.c: Grep just for GIMPLE IL.
......
/* { dg-options "-O2 -moverride=tune=none" } */
typedef float float32x4_t __attribute__ ((__vector_size__ ((16))));
float32x4_t arr[4][4];
void
foo (float32x4_t x, float32x4_t y)
{
arr[0][1] = x;
arr[1][0] = y;
arr[2][0] = x;
arr[1][1] = y;
arr[0][2] = x;
arr[0][3] = y;
arr[1][2] = x;
arr[2][1] = y;
arr[3][0] = x;
arr[3][1] = y;
arr[2][2] = x;
arr[1][3] = y;
arr[2][3] = x;
arr[3][2] = y;
}
/* { dg-final { scan-assembler-times "stp\tq\[0-9\]+, q\[0-9\]" 7 } } */
/* { dg-options "-O2 -moverride=tune=no_ldp_stp_qregs" } */
typedef float float32x4_t __attribute__ ((__vector_size__ ((16))));
float32x4_t arr[4][4];
void
foo (float32x4_t x, float32x4_t y)
{
arr[0][1] = x;
arr[1][0] = y;
arr[2][0] = x;
arr[1][1] = y;
arr[0][2] = x;
arr[0][3] = y;
arr[1][2] = x;
arr[2][1] = y;
arr[3][0] = x;
arr[3][1] = y;
arr[2][2] = x;
arr[1][3] = y;
arr[2][3] = x;
arr[3][2] = y;
}
/* { dg-final { scan-assembler-not "stp\tq\[0-9\]+, q\[0-9\]" } } */
/* { dg-do compile } */
/* { dg-options "-Ofast -moverride=tune=none" } */
typedef int int32x4_t __attribute__ ((__vector_size__ ((16))));
void
bar (int32x4_t *foo)
{
int i = 0;
int32x4_t val = { 3, 2, 5, 1 };
for (i = 0; i < 256; i+=2)
{
foo[i] = val;
foo[i+1] = val;
}
}
/* { dg-final { scan-assembler "stp\tq\[0-9\]+, q\[0-9\]" } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment