Commit a229966c by Richard Sandiford Committed by Richard Sandiford

[AArch64] Use SVE ADR to optimise shift-add sequences

This patch uses SVE ADR to optimise shift-and-add and uxtw-and-add
sequences.

2019-08-14  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* config/aarch64/predicates.md (const_1_to_3_operand): New predicate.
	* config/aarch64/aarch64-sve.md (*aarch64_adr_uxtw)
	(*aarch64_adr<mode>_shift, *aarch64_adr_shift_uxtw): New patterns.

gcc/testsuite/
	* gcc.target/aarch64/sve/adr_1.c: New test.
	* gcc.target/aarch64/sve/adr_1_run.c: Likewise.
	* gcc.target/aarch64/sve/adr_2.c: Likewise.
	* gcc.target/aarch64/sve/adr_2_run.c: Likewise.
	* gcc.target/aarch64/sve/adr_3.c: Likewise.
	* gcc.target/aarch64/sve/adr_3_run.c: Likewise.
	* gcc.target/aarch64/sve/adr_4.c: Likewise.
	* gcc.target/aarch64/sve/adr_4_run.c: Likewise.
	* gcc.target/aarch64/sve/adr_5.c: Likewise.
	* gcc.target/aarch64/sve/adr_5_run.c: Likewise.

From-SVN: r274436
parent 917d611c
2019-08-14 Richard Sandiford <richard.sandiford@arm.com>
* config/aarch64/predicates.md (const_1_to_3_operand): New predicate.
* config/aarch64/aarch64-sve.md (*aarch64_adr_uxtw)
(*aarch64_adr<mode>_shift, *aarch64_adr_shift_uxtw): New patterns.
2019-08-14 Richard Sandiford <richard.sandiford@arm.com>
* config/aarch64/aarch64.c (aarch64_expand_sve_const_pred_eor)
(aarch64_expand_sve_const_pred_trn): New functions.
(aarch64_expand_sve_const_pred_1): Add a recurse_p parameter and
......
......@@ -61,6 +61,7 @@
;; ---- [INT] General binary arithmetic corresponding to rtx codes
;; ---- [INT] Addition
;; ---- [INT] Subtraction
;; ---- [INT] Take address
;; ---- [INT] Absolute difference
;; ---- [INT] Multiplication
;; ---- [INT] Highpart multiplication
......@@ -1672,6 +1673,65 @@
;; Merging forms are handled through SVE_INT_BINARY.
;; -------------------------------------------------------------------------
;; ---- [INT] Take address
;; -------------------------------------------------------------------------
;; Includes:
;; - ADR
;; -------------------------------------------------------------------------
;; Unshifted ADR, with the offset being zero-extended from the low 32 bits.
(define_insn "*aarch64_adr_uxtw"
[(set (match_operand:VNx2DI 0 "register_operand" "=w")
(plus:VNx2DI
(and:VNx2DI
(match_operand:VNx2DI 2 "register_operand" "w")
(match_operand:VNx2DI 3 "aarch64_sve_uxtw_immediate"))
(match_operand:VNx2DI 1 "register_operand" "w")))]
"TARGET_SVE"
"adr\t%0.d, [%1.d, %2.d, uxtw]"
)
;; ADR with a nonzero shift.
(define_insn_and_rewrite "*aarch64_adr<mode>_shift"
[(set (match_operand:SVE_SDI 0 "register_operand" "=w")
(plus:SVE_SDI
(unspec:SVE_SDI
[(match_operand 4)
(ashift:SVE_SDI
(match_operand:SVE_SDI 2 "register_operand" "w")
(match_operand:SVE_SDI 3 "const_1_to_3_operand"))]
UNSPEC_PRED_X)
(match_operand:SVE_SDI 1 "register_operand" "w")))]
"TARGET_SVE"
"adr\t%0.<Vetype>, [%1.<Vetype>, %2.<Vetype>, lsl %3]"
"&& !CONSTANT_P (operands[4])"
{
operands[4] = CONSTM1_RTX (<VPRED>mode);
}
)
;; Same, but with the index being zero-extended from the low 32 bits.
(define_insn_and_rewrite "*aarch64_adr_shift_uxtw"
[(set (match_operand:VNx2DI 0 "register_operand" "=w")
(plus:VNx2DI
(unspec:VNx2DI
[(match_operand 5)
(ashift:VNx2DI
(and:VNx2DI
(match_operand:VNx2DI 2 "register_operand" "w")
(match_operand:VNx2DI 4 "aarch64_sve_uxtw_immediate"))
(match_operand:VNx2DI 3 "const_1_to_3_operand"))]
UNSPEC_PRED_X)
(match_operand:VNx2DI 1 "register_operand" "w")))]
"TARGET_SVE"
"adr\t%0.d, [%1.d, %2.d, uxtw %3]"
"&& !CONSTANT_P (operands[5])"
{
operands[5] = CONSTM1_RTX (VNx2BImode);
}
)
;; -------------------------------------------------------------------------
;; ---- [INT] Absolute difference
;; -------------------------------------------------------------------------
;; Includes:
......
......@@ -39,6 +39,13 @@
(and (match_code "const_int")
(match_test "op == CONST0_RTX (mode)")))
(define_predicate "const_1_to_3_operand"
(match_code "const_int,const_vector")
{
op = unwrap_const_vec_duplicate (op);
return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
})
(define_special_predicate "subreg_lowpart_operator"
(and (match_code "subreg")
(match_test "subreg_lowpart_p (op)")))
......@@ -595,6 +602,11 @@
(and (match_code "const,const_vector")
(match_test "aarch64_sve_inc_dec_immediate_p (op)")))
(define_predicate "aarch64_sve_uxtw_immediate"
(and (match_code "const_vector")
(match_test "GET_MODE_UNIT_BITSIZE (GET_MODE (op)) > 32")
(match_test "aarch64_const_vec_all_same_int_p (op, 0xffffffff)")))
(define_predicate "aarch64_sve_logical_immediate"
(and (match_code "const,const_vector")
(match_test "aarch64_sve_bitmask_immediate_p (op)")))
......
2019-08-14 Richard Sandiford <richard.sandiford@arm.com>
* gcc.target/aarch64/sve/adr_1.c: New test.
* gcc.target/aarch64/sve/adr_1_run.c: Likewise.
* gcc.target/aarch64/sve/adr_2.c: Likewise.
* gcc.target/aarch64/sve/adr_2_run.c: Likewise.
* gcc.target/aarch64/sve/adr_3.c: Likewise.
* gcc.target/aarch64/sve/adr_3_run.c: Likewise.
* gcc.target/aarch64/sve/adr_4.c: Likewise.
* gcc.target/aarch64/sve/adr_4_run.c: Likewise.
* gcc.target/aarch64/sve/adr_5.c: Likewise.
* gcc.target/aarch64/sve/adr_5_run.c: Likewise.
2019-08-14 Paolo Carlini <paolo.carlini@oracle.com>
* g++.dg/conversion/simd4.C: Test locations.
......
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
#ifndef FACTOR
#define FACTOR 2
#endif
#define LOOP(TYPE) \
__attribute__ ((noipa)) \
void \
test_##TYPE (TYPE *restrict dst, TYPE *restrict src, \
int count) \
{ \
for (int i = 0; i < count; ++i) \
dst[i] += src[i] * FACTOR; \
}
#define TEST_ALL(T) \
T (int8_t) \
T (int16_t) \
T (int32_t) \
T (int64_t) \
T (uint8_t) \
T (uint16_t) \
T (uint32_t) \
T (uint64_t)
TEST_ALL (LOOP)
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.b,} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.b,} 2 } } */
/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.b,} } } */
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.h,} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.h,} 2 } } */
/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.h,} } } */
/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.s,} } } */
/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.s,} } } */
/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.s, \[z[0-9]\.s, z[0-9]\.s, lsl 1\]} 2 } } */
/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, lsl 1\]} 2 } } */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "adr_1.c"
#define N 131
#define TEST_LOOP(TYPE) \
{ \
TYPE a[N], b[N]; \
for (int i = 0; i < N; ++i) \
{ \
a[i] = (TYPE) i * i + i % 5; \
b[i] = (TYPE) i * 3 + i % 7; \
asm volatile ("" ::: "memory"); \
} \
test_##TYPE (a, b, N); \
for (int i = 0; i < N; ++i) \
{ \
TYPE expected = ((TYPE) (i * i + i % 5) \
+ ((TYPE) i * 3 + i % 7) * FACTOR); \
if (a[i] != expected) \
__builtin_abort (); \
} \
}
int __attribute__ ((optimize (1)))
main (void)
{
TEST_ALL (TEST_LOOP)
}
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#define FACTOR 4
#include "adr_1.c"
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.b,} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.b,} 2 } } */
/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.b,} } } */
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.h,} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.h,} 2 } } */
/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.h,} } } */
/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.s,} } } */
/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.s,} } } */
/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.s, \[z[0-9]\.s, z[0-9]\.s, lsl 2\]} 2 } } */
/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, lsl 2\]} 2 } } */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#define FACTOR 4
#include "adr_1_run.c"
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#define FACTOR 8
#include "adr_1.c"
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.b,} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.b,} 2 } } */
/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.b,} } } */
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.h,} 2 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.h,} 2 } } */
/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.h,} } } */
/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.s,} } } */
/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.s,} } } */
/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.s, \[z[0-9]\.s, z[0-9]\.s, lsl 3\]} 2 } } */
/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, lsl 3\]} 2 } } */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#define FACTOR 8
#include "adr_1_run.c"
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#define FACTOR 16
#include "adr_1.c"
/* { dg-final { scan-assembler-times {\tadd\tz[0-9]\.[bhsd],} 8 } } */
/* { dg-final { scan-assembler-times {\tlsl\tz[0-9]\.[bhsd],} 8 } } */
/* { dg-final { scan-assembler-not {\tadr\tz[0-9]\.[bhsd],} } } */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#define FACTOR 16
#include "adr_1_run.c"
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
#define LOOP(FACTOR) \
__attribute__ ((noipa)) \
void \
test_##FACTOR (uint64_t *restrict dst, \
uint64_t *restrict src, int count) \
{ \
for (int i = 0; i < count; ++i) \
dst[i] += (src[i] & 0xffffffff) * FACTOR; \
}
#define TEST_ALL(T) T (1) T (2) T (4) T (8)
TEST_ALL (LOOP)
/* { dg-final { scan-assembler-not {\tadd\tz[0-9]\.d,} } } */
/* { dg-final { scan-assembler-not {\tlsl\tz[0-9]\.d,} } } */
/* { dg-final { scan-assembler-not {\tand\tz[0-9]\.d,} } } */
/* { dg-final { scan-assembler-not {\tuxtw\tz[0-9]\.d,} } } */
/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw\]} 1 } } */
/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw 1\]} 1 } } */
/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw 2\]} 1 } } */
/* { dg-final { scan-assembler-times {\tadr\tz[0-9]\.d, \[z[0-9]\.d, z[0-9]\.d, uxtw 3\]} 1 } } */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "adr_5.c"
#define N 131
#define TEST_LOOP(FACTOR) \
{ \
uint64_t a[N], b[N]; \
for (int i = 0; i < N; ++i) \
{ \
a[i] = (uint64_t) i * i + i % 5; \
b[i] = (uint64_t) (i * 3) << ((i & 7) * 8); \
asm volatile ("" ::: "memory"); \
} \
test_##FACTOR (a, b, N); \
for (int i = 0; i < N; ++i) \
{ \
uint64_t expected = ((uint64_t) (i * i + i % 5) \
+ (((uint64_t) (i * 3) << ((i & 7) * 8)) \
& 0xffffffff) * FACTOR); \
if (a[i] != expected) \
__builtin_abort (); \
} \
}
int __attribute__ ((optimize (1)))
main (void)
{
TEST_ALL (TEST_LOOP)
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment