Commit 0b1fe8cf by Richard Sandiford Committed by Richard Sandiford

Optimise constant IFN_WHILE_ULTs

This patch is a combination of two changes that have to be
committed as a single unit:

(1) Try to fold IFN_WHILE_ULTs with constant arguments to a VECTOR_CST
    (which is always possible for fixed-length vectors but is not
    necessarily so for variable-length vectors)

(2) Make the SVE port recognise constants that map to PTRUE VLn,
    which includes those generated by the new fold.

(2) can't be tested without (1) and (1) would be a significant
pessimisation without (2).

The target-specific parts also start moving towards doing predicate
manipulation in a canonical VNx16BImode form, using rtx_vector_builders.

2019-08-13  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* tree.h (build_vector_a_then_b): Declare.
	* tree.c (build_vector_a_then_b): New function.
	* fold-const-call.c (fold_while_ult): Likewise.
	(fold_const_call): Use it to handle IFN_WHILE_ULT.
	* config/aarch64/aarch64-protos.h (AARCH64_FOR_SVPATTERN): New macro.
	(aarch64_svpattern): New enum.
	* config/aarch64/aarch64-sve.md (mov<PRED_ALL:mode>): Pass
	constants through aarch64_expand_mov_immediate.
	(*aarch64_sve_mov<PRED_ALL:mode>): Use aarch64_mov_operand rather
	than general_operand as the predicate for operand 1.
	(while_ult<GPI:mode><PRED_ALL:mode>): Add a '@' marker.
	* config/aarch64/aarch64.c (simd_immediate_info::PTRUE): New
	insn_type.
	(simd_immediate_info::simd_immediate_info): New overload that
	takes a scalar_int_mode and an svpattern.
	(simd_immediate_info::u): Add a "pattern" field.
	(svpattern_token): New function.
	(aarch64_get_sve_pred_bits, aarch64_widest_sve_pred_elt_size)
	(aarch64_partial_ptrue_length, aarch64_svpattern_for_vl)
	(aarch64_sve_move_pred_via_while): New functions.
	(aarch64_expand_mov_immediate): Try using
	aarch64_sve_move_pred_via_while for predicates that contain N ones
	followed by M zeros but that do not correspond to a VLnnn pattern.
	(aarch64_sve_pred_valid_immediate): New function.
	(aarch64_simd_valid_immediate): Use it instead of dealing directly
	with PTRUE and PFALSE.
	(aarch64_output_sve_mov_immediate): Handle new simd_immediate_info
	forms.

gcc/testsuite/
	* gcc.target/aarch64/sve/spill_2.c: Increase iteration counts
	beyond the range of a PTRUE.
	* gcc.target/aarch64/sve/while_6.c: New test.
	* gcc.target/aarch64/sve/while_7.c: Likewise.
	* gcc.target/aarch64/sve/while_8.c: Likewise.
	* gcc.target/aarch64/sve/while_9.c: Likewise.
	* gcc.target/aarch64/sve/while_10.c: Likewise.

From-SVN: r274402
parent abb1d111
2019-08-13 Richard Sandiford <richard.sandiford@arm.com>
* tree.h (build_vector_a_then_b): Declare.
* tree.c (build_vector_a_then_b): New function.
* fold-const-call.c (fold_while_ult): Likewise.
(fold_const_call): Use it to handle IFN_WHILE_ULT.
* config/aarch64/aarch64-protos.h (AARCH64_FOR_SVPATTERN): New macro.
(aarch64_svpattern): New enum.
* config/aarch64/aarch64-sve.md (mov<PRED_ALL:mode>): Pass
constants through aarch64_expand_mov_immediate.
(*aarch64_sve_mov<PRED_ALL:mode>): Use aarch64_mov_operand rather
than general_operand as the predicate for operand 1.
(while_ult<GPI:mode><PRED_ALL:mode>): Add a '@' marker.
* config/aarch64/aarch64.c (simd_immediate_info::PTRUE): New
insn_type.
(simd_immediate_info::simd_immediate_info): New overload that
takes a scalar_int_mode and an svpattern.
(simd_immediate_info::u): Add a "pattern" field.
(svpattern_token): New function.
(aarch64_get_sve_pred_bits, aarch64_widest_sve_pred_elt_size)
(aarch64_partial_ptrue_length, aarch64_svpattern_for_vl)
(aarch64_sve_move_pred_via_while): New functions.
(aarch64_expand_mov_immediate): Try using
aarch64_sve_move_pred_via_while for predicates that contain N ones
followed by M zeros but that do not correspond to a VLnnn pattern.
(aarch64_sve_pred_valid_immediate): New function.
(aarch64_simd_valid_immediate): Use it instead of dealing directly
with PTRUE and PFALSE.
(aarch64_output_sve_mov_immediate): Handle new simd_immediate_info
forms.
2019-08-13 Iain Sandoe <iain@sandoe.co.uk>
* config/darwin.c (machopic_indirect_call_target): Rename symbol stub
......
......@@ -406,6 +406,33 @@ extern enum aarch64_key_type aarch64_ra_sign_key;
extern struct tune_params aarch64_tune_params;
/* The available SVE predicate patterns, known in the ACLE as "svpattern". */
#define AARCH64_FOR_SVPATTERN(T) \
T (POW2, pow2, 0) \
T (VL1, vl1, 1) \
T (VL2, vl2, 2) \
T (VL3, vl3, 3) \
T (VL4, vl4, 4) \
T (VL5, vl5, 5) \
T (VL6, vl6, 6) \
T (VL7, vl7, 7) \
T (VL8, vl8, 8) \
T (VL16, vl16, 9) \
T (VL32, vl32, 10) \
T (VL64, vl64, 11) \
T (VL128, vl128, 12) \
T (VL256, vl256, 13) \
T (MUL4, mul4, 29) \
T (MUL3, mul3, 30) \
T (ALL, all, 31)
#define AARCH64_SVENUM(UPPER, LOWER, VALUE) AARCH64_SV_##UPPER = VALUE,
enum aarch64_svpattern {
AARCH64_FOR_SVPATTERN (AARCH64_SVENUM)
AARCH64_NUM_SVPATTERNS
};
#undef AARCH64_SVENUM
void aarch64_post_cfi_startproc (void);
poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
int aarch64_get_condition_code (rtx);
......
......@@ -481,12 +481,18 @@
{
if (GET_CODE (operands[0]) == MEM)
operands[1] = force_reg (<MODE>mode, operands[1]);
if (CONSTANT_P (operands[1]))
{
aarch64_expand_mov_immediate (operands[0], operands[1]);
DONE;
}
}
)
(define_insn "*aarch64_sve_mov<mode>"
[(set (match_operand:PRED_ALL 0 "nonimmediate_operand" "=Upa, m, Upa, Upa")
(match_operand:PRED_ALL 1 "general_operand" "Upa, Upa, m, Dn"))]
(match_operand:PRED_ALL 1 "aarch64_mov_operand" "Upa, Upa, m, Dn"))]
"TARGET_SVE
&& (register_operand (operands[0], <MODE>mode)
|| register_operand (operands[1], <MODE>mode))"
......@@ -2923,7 +2929,7 @@
;; Set element I of the result if operand1 + J < operand2 for all J in [0, I],
;; with the comparison being unsigned.
(define_insn "while_ult<GPI:mode><PRED_ALL:mode>"
(define_insn "@while_ult<GPI:mode><PRED_ALL:mode>"
[(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
(unspec:PRED_ALL [(match_operand:GPI 1 "aarch64_reg_or_zero" "rZ")
(match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")]
......
......@@ -691,6 +691,36 @@ fold_const_vec_convert (tree ret_type, tree arg)
/* Try to evaluate:
IFN_WHILE_ULT (ARG0, ARG1, (TYPE) { ... })
Return the value on success and null on failure. */
static tree
fold_while_ult (tree type, poly_uint64 arg0, poly_uint64 arg1)
{
if (known_ge (arg0, arg1))
return build_zero_cst (type);
if (maybe_ge (arg0, arg1))
return NULL_TREE;
poly_uint64 diff = arg1 - arg0;
poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
if (known_ge (diff, nelts))
return build_all_ones_cst (type);
unsigned HOST_WIDE_INT const_diff;
if (known_le (diff, nelts) && diff.is_constant (&const_diff))
{
tree minus_one = build_minus_one_cst (TREE_TYPE (type));
tree zero = build_zero_cst (TREE_TYPE (type));
return build_vector_a_then_b (type, const_diff, minus_one, zero);
}
return NULL_TREE;
}
/* Try to evaluate:
*RESULT = FN (*ARG)
in format FORMAT. Return true on success. */
......@@ -1782,6 +1812,14 @@ fold_const_call (combined_fn fn, tree type, tree arg0, tree arg1, tree arg2)
}
return NULL_TREE;
case CFN_WHILE_ULT:
{
poly_uint64 parg0, parg1;
if (poly_int_tree_p (arg0, &parg0) && poly_int_tree_p (arg1, &parg1))
return fold_while_ult (type, parg0, parg1);
return NULL_TREE;
}
default:
return fold_const_call_1 (fn, type, arg0, arg1, arg2);
}
......
2019-08-13 Richard Sandiford <richard.sandiford@arm.com>
* gcc.target/aarch64/sve/spill_2.c: Increase iteration counts
beyond the range of a PTRUE.
* gcc.target/aarch64/sve/while_6.c: New test.
* gcc.target/aarch64/sve/while_7.c: Likewise.
* gcc.target/aarch64/sve/while_8.c: Likewise.
* gcc.target/aarch64/sve/while_9.c: Likewise.
* gcc.target/aarch64/sve/while_10.c: Likewise.
2019-08-13 Steven G. Kargl <kargl@gcc.gnu.org>
PR fortran/88072
......
......@@ -9,29 +9,30 @@ void consumer (void *);
void \
multi_loop_##TYPE (TYPE *x, TYPE val) \
{ \
for (int i = 0; i < 7; ++i) \
for (int i = 0; i < 9; ++i) \
x[i] += val; \
consumer (x); \
for (int i = 0; i < 7; ++i) \
for (int i = 0; i < 9; ++i) \
x[i] += val; \
consumer (x); \
for (int i = 0; i < 7; ++i) \
for (int i = 0; i < 9; ++i) \
x[i] += val; \
consumer (x); \
}
/* One iteration is enough. */
TEST_LOOP (uint8_t);
/* Two iterations are enough. We specialize the second two loops based
on whether the first executes once or twice. */
TEST_LOOP (uint16_t);
/* Two iterations are enough. Complete unrolling makes sense
even at -O2. */
/* Three iterations are needed; ought to stay a loop. */
TEST_LOOP (uint32_t);
/* Four iterations are needed; ought to stay a loop. */
/* Five iterations are needed; ought to stay a loop. */
TEST_LOOP (uint64_t);
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.b} 3 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.h} 3 } } */
/* { dg-final { scan-assembler {\twhilelo\tp[0-9]\.s} } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.h} 8 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.s} 6 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.d} 6 } } */
/* { dg-final { scan-assembler-not {\tldr\tz[0-9]} } } */
/* { dg-final { scan-assembler-not {\tstr\tz[0-9]} } } */
......
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=512" } */
#include <stdint.h>
#define ADD_LOOP(TYPE, COUNT) \
TYPE __attribute__ ((noinline, noclone)) \
vec_while_##TYPE (TYPE *restrict a) \
{ \
for (int i = 0; i < COUNT; ++i) \
a[i] += 1; \
}
#define TEST_ALL(T) \
T (int8_t, 63) \
T (int16_t, 30) \
T (int32_t, 15) \
T (int64_t, 6)
TEST_ALL (ADD_LOOP)
/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, mul3\n} 1 } } */
/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, mul3\n} 1 } } */
/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, mul3\n} 1 } } */
/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl6\n} 1 } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
#include <stdint.h>
#define ADD_LOOP(TYPE) \
TYPE __attribute__ ((noinline, noclone)) \
vec_while_##TYPE (TYPE *restrict a) \
{ \
for (int i = 0; i < 7; ++i) \
a[i] += 1; \
}
#define TEST_ALL(T) \
T (int8_t) \
T (int16_t) \
T (int32_t) \
T (int64_t)
TEST_ALL (ADD_LOOP)
/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl7\n} 1 } } */
/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl7\n} 1 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
#include <stdint.h>
#define ADD_LOOP(TYPE) \
TYPE __attribute__ ((noinline, noclone)) \
vec_while_##TYPE (TYPE *restrict a) \
{ \
for (int i = 0; i < 8; ++i) \
a[i] += 1; \
}
#define TEST_ALL(T) \
T (int8_t) \
T (int16_t) \
T (int32_t) \
T (int64_t)
TEST_ALL (ADD_LOOP)
/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl8\n} 1 } } */
/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl8\n} 1 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
#include <stdint.h>
#define ADD_LOOP(TYPE) \
TYPE __attribute__ ((noinline, noclone)) \
vec_while_##TYPE (TYPE *restrict a) \
{ \
for (int i = 0; i < 9; ++i) \
a[i] += 1; \
}
#define TEST_ALL(T) \
T (int8_t) \
T (int16_t) \
T (int32_t) \
T (int64_t)
TEST_ALL (ADD_LOOP)
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b,} 1 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
#include <stdint.h>
#define ADD_LOOP(TYPE) \
TYPE __attribute__ ((noinline, noclone)) \
vec_while_##TYPE (TYPE *restrict a) \
{ \
for (int i = 0; i < 16; ++i) \
a[i] += 1; \
}
#define TEST_ALL(T) \
T (int8_t) \
T (int16_t) \
T (int32_t) \
T (int64_t)
TEST_ALL (ADD_LOOP)
/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl16\n} 1 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
......@@ -1981,6 +1981,23 @@ build_index_vector (tree vec_type, poly_uint64 base, poly_uint64 step)
return v.build ();
}
/* Return a VECTOR_CST of type VEC_TYPE in which the first NUM_A
elements are A and the rest are B. */
tree
build_vector_a_then_b (tree vec_type, unsigned int num_a, tree a, tree b)
{
gcc_assert (known_le (num_a, TYPE_VECTOR_SUBPARTS (vec_type)));
unsigned int count = constant_lower_bound (TYPE_VECTOR_SUBPARTS (vec_type));
/* Optimize the constant case. */
if ((count & 1) == 0 && TYPE_VECTOR_SUBPARTS (vec_type).is_constant ())
count /= 2;
tree_vector_builder builder (vec_type, count, 2);
for (unsigned int i = 0; i < count * 2; ++i)
builder.quick_push (i < num_a ? a : b);
return builder.build ();
}
/* Something has messed with the elements of CONSTRUCTOR C after it was built;
calculate TREE_CONSTANT and TREE_SIDE_EFFECTS. */
......
......@@ -4314,6 +4314,7 @@ extern tree build_vector_from_val (tree, tree);
extern tree build_uniform_cst (tree, tree);
extern tree build_vec_series (tree, tree, tree);
extern tree build_index_vector (tree, poly_uint64, poly_uint64);
extern tree build_vector_a_then_b (tree, unsigned int, tree, tree);
extern void recompute_constructor_flags (tree);
extern void verify_constructor_flags (tree);
extern tree build_constructor (tree, vec<constructor_elt, va_gc> * CXX_MEM_STAT_INFO);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment