Commit 5249ee4d by Richard Sandiford Committed by Richard Sandiford

Implement SLP of internal functions

SLP of calls was previously restricted to built-in functions.
This patch extends it to internal functions.

2018-07-12  Richard Sandiford  <richard.sandiford@linaro.org>

gcc/
	* internal-fn.h (vectorizable_internal_fn_p): New function.
	* tree-vect-slp.c (compatible_calls_p): Likewise.
	(vect_build_slp_tree_1): Remove nops argument.  Handle calls
	to internal functions.
	(vect_build_slp_tree_2): Update call to vect_build_slp_tree_1.

gcc/testsuite/
	* gcc.dg/vect/vect-cond-arith-6.c: New test.
	* gcc.target/aarch64/sve/cond_arith_4.c: Likewise.
	* gcc.target/aarch64/sve/cond_arith_4_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_arith_5.c: Likewise.
	* gcc.target/aarch64/sve/cond_arith_5_run.c: Likewise.
	* gcc.target/aarch64/sve/slp_14.c: Likewise.
	* gcc.target/aarch64/sve/slp_14_run.c: Likewise.

From-SVN: r262590
parent 2c58d42c
2018-07-12 Richard Sandiford <richard.sandiford@linaro.org>
* internal-fn.h (vectorizable_internal_fn_p): New function.
* tree-vect-slp.c (compatible_calls_p): Likewise.
(vect_build_slp_tree_1): Remove nops argument. Handle calls
to internal functions.
(vect_build_slp_tree_2): Update call to vect_build_slp_tree_1.
2018-07-12 Richard Sandiford <richard.sandiford@linaro.org>
* fold-const.h (inverse_conditions_p): Declare.
* fold-const.c (inverse_conditions_p): New function.
* match.pd: Use inverse_conditions_p. Add folds of view_converts
......
......@@ -160,6 +160,17 @@ direct_internal_fn_p (internal_fn fn)
return direct_internal_fn_array[fn].type0 >= -1;
}
/* Return true if FN is a direct internal function that can be vectorized by
converting the return type and all argument types to vectors of the same
number of elements. E.g. we can vectorize an IFN_SQRT on floats as an
IFN_SQRT on vectors of N floats. */
inline bool
vectorizable_internal_fn_p (internal_fn fn)
{
return direct_internal_fn_array[fn].vectorizable;
}
/* Return optab information about internal function FN. Only meaningful
if direct_internal_fn_p (FN). */
......
2018-07-12 Richard Sandiford <richard.sandiford@linaro.org>
* gcc.dg/vect/vect-cond-arith-6.c: New test.
* gcc.target/aarch64/sve/cond_arith_4.c: Likewise.
* gcc.target/aarch64/sve/cond_arith_4_run.c: Likewise.
* gcc.target/aarch64/sve/cond_arith_5.c: Likewise.
* gcc.target/aarch64/sve/cond_arith_5_run.c: Likewise.
* gcc.target/aarch64/sve/slp_14.c: Likewise.
* gcc.target/aarch64/sve/slp_14_run.c: Likewise.
2018-07-12 Richard Sandiford <richard.sandiford@linaro.org>
* gcc.dg/vect/vect-cond-arith-4.c: New test.
* gcc.dg/vect/vect-cond-arith-5.c: Likewise.
* gcc.target/aarch64/sve/cond_arith_1.c: Likewise.
......
/* { dg-additional-options "-fdump-tree-optimized" } */
#include "tree-vect.h"
#define N (VECTOR_BITS * 11 / 64 + 4)
#define add(A, B) ((A) + (B))
#define sub(A, B) ((A) - (B))
#define mul(A, B) ((A) * (B))
#define div(A, B) ((A) / (B))
#define DEF(OP) \
void __attribute__ ((noipa)) \
f_##OP (double *restrict a, double *restrict b, double x) \
{ \
for (int i = 0; i < N; i += 2) \
{ \
a[i] = b[i] < 100 ? OP (b[i], x) : b[i]; \
a[i + 1] = b[i + 1] < 70 ? OP (b[i + 1], x) : b[i + 1]; \
} \
}
#define TEST(OP) \
{ \
f_##OP (a, b, 10); \
for (int i = 0; i < N; ++i) \
{ \
int bval = (i % 17) * 10; \
int truev = OP (bval, 10); \
if (a[i] != (bval < (i & 1 ? 70 : 100) ? truev : bval)) \
__builtin_abort (); \
asm volatile ("" ::: "memory"); \
} \
}
#define FOR_EACH_OP(T) \
T (add) \
T (sub) \
T (mul) \
T (div)
FOR_EACH_OP (DEF)
int
main (void)
{
double a[N], b[N];
for (int i = 0; i < N; ++i)
{
b[i] = (i % 17) * 10;
asm volatile ("" ::: "memory");
}
FOR_EACH_OP (TEST)
return 0;
}
/* { dg-final { scan-tree-dump-times {vectorizing stmts using SLP} 4 "vect" { target vect_double_cond_arith } } } */
/* { dg-final { scan-tree-dump-times { = \.COND_ADD} 1 "optimized" { target vect_double_cond_arith } } } */
/* { dg-final { scan-tree-dump-times { = \.COND_SUB} 1 "optimized" { target vect_double_cond_arith } } } */
/* { dg-final { scan-tree-dump-times { = \.COND_MUL} 1 "optimized" { target vect_double_cond_arith } } } */
/* { dg-final { scan-tree-dump-times { = \.COND_RDIV} 1 "optimized" { target vect_double_cond_arith } } } */
/* { dg-final { scan-tree-dump-not {VEC_COND_EXPR} "optimized" { target vect_double_cond_arith } } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
#define TEST(TYPE, NAME, OP) \
void __attribute__ ((noinline, noclone)) \
test_##TYPE##_##NAME (TYPE *__restrict x, \
TYPE *__restrict y, \
TYPE z1, TYPE z2, \
TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; i += 2) \
{ \
x[i] = (pred[i] != 1 ? y[i] OP z1 : y[i]); \
x[i + 1] = (pred[i + 1] != 1 ? y[i + 1] OP z2 : y[i + 1]); \
} \
}
#define TEST_INT_TYPE(TYPE) \
TEST (TYPE, div, /)
#define TEST_FP_TYPE(TYPE) \
TEST (TYPE, add, +) \
TEST (TYPE, sub, -) \
TEST (TYPE, mul, *) \
TEST (TYPE, div, /)
#define TEST_ALL \
TEST_INT_TYPE (int32_t) \
TEST_INT_TYPE (uint32_t) \
TEST_INT_TYPE (int64_t) \
TEST_INT_TYPE (uint64_t) \
TEST_FP_TYPE (float) \
TEST_FP_TYPE (double)
TEST_ALL
/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z,} 12 } } */
/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7],} 6 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z,} 12 } } */
/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7],} 6 } } */
/* { dg-final { scan-assembler-not {\tsel\t} } } */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "cond_arith_4.c"
#define N 98
#undef TEST
#define TEST(TYPE, NAME, OP) \
{ \
TYPE x[N], y[N], pred[N], z[2] = { 5, 7 }; \
for (int i = 0; i < N; ++i) \
{ \
y[i] = i * i; \
pred[i] = i % 3; \
} \
test_##TYPE##_##NAME (x, y, z[0], z[1], pred, N); \
for (int i = 0; i < N; ++i) \
{ \
TYPE expected = i % 3 != 1 ? y[i] OP z[i & 1] : y[i]; \
if (x[i] != expected) \
__builtin_abort (); \
asm volatile ("" ::: "memory"); \
} \
}
int
main (void)
{
TEST_ALL
return 0;
}
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
#include <stdint.h>
#define TEST(DATA_TYPE, OTHER_TYPE, NAME, OP) \
void __attribute__ ((noinline, noclone)) \
test_##DATA_TYPE##_##OTHER_TYPE##_##NAME (DATA_TYPE *__restrict x, \
DATA_TYPE *__restrict y, \
DATA_TYPE z1, DATA_TYPE z2, \
DATA_TYPE *__restrict pred, \
OTHER_TYPE *__restrict foo, \
int n) \
{ \
for (int i = 0; i < n; i += 2) \
{ \
x[i] = (pred[i] != 1 ? y[i] OP z1 : y[i]); \
x[i + 1] = (pred[i + 1] != 1 ? y[i + 1] OP z2 : y[i + 1]); \
foo[i] += 1; \
foo[i + 1] += 2; \
} \
}
#define TEST_INT_TYPE(DATA_TYPE, OTHER_TYPE) \
TEST (DATA_TYPE, OTHER_TYPE, div, /)
#define TEST_FP_TYPE(DATA_TYPE, OTHER_TYPE) \
TEST (DATA_TYPE, OTHER_TYPE, add, +) \
TEST (DATA_TYPE, OTHER_TYPE, sub, -) \
TEST (DATA_TYPE, OTHER_TYPE, mul, *) \
TEST (DATA_TYPE, OTHER_TYPE, div, /)
#define TEST_ALL \
TEST_INT_TYPE (int32_t, int8_t) \
TEST_INT_TYPE (int32_t, int16_t) \
TEST_INT_TYPE (uint32_t, int8_t) \
TEST_INT_TYPE (uint32_t, int16_t) \
TEST_INT_TYPE (int64_t, int8_t) \
TEST_INT_TYPE (int64_t, int16_t) \
TEST_INT_TYPE (int64_t, int32_t) \
TEST_INT_TYPE (uint64_t, int8_t) \
TEST_INT_TYPE (uint64_t, int16_t) \
TEST_INT_TYPE (uint64_t, int32_t) \
TEST_FP_TYPE (float, int8_t) \
TEST_FP_TYPE (float, int16_t) \
TEST_FP_TYPE (double, int8_t) \
TEST_FP_TYPE (double, int16_t) \
TEST_FP_TYPE (double, int32_t)
TEST_ALL
/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
/* The load XFAILs for fixed-length SVE account for extra loads from the
constant pool. */
/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b, p[0-7]/z,} 12 { xfail { aarch64_sve && { ! vect_variable_length } } } } } */
/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7],} 12 } } */
/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z,} 12 { xfail { aarch64_sve && { ! vect_variable_length } } } } } */
/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h, p[0-7],} 12 } } */
/* 72 for x operations, 6 for foo operations. */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z,} 78 { xfail { aarch64_sve && { ! vect_variable_length } } } } } */
/* 36 for x operations, 6 for foo operations. */
/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7],} 42 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z,} 168 } } */
/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7],} 84 } } */
/* { dg-final { scan-assembler-not {\tsel\t} } } */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "cond_arith_5.c"
#define N 98
#undef TEST
#define TEST(DATA_TYPE, OTHER_TYPE, NAME, OP) \
{ \
DATA_TYPE x[N], y[N], pred[N], z[2] = { 5, 7 }; \
OTHER_TYPE foo[N]; \
for (int i = 0; i < N; ++i) \
{ \
y[i] = i * i; \
pred[i] = i % 3; \
foo[i] = i * 5; \
} \
test_##DATA_TYPE##_##OTHER_TYPE##_##NAME (x, y, z[0], z[1], \
pred, foo, N); \
for (int i = 0; i < N; ++i) \
{ \
DATA_TYPE expected = i % 3 != 1 ? y[i] OP z[i & 1] : y[i]; \
if (x[i] != expected) \
__builtin_abort (); \
asm volatile ("" ::: "memory"); \
} \
}
int
main (void)
{
TEST_ALL
return 0;
}
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
#define VEC_PERM(TYPE) \
void __attribute__ ((weak)) \
vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n) \
{ \
for (int i = 0; i < n; ++i) \
{ \
TYPE a1 = a[i * 2]; \
TYPE a2 = a[i * 2 + 1]; \
TYPE b1 = b[i * 2]; \
TYPE b2 = b[i * 2 + 1]; \
a[i * 2] = b1 > 1 ? a1 / b1 : a1; \
a[i * 2 + 1] = b2 > 2 ? a2 / b2 : a2; \
} \
}
#define TEST_ALL(T) \
T (int32_t) \
T (uint32_t) \
T (int64_t) \
T (uint64_t) \
T (float) \
T (double)
TEST_ALL (VEC_PERM)
/* The loop should be fully-masked. The load XFAILs for fixed-length
SVE account for extra loads from the constant pool. */
/* { dg-final { scan-assembler-times {\tld1w\t} 6 { xfail { aarch64_sve && { ! vect_variable_length } } } } } */
/* { dg-final { scan-assembler-times {\tst1w\t} 3 } } */
/* { dg-final { scan-assembler-times {\tld1d\t} 6 { xfail { aarch64_sve && { ! vect_variable_length } } } } } */
/* { dg-final { scan-assembler-times {\tst1d\t} 3 } } */
/* { dg-final { scan-assembler-not {\tldr} } } */
/* { dg-final { scan-assembler-not {\tstr} } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s} 1 } } */
/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s} 1 } } */
/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s} 1 } } */
/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d} 1 } } */
/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d} 1 } } */
/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d} 1 } } */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "slp_14.c"
#define N1 (103 * 2)
#define N2 (111 * 2)
#define HARNESS(TYPE) \
{ \
TYPE a[N2], b[N2]; \
for (unsigned int i = 0; i < N2; ++i) \
{ \
a[i] = i * 2 + i % 5; \
b[i] = i % 11; \
} \
vec_slp_##TYPE (a, b, N1 / 2); \
for (unsigned int i = 0; i < N2; ++i) \
{ \
TYPE orig_a = i * 2 + i % 5; \
TYPE orig_b = i % 11; \
TYPE expected_a = orig_a; \
if (i < N1 && orig_b > (i & 1 ? 2 : 1)) \
expected_a /= orig_b; \
if (a[i] != expected_a || b[i] != orig_b) \
__builtin_abort (); \
} \
}
int
main (void)
{
TEST_ALL (HARNESS)
}
......@@ -562,6 +562,41 @@ again:
return 0;
}
/* Return true if call statements CALL1 and CALL2 are similar enough
to be combined into the same SLP group. */
static bool
compatible_calls_p (gcall *call1, gcall *call2)
{
unsigned int nargs = gimple_call_num_args (call1);
if (nargs != gimple_call_num_args (call2))
return false;
if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
return false;
if (gimple_call_internal_p (call1))
{
if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
TREE_TYPE (gimple_call_lhs (call2))))
return false;
for (unsigned int i = 0; i < nargs; ++i)
if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
TREE_TYPE (gimple_call_arg (call2, i))))
return false;
}
else
{
if (!operand_equal_p (gimple_call_fn (call1),
gimple_call_fn (call2), 0))
return false;
if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
return false;
}
return true;
}
/* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
caller's attempt to find the vector type in STMT with the narrowest
element type. Return true if VECTYPE is nonnull and if it is valid
......@@ -650,8 +685,8 @@ vect_two_operations_perm_ok_p (vec<gimple *> stmts, unsigned int group_size,
static bool
vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
vec<gimple *> stmts, unsigned int group_size,
unsigned nops, poly_uint64 *max_nunits,
bool *matches, bool *two_operators)
poly_uint64 *max_nunits, bool *matches,
bool *two_operators)
{
unsigned int i;
gimple *first_stmt = stmts[0], *stmt = stmts[0];
......@@ -727,7 +762,9 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
{
rhs_code = CALL_EXPR;
if (gimple_call_internal_p (call_stmt)
if ((gimple_call_internal_p (call_stmt)
&& (!vectorizable_internal_fn_p
(gimple_call_internal_fn (call_stmt))))
|| gimple_call_tail_p (call_stmt)
|| gimple_call_noreturn_p (call_stmt)
|| !gimple_call_nothrow_p (call_stmt)
......@@ -873,11 +910,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
if (rhs_code == CALL_EXPR)
{
gimple *first_stmt = stmts[0];
if (gimple_call_num_args (stmt) != nops
|| !operand_equal_p (gimple_call_fn (first_stmt),
gimple_call_fn (stmt), 0)
|| gimple_call_fntype (first_stmt)
!= gimple_call_fntype (stmt))
if (!compatible_calls_p (as_a <gcall *> (first_stmt),
as_a <gcall *> (stmt)))
{
if (dump_enabled_p ())
{
......@@ -1193,8 +1227,7 @@ vect_build_slp_tree_2 (vec_info *vinfo,
bool two_operators = false;
unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
if (!vect_build_slp_tree_1 (vinfo, swap,
stmts, group_size, nops,
if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
&this_max_nunits, matches, &two_operators))
return NULL;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment