Commit ab2fc782 by Richard Sandiford Committed by Richard Sandiford

Use gather loads for strided accesses

This patch tries to use gather loads for strided accesses,
rather than falling back to VMAT_ELEMENTWISE.

2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
	    Alan Hayward  <alan.hayward@arm.com>
	    David Sherwood  <david.sherwood@arm.com>

gcc/
	* tree-vectorizer.h (vect_create_data_ref_ptr): Take an extra
	optional tree argument.
	* tree-vect-data-refs.c (vect_check_gather_scatter): Check for
	null target hooks.
	(vect_create_data_ref_ptr): Take the iv_step as an optional argument,
	but continue to use the current value as a fallback.
	(bump_vector_ptr): Use operand_equal_p rather than tree_int_cst_compare
	to compare the updates.
	* tree-vect-stmts.c (vect_use_strided_gather_scatters_p): New function.
	(get_load_store_type): Use it when handling a strided access.
	(vect_get_strided_load_store_ops): New function.
	(vect_get_data_ptr_increment): Likewise.
	(vectorizable_load): Handle strided gather loads.  Always pass
	a step to vect_create_data_ref_ptr and bump_vector_ptr.

gcc/testsuite/
	* gcc.target/aarch64/sve/strided_load_1.c: New test.
	* gcc.target/aarch64/sve/strided_load_2.c: Likewise.
	* gcc.target/aarch64/sve/strided_load_3.c: Likewise.

Co-Authored-By: Alan Hayward <alan.hayward@arm.com>
Co-Authored-By: David Sherwood <david.sherwood@arm.com>

From-SVN: r256641
parent bfaa08b7
......@@ -2,6 +2,25 @@
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
* tree-vectorizer.h (vect_create_data_ref_ptr): Take an extra
optional tree argument.
* tree-vect-data-refs.c (vect_check_gather_scatter): Check for
null target hooks.
(vect_create_data_ref_ptr): Take the iv_step as an optional argument,
but continue to use the current value as a fallback.
(bump_vector_ptr): Use operand_equal_p rather than tree_int_cst_compare
to compare the updates.
* tree-vect-stmts.c (vect_use_strided_gather_scatters_p): New function.
(get_load_store_type): Use it when handling a strided access.
(vect_get_strided_load_store_ops): New function.
(vect_get_data_ptr_increment): Likewise.
(vectorizable_load): Handle strided gather loads. Always pass
a step to vect_create_data_ref_ptr and bump_vector_ptr.
2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
* doc/md.texi (gather_load@var{m}): Document.
(mask_gather_load@var{m}): Likewise.
* genopinit.c (main): Add supports_vec_gather_load and
......
......@@ -2,6 +2,14 @@
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
* gcc.target/aarch64/sve/strided_load_1.c: New test.
* gcc.target/aarch64/sve/strided_load_2.c: Likewise.
* gcc.target/aarch64/sve/strided_load_3.c: Likewise.
2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
* gcc.target/aarch64/sve/gather_load_1.c: New test.
* gcc.target/aarch64/sve/gather_load_2.c: Likewise.
* gcc.target/aarch64/sve/gather_load_3.c: Likewise.
......
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
#include <stdint.h>
#ifndef INDEX8
#define INDEX8 int8_t
#define INDEX16 int16_t
#define INDEX32 int32_t
#define INDEX64 int64_t
#endif
#define TEST_LOOP(DATA_TYPE, BITS) \
void __attribute__ ((noinline, noclone)) \
f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \
DATA_TYPE *restrict src, \
INDEX##BITS stride, INDEX##BITS n) \
{ \
for (INDEX##BITS i = 0; i < n; ++i) \
dest[i] += src[i * stride]; \
}
#define TEST_TYPE(T, DATA_TYPE) \
T (DATA_TYPE, 8) \
T (DATA_TYPE, 16) \
T (DATA_TYPE, 32) \
T (DATA_TYPE, 64)
#define TEST_ALL(T) \
TEST_TYPE (T, int32_t) \
TEST_TYPE (T, uint32_t) \
TEST_TYPE (T, float) \
TEST_TYPE (T, int64_t) \
TEST_TYPE (T, uint64_t) \
TEST_TYPE (T, double)
TEST_ALL (TEST_LOOP)
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 9 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 12 } } */
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
#define INDEX8 uint8_t
#define INDEX16 uint16_t
#define INDEX32 uint32_t
#define INDEX64 uint64_t
#include "strided_load_1.c"
/* 8 and 16 bits are signed because the multiplication promotes to int.
Using uxtw for all 9 would be OK. */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */
/* The 32-bit loop needs to honor the defined overflow in uint32_t,
so we vectorize the offset calculation. This means that the
64-bit version needs two copies. */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
#include <stdint.h>
#define TEST_LOOP(DATA_TYPE, OTHER_TYPE) \
void __attribute__ ((noinline, noclone)) \
f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \
DATA_TYPE *restrict src, \
OTHER_TYPE *restrict other, \
OTHER_TYPE mask, \
int stride, int n) \
{ \
for (int i = 0; i < n; ++i) \
dest[i] = src[i * stride] + (OTHER_TYPE) (other[i] | mask); \
}
#define TEST_ALL(T) \
T (int32_t, int16_t) \
T (uint32_t, int16_t) \
T (float, int16_t) \
T (int64_t, int32_t) \
T (uint64_t, int32_t) \
T (double, int32_t)
TEST_ALL (TEST_LOOP)
/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 1\]\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */
......@@ -3616,9 +3616,15 @@ vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
else
{
if (DR_IS_READ (dr))
decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
{
if (targetm.vectorize.builtin_gather)
decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
}
else
decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
{
if (targetm.vectorize.builtin_scatter)
decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
}
if (!decl)
return false;
......@@ -4367,6 +4373,10 @@ vect_create_addr_base_for_vector_ref (gimple *stmt,
to the initial address accessed by the data-ref in STMT. This is
similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
in bytes.
8. IV_STEP (optional, defaults to NULL): the amount that should be added
to the IV during each iteration of the loop. NULL says to move
by one copy of AGGR_TYPE up or down, depending on the step of the
data reference.
Output:
1. Declare a new ptr to vector_type, and have it point to the base of the
......@@ -4399,7 +4409,8 @@ tree
vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
tree offset, tree *initial_address,
gimple_stmt_iterator *gsi, gimple **ptr_incr,
bool only_init, bool *inv_p, tree byte_offset)
bool only_init, bool *inv_p, tree byte_offset,
tree iv_step)
{
const char *base_name;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
......@@ -4423,7 +4434,8 @@ vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
tree step;
bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
gcc_assert (iv_step != NULL_TREE
|| TREE_CODE (aggr_type) == ARRAY_TYPE
|| TREE_CODE (aggr_type) == VECTOR_TYPE);
if (loop_vinfo)
......@@ -4564,14 +4576,17 @@ vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
aptr = aggr_ptr_init;
else
{
/* The step of the aggregate pointer is the type size. */
tree iv_step = TYPE_SIZE_UNIT (aggr_type);
/* One exception to the above is when the scalar step of the load in
LOOP is zero. In this case the step here is also zero. */
if (*inv_p)
iv_step = size_zero_node;
else if (tree_int_cst_sgn (step) == -1)
iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
if (iv_step == NULL_TREE)
{
/* The step of the aggregate pointer is the type size. */
iv_step = TYPE_SIZE_UNIT (aggr_type);
/* One exception to the above is when the scalar step of the load in
LOOP is zero. In this case the step here is also zero. */
if (*inv_p)
iv_step = size_zero_node;
else if (tree_int_cst_sgn (step) == -1)
iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
}
standard_iv_increment_position (loop, &incr_gsi, &insert_after);
......@@ -4704,7 +4719,7 @@ bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
if (use == dataref_ptr)
SET_USE (use_p, new_dataref_ptr);
else
gcc_assert (tree_int_cst_compare (use, update) == 0);
gcc_assert (operand_equal_p (use, update, 0));
}
return new_dataref_ptr;
......
......@@ -1849,6 +1849,44 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask,
return and_res;
}
/* Return true if we can use gather/scatter internal functions to
vectorize STMT, which is a grouped or strided load or store.
When returning true, fill in GS_INFO with the information required
to perform the operation. */
static bool
vect_use_strided_gather_scatters_p (gimple *stmt, loop_vec_info loop_vinfo,
gather_scatter_info *gs_info)
{
if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info)
|| gs_info->decl)
return false;
scalar_mode element_mode = SCALAR_TYPE_MODE (gs_info->element_type);
unsigned int element_bits = GET_MODE_BITSIZE (element_mode);
tree offset_type = TREE_TYPE (gs_info->offset);
unsigned int offset_bits = TYPE_PRECISION (offset_type);
/* Enforced by vect_check_gather_scatter. */
gcc_assert (element_bits >= offset_bits);
/* If the elements are wider than the offset, convert the offset to the
same width, without changing its sign. */
if (element_bits > offset_bits)
{
bool unsigned_p = TYPE_UNSIGNED (offset_type);
offset_type = build_nonstandard_integer_type (element_bits, unsigned_p);
gs_info->offset = fold_convert (offset_type, gs_info->offset);
}
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"using gather/scatter for strided/grouped access,"
" scale = %d\n", gs_info->scale);
return true;
}
/* STMT is a non-strided load or store, meaning that it accesses
elements with a known constant step. Return -1 if that step
is negative, 0 if it is zero, and 1 if it is greater than zero. */
......@@ -2168,7 +2206,11 @@ get_load_store_type (gimple *stmt, tree vectype, bool slp, bool masked_p,
else if (STMT_VINFO_STRIDED_P (stmt_info))
{
gcc_assert (!slp);
*memory_access_type = VMAT_ELEMENTWISE;
if (loop_vinfo
&& vect_use_strided_gather_scatters_p (stmt, loop_vinfo, gs_info))
*memory_access_type = VMAT_GATHER_SCATTER;
else
*memory_access_type = VMAT_ELEMENTWISE;
}
else
{
......@@ -2612,6 +2654,71 @@ vect_get_gather_scatter_ops (struct loop *loop, gimple *stmt,
offset_vectype);
}
/* Prepare to implement a grouped or strided load or store using
the gather load or scatter store operation described by GS_INFO.
STMT is the load or store statement.
Set *DATAREF_BUMP to the amount that should be added to the base
address after each copy of the vectorized statement. Set *VEC_OFFSET
to an invariant offset vector in which element I has the value
I * DR_STEP / SCALE. */
static void
vect_get_strided_load_store_ops (gimple *stmt, loop_vec_info loop_vinfo,
gather_scatter_info *gs_info,
tree *dataref_bump, tree *vec_offset)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
gimple_seq stmts;
tree bump = size_binop (MULT_EXPR,
fold_convert (sizetype, DR_STEP (dr)),
size_int (TYPE_VECTOR_SUBPARTS (vectype)));
*dataref_bump = force_gimple_operand (bump, &stmts, true, NULL_TREE);
if (stmts)
gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
/* The offset given in GS_INFO can have pointer type, so use the element
type of the vector instead. */
tree offset_type = TREE_TYPE (gs_info->offset);
tree offset_vectype = get_vectype_for_scalar_type (offset_type);
offset_type = TREE_TYPE (offset_vectype);
/* Calculate X = DR_STEP / SCALE and convert it to the appropriate type. */
tree step = size_binop (EXACT_DIV_EXPR, DR_STEP (dr),
ssize_int (gs_info->scale));
step = fold_convert (offset_type, step);
step = force_gimple_operand (step, &stmts, true, NULL_TREE);
/* Create {0, X, X*2, X*3, ...}. */
*vec_offset = gimple_build (&stmts, VEC_SERIES_EXPR, offset_vectype,
build_zero_cst (offset_type), step);
if (stmts)
gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
}
/* Return the amount that should be added to a vector pointer to move
to the next or previous copy of AGGR_TYPE. DR is the data reference
being vectorized and MEMORY_ACCESS_TYPE describes the type of
vectorization. */
static tree
vect_get_data_ptr_increment (data_reference *dr, tree aggr_type,
vect_memory_access_type memory_access_type)
{
if (memory_access_type == VMAT_INVARIANT)
return size_zero_node;
tree iv_step = TYPE_SIZE_UNIT (aggr_type);
tree step = vect_dr_behavior (dr)->step;
if (tree_int_cst_sgn (step) == -1)
iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
return iv_step;
}
/* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64}. */
static bool
......@@ -7412,6 +7519,9 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
return true;
}
if (memory_access_type == VMAT_GATHER_SCATTER)
grouped_load = false;
if (grouped_load)
{
first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
......@@ -7623,13 +7733,29 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
if (memory_access_type == VMAT_LOAD_STORE_LANES)
aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
tree bump;
tree vec_offset = NULL_TREE;
if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
{
aggr_type = NULL_TREE;
bump = NULL_TREE;
}
else if (memory_access_type == VMAT_GATHER_SCATTER)
{
aggr_type = elem_type;
vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info,
&bump, &vec_offset);
}
else
aggr_type = vectype;
{
if (memory_access_type == VMAT_LOAD_STORE_LANES)
aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
else
aggr_type = vectype;
bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type);
}
tree vec_mask = NULL_TREE;
tree vec_offset = NULL_TREE;
prev_stmt_info = NULL;
poly_uint64 group_elt = 0;
vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
......@@ -7661,7 +7787,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
= vect_create_data_ref_ptr (first_stmt_for_drptr, aggr_type,
at_loop, offset, &dummy, gsi,
&ptr_incr, simd_lane_access_p,
&inv_p, byte_offset);
&inv_p, byte_offset, bump);
/* Adjust the pointer by the difference to first_stmt. */
data_reference_p ptrdr
= STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt_for_drptr));
......@@ -7683,7 +7809,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
= vect_create_data_ref_ptr (first_stmt, aggr_type, at_loop,
offset, &dummy, gsi, &ptr_incr,
simd_lane_access_p, &inv_p,
byte_offset);
byte_offset, bump);
if (mask)
vec_mask = vect_get_vec_def_for_operand (mask, stmt,
mask_vectype);
......@@ -7692,7 +7818,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
{
if (dataref_offset)
dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
TYPE_SIZE_UNIT (aggr_type));
bump);
else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
{
gimple *def_stmt;
......@@ -7701,8 +7827,8 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
vec_offset = vect_get_vec_def_for_stmt_copy (dt, vec_offset);
}
else
dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
TYPE_SIZE_UNIT (aggr_type));
dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi,
stmt, bump);
if (mask)
{
gimple *def_stmt;
......@@ -7778,7 +7904,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
if (i > 0)
dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi,
stmt, NULL_TREE);
stmt, bump);
/* 2. Create the vector-load in the loop. */
switch (alignment_support_scheme)
......
......@@ -1462,7 +1462,7 @@ extern void vect_record_base_alignments (vec_info *);
extern tree vect_create_data_ref_ptr (gimple *, tree, struct loop *, tree,
tree *, gimple_stmt_iterator *,
gimple **, bool, bool *,
tree = NULL_TREE);
tree = NULL_TREE, tree = NULL_TREE);
extern tree bump_vector_ptr (tree, gimple *, gimple_stmt_iterator *, gimple *,
tree);
extern tree vect_create_destination_var (tree, tree);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment