Commit ab7e60ce by Richard Sandiford Committed by Richard Sandiford

Handle SLP permutations for variable-length vectors

The SLP code currently punts for all variable-length permutes.
This patch makes it handle the easy case of N->N permutes in which
the number of vector lanes is a multiple of N.  Every permute then
uses the same mask, and that mask repeats (with a stride) every
N elements.

The patch uses the same path for constant-length vectors,
since it should be slightly cheaper in terms of compile time.

2018-08-24  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* tree-vect-slp.c (vect_transform_slp_perm_load): Separate out
	the case in which the permute needs only a single element and
	repeats for every vector of the result.  Extend that case to
	handle variable-length vectors.
	* tree-vect-stmts.c (vectorizable_load): Update accordingly.

gcc/testsuite/
	* gcc.target/aarch64/sve/slp_perm_1.c: New test.
	* gcc.target/aarch64/sve/slp_perm_2.c: Likewise.
	* gcc.target/aarch64/sve/slp_perm_3.c: Likewise.
	* gcc.target/aarch64/sve/slp_perm_4.c: Likewise.
	* gcc.target/aarch64/sve/slp_perm_5.c: Likewise.
	* gcc.target/aarch64/sve/slp_perm_6.c: Likewise.
	* gcc.target/aarch64/sve/slp_perm_7.c: Likewise.

From-SVN: r263832
parent 1ade64c9
2018-08-24 Richard Sandiford <richard.sandiford@arm.com>
* tree-vect-slp.c (vect_transform_slp_perm_load): Separate out
the case in which the permute needs only a single element and
repeats for every vector of the result. Extend that case to
handle variable-length vectors.
* tree-vect-stmts.c (vectorizable_load): Update accordingly.
2018-08-24 H.J. Lu <hongjiu.lu@intel.com>
PR debug/79342
......
2018-08-24 Richard Sandiford <richard.sandiford@arm.com>
* gcc.target/aarch64/sve/slp_perm_1.c: New test.
* gcc.target/aarch64/sve/slp_perm_2.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_3.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_4.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_5.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_6.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_7.c: Likewise.
2018-08-24 H.J. Lu <hongjiu.lu@intel.com>
PR debug/79342
......
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8 + 7] + 1;
a[i * 8 + 1] = b[i * 8 + 6] + 2;
a[i * 8 + 2] = b[i * 8 + 5] + 3;
a[i * 8 + 3] = b[i * 8 + 4] + 4;
a[i * 8 + 4] = b[i * 8 + 3] + 5;
a[i * 8 + 5] = b[i * 8 + 2] + 6;
a[i * 8 + 6] = b[i * 8 + 1] + 7;
a[i * 8 + 7] = b[i * 8 + 0] + 8;
}
}
/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8 + 3] + 1;
a[i * 8 + 1] = b[i * 8 + 2] + 2;
a[i * 8 + 2] = b[i * 8 + 1] + 3;
a[i * 8 + 3] = b[i * 8 + 0] + 4;
a[i * 8 + 4] = b[i * 8 + 7] + 5;
a[i * 8 + 5] = b[i * 8 + 6] + 6;
a[i * 8 + 6] = b[i * 8 + 5] + 7;
a[i * 8 + 7] = b[i * 8 + 4] + 8;
}
}
/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8 + 1] + 1;
a[i * 8 + 1] = b[i * 8 + 0] + 2;
a[i * 8 + 2] = b[i * 8 + 3] + 3;
a[i * 8 + 3] = b[i * 8 + 2] + 4;
a[i * 8 + 4] = b[i * 8 + 5] + 5;
a[i * 8 + 5] = b[i * 8 + 4] + 6;
a[i * 8 + 6] = b[i * 8 + 7] + 7;
a[i * 8 + 7] = b[i * 8 + 6] + 8;
}
}
/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8] + c[i * 8];
a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1];
a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2];
a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3];
a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4];
a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5];
a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6];
a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7];
}
}
/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b,
uint8_t *restrict c, uint8_t *restrict d)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = c[i * 8] + d[i * 8];
a[i * 8 + 1] = c[i * 8] + d[i * 8 + 1];
a[i * 8 + 2] = c[i * 8 + 2] + d[i * 8 + 2];
a[i * 8 + 3] = c[i * 8 + 2] + d[i * 8 + 3];
a[i * 8 + 4] = c[i * 8 + 4] + d[i * 8 + 4];
a[i * 8 + 5] = c[i * 8 + 4] + d[i * 8 + 5];
a[i * 8 + 6] = c[i * 8 + 6] + d[i * 8 + 6];
a[i * 8 + 7] = c[i * 8 + 6] + d[i * 8 + 7];
b[i * 8] = c[i * 8 + 1] + d[i * 8];
b[i * 8 + 1] = c[i * 8 + 1] + d[i * 8 + 1];
b[i * 8 + 2] = c[i * 8 + 3] + d[i * 8 + 2];
b[i * 8 + 3] = c[i * 8 + 3] + d[i * 8 + 3];
b[i * 8 + 4] = c[i * 8 + 5] + d[i * 8 + 4];
b[i * 8 + 5] = c[i * 8 + 5] + d[i * 8 + 5];
b[i * 8 + 6] = c[i * 8 + 7] + d[i * 8 + 6];
b[i * 8 + 7] = c[i * 8 + 7] + d[i * 8 + 7];
}
}
/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
/* { dg-final { scan-assembler {\ttrn2\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8 + 3] + 1;
a[i * 8 + 1] = b[i * 8 + 6] + 1;
a[i * 8 + 2] = b[i * 8 + 0] + 1;
a[i * 8 + 3] = b[i * 8 + 2] + 1;
a[i * 8 + 4] = b[i * 8 + 1] + 1;
a[i * 8 + 5] = b[i * 8 + 7] + 1;
a[i * 8 + 6] = b[i * 8 + 5] + 1;
a[i * 8 + 7] = b[i * 8 + 4] + 1;
}
}
/* { dg-final { scan-assembler-times {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8 + 1] + 1;
a[i * 8 + 1] = b[i * 8 + 7] + 2;
a[i * 8 + 2] = b[i * 8 + 1] + 3;
a[i * 8 + 3] = b[i * 8 + 7] + 4;
a[i * 8 + 4] = b[i * 8 + 1] + 5;
a[i * 8 + 5] = b[i * 8 + 7] + 6;
a[i * 8 + 6] = b[i * 8 + 1] + 7;
a[i * 8 + 7] = b[i * 8 + 7] + 8;
}
}
/* { dg-final { scan-assembler {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
......@@ -3606,13 +3606,11 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
{
stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
vec_info *vinfo = stmt_info->vinfo;
tree mask_element_type = NULL_TREE, mask_type;
int vec_index = 0;
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
unsigned int mask_element;
machine_mode mode;
unsigned HOST_WIDE_INT nunits, const_vf;
if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
return false;
......@@ -3620,22 +3618,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
mode = TYPE_MODE (vectype);
/* At the moment, all permutations are represented using per-element
indices, so we can't cope with variable vector lengths or
vectorization factors. */
if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
|| !vf.is_constant (&const_vf))
return false;
/* The generic VEC_PERM_EXPR code always uses an integral type of the
same size as the vector element being permuted. */
mask_element_type = lang_hooks.types.type_for_mode
(int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
mask_type = get_vectype_for_scalar_type (mask_element_type);
vec_perm_builder mask (nunits, nunits, 1);
mask.quick_grow (nunits);
vec_perm_indices indices;
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
/* Initialize the vect stmts of NODE to properly insert the generated
stmts later. */
......@@ -3669,14 +3652,53 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
bool noop_p = true;
*n_perms = 0;
for (unsigned int j = 0; j < const_vf; j++)
vec_perm_builder mask;
unsigned int nelts_to_build;
unsigned int nvectors_per_build;
bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
&& multiple_p (nunits, group_size));
if (repeating_p)
{
for (int k = 0; k < group_size; k++)
/* A single vector contains a whole number of copies of the node, so:
(a) all permutes can use the same mask; and
(b) the permutes only need a single vector input. */
mask.new_vector (nunits, group_size, 3);
nelts_to_build = mask.encoded_nelts ();
nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
}
else
{
/* We need to construct a separate mask for each vector statement. */
unsigned HOST_WIDE_INT const_nunits, const_vf;
if (!nunits.is_constant (&const_nunits)
|| !vf.is_constant (&const_vf))
return false;
mask.new_vector (const_nunits, const_nunits, 1);
nelts_to_build = const_vf * group_size;
nvectors_per_build = 1;
}
unsigned int count = mask.encoded_nelts ();
mask.quick_grow (count);
vec_perm_indices indices;
for (unsigned int j = 0; j < nelts_to_build; j++)
{
unsigned int iter_num = j / group_size;
unsigned int stmt_num = j % group_size;
unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
+ SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
if (repeating_p)
{
unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
+ j * DR_GROUP_SIZE (stmt_info));
vec_index = i / nunits;
mask_element = i % nunits;
first_vec_index = 0;
mask_element = i;
}
else
{
/* Enforced before the loop when !repeating_p. */
unsigned int const_nunits = nunits.to_constant ();
vec_index = i / const_nunits;
mask_element = i % const_nunits;
if (vec_index == first_vec_index
|| first_vec_index == -1)
{
......@@ -3686,7 +3708,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
|| second_vec_index == -1)
{
second_vec_index = vec_index;
mask_element += nunits;
mask_element += const_nunits;
}
else
{
......@@ -3702,50 +3724,54 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
return false;
}
gcc_assert (mask_element < 2 * nunits);
if (mask_element != index)
noop_p = false;
mask[index++] = mask_element;
gcc_assert (mask_element < 2 * const_nunits);
}
if (mask_element != index)
noop_p = false;
mask[index++] = mask_element;
if (index == nunits && !noop_p)
if (index == count && !noop_p)
{
indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
if (!can_vec_perm_const_p (mode, indices))
{
indices.new_vector (mask, 2, nunits);
if (!can_vec_perm_const_p (mode, indices))
if (dump_enabled_p ())
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION,
vect_location,
"unsupported vect permute { ");
for (i = 0; i < count; ++i)
{
dump_printf_loc (MSG_MISSED_OPTIMIZATION,
vect_location,
"unsupported vect permute { ");
for (i = 0; i < nunits; ++i)
{
dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
dump_printf (MSG_MISSED_OPTIMIZATION, " ");
}
dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
dump_printf (MSG_MISSED_OPTIMIZATION, " ");
}
gcc_assert (analyze_only);
return false;
dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
}
++*n_perms;
gcc_assert (analyze_only);
return false;
}
if (index == nunits)
++*n_perms;
}
if (index == count)
{
if (!analyze_only)
{
if (!analyze_only)
{
tree mask_vec = NULL_TREE;
tree mask_vec = NULL_TREE;
if (! noop_p)
mask_vec = vec_perm_indices_to_tree (mask_type, indices);
if (! noop_p)
mask_vec = vect_gen_perm_mask_checked (vectype, indices);
if (second_vec_index == -1)
second_vec_index = first_vec_index;
if (second_vec_index == -1)
second_vec_index = first_vec_index;
for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
{
/* Generate the permute statement if necessary. */
tree first_vec = dr_chain[first_vec_index];
tree second_vec = dr_chain[second_vec_index];
tree first_vec = dr_chain[first_vec_index + ri];
tree second_vec = dr_chain[second_vec_index + ri];
stmt_vec_info perm_stmt_info;
if (! noop_p)
{
......@@ -3771,12 +3797,12 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
= perm_stmt_info;
}
index = 0;
first_vec_index = -1;
second_vec_index = -1;
noop_p = true;
}
index = 0;
first_vec_index = -1;
second_vec_index = -1;
noop_p = true;
}
}
......
......@@ -8003,13 +8003,18 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
if (slp)
{
grouped_load = false;
/* For SLP permutation support we need to load the whole group,
not only the number of vector stmts the permutation result
fits in. */
if (slp_perm)
/* If an SLP permutation is from N elements to N elements,
and if one vector holds a whole number of N, we can load
the inputs to the permutation in the same way as an
unpermuted sequence. In other cases we need to load the
whole group, not only the number of vector stmts the
permutation result fits in. */
if (slp_perm
&& (group_size != SLP_INSTANCE_GROUP_SIZE (slp_node_instance)
|| !multiple_p (nunits, group_size)))
{
/* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
variable VF. */
/* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for
variable VF; see vect_transform_slp_perm_load. */
unsigned int const_vf = vf.to_constant ();
unsigned int const_nunits = nunits.to_constant ();
vec_num = CEIL (group_size * const_vf, const_nunits);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment