Commit ab7e60ce by Richard Sandiford Committed by Richard Sandiford

Handle SLP permutations for variable-length vectors

The SLP code currently punts for all variable-length permutes.
This patch makes it handle the easy case of N->N permutes in which
the number of vector lanes is a multiple of N.  Every permute then
uses the same mask, and that mask repeats (with a stride) every
N elements.

The patch uses the same path for constant-length vectors,
since it should be slightly cheaper in terms of compile time.

2018-08-24  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* tree-vect-slp.c (vect_transform_slp_perm_load): Separate out
	the case in which the permute needs only a single element and
	repeats for every vector of the result.  Extend that case to
	handle variable-length vectors.
	* tree-vect-stmts.c (vectorizable_load): Update accordingly.

gcc/testsuite/
	* gcc.target/aarch64/sve/slp_perm_1.c: New test.
	* gcc.target/aarch64/sve/slp_perm_2.c: Likewise.
	* gcc.target/aarch64/sve/slp_perm_3.c: Likewise.
	* gcc.target/aarch64/sve/slp_perm_4.c: Likewise.
	* gcc.target/aarch64/sve/slp_perm_5.c: Likewise.
	* gcc.target/aarch64/sve/slp_perm_6.c: Likewise.
	* gcc.target/aarch64/sve/slp_perm_7.c: Likewise.

From-SVN: r263832
parent 1ade64c9
2018-08-24 Richard Sandiford <richard.sandiford@arm.com>
* tree-vect-slp.c (vect_transform_slp_perm_load): Separate out
the case in which the permute needs only a single element and
repeats for every vector of the result. Extend that case to
handle variable-length vectors.
* tree-vect-stmts.c (vectorizable_load): Update accordingly.
2018-08-24 H.J. Lu <hongjiu.lu@intel.com> 2018-08-24 H.J. Lu <hongjiu.lu@intel.com>
PR debug/79342 PR debug/79342
......
2018-08-24 Richard Sandiford <richard.sandiford@arm.com>
* gcc.target/aarch64/sve/slp_perm_1.c: New test.
* gcc.target/aarch64/sve/slp_perm_2.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_3.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_4.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_5.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_6.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_7.c: Likewise.
2018-08-24 H.J. Lu <hongjiu.lu@intel.com> 2018-08-24 H.J. Lu <hongjiu.lu@intel.com>
PR debug/79342 PR debug/79342
......
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8 + 7] + 1;
a[i * 8 + 1] = b[i * 8 + 6] + 2;
a[i * 8 + 2] = b[i * 8 + 5] + 3;
a[i * 8 + 3] = b[i * 8 + 4] + 4;
a[i * 8 + 4] = b[i * 8 + 3] + 5;
a[i * 8 + 5] = b[i * 8 + 2] + 6;
a[i * 8 + 6] = b[i * 8 + 1] + 7;
a[i * 8 + 7] = b[i * 8 + 0] + 8;
}
}
/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8 + 3] + 1;
a[i * 8 + 1] = b[i * 8 + 2] + 2;
a[i * 8 + 2] = b[i * 8 + 1] + 3;
a[i * 8 + 3] = b[i * 8 + 0] + 4;
a[i * 8 + 4] = b[i * 8 + 7] + 5;
a[i * 8 + 5] = b[i * 8 + 6] + 6;
a[i * 8 + 6] = b[i * 8 + 5] + 7;
a[i * 8 + 7] = b[i * 8 + 4] + 8;
}
}
/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8 + 1] + 1;
a[i * 8 + 1] = b[i * 8 + 0] + 2;
a[i * 8 + 2] = b[i * 8 + 3] + 3;
a[i * 8 + 3] = b[i * 8 + 2] + 4;
a[i * 8 + 4] = b[i * 8 + 5] + 5;
a[i * 8 + 5] = b[i * 8 + 4] + 6;
a[i * 8 + 6] = b[i * 8 + 7] + 7;
a[i * 8 + 7] = b[i * 8 + 6] + 8;
}
}
/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8] + c[i * 8];
a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1];
a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2];
a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3];
a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4];
a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5];
a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6];
a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7];
}
}
/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b,
uint8_t *restrict c, uint8_t *restrict d)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = c[i * 8] + d[i * 8];
a[i * 8 + 1] = c[i * 8] + d[i * 8 + 1];
a[i * 8 + 2] = c[i * 8 + 2] + d[i * 8 + 2];
a[i * 8 + 3] = c[i * 8 + 2] + d[i * 8 + 3];
a[i * 8 + 4] = c[i * 8 + 4] + d[i * 8 + 4];
a[i * 8 + 5] = c[i * 8 + 4] + d[i * 8 + 5];
a[i * 8 + 6] = c[i * 8 + 6] + d[i * 8 + 6];
a[i * 8 + 7] = c[i * 8 + 6] + d[i * 8 + 7];
b[i * 8] = c[i * 8 + 1] + d[i * 8];
b[i * 8 + 1] = c[i * 8 + 1] + d[i * 8 + 1];
b[i * 8 + 2] = c[i * 8 + 3] + d[i * 8 + 2];
b[i * 8 + 3] = c[i * 8 + 3] + d[i * 8 + 3];
b[i * 8 + 4] = c[i * 8 + 5] + d[i * 8 + 4];
b[i * 8 + 5] = c[i * 8 + 5] + d[i * 8 + 5];
b[i * 8 + 6] = c[i * 8 + 7] + d[i * 8 + 6];
b[i * 8 + 7] = c[i * 8 + 7] + d[i * 8 + 7];
}
}
/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
/* { dg-final { scan-assembler {\ttrn2\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8 + 3] + 1;
a[i * 8 + 1] = b[i * 8 + 6] + 1;
a[i * 8 + 2] = b[i * 8 + 0] + 1;
a[i * 8 + 3] = b[i * 8 + 2] + 1;
a[i * 8 + 4] = b[i * 8 + 1] + 1;
a[i * 8 + 5] = b[i * 8 + 7] + 1;
a[i * 8 + 6] = b[i * 8 + 5] + 1;
a[i * 8 + 7] = b[i * 8 + 4] + 1;
}
}
/* { dg-final { scan-assembler-times {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include <stdint.h>
void
f (uint8_t *restrict a, uint8_t *restrict b)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8 + 1] + 1;
a[i * 8 + 1] = b[i * 8 + 7] + 2;
a[i * 8 + 2] = b[i * 8 + 1] + 3;
a[i * 8 + 3] = b[i * 8 + 7] + 4;
a[i * 8 + 4] = b[i * 8 + 1] + 5;
a[i * 8 + 5] = b[i * 8 + 7] + 6;
a[i * 8 + 6] = b[i * 8 + 1] + 7;
a[i * 8 + 7] = b[i * 8 + 7] + 8;
}
}
/* { dg-final { scan-assembler {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
...@@ -3606,13 +3606,11 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, ...@@ -3606,13 +3606,11 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
{ {
stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
vec_info *vinfo = stmt_info->vinfo; vec_info *vinfo = stmt_info->vinfo;
tree mask_element_type = NULL_TREE, mask_type;
int vec_index = 0; int vec_index = 0;
tree vectype = STMT_VINFO_VECTYPE (stmt_info); tree vectype = STMT_VINFO_VECTYPE (stmt_info);
int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
unsigned int mask_element; unsigned int mask_element;
machine_mode mode; machine_mode mode;
unsigned HOST_WIDE_INT nunits, const_vf;
if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
return false; return false;
...@@ -3620,22 +3618,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, ...@@ -3620,22 +3618,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
mode = TYPE_MODE (vectype); mode = TYPE_MODE (vectype);
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
/* At the moment, all permutations are represented using per-element
indices, so we can't cope with variable vector lengths or
vectorization factors. */
if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
|| !vf.is_constant (&const_vf))
return false;
/* The generic VEC_PERM_EXPR code always uses an integral type of the
same size as the vector element being permuted. */
mask_element_type = lang_hooks.types.type_for_mode
(int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
mask_type = get_vectype_for_scalar_type (mask_element_type);
vec_perm_builder mask (nunits, nunits, 1);
mask.quick_grow (nunits);
vec_perm_indices indices;
/* Initialize the vect stmts of NODE to properly insert the generated /* Initialize the vect stmts of NODE to properly insert the generated
stmts later. */ stmts later. */
...@@ -3669,14 +3652,53 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, ...@@ -3669,14 +3652,53 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
bool noop_p = true; bool noop_p = true;
*n_perms = 0; *n_perms = 0;
for (unsigned int j = 0; j < const_vf; j++) vec_perm_builder mask;
unsigned int nelts_to_build;
unsigned int nvectors_per_build;
bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
&& multiple_p (nunits, group_size));
if (repeating_p)
{ {
for (int k = 0; k < group_size; k++) /* A single vector contains a whole number of copies of the node, so:
(a) all permutes can use the same mask; and
(b) the permutes only need a single vector input. */
mask.new_vector (nunits, group_size, 3);
nelts_to_build = mask.encoded_nelts ();
nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
}
else
{
/* We need to construct a separate mask for each vector statement. */
unsigned HOST_WIDE_INT const_nunits, const_vf;
if (!nunits.is_constant (&const_nunits)
|| !vf.is_constant (&const_vf))
return false;
mask.new_vector (const_nunits, const_nunits, 1);
nelts_to_build = const_vf * group_size;
nvectors_per_build = 1;
}
unsigned int count = mask.encoded_nelts ();
mask.quick_grow (count);
vec_perm_indices indices;
for (unsigned int j = 0; j < nelts_to_build; j++)
{
unsigned int iter_num = j / group_size;
unsigned int stmt_num = j % group_size;
unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
+ SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
if (repeating_p)
{ {
unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k] first_vec_index = 0;
+ j * DR_GROUP_SIZE (stmt_info)); mask_element = i;
vec_index = i / nunits; }
mask_element = i % nunits; else
{
/* Enforced before the loop when !repeating_p. */
unsigned int const_nunits = nunits.to_constant ();
vec_index = i / const_nunits;
mask_element = i % const_nunits;
if (vec_index == first_vec_index if (vec_index == first_vec_index
|| first_vec_index == -1) || first_vec_index == -1)
{ {
...@@ -3686,7 +3708,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, ...@@ -3686,7 +3708,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
|| second_vec_index == -1) || second_vec_index == -1)
{ {
second_vec_index = vec_index; second_vec_index = vec_index;
mask_element += nunits; mask_element += const_nunits;
} }
else else
{ {
...@@ -3702,50 +3724,54 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, ...@@ -3702,50 +3724,54 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
return false; return false;
} }
gcc_assert (mask_element < 2 * nunits); gcc_assert (mask_element < 2 * const_nunits);
if (mask_element != index) }
noop_p = false;
mask[index++] = mask_element; if (mask_element != index)
noop_p = false;
mask[index++] = mask_element;
if (index == nunits && !noop_p) if (index == count && !noop_p)
{
indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
if (!can_vec_perm_const_p (mode, indices))
{ {
indices.new_vector (mask, 2, nunits); if (dump_enabled_p ())
if (!can_vec_perm_const_p (mode, indices))
{ {
if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION,
vect_location,
"unsupported vect permute { ");
for (i = 0; i < count; ++i)
{ {
dump_printf_loc (MSG_MISSED_OPTIMIZATION, dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
vect_location, dump_printf (MSG_MISSED_OPTIMIZATION, " ");
"unsupported vect permute { ");
for (i = 0; i < nunits; ++i)
{
dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
dump_printf (MSG_MISSED_OPTIMIZATION, " ");
}
dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
} }
gcc_assert (analyze_only); dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
return false;
} }
gcc_assert (analyze_only);
++*n_perms; return false;
} }
if (index == nunits) ++*n_perms;
}
if (index == count)
{
if (!analyze_only)
{ {
if (!analyze_only) tree mask_vec = NULL_TREE;
{
tree mask_vec = NULL_TREE;
if (! noop_p) if (! noop_p)
mask_vec = vec_perm_indices_to_tree (mask_type, indices); mask_vec = vect_gen_perm_mask_checked (vectype, indices);
if (second_vec_index == -1) if (second_vec_index == -1)
second_vec_index = first_vec_index; second_vec_index = first_vec_index;
for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
{
/* Generate the permute statement if necessary. */ /* Generate the permute statement if necessary. */
tree first_vec = dr_chain[first_vec_index]; tree first_vec = dr_chain[first_vec_index + ri];
tree second_vec = dr_chain[second_vec_index]; tree second_vec = dr_chain[second_vec_index + ri];
stmt_vec_info perm_stmt_info; stmt_vec_info perm_stmt_info;
if (! noop_p) if (! noop_p)
{ {
...@@ -3771,12 +3797,12 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, ...@@ -3771,12 +3797,12 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
= perm_stmt_info; = perm_stmt_info;
} }
index = 0;
first_vec_index = -1;
second_vec_index = -1;
noop_p = true;
} }
index = 0;
first_vec_index = -1;
second_vec_index = -1;
noop_p = true;
} }
} }
......
...@@ -8003,13 +8003,18 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, ...@@ -8003,13 +8003,18 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
if (slp) if (slp)
{ {
grouped_load = false; grouped_load = false;
/* For SLP permutation support we need to load the whole group, /* If an SLP permutation is from N elements to N elements,
not only the number of vector stmts the permutation result and if one vector holds a whole number of N, we can load
fits in. */ the inputs to the permutation in the same way as an
if (slp_perm) unpermuted sequence. In other cases we need to load the
whole group, not only the number of vector stmts the
permutation result fits in. */
if (slp_perm
&& (group_size != SLP_INSTANCE_GROUP_SIZE (slp_node_instance)
|| !multiple_p (nunits, group_size)))
{ {
/* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for /* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for
variable VF. */ variable VF; see vect_transform_slp_perm_load. */
unsigned int const_vf = vf.to_constant (); unsigned int const_vf = vf.to_constant ();
unsigned int const_nunits = nunits.to_constant (); unsigned int const_nunits = nunits.to_constant ();
vec_num = CEIL (group_size * const_vf, const_nunits); vec_num = CEIL (group_size * const_vf, const_nunits);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment