Handle SLP permutations for variable-length vectors

The SLP code currently punts for all variable-length permutes. This patch makes it handle the easy case of N->N permutes in which the number of vector lanes is a multiple of N. Every permute then uses the same mask, and that mask repeats (with a stride) every N elements. The patch uses the same path for constant-length vectors, since it should be slightly cheaper in terms of compile time. 2018-08-24 Richard Sandiford <richard.sandiford@arm.com> gcc/ * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out the case in which the permute needs only a single element and repeats for every vector of the result. Extend that case to handle variable-length vectors. * tree-vect-stmts.c (vectorizable_load): Update accordingly. gcc/testsuite/ * gcc.target/aarch64/sve/slp_perm_1.c: New test. * gcc.target/aarch64/sve/slp_perm_2.c: Likewise. * gcc.target/aarch64/sve/slp_perm_3.c: Likewise. * gcc.target/aarch64/sve/slp_perm_4.c: Likewise. * gcc.target/aarch64/sve/slp_perm_5.c: Likewise. * gcc.target/aarch64/sve/slp_perm_6.c: Likewise. * gcc.target/aarch64/sve/slp_perm_7.c: Likewise. From-SVN: r263832

Handle SLP permutations for variable-length vectors
The SLP code currently punts for all variable-length permutes. This patch makes it handle the easy case of N->N permutes in which the number of vector lanes is a multiple of N. Every permute then uses the same mask, and that mask repeats (with a stride) every N elements. The patch uses the same path for constant-length vectors, since it should be slightly cheaper in terms of compile time. 2018-08-24 Richard Sandiford <richard.sandiford@arm.com> gcc/ * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out the case in which the permute needs only a single element and repeats for every vector of the result. Extend that case to handle variable-length vectors. * tree-vect-stmts.c (vectorizable_load): Update accordingly. gcc/testsuite/ * gcc.target/aarch64/sve/slp_perm_1.c: New test. * gcc.target/aarch64/sve/slp_perm_2.c: Likewise. * gcc.target/aarch64/sve/slp_perm_3.c: Likewise. * gcc.target/aarch64/sve/slp_perm_4.c: Likewise. * gcc.target/aarch64/sve/slp_perm_5.c: Likewise. * gcc.target/aarch64/sve/slp_perm_6.c: Likewise. * gcc.target/aarch64/sve/slp_perm_7.c: Likewise. From-SVN: r263832
ab7e60ce · Richard Sandiford · Richard Sandiford · 1ade64c9 · ab7e60ce · ab7e60ce
Commit ab7e60ce authored Aug 24, 2018 by Richard Sandiford Committed by Richard Sandiford Aug 24, 2018
11 changed files
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2018-08-24  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* tree-vect-slp.c (vect_transform_slp_perm_load): Separate out
+	the case in which the permute needs only a single element and
+	repeats for every vector of the result.  Extend that case to
+	handle variable-length vectors.
+	* tree-vect-stmts.c (vectorizable_load): Update accordingly.
+
 2018-08-24  H.J. Lu  <hongjiu.lu@intel.com>

 	PR debug/79342

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
+2018-08-24  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* gcc.target/aarch64/sve/slp_perm_1.c: New test.
+	* gcc.target/aarch64/sve/slp_perm_2.c: Likewise.
+	* gcc.target/aarch64/sve/slp_perm_3.c: Likewise.
+	* gcc.target/aarch64/sve/slp_perm_4.c: Likewise.
+	* gcc.target/aarch64/sve/slp_perm_5.c: Likewise.
+	* gcc.target/aarch64/sve/slp_perm_6.c: Likewise.
+	* gcc.target/aarch64/sve/slp_perm_7.c: Likewise.
+
 2018-08-24  H.J. Lu  <hongjiu.lu@intel.com>

 	PR debug/79342

--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_1.c
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      a[i * 8] = b[i * 8 + 7] + 1;
+      a[i * 8 + 1] = b[i * 8 + 6] + 2;
+      a[i * 8 + 2] = b[i * 8 + 5] + 3;
+      a[i * 8 + 3] = b[i * 8 + 4] + 4;
+      a[i * 8 + 4] = b[i * 8 + 3] + 5;
+      a[i * 8 + 5] = b[i * 8 + 2] + 6;
+      a[i * 8 + 6] = b[i * 8 + 1] + 7;
+      a[i * 8 + 7] = b[i * 8 + 0] + 8;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_2.c
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      a[i * 8] = b[i * 8 + 3] + 1;
+      a[i * 8 + 1] = b[i * 8 + 2] + 2;
+      a[i * 8 + 2] = b[i * 8 + 1] + 3;
+      a[i * 8 + 3] = b[i * 8 + 0] + 4;
+      a[i * 8 + 4] = b[i * 8 + 7] + 5;
+      a[i * 8 + 5] = b[i * 8 + 6] + 6;
+      a[i * 8 + 6] = b[i * 8 + 5] + 7;
+      a[i * 8 + 7] = b[i * 8 + 4] + 8;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_3.c
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      a[i * 8] = b[i * 8 + 1] + 1;
+      a[i * 8 + 1] = b[i * 8 + 0] + 2;
+      a[i * 8 + 2] = b[i * 8 + 3] + 3;
+      a[i * 8 + 3] = b[i * 8 + 2] + 4;
+      a[i * 8 + 4] = b[i * 8 + 5] + 5;
+      a[i * 8 + 5] = b[i * 8 + 4] + 6;
+      a[i * 8 + 6] = b[i * 8 + 7] + 7;
+      a[i * 8 + 7] = b[i * 8 + 6] + 8;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_4.c
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      a[i * 8] = b[i * 8] + c[i * 8];
+      a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1];
+      a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2];
+      a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3];
+      a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4];
+      a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5];
+      a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6];
+      a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7];
+    }
+}
+
+/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_5.c
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b,
+   uint8_t *restrict c, uint8_t *restrict d)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      a[i * 8] = c[i * 8] + d[i * 8];
+      a[i * 8 + 1] = c[i * 8] + d[i * 8 + 1];
+      a[i * 8 + 2] = c[i * 8 + 2] + d[i * 8 + 2];
+      a[i * 8 + 3] = c[i * 8 + 2] + d[i * 8 + 3];
+      a[i * 8 + 4] = c[i * 8 + 4] + d[i * 8 + 4];
+      a[i * 8 + 5] = c[i * 8 + 4] + d[i * 8 + 5];
+      a[i * 8 + 6] = c[i * 8 + 6] + d[i * 8 + 6];
+      a[i * 8 + 7] = c[i * 8 + 6] + d[i * 8 + 7];
+      b[i * 8] = c[i * 8 + 1] + d[i * 8];
+      b[i * 8 + 1] = c[i * 8 + 1] + d[i * 8 + 1];
+      b[i * 8 + 2] = c[i * 8 + 3] + d[i * 8 + 2];
+      b[i * 8 + 3] = c[i * 8 + 3] + d[i * 8 + 3];
+      b[i * 8 + 4] = c[i * 8 + 5] + d[i * 8 + 4];
+      b[i * 8 + 5] = c[i * 8 + 5] + d[i * 8 + 5];
+      b[i * 8 + 6] = c[i * 8 + 7] + d[i * 8 + 6];
+      b[i * 8 + 7] = c[i * 8 + 7] + d[i * 8 + 7];
+    }
+}
+
+/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
+/* { dg-final { scan-assembler {\ttrn2\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_6.c
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      a[i * 8] = b[i * 8 + 3] + 1;
+      a[i * 8 + 1] = b[i * 8 + 6] + 1;
+      a[i * 8 + 2] = b[i * 8 + 0] + 1;
+      a[i * 8 + 3] = b[i * 8 + 2] + 1;
+      a[i * 8 + 4] = b[i * 8 + 1] + 1;
+      a[i * 8 + 5] = b[i * 8 + 7] + 1;
+      a[i * 8 + 6] = b[i * 8 + 5] + 1;
+      a[i * 8 + 7] = b[i * 8 + 4] + 1;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_7.c
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      a[i * 8] = b[i * 8 + 1] + 1;
+      a[i * 8 + 1] = b[i * 8 + 7] + 2;
+      a[i * 8 + 2] = b[i * 8 + 1] + 3;
+      a[i * 8 + 3] = b[i * 8 + 7] + 4;
+      a[i * 8 + 4] = b[i * 8 + 1] + 5;
+      a[i * 8 + 5] = b[i * 8 + 7] + 6;
+      a[i * 8 + 6] = b[i * 8 + 1] + 7;
+      a[i * 8 + 7] = b[i * 8 + 7] + 8;
+    }
+}
+
+/* { dg-final { scan-assembler {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -3606,13 +3606,11 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
 {
  stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
  vec_info *vinfo = stmt_info->vinfo;
-  tree mask_element_type = NULL_TREE, mask_type;
  int vec_index = 0;
  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
+  unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
  unsigned int mask_element;
  machine_mode mode;
-  unsigned HOST_WIDE_INT nunits, const_vf;

  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
    return false;
@@ -3620,22 +3618,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
  stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);

  mode = TYPE_MODE (vectype);
-
-  /* At the moment, all permutations are represented using per-element
-     indices, so we can't cope with variable vector lengths or
-     vectorization factors.  */
-  if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
-      || !vf.is_constant (&const_vf))
-    return false;
-
-  /* The generic VEC_PERM_EXPR code always uses an integral type of the
-     same size as the vector element being permuted.  */
-  mask_element_type = lang_hooks.types.type_for_mode
-    (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
-  mask_type = get_vectype_for_scalar_type (mask_element_type);
-  vec_perm_builder mask (nunits, nunits, 1);
-  mask.quick_grow (nunits);
-  vec_perm_indices indices;
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);

  /* Initialize the vect stmts of NODE to properly insert the generated
     stmts later.  */
@@ -3669,14 +3652,53 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
  bool noop_p = true;
  *n_perms = 0;

-  for (unsigned int j = 0; j < const_vf; j++)
+  vec_perm_builder mask;
+  unsigned int nelts_to_build;
+  unsigned int nvectors_per_build;
+  bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
+		      && multiple_p (nunits, group_size));
+  if (repeating_p)
    {
-      for (int k = 0; k < group_size; k++)
+      /* A single vector contains a whole number of copies of the node, so:
+	 (a) all permutes can use the same mask; and
+	 (b) the permutes only need a single vector input.  */
+      mask.new_vector (nunits, group_size, 3);
+      nelts_to_build = mask.encoded_nelts ();
+      nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
+    }
+  else
+    {
+      /* We need to construct a separate mask for each vector statement.  */
+      unsigned HOST_WIDE_INT const_nunits, const_vf;
+      if (!nunits.is_constant (&const_nunits)
+	  || !vf.is_constant (&const_vf))
+	return false;
+      mask.new_vector (const_nunits, const_nunits, 1);
+      nelts_to_build = const_vf * group_size;
+      nvectors_per_build = 1;
+    }
+
+  unsigned int count = mask.encoded_nelts ();
+  mask.quick_grow (count);
+  vec_perm_indices indices;
+
+  for (unsigned int j = 0; j < nelts_to_build; j++)
+    {
+      unsigned int iter_num = j / group_size;
+      unsigned int stmt_num = j % group_size;
+      unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
+			+ SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
+      if (repeating_p)
 	{
-	  unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
-			    + j * DR_GROUP_SIZE (stmt_info));
-	  vec_index = i / nunits;
-	  mask_element = i % nunits;
+	  first_vec_index = 0;
+	  mask_element = i;
+	}
+      else
+	{
+	  /* Enforced before the loop when !repeating_p.  */
+	  unsigned int const_nunits = nunits.to_constant ();
+	  vec_index = i / const_nunits;
+	  mask_element = i % const_nunits;
 	  if (vec_index == first_vec_index
 	      || first_vec_index == -1)
 	    {
@@ -3686,7 +3708,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
 		   || second_vec_index == -1)
 	    {
 	      second_vec_index = vec_index;
-	      mask_element += nunits;
+	      mask_element += const_nunits;
 	    }
 	  else
 	    {
@@ -3702,50 +3724,54 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
 	      return false;
 	    }

-	  gcc_assert (mask_element < 2 * nunits);
-	  if (mask_element != index)
-	    noop_p = false;
-	  mask[index++] = mask_element;
+	  gcc_assert (mask_element < 2 * const_nunits);
+	}
+
+      if (mask_element != index)
+	noop_p = false;
+      mask[index++] = mask_element;

-	  if (index == nunits && !noop_p)
+      if (index == count && !noop_p)
+	{
+	  indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
+	  if (!can_vec_perm_const_p (mode, indices))
 	    {
-	      indices.new_vector (mask, 2, nunits);
-	      if (!can_vec_perm_const_p (mode, indices))
+	      if (dump_enabled_p ())
 		{
-		  if (dump_enabled_p ())
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+				   vect_location,
+				   "unsupported vect permute { ");
+		  for (i = 0; i < count; ++i)
 		    {
-		      dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-				       vect_location, 
-				       "unsupported vect permute { ");
-		      for (i = 0; i < nunits; ++i)
-			{
-			  dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
-			  dump_printf (MSG_MISSED_OPTIMIZATION, " ");
-			}
-		      dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
+		      dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
+		      dump_printf (MSG_MISSED_OPTIMIZATION, " ");
 		    }
-		  gcc_assert (analyze_only);
-		  return false;
+		  dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
 		}
-
-	      ++*n_perms;
+	      gcc_assert (analyze_only);
+	      return false;
 	    }

-	  if (index == nunits)
+	  ++*n_perms;
+	}
+
+      if (index == count)
+	{
+	  if (!analyze_only)
 	    {
-	      if (!analyze_only)
-		{
-		  tree mask_vec = NULL_TREE;
+	      tree mask_vec = NULL_TREE;
 		  
-		  if (! noop_p)
-		    mask_vec = vec_perm_indices_to_tree (mask_type, indices);
+	      if (! noop_p)
+		mask_vec = vect_gen_perm_mask_checked (vectype, indices);

-		  if (second_vec_index == -1)
-		    second_vec_index = first_vec_index;
+	      if (second_vec_index == -1)
+		second_vec_index = first_vec_index;

+	      for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
+		{
 		  /* Generate the permute statement if necessary.  */
-		  tree first_vec = dr_chain[first_vec_index];
-		  tree second_vec = dr_chain[second_vec_index];
+		  tree first_vec = dr_chain[first_vec_index + ri];
+		  tree second_vec = dr_chain[second_vec_index + ri];
 		  stmt_vec_info perm_stmt_info;
 		  if (! noop_p)
 		    {
@@ -3771,12 +3797,12 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
 		  SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
 		    = perm_stmt_info;
 		}
-
-	      index = 0;
-	      first_vec_index = -1;
-	      second_vec_index = -1;
-	      noop_p = true;
 	    }
+
+	  index = 0;
+	  first_vec_index = -1;
+	  second_vec_index = -1;
+	  noop_p = true;
 	}
    }


--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -8003,13 +8003,18 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
      if (slp)
 	{
 	  grouped_load = false;
-	  /* For SLP permutation support we need to load the whole group,
-	     not only the number of vector stmts the permutation result
-	     fits in.  */
-	  if (slp_perm)
+	  /* If an SLP permutation is from N elements to N elements,
+	     and if one vector holds a whole number of N, we can load
+	     the inputs to the permutation in the same way as an
+	     unpermuted sequence.  In other cases we need to load the
+	     whole group, not only the number of vector stmts the
+	     permutation result fits in.  */
+	  if (slp_perm
+	      && (group_size != SLP_INSTANCE_GROUP_SIZE (slp_node_instance)
+		  || !multiple_p (nunits, group_size)))
 	    {
-	      /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
-		 variable VF.  */
+	      /* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for
+		 variable VF; see vect_transform_slp_perm_load.  */
 	      unsigned int const_vf = vf.to_constant ();
 	      unsigned int const_nunits = nunits.to_constant ();
 	      vec_num = CEIL (group_size * const_vf, const_nunits);