tree-vect-stmts.c (vectorizable_load): Remove restrictions on strided SLP loads…

tree-vect-stmts.c (vectorizable_load): Remove restrictions on strided SLP loads and fall back to scalar loads in case... 2016-06-08 Richard Biener <rguenther@suse.de> * tree-vect-stmts.c (vectorizable_load): Remove restrictions on strided SLP loads and fall back to scalar loads in case we can't chunk them. * gcc.dg/vect/slp-43.c: New testcase. From-SVN: r237215

tree-vect-stmts.c (vectorizable_load): Remove restrictions on strided SLP loads…
tree-vect-stmts.c (vectorizable_load): Remove restrictions on strided SLP loads and fall back to scalar loads in case... 2016-06-08 Richard Biener <rguenther@suse.de> * tree-vect-stmts.c (vectorizable_load): Remove restrictions on strided SLP loads and fall back to scalar loads in case we can't chunk them. * gcc.dg/vect/slp-43.c: New testcase. From-SVN: r237215
e09b4c37 · Richard Biener · Richard Biener · 72d50660 · e09b4c37 · e09b4c37
Commit e09b4c37 authored Jun 08, 2016 by Richard Biener Committed by Richard Biener Jun 08, 2016
Hide whitespace changes
Inline Side-by-side

Showing with 134 additions and 48 deletions

gcc/ChangeLog
+6 -0

gcc/testsuite/ChangeLog
+4 -0

gcc/testsuite/gcc.dg/vect/slp-43.c
+78 -0

gcc/tree-vect-stmts.c
+46 -48

No files found.
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
 2016-06-08  Richard Biener  <rguenther@suse.de>
+	* tree-vect-stmts.c (vectorizable_load): Remove restrictions
+	on strided SLP loads and fall back to scalar loads in case
+	we can't chunk them.
+2016-06-08  Richard Biener  <rguenther@suse.de>
 	PR tree-optimization/71452
 	* tree-ssa.c (non_rewritable_lvalue_p): Make sure that the
 	type used for the SSA rewrite has enough precision to cover

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
 2016-06-08  Richard Biener  <rguenther@suse.de>
+	* gcc.dg/vect/slp-43.c: New testcase.
+2016-06-08  Richard Biener  <rguenther@suse.de>
 	PR tree-optimization/71452
 	* gcc.dg/torture/pr71452.c: New testcase.

--- a/gcc/testsuite/gcc.dg/vect/slp-43.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-43.c
+/* { dg-do run } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3" } */
+#include <string.h>
+#include "tree-vect.h"
+#define FOO(T,N) \
+void __attribute__((noinline,noclone)) \
+foo_ ## T ## _ ## N (T * __restrict__ in_, T * __restrict__ out_, int s) \
+{ \
+  T *in = __builtin_assume_aligned (in_, __BIGGEST_ALIGNMENT__); \
+  T *out = __builtin_assume_aligned (out_, __BIGGEST_ALIGNMENT__); \
+  for (int i = 0; i < 16; i++) \
+    { \
+      for (int j = 0; j < N; ++j) \
+        out[j] = in[j]; \
+      in += s*N; \
+      out += N; \
+    } \
+}
+#define TEST(T,N) \
+ do { \
+  memset (out, 0, 4096); \
+  foo_ ## T ## _ ## N ((T *)in, (T *)out, 1); \
+  if (memcmp (in, out, sizeof (T) * 16 * N) != 0) \
+    __builtin_abort (); \
+  for (int i = sizeof (T) * 16 * N; i < 4096; ++i) \
+    if (out[i] != 0) \
+      __builtin_abort (); \
+ } while (0)
+FOO(char, 1)
+FOO(char, 2)
+FOO(char, 3)
+FOO(char, 4)
+FOO(char, 6)
+FOO(char, 8)
+FOO(int, 1)
+FOO(int, 2)
+FOO(int, 3)
+FOO(int, 4)
+FOO(int, 6)
+FOO(int, 8)
+FOO(int, 16)
+char in[4096] __attribute__((aligned(__BIGGEST_ALIGNMENT__)));
+char out[4096] __attribute__((aligned(__BIGGEST_ALIGNMENT__)));
+int main()
+{
+  check_vect ();
+  for (int i = 0; i < 4096; ++i)
+    {
+      in[i] = i;
+      __asm__ volatile ("" : : : "memory");
+    }
+  TEST(char, 1);
+  TEST(char, 2);
+  TEST(char, 3);
+  TEST(char, 4);
+  TEST(char, 6);
+  TEST(char, 8);
+  TEST(int, 1);
+  TEST(int, 2);
+  TEST(int, 3);
+  TEST(int, 4);
+  TEST(int, 6);
+  TEST(int, 8);
+  TEST(int, 16);
+  return 0;
+}
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 13 "vect" } } */
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -6440,17 +6440,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	}
    }
  else if (STMT_VINFO_STRIDED_P (stmt_info))
-    {
+    ;
-      if (grouped_load
-	  && slp
-	  && (group_size > nunits
-	      || nunits % group_size != 0))
-	{
-	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			   "unhandled strided group load\n");
-	  return false;
-	}
-    }
  else
    {
      negative = tree_int_cst_compare (nested_in_vect_loop
@@ -6744,16 +6734,29 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
      running_off = offvar;
      alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (first_dr)), 0);
      int nloads = nunits;
+      int lnel = 1;
      tree ltype = TREE_TYPE (vectype);
      auto_vec<tree> dr_chain;
      if (slp)
 	{
-	  nloads = nunits / group_size;
+	  if (group_size < nunits
-	  if (group_size < nunits)
+	      && nunits % group_size == 0)
-	    ltype = build_vector_type (TREE_TYPE (vectype), group_size);
+	    {
-	  else
+	      nloads = nunits / group_size;
-	    ltype = vectype;
+	      lnel = group_size;
-	  ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
+	      ltype = build_vector_type (TREE_TYPE (vectype), group_size);
+	      ltype = build_aligned_type (ltype,
+					  TYPE_ALIGN (TREE_TYPE (vectype)));
+	    }
+	  else if (group_size >= nunits
+		   && group_size % nunits == 0)
+	    {
+	      nloads = 1;
+	      lnel = nunits;
+	      ltype = vectype;
+	      ltype = build_aligned_type (ltype,
+					  TYPE_ALIGN (TREE_TYPE (vectype)));
+	    }
 	  /* For SLP permutation support we need to load the whole group,
 	     not only the number of vector stmts the permutation result
 	     fits in.  */
@@ -6765,48 +6768,43 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	  else
 	    ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
 	}
+      int group_el = 0;
+      unsigned HOST_WIDE_INT
+	elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
      for (j = 0; j < ncopies; j++)
 	{
-	  tree vec_inv;
 	  if (nloads > 1)
+	    vec_alloc (v, nloads);
+	  for (i = 0; i < nloads; i++)
 	    {
-	      vec_alloc (v, nloads);
+	      tree this_off = build_int_cst (TREE_TYPE (alias_off),
-	      for (i = 0; i < nloads; i++)
+					     group_el * elsz);
+	      new_stmt = gimple_build_assign (make_ssa_name (ltype),
+					      build2 (MEM_REF, ltype,
+						      running_off, this_off));
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      if (nloads > 1)
+		CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+					gimple_assign_lhs (new_stmt));
+	      group_el += lnel;
+	      if (! slp
+		  || group_el == group_size)
 		{
-		  tree newref, newoff;
+		  tree newoff = copy_ssa_name (running_off);
-		  gimple *incr;
+		  gimple *incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
-		  newref = build2 (MEM_REF, ltype, running_off, alias_off);
+						      running_off, stride_step);
-		  newref = force_gimple_operand_gsi (gsi, newref, true,
-						     NULL_TREE, true,
-						     GSI_SAME_STMT);
-		  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, newref);
-		  newoff = copy_ssa_name (running_off);
-		  incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
-					      running_off, stride_step);
 		  vect_finish_stmt_generation (stmt, incr, gsi);
 		  running_off = newoff;
+		  group_el = 0;
 		}
-	      vec_inv = build_constructor (vectype, v);
-	      new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
-	      new_stmt = SSA_NAME_DEF_STMT (new_temp);
 	    }
-	  else
+	  if (nloads > 1)
 	    {
-	      new_stmt = gimple_build_assign (make_ssa_name (ltype),
+	      tree vec_inv = build_constructor (vectype, v);
-					      build2 (MEM_REF, ltype,
+	      new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
-						      running_off, alias_off));
+	      new_stmt = SSA_NAME_DEF_STMT (new_temp);
-	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
-	      tree newoff = copy_ssa_name (running_off);
-	      gimple *incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
-					  running_off, stride_step);
-	      vect_finish_stmt_generation (stmt, incr, gsi);
-	      running_off = newoff;
 	    }
 	  if (slp)