re PR tree-optimization/52252 (An opportunity for x86 gcc vectorizer (gain up to 3 times))

gcc/ * config/i386/i386.c (ix86_reassociation_width): Add alternative for vector case. * config/i386/i386.h (TARGET_VECTOR_PARALLEL_EXECUTION): New. * config/i386/x86-tune.def (X86_TUNE_VECTOR_PARALLEL_EXECUTION): New. * tree-vect-data-refs.c (vect_shift_permute_load_chain): New. Introduces alternative way of loads group permutaions. (vect_transform_grouped_load): Try alternative way of permutations. gcc/testsuite/ PR tree-optimization/52252 * gcc.target/i386/pr52252-atom.c: Test on loads group of size 3. * gcc.target/i386/pr52252-core.c: Ditto. PR tree-optimization/61403 * gcc.target/i386/pr61403.c: Test on loads and stores group of size 3. From-SVN: r211769

re PR tree-optimization/52252 (An opportunity for x86 gcc vectorizer (gain up to 3 times))
gcc/ * config/i386/i386.c (ix86_reassociation_width): Add alternative for vector case. * config/i386/i386.h (TARGET_VECTOR_PARALLEL_EXECUTION): New. * config/i386/x86-tune.def (X86_TUNE_VECTOR_PARALLEL_EXECUTION): New. * tree-vect-data-refs.c (vect_shift_permute_load_chain): New. Introduces alternative way of loads group permutaions. (vect_transform_grouped_load): Try alternative way of permutations. gcc/testsuite/ PR tree-optimization/52252 * gcc.target/i386/pr52252-atom.c: Test on loads group of size 3. * gcc.target/i386/pr52252-core.c: Ditto. PR tree-optimization/61403 * gcc.target/i386/pr61403.c: Test on loads and stores group of size 3. From-SVN: r211769
f7917029 · Evgeny Stupachenko · Kirill Yukhin · f014c653 · f7917029 · f7917029
Commit f7917029 authored Jun 18, 2014 by Evgeny Stupachenko Committed by Kirill Yukhin Jun 18, 2014
9 changed files
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2014-06-18  Evgeny Stupachenko  <evstupac@gmail.com>
+
+	* config/i386/i386.c (ix86_reassociation_width): Add alternative for
+	vector case.
+	* config/i386/i386.h (TARGET_VECTOR_PARALLEL_EXECUTION): New.
+	* config/i386/x86-tune.def (X86_TUNE_VECTOR_PARALLEL_EXECUTION): New.
+	* tree-vect-data-refs.c (vect_shift_permute_load_chain): New.
+	Introduces alternative way of loads group permutaions.
+	(vect_transform_grouped_load): Try alternative way of permutations.
+
 2014-06-18  Jakub Jelinek  <jakub@redhat.com>

 	* gimplify.c (omp_notice_variable): If n is non-NULL

--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -46429,6 +46429,16 @@ ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
 {
  int res = 1;

+  /* Vector part.  */
+  if (VECTOR_MODE_P (mode))
+    {
+      if (TARGET_VECTOR_PARALLEL_EXECUTION)
+	return 2;
+      else
+	return 1;
+    }
+
+  /* Scalar part.  */
  if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
    res = 2;
  else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -433,6 +433,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 	ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS]
 #define TARGET_SLOW_PSHUFB \
 	ix86_tune_features[X86_TUNE_SLOW_PSHUFB]
+#define TARGET_VECTOR_PARALLEL_EXECUTION \
+	ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION]
 #define TARGET_FUSE_CMP_AND_BRANCH_32 \
 	ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
 #define TARGET_FUSE_CMP_AND_BRANCH_64 \

--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -390,6 +390,11 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
 DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
          m_BONNELL | m_SILVERMONT | m_INTEL)

+/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to
+   execute 2 or more vector instructions in parallel.  */
+DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel",
+          m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
+
 /*****************************************************************************/
 /* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
 /*****************************************************************************/

--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
+2014-06-18  Evgeny Stupachenko  <evstupac@gmail.com>
+
+	PR tree-optimization/52252
+	* gcc.target/i386/pr52252-atom.c: Test on loads group of size 3.
+	* gcc.target/i386/pr52252-core.c: Ditto.
+
+	PR tree-optimization/61403
+	* gcc.target/i386/pr61403.c: Test on loads and stores group of size 3.
+
 2014-06-18  Jakub Jelinek  <jakub@redhat.com>

 	* gfortran.dg/gomp/declare-simd-1.f90: New test.

--- a/gcc/testsuite/gcc.target/i386/pr52252-atom.c
+++ b/gcc/testsuite/gcc.target/i386/pr52252-atom.c
+/* { dg-do compile } */
+/* { dg-require-effective-target ssse3 } */
+/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+  int i;
+  for (i = 0; i < size; i++)
+    {
+      byte in0 = in[0];
+      byte in1 = in[1];
+      byte in2 = in[2];
+      byte out0, out1, out2, out3;
+      out0 = in0 + in1;
+      out1 = in0 + in2;
+      out2 = in1 + in2;
+      out3 = in0 + in1 + in2;
+      out[0] = out0;
+      out[1] = out1;
+      out[2] = out2;
+      out[3] = out3;
+      in += 3;
+      out += 4;
+    }
+}
+
+/* { dg-final { scan-assembler "palignr" } } */
--- a/gcc/testsuite/gcc.target/i386/pr52252-core.c
+++ b/gcc/testsuite/gcc.target/i386/pr52252-core.c
+/* { dg-do compile } */
+/* { dg-require-effective-target ssse3 } */
+/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=corei7" } */
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+  int i;
+  for (i = 0; i < size; i++)
+    {
+      byte in0 = in[0];
+      byte in1 = in[1];
+      byte in2 = in[2];
+      byte out0, out1, out2, out3;
+      out0 = in0 + in1;
+      out1 = in0 + in2;
+      out2 = in1 + in2;
+      out3 = in0 + in1 + in2;
+      out[0] = out0;
+      out[1] = out1;
+      out[2] = out2;
+      out[3] = out3;
+      in += 3;
+      out += 4;
+    }
+}
+
+/* { dg-final { scan-assembler "pshufb" } } */
--- a/gcc/testsuite/gcc.target/i386/pr61403.c
+++ b/gcc/testsuite/gcc.target/i386/pr61403.c
+/* { dg-do compile } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.2 -mtune=corei7" } */
+
+#include <math.h>
+
+struct XYZ
+{
+  float x;
+  float y;
+  float z;
+};
+
+void
+norm (struct XYZ *in, struct XYZ *out, int size)
+{
+  int i;
+  for (i = 0; i < size; ++i)
+    {
+      float n = sqrt (in[i].x * in[i].x + in[i].y * in[i].y + in[i].z * in[i].z);
+      out[i].x = in[i].x / n;
+      out[i].y = in[i].y / n;
+      out[i].z = in[i].z / n;
+    }
+}
+
+/* { dg-final { scan-assembler "blend" } } */
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c