Commit f7917029 by Evgeny Stupachenko Committed by Kirill Yukhin

re PR tree-optimization/52252 (An opportunity for x86 gcc vectorizer (gain up to 3 times))

gcc/
	* config/i386/i386.c (ix86_reassociation_width): Add alternative for
	vector case.
	* config/i386/i386.h (TARGET_VECTOR_PARALLEL_EXECUTION): New.
	* config/i386/x86-tune.def (X86_TUNE_VECTOR_PARALLEL_EXECUTION): New.
	* tree-vect-data-refs.c (vect_shift_permute_load_chain): New.
	Introduces alternative way of loads group permutaions.
	(vect_transform_grouped_load): Try alternative way of permutations.

gcc/testsuite/
	PR tree-optimization/52252
	* gcc.target/i386/pr52252-atom.c: Test on loads group of size 3.
	* gcc.target/i386/pr52252-core.c: Ditto.

	PR tree-optimization/61403
	* gcc.target/i386/pr61403.c: Test on loads and stores group of size 3.

From-SVN: r211769
parent f014c653
2014-06-18 Evgeny Stupachenko <evstupac@gmail.com>
* config/i386/i386.c (ix86_reassociation_width): Add alternative for
vector case.
* config/i386/i386.h (TARGET_VECTOR_PARALLEL_EXECUTION): New.
* config/i386/x86-tune.def (X86_TUNE_VECTOR_PARALLEL_EXECUTION): New.
* tree-vect-data-refs.c (vect_shift_permute_load_chain): New.
Introduces alternative way of loads group permutaions.
(vect_transform_grouped_load): Try alternative way of permutations.
2014-06-18 Jakub Jelinek <jakub@redhat.com>
* gimplify.c (omp_notice_variable): If n is non-NULL
......
......@@ -46429,6 +46429,16 @@ ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
{
int res = 1;
/* Vector part. */
if (VECTOR_MODE_P (mode))
{
if (TARGET_VECTOR_PARALLEL_EXECUTION)
return 2;
else
return 1;
}
/* Scalar part. */
if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
res = 2;
else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
......@@ -433,6 +433,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS]
#define TARGET_SLOW_PSHUFB \
ix86_tune_features[X86_TUNE_SLOW_PSHUFB]
#define TARGET_VECTOR_PARALLEL_EXECUTION \
ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION]
#define TARGET_FUSE_CMP_AND_BRANCH_32 \
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
#define TARGET_FUSE_CMP_AND_BRANCH_64 \
......
......@@ -390,6 +390,11 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
m_BONNELL | m_SILVERMONT | m_INTEL)
/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to
execute 2 or more vector instructions in parallel. */
DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel",
m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
/*****************************************************************************/
/* AVX instruction selection tuning (some of SSE flags affects AVX, too) */
/*****************************************************************************/
......
2014-06-18 Evgeny Stupachenko <evstupac@gmail.com>
PR tree-optimization/52252
* gcc.target/i386/pr52252-atom.c: Test on loads group of size 3.
* gcc.target/i386/pr52252-core.c: Ditto.
PR tree-optimization/61403
* gcc.target/i386/pr61403.c: Test on loads and stores group of size 3.
2014-06-18 Jakub Jelinek <jakub@redhat.com>
* gfortran.dg/gomp/declare-simd-1.f90: New test.
......
/* { dg-do compile } */
/* { dg-require-effective-target ssse3 } */
/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */
#define byte unsigned char
void
matrix_mul (byte *in, byte *out, int size)
{
int i;
for (i = 0; i < size; i++)
{
byte in0 = in[0];
byte in1 = in[1];
byte in2 = in[2];
byte out0, out1, out2, out3;
out0 = in0 + in1;
out1 = in0 + in2;
out2 = in1 + in2;
out3 = in0 + in1 + in2;
out[0] = out0;
out[1] = out1;
out[2] = out2;
out[3] = out3;
in += 3;
out += 4;
}
}
/* { dg-final { scan-assembler "palignr" } } */
/* { dg-do compile } */
/* { dg-require-effective-target ssse3 } */
/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=corei7" } */
#define byte unsigned char
void
matrix_mul (byte *in, byte *out, int size)
{
int i;
for (i = 0; i < size; i++)
{
byte in0 = in[0];
byte in1 = in[1];
byte in2 = in[2];
byte out0, out1, out2, out3;
out0 = in0 + in1;
out1 = in0 + in2;
out2 = in1 + in2;
out3 = in0 + in1 + in2;
out[0] = out0;
out[1] = out1;
out[2] = out2;
out[3] = out3;
in += 3;
out += 4;
}
}
/* { dg-final { scan-assembler "pshufb" } } */
/* { dg-do compile } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse4.2 -mtune=corei7" } */
#include <math.h>
struct XYZ
{
float x;
float y;
float z;
};
void
norm (struct XYZ *in, struct XYZ *out, int size)
{
int i;
for (i = 0; i < size; ++i)
{
float n = sqrt (in[i].x * in[i].x + in[i].y * in[i].y + in[i].z * in[i].z);
out[i].x = in[i].x / n;
out[i].y = in[i].y / n;
out[i].z = in[i].z / n;
}
}
/* { dg-final { scan-assembler "blend" } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment