Commit 305f1fb7 by Jakub Jelinek

re PR tree-optimization/88464 (AVX-512 vectorization of masked scatter failing…

re PR tree-optimization/88464 (AVX-512 vectorization of masked scatter failing with "not suitable for scatter store")

	PR tree-optimization/88464
	PR target/88498
	* tree-vect-stmts.c (vect_build_gather_load_calls): For NARROWING
	and mask with integral masktype, don't try to permute mask vectors,
	instead emit VEC_UNPACK_{LO,HI}_EXPR.  Fix up NOP_EXPR operand.
	(vectorizable_store): Handle masked scatters with decl and integral
	mask type.
	(permute_vec_elements): Allow scalar_dest to be NULL.
	* config/i386/i386.c (ix86_get_builtin)
	<case IX86_BUILTIN_GATHER3ALTDIV16SF>: Use lowpart_subreg for masks.
	<case IX86_BUILTIN_GATHER3ALTDIV8SF>: Don't assume mask and src have
	to be the same.

	* gcc.target/i386/avx512f-pr88462-1.c: Rename to ...
	* gcc.target/i386/avx512f-pr88464-1.c: ... this.  Fix up PR number.
	Expect 4 vectorized loops instead of 3.
	(f4): New function.
	* gcc.target/i386/avx512f-pr88462-2.c: Rename to ...
	* gcc.target/i386/avx512f-pr88464-2.c: ... this.  Fix up PR number
	and #include.
	(avx512f_test): Prepare arguments for f4 and check the results.
	* gcc.target/i386/avx512f-pr88464-3.c: New test.
	* gcc.target/i386/avx512f-pr88464-4.c: New test.

From-SVN: r267170
parent b1985ca0
/* PR tree-optimization/88464 */
/* { dg-do compile } */
/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */
/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */
__attribute__((noipa)) void
f1 (double * __restrict__ a, const double * __restrict__ b, const int * __restrict__ c, int n)
{
int i;
#pragma GCC ivdep
for (i = 0; i < n; ++i)
if (b[i] > -2.0)
a[c[i]] = b[i];
}
__attribute__((noipa)) void
f2 (double * __restrict__ a, const double * __restrict__ b, const long * __restrict__ c, int n)
{
int i;
#pragma GCC ivdep
for (i = 0; i < n; ++i)
if (b[i] > -2.0)
a[c[i]] = b[i];
}
__attribute__((noipa)) void
f3 (float * __restrict__ a, const float * __restrict__ b, const int * __restrict__ c, int n)
{
int i;
#pragma GCC ivdep
for (i = 0; i < n; ++i)
if (b[i] > -2.0f)
a[c[i]] = b[i];
}
__attribute__((noipa)) void
f4 (float * __restrict__ a, const float * __restrict__ b, const long * __restrict__ c, int n)
{
int i;
#pragma GCC ivdep
for (i = 0; i < n; ++i)
if (b[i] > -2.0f)
a[c[i]] = b[i];
}
/* PR tree-optimization/88464 */
/* { dg-do run { target { avx512f } } } */
/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512" } */
#include "avx512f-check.h"
#include "avx512f-pr88464-3.c"
static void
avx512f_test (void)
{
double a[1024], b[1024];
float c[1024], f[1024];
int d[1024];
long e[1024];
int i;
for (i = 0; i < 1024; i++)
{
asm volatile ("" : "+g" (i));
a[i] = -5.0;
b[i] = (i % 3) != 0 ? 2.0 * i : -5.0;
d[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__;
}
f1 (a, b, d, 1024);
for (i = 0; i < 1024; i++)
{
asm volatile ("" : "+g" (i));
if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0))
abort ();
a[i] = -5.0;
b[i] = (i % 3) != 1 ? 3.0 * i : -5.0;
e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__;
}
f2 (a, b, e, 1024);
for (i = 0; i < 1024; i++)
{
asm volatile ("" : "+g" (i));
if (a[i] != ((i % 3) != 2 ? (1023 - i) * 3.0 : -5.0))
abort ();
c[i] = -5.0f;
d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__;
f[i] = (i % 3) != 2 ? 4.0f * i : -5.0f;
}
f3 (c, f, d, 1024);
for (i = 0; i < 1024; i++)
{
asm volatile ("" : "+g" (i));
if (c[i] != ((i % 3) != 1 ? (1023 - i) * 4.0f : -5.0f))
abort ();
c[i] = -5.0f;
e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__;
f[i] = (i % 3) != 0 ? 5.0f * i : -5.0f;
}
f4 (c, f, e, 1024);
for (i = 0; i < 1024; i++)
{
asm volatile ("" : "+g" (i));
if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f))
abort ();
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment