Commit b6f9a04a by Jakub Jelinek Committed by Jakub Jelinek

i386.c (ix86_expand_vec_perm): In merge_two use mode SUBREG of operands[0] as target.

	* config/i386/i386.c (ix86_expand_vec_perm): In merge_two use
	mode SUBREG of operands[0] as target.
	(valid_perm_using_mode_p): Don't ignore higher bits of d->perm.
	(expand_vec_pshufb): For V8SImode vmode emit avx2_permvarv8si.
	(expand_vec_perm_1): Handle identity and some broadcast
	permutations.
	(expand_vec_perm_interleave2): Handle also 32-byte modes, using
	vperm2[fi]128 or vpunpck[lh]* followed by single insn permutation.
	For d->testing_p return true earlier to avoid creating more GC
	garbage.
	(expand_vec_perm_vpermq_perm_1): New function.
	(expand_vec_perm_vpshufb2_vpermq): For d->testing_p return true
	earlier to avoid creating more GC garbage.  Fix handling of
	V16HImode.  Avoid some SUBREGs in SET_DEST.
	(expand_vec_perm_broadcast_1): Return false for 32-byte integer
	vector modes.
	(expand_vec_perm_vpshufb4_vpermq2): New function.
	(ix86_expand_vec_perm_builtin_1): Call expand_vec_perm_vpermq_perm_1
	and expand_vec_perm_vpshufb4_vpermq2.

From-SVN: r180169
parent e9d662bb
2011-10-18 Jakub Jelinek <jakub@redhat.com>
* config/i386/i386.c (ix86_expand_vec_perm): In merge_two use
mode SUBREG of operands[0] as target.
(valid_perm_using_mode_p): Don't ignore higher bits of d->perm.
(expand_vec_pshufb): For V8SImode vmode emit avx2_permvarv8si.
(expand_vec_perm_1): Handle identity and some broadcast
permutations.
(expand_vec_perm_interleave2): Handle also 32-byte modes, using
vperm2[fi]128 or vpunpck[lh]* followed by single insn permutation.
For d->testing_p return true earlier to avoid creating more GC
garbage.
(expand_vec_perm_vpermq_perm_1): New function.
(expand_vec_perm_vpshufb2_vpermq): For d->testing_p return true
earlier to avoid creating more GC garbage. Fix handling of
V16HImode. Avoid some SUBREGs in SET_DEST.
(expand_vec_perm_broadcast_1): Return false for 32-byte integer
vector modes.
(expand_vec_perm_vpshufb4_vpermq2): New function.
(ix86_expand_vec_perm_builtin_1): Call expand_vec_perm_vpermq_perm_1
and expand_vec_perm_vpshufb4_vpermq2.
2011-10-18 Andrew Stubbs <ams@codesourcery.com> 2011-10-18 Andrew Stubbs <ams@codesourcery.com>
* config/arm/driver-arm.c (host_detect_local_cpu): Close the file * config/arm/driver-arm.c (host_detect_local_cpu): Close the file
...@@ -19663,7 +19663,7 @@ ix86_expand_vec_perm (rtx operands[]) ...@@ -19663,7 +19663,7 @@ ix86_expand_vec_perm (rtx operands[])
mask = expand_simple_binop (maskmode, AND, mask, vt, mask = expand_simple_binop (maskmode, AND, mask, vt,
NULL_RTX, 0, OPTAB_DIRECT); NULL_RTX, 0, OPTAB_DIRECT);
xops[0] = operands[0]; xops[0] = gen_lowpart (mode, operands[0]);
xops[1] = gen_lowpart (mode, t2); xops[1] = gen_lowpart (mode, t2);
xops[2] = gen_lowpart (mode, t1); xops[2] = gen_lowpart (mode, t1);
xops[3] = gen_rtx_EQ (maskmode, mask, vt); xops[3] = gen_rtx_EQ (maskmode, mask, vt);
...@@ -35006,8 +35006,7 @@ valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d) ...@@ -35006,8 +35006,7 @@ valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
return false; return false;
else else
for (j = 1; j < chunk; ++j) for (j = 1; j < chunk; ++j)
if ((d->perm[i] & (d->nelt - 1)) + j if (d->perm[i] + j != d->perm[i + j])
!= (d->perm[i + j] & (d->nelt - 1)))
return false; return false;
return true; return true;
...@@ -35138,6 +35137,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) ...@@ -35138,6 +35137,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
else if (vmode == V32QImode) else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
else
emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
} }
else else
{ {
...@@ -35163,9 +35164,58 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) ...@@ -35163,9 +35164,58 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
if (d->op0 == d->op1) if (d->op0 == d->op1)
{ {
int mask = nelt - 1; int mask = nelt - 1;
bool identity_perm = true;
bool broadcast_perm = true;
for (i = 0; i < nelt; i++) for (i = 0; i < nelt; i++)
perm2[i] = d->perm[i] & mask; {
perm2[i] = d->perm[i] & mask;
if (perm2[i] != i)
identity_perm = false;
if (perm2[i])
broadcast_perm = false;
}
if (identity_perm)
{
if (!d->testing_p)
emit_move_insn (d->target, d->op0);
return true;
}
else if (broadcast_perm && TARGET_AVX2)
{
/* Use vpbroadcast{b,w,d}. */
rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
switch (d->vmode)
{
case V32QImode:
op = gen_lowpart (V16QImode, op);
gen = gen_avx2_pbroadcastv32qi;
break;
case V16HImode:
op = gen_lowpart (V8HImode, op);
gen = gen_avx2_pbroadcastv16hi;
break;
case V8SImode:
op = gen_lowpart (V4SImode, op);
gen = gen_avx2_pbroadcastv8si;
break;
case V16QImode:
gen = gen_avx2_pbroadcastv16qi;
break;
case V8HImode:
gen = gen_avx2_pbroadcastv8hi;
break;
/* For other modes prefer other shuffles this function creates. */
default: break;
}
if (gen != NULL)
{
if (!d->testing_p)
emit_insn (gen (d->target, op));
return true;
}
}
if (expand_vselect (d->target, d->op0, perm2, nelt)) if (expand_vselect (d->target, d->op0, perm2, nelt))
return true; return true;
...@@ -35349,93 +35399,210 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) ...@@ -35349,93 +35399,210 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
{ {
struct expand_vec_perm_d dremap, dfinal; struct expand_vec_perm_d dremap, dfinal;
unsigned i, nelt = d->nelt, nelt2 = nelt / 2; unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
unsigned contents, h1, h2, h3, h4; unsigned HOST_WIDE_INT contents;
unsigned char remap[2 * MAX_VECT_LEN]; unsigned char remap[2 * MAX_VECT_LEN];
rtx seq; rtx seq;
bool ok; bool ok, same_halves = false;
if (d->op0 == d->op1) if (GET_MODE_SIZE (d->vmode) == 16)
return false; {
if (d->op0 == d->op1)
/* The 256-bit unpck[lh]p[sd] instructions only operate within the 128-bit return false;
lanes. We can use similar techniques with the vperm2f128 instruction, }
but it requires slightly different logic. */ else if (GET_MODE_SIZE (d->vmode) == 32)
if (GET_MODE_SIZE (d->vmode) != 16) {
if (!TARGET_AVX)
return false;
/* For 32-byte modes allow even d->op0 == d->op1.
The lack of cross-lane shuffling in some instructions
might prevent a single insn shuffle. */
}
else
return false; return false;
/* Examine from whence the elements come. */ /* Examine from whence the elements come. */
contents = 0; contents = 0;
for (i = 0; i < nelt; ++i) for (i = 0; i < nelt; ++i)
contents |= 1u << d->perm[i]; contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
/* Split the two input vectors into 4 halves. */
h1 = (1u << nelt2) - 1;
h2 = h1 << nelt2;
h3 = h2 << nelt2;
h4 = h3 << nelt2;
memset (remap, 0xff, sizeof (remap)); memset (remap, 0xff, sizeof (remap));
dremap = *d; dremap = *d;
/* If the elements from the low halves use interleave low, and similarly if (GET_MODE_SIZE (d->vmode) == 16)
for interleave high. If the elements are from mis-matched halves, we
can use shufps for V4SF/V4SI or do a DImode shuffle. */
if ((contents & (h1 | h3)) == contents)
{ {
for (i = 0; i < nelt2; ++i) unsigned HOST_WIDE_INT h1, h2, h3, h4;
/* Split the two input vectors into 4 halves. */
h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
h2 = h1 << nelt2;
h3 = h2 << nelt2;
h4 = h3 << nelt2;
/* If the elements from the low halves use interleave low, and similarly
for interleave high. If the elements are from mis-matched halves, we
can use shufps for V4SF/V4SI or do a DImode shuffle. */
if ((contents & (h1 | h3)) == contents)
{ {
remap[i] = i * 2; /* punpckl* */
remap[i + nelt] = i * 2 + 1; for (i = 0; i < nelt2; ++i)
dremap.perm[i * 2] = i; {
dremap.perm[i * 2 + 1] = i + nelt; remap[i] = i * 2;
remap[i + nelt] = i * 2 + 1;
dremap.perm[i * 2] = i;
dremap.perm[i * 2 + 1] = i + nelt;
}
} }
} else if ((contents & (h2 | h4)) == contents)
else if ((contents & (h2 | h4)) == contents)
{
for (i = 0; i < nelt2; ++i)
{ {
remap[i + nelt2] = i * 2; /* punpckh* */
remap[i + nelt + nelt2] = i * 2 + 1; for (i = 0; i < nelt2; ++i)
dremap.perm[i * 2] = i + nelt2; {
dremap.perm[i * 2 + 1] = i + nelt + nelt2; remap[i + nelt2] = i * 2;
remap[i + nelt + nelt2] = i * 2 + 1;
dremap.perm[i * 2] = i + nelt2;
dremap.perm[i * 2 + 1] = i + nelt + nelt2;
}
} }
} else if ((contents & (h1 | h4)) == contents)
else if ((contents & (h1 | h4)) == contents)
{
for (i = 0; i < nelt2; ++i)
{ {
remap[i] = i; /* shufps */
remap[i + nelt + nelt2] = i + nelt2; for (i = 0; i < nelt2; ++i)
dremap.perm[i] = i; {
dremap.perm[i + nelt2] = i + nelt + nelt2; remap[i] = i;
remap[i + nelt + nelt2] = i + nelt2;
dremap.perm[i] = i;
dremap.perm[i + nelt2] = i + nelt + nelt2;
}
if (nelt != 4)
{
/* shufpd */
dremap.vmode = V2DImode;
dremap.nelt = 2;
dremap.perm[0] = 0;
dremap.perm[1] = 3;
}
} }
if (nelt != 4) else if ((contents & (h2 | h3)) == contents)
{ {
dremap.vmode = V2DImode; /* shufps */
dremap.nelt = 2; for (i = 0; i < nelt2; ++i)
dremap.perm[0] = 0; {
dremap.perm[1] = 3; remap[i + nelt2] = i;
remap[i + nelt] = i + nelt2;
dremap.perm[i] = i + nelt2;
dremap.perm[i + nelt2] = i + nelt;
}
if (nelt != 4)
{
/* shufpd */
dremap.vmode = V2DImode;
dremap.nelt = 2;
dremap.perm[0] = 1;
dremap.perm[1] = 2;
}
} }
else
return false;
} }
else if ((contents & (h2 | h3)) == contents) else
{ {
for (i = 0; i < nelt2; ++i) unsigned int nelt4 = nelt / 4, nzcnt = 0;
unsigned HOST_WIDE_INT q[8];
unsigned int nonzero_halves[4];
/* Split the two input vectors into 8 quarters. */
q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
for (i = 1; i < 8; ++i)
q[i] = q[0] << (nelt4 * i);
for (i = 0; i < 4; ++i)
if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
{
nonzero_halves[nzcnt] = i;
++nzcnt;
}
if (nzcnt == 1)
{
gcc_assert (d->op0 == d->op1);
nonzero_halves[1] = nonzero_halves[0];
same_halves = true;
}
else if (d->op0 == d->op1)
{
gcc_assert (nonzero_halves[0] == 0);
gcc_assert (nonzero_halves[1] == 1);
}
if (nzcnt <= 2)
{
if (d->perm[0] / nelt2 == nonzero_halves[1])
{
/* Attempt to increase the likelyhood that dfinal
shuffle will be intra-lane. */
char tmph = nonzero_halves[0];
nonzero_halves[0] = nonzero_halves[1];
nonzero_halves[1] = tmph;
}
/* vperm2f128 or vperm2i128. */
for (i = 0; i < nelt2; ++i)
{
remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
remap[i + nonzero_halves[0] * nelt2] = i;
dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
dremap.perm[i] = i + nonzero_halves[0] * nelt2;
}
if (d->vmode != V8SFmode
&& d->vmode != V4DFmode
&& d->vmode != V8SImode)
{
dremap.vmode = V8SImode;
dremap.nelt = 8;
for (i = 0; i < 4; ++i)
{
dremap.perm[i] = i + nonzero_halves[0] * 4;
dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
}
}
}
else if (d->op0 == d->op1)
return false;
else if (TARGET_AVX2
&& (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
{ {
remap[i + nelt2] = i; /* vpunpckl* */
remap[i + nelt] = i + nelt2; for (i = 0; i < nelt4; ++i)
dremap.perm[i] = i + nelt2; {
dremap.perm[i + nelt2] = i + nelt; remap[i] = i * 2;
remap[i + nelt] = i * 2 + 1;
remap[i + nelt2] = i * 2 + nelt2;
remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
dremap.perm[i * 2] = i;
dremap.perm[i * 2 + 1] = i + nelt;
dremap.perm[i * 2 + nelt2] = i + nelt2;
dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
}
} }
if (nelt != 4) else if (TARGET_AVX2
&& (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
{ {
dremap.vmode = V2DImode; /* vpunpckh* */
dremap.nelt = 2; for (i = 0; i < nelt4; ++i)
dremap.perm[0] = 1; {
dremap.perm[1] = 2; remap[i + nelt4] = i * 2;
remap[i + nelt + nelt4] = i * 2 + 1;
remap[i + nelt2 + nelt4] = i * 2 + nelt2;
remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
dremap.perm[i * 2] = i + nelt4;
dremap.perm[i * 2 + 1] = i + nelt + nelt4;
dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
}
} }
else
return false;
} }
else
return false;
/* Use the remapping array set up above to move the elements from their /* Use the remapping array set up above to move the elements from their
swizzled locations into their final destinations. */ swizzled locations into their final destinations. */
...@@ -35444,7 +35611,15 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) ...@@ -35444,7 +35611,15 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
{ {
unsigned e = remap[d->perm[i]]; unsigned e = remap[d->perm[i]];
gcc_assert (e < nelt); gcc_assert (e < nelt);
dfinal.perm[i] = e; /* If same_halves is true, both halves of the remapped vector are the
same. Avoid cross-lane accesses if possible. */
if (same_halves && i >= nelt2)
{
gcc_assert (e < nelt2);
dfinal.perm[i] = e + nelt2;
}
else
dfinal.perm[i] = e;
} }
dfinal.op0 = gen_reg_rtx (dfinal.vmode); dfinal.op0 = gen_reg_rtx (dfinal.vmode);
dfinal.op1 = dfinal.op0; dfinal.op1 = dfinal.op0;
...@@ -35460,6 +35635,9 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) ...@@ -35460,6 +35635,9 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
if (!ok) if (!ok)
return false; return false;
if (d->testing_p)
return true;
if (dremap.vmode != dfinal.vmode) if (dremap.vmode != dfinal.vmode)
{ {
dremap.target = gen_lowpart (dremap.vmode, dremap.target); dremap.target = gen_lowpart (dremap.vmode, dremap.target);
...@@ -35475,6 +35653,83 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) ...@@ -35475,6 +35653,83 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
} }
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
a single vector cross-lane permutation into vpermq followed
by any of the single insn permutations. */
static bool
expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
{
struct expand_vec_perm_d dremap, dfinal;
unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
unsigned contents[2];
bool ok;
if (!(TARGET_AVX2
&& (d->vmode == V32QImode || d->vmode == V16HImode)
&& d->op0 == d->op1))
return false;
contents[0] = 0;
contents[1] = 0;
for (i = 0; i < nelt2; ++i)
{
contents[0] |= 1u << (d->perm[i] / nelt4);
contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
}
for (i = 0; i < 2; ++i)
{
unsigned int cnt = 0;
for (j = 0; j < 4; ++j)
if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
return false;
}
if (d->testing_p)
return true;
dremap = *d;
dremap.vmode = V4DImode;
dremap.nelt = 4;
dremap.target = gen_reg_rtx (V4DImode);
dremap.op0 = gen_lowpart (V4DImode, d->op0);
dremap.op1 = dremap.op0;
for (i = 0; i < 2; ++i)
{
unsigned int cnt = 0;
for (j = 0; j < 4; ++j)
if ((contents[i] & (1u << j)) != 0)
dremap.perm[2 * i + cnt++] = j;
for (; cnt < 2; ++cnt)
dremap.perm[2 * i + cnt] = 0;
}
dfinal = *d;
dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
dfinal.op1 = dfinal.op0;
for (i = 0, j = 0; i < nelt; ++i)
{
if (i == nelt2)
j = 2;
dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
if ((d->perm[i] / nelt4) == dremap.perm[j])
;
else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
dfinal.perm[i] |= nelt4;
else
gcc_unreachable ();
}
ok = expand_vec_perm_1 (&dremap);
gcc_assert (ok);
ok = expand_vec_perm_1 (&dfinal);
gcc_assert (ok);
return true;
}
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
a two vector permutation using 2 intra-lane interleave insns a two vector permutation using 2 intra-lane interleave insns
and cross-lane shuffle for 32-byte vectors. */ and cross-lane shuffle for 32-byte vectors. */
...@@ -35621,6 +35876,9 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d) ...@@ -35621,6 +35876,9 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
|| (d->vmode != V32QImode && d->vmode != V16HImode)) || (d->vmode != V32QImode && d->vmode != V16HImode))
return false; return false;
if (d->testing_p)
return true;
nelt = d->nelt; nelt = d->nelt;
eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode)); eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
...@@ -35635,12 +35893,12 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d) ...@@ -35635,12 +35893,12 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
for (i = 0; i < nelt; ++i) for (i = 0; i < nelt; ++i)
{ {
unsigned j, e = d->perm[i] & (nelt / 2 - 1); unsigned j, e = d->perm[i] & (nelt / 2 - 1);
unsigned which = ((d->perm[i] ^ i) & (nelt / 2)); unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
for (j = 0; j < eltsz; ++j) for (j = 0; j < eltsz; ++j)
{ {
rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j); rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
rperm[!which][(i * eltsz + j) ^ (which ^ (nelt / 2))] = m128; rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
} }
} }
...@@ -35652,10 +35910,9 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d) ...@@ -35652,10 +35910,9 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm)); emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
/* Swap the 128-byte lanes of h into hp. */ /* Swap the 128-byte lanes of h into hp. */
hp = gen_reg_rtx (V32QImode); hp = gen_reg_rtx (V4DImode);
op = gen_lowpart (V4DImode, h); op = gen_lowpart (V4DImode, h);
emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, hp), op, emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
const2_rtx, GEN_INT (3), const0_rtx,
const1_rtx)); const1_rtx));
vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0])); vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
...@@ -35666,7 +35923,7 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d) ...@@ -35666,7 +35923,7 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm)); emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
op = gen_lowpart (V32QImode, d->target); op = gen_lowpart (V32QImode, d->target);
emit_insn (gen_iorv32qi3 (op, l, hp)); emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
return true; return true;
} }
...@@ -35994,6 +36251,15 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) ...@@ -35994,6 +36251,15 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
gcc_assert (ok); gcc_assert (ok);
return true; return true;
case V32QImode:
case V16HImode:
case V8SImode:
case V4DImode:
/* For AVX2 broadcasts of the first element vpbroadcast* or
vpermq should be used by expand_vec_perm_1. */
gcc_assert (!TARGET_AVX2 || d->perm[0]);
return false;
default: default:
gcc_unreachable (); gcc_unreachable ();
} }
...@@ -36018,6 +36284,117 @@ expand_vec_perm_broadcast (struct expand_vec_perm_d *d) ...@@ -36018,6 +36284,117 @@ expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
return expand_vec_perm_broadcast_1 (d); return expand_vec_perm_broadcast_1 (d);
} }
/* Implement arbitrary permutation of two V32QImode and V16QImode operands
with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
all the shorter instruction sequences. */
static bool
expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
{
rtx rperm[4][32], vperm, l[2], h[2], op, m128;
unsigned int i, nelt, eltsz;
bool used[4];
if (!TARGET_AVX2
|| d->op0 == d->op1
|| (d->vmode != V32QImode && d->vmode != V16HImode))
return false;
if (d->testing_p)
return true;
nelt = d->nelt;
eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
/* Generate 4 permutation masks. If the required element is within
the same lane, it is shuffled in. If the required element from the
other lane, force a zero by setting bit 7 in the permutation mask.
In the other mask the mask has non-negative elements if element
is requested from the other lane, but also moved to the other lane,
so that the result of vpshufb can have the two V2TImode halves
swapped. */
m128 = GEN_INT (-128);
for (i = 0; i < 32; ++i)
{
rperm[0][i] = m128;
rperm[1][i] = m128;
rperm[2][i] = m128;
rperm[3][i] = m128;
}
used[0] = false;
used[1] = false;
used[2] = false;
used[3] = false;
for (i = 0; i < nelt; ++i)
{
unsigned j, e = d->perm[i] & (nelt / 2 - 1);
unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
for (j = 0; j < eltsz; ++j)
rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
used[which] = true;
}
for (i = 0; i < 2; ++i)
{
if (!used[2 * i + 1])
{
h[i] = NULL_RTX;
continue;
}
vperm = gen_rtx_CONST_VECTOR (V32QImode,
gen_rtvec_v (32, rperm[2 * i + 1]));
vperm = force_reg (V32QImode, vperm);
h[i] = gen_reg_rtx (V32QImode);
op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
}
/* Swap the 128-byte lanes of h[X]. */
for (i = 0; i < 2; ++i)
{
if (h[i] == NULL_RTX)
continue;
op = gen_reg_rtx (V4DImode);
emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
const2_rtx, GEN_INT (3), const0_rtx,
const1_rtx));
h[i] = gen_lowpart (V32QImode, op);
}
for (i = 0; i < 2; ++i)
{
if (!used[2 * i])
{
l[i] = NULL_RTX;
continue;
}
vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
vperm = force_reg (V32QImode, vperm);
l[i] = gen_reg_rtx (V32QImode);
op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
}
for (i = 0; i < 2; ++i)
{
if (h[i] && l[i])
{
op = gen_reg_rtx (V32QImode);
emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
l[i] = op;
}
else if (h[i])
l[i] = h[i];
}
gcc_assert (l[0] && l[1]);
op = gen_lowpart (V32QImode, d->target);
emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
return true;
}
/* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook. /* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook.
With all of the interface bits taken care of, perform the expansion With all of the interface bits taken care of, perform the expansion
in D and return true on success. */ in D and return true on success. */
...@@ -36043,6 +36420,9 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d) ...@@ -36043,6 +36420,9 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_broadcast (d)) if (expand_vec_perm_broadcast (d))
return true; return true;
if (expand_vec_perm_vpermq_perm_1 (d))
return true;
/* Try sequences of three instructions. */ /* Try sequences of three instructions. */
if (expand_vec_perm_pshufb2 (d)) if (expand_vec_perm_pshufb2 (d))
...@@ -36072,6 +36452,10 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d) ...@@ -36072,6 +36452,10 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_even_odd (d)) if (expand_vec_perm_even_odd (d))
return true; return true;
/* Even longer sequences. */
if (expand_vec_perm_vpshufb4_vpermq2 (d))
return true;
return false; return false;
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment