Commit dae00b16 by Richard Henderson Committed by Richard Henderson

i386: Rewrite ix86_expand_vshuffle.

1: Handle TARGET_XOP.
2: Reduce code duplication.
3: Use ASHIFT instead of MULT for scaling.
4: Fix errors in building convert-to-v16qi indicies.
5: Handle v2di without sse4.1.

From-SVN: r179564
parent 067f5960
......@@ -6,6 +6,12 @@
code duplication. Do update_stmt here ...
(expand_vector_operations_1): ... not here.
* config/i386/i386.c (ix86_expand_vshuffle): Never fail. Handle
TARGET_XOP. Fix pshufb constant vector creation. Reduce code
duplication. Handle V2DI without SSE4.1.
* config/i386/i386-protos.h (ix86_expand_vshuffle): Update decl.
* config/i386/i386.md (vshuffle<V_128>): Remove assert for ok.
2011-10-05 Uros Bizjak <ubizjak@gmail.com>
* config/i386/i386.c (distance_non_agu_define): Simplify calculation
......@@ -123,7 +123,7 @@ extern bool ix86_expand_int_movcc (rtx[]);
extern bool ix86_expand_fp_movcc (rtx[]);
extern bool ix86_expand_fp_vcond (rtx[]);
extern bool ix86_expand_int_vcond (rtx[]);
extern bool ix86_expand_vshuffle (rtx[]);
extern void ix86_expand_vshuffle (rtx[]);
extern void ix86_expand_sse_unpack (rtx[], bool, bool);
extern bool ix86_expand_int_addcc (rtx[]);
extern rtx ix86_expand_call (rtx, rtx, rtx, rtx, rtx, bool);
......
......@@ -19237,145 +19237,139 @@ ix86_expand_int_vcond (rtx operands[])
return true;
}
bool
void
ix86_expand_vshuffle (rtx operands[])
{
rtx target = operands[0];
rtx op0 = operands[1];
rtx op1 = operands[2];
rtx mask = operands[3];
rtx new_mask, vt, t1, t2, w_vector;
rtx vt, vec[16];
enum machine_mode mode = GET_MODE (op0);
enum machine_mode maskmode = GET_MODE (mask);
enum machine_mode maskinner = GET_MODE_INNER (mode);
rtx vec[16];
int w, i, j;
bool one_operand_shuffle = op0 == op1;
int w, e, i;
bool one_operand_shuffle = rtx_equal_p (op0, op1);
gcc_assert ((TARGET_SSSE3 || TARGET_AVX) && GET_MODE_BITSIZE (mode) == 128);
gcc_checking_assert (GET_MODE_BITSIZE (mode) == 128);
/* Number of elements in the vector. */
w = GET_MODE_BITSIZE (maskmode) / GET_MODE_BITSIZE (maskinner);
/* generate w_vector = {w, w, ...} */
for (i = 0; i < w; i++)
vec[i] = GEN_INT (w);
w_vector = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
/* mask = mask & {w-1, w-1, w-1,...} */
for (i = 0; i < w; i++)
vec[i] = GEN_INT (w - 1);
w = GET_MODE_NUNITS (mode);
e = GET_MODE_UNIT_SIZE (mode);
vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
new_mask = expand_simple_binop (maskmode, AND, mask, vt,
NULL_RTX, 0, OPTAB_DIRECT);
/* If the original vector mode is V16QImode, we can just
use pshufb directly. */
if (mode == V16QImode && one_operand_shuffle)
if (TARGET_XOP)
{
t1 = gen_reg_rtx (V16QImode);
emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask));
emit_insn (gen_rtx_SET (VOIDmode, target, t1));
return true;
/* The XOP VPPERM insn supports three inputs. By ignoring the
one_operand_shuffle special case, we avoid creating another
set of constant vectors in memory. */
one_operand_shuffle = false;
/* mask = mask & {2*w-1, ...} */
vt = GEN_INT (2*w - 1);
}
else if (mode == V16QImode)
else
{
rtx xops[6];
t1 = gen_reg_rtx (V16QImode);
t2 = gen_reg_rtx (V16QImode);
emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask));
emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, new_mask));
/* mask = mask & {w, w, ...} */
mask = expand_simple_binop (V16QImode, AND, mask, w_vector,
NULL_RTX, 0, OPTAB_DIRECT);
xops[0] = target;
xops[1] = operands[1];
xops[2] = operands[2];
xops[3] = gen_rtx_EQ (mode, mask, w_vector);
xops[4] = t1;
xops[5] = t2;
return ix86_expand_int_vcond (xops);
/* mask = mask & {w-1, ...} */
vt = GEN_INT (w - 1);
}
/* mask = mask * {w, w, ...} */
new_mask = expand_simple_binop (maskmode, MULT, new_mask, w_vector,
NULL_RTX, 0, OPTAB_DIRECT);
/* Convert mask to vector of chars. */
new_mask = simplify_gen_subreg (V16QImode, new_mask, maskmode, 0);
new_mask = force_reg (V16QImode, new_mask);
/* Build a helper mask wich we will use in pshufb
(v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
(v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}
... */
for (i = 0; i < w; i++)
for (j = 0; j < 16/w; j++)
vec[i*w+j] = GEN_INT (i*16/w);
vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
vt = force_reg (V16QImode, vt);
t1 = gen_reg_rtx (V16QImode);
emit_insn (gen_ssse3_pshufbv16qi3 (t1, new_mask, vt));
new_mask = t1;
/* Convert it into the byte positions by doing
new_mask = new_mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
for (i = 0; i < w; i++)
for (j = 0; j < 16/w; j++)
vec[i*w+j] = GEN_INT (j);
vec[i] = vt;
vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
mask = expand_simple_binop (maskmode, AND, mask, vt,
NULL_RTX, 0, OPTAB_DIRECT);
vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
new_mask = expand_simple_binop (V16QImode, PLUS, new_mask, vt,
/* For non-QImode operations, convert the word permutation control
into a byte permutation control. */
if (mode != V16QImode)
{
mask = expand_simple_binop (maskmode, ASHIFT, mask,
GEN_INT (exact_log2 (e)),
NULL_RTX, 0, OPTAB_DIRECT);
t1 = gen_reg_rtx (V16QImode);
/* Convert mask to vector of chars. */
mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
/* Replicate each of the input bytes into byte positions:
(v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
(v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
(v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
for (i = 0; i < 16; ++i)
vec[i] = GEN_INT (i/e * e);
vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
vt = force_const_mem (V16QImode, vt);
if (TARGET_XOP)
emit_insn (gen_xop_pperm (mask, mask, mask, vt));
else
emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
/* Convert OP0 to vector of chars. */
op0 = simplify_gen_subreg (V16QImode, op0, mode, 0);
op0 = force_reg (V16QImode, op0);
emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask));
/* Convert it into the byte positions by doing
mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
for (i = 0; i < 16; ++i)
vec[i] = GEN_INT (i % e);
vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
vt = force_const_mem (V16QImode, vt);
emit_insn (gen_addv16qi3 (mask, mask, vt));
}
if (one_operand_shuffle)
/* The actual shuffle operations all operate on V16QImode. */
op0 = gen_lowpart (V16QImode, op0);
op1 = gen_lowpart (V16QImode, op1);
target = gen_lowpart (V16QImode, target);
if (TARGET_XOP)
{
/* Convert it back from vector of chars to the original mode. */
t1 = simplify_gen_subreg (mode, t1, V16QImode, 0);
emit_insn (gen_rtx_SET (VOIDmode, target, t1));
return true;
emit_insn (gen_xop_pperm (target, op0, op1, mask));
}
else if (one_operand_shuffle)
{
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
}
else
{
rtx xops[6];
rtx xops[6], t1, t2;
bool ok;
/* Shuffle the two input vectors independently. */
t1 = gen_reg_rtx (V16QImode);
t2 = gen_reg_rtx (V16QImode);
/* Convert OP1 to vector of chars. */
op1 = simplify_gen_subreg (V16QImode, op1, mode, 0);
op1 = force_reg (V16QImode, op1);
emit_insn (gen_ssse3_pshufbv16qi3 (t1, op1, new_mask));
/* mask = mask & {w, w, ...} */
mask = expand_simple_binop (V16QImode, AND, mask, w_vector,
emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
/* Then merge them together. The key is whether any given control
element contained a bit set that indicates the second word. */
mask = operands[3];
vt = GEN_INT (w);
if (maskmode == V2DImode && !TARGET_SSE4_1)
{
/* Without SSE4.1, we don't have V2DImode EQ. Perform one
more shuffle to convert the V2DI input mask into a V4SI
input mask. At which point the masking that expand_int_vcond
will work as desired. */
rtx t3 = gen_reg_rtx (V4SImode);
emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
const0_rtx, const0_rtx,
const2_rtx, const2_rtx));
mask = t3;
maskmode = V4SImode;
e = w = 4;
}
for (i = 0; i < w; i++)
vec[i] = vt;
vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
vt = force_reg (maskmode, vt);
mask = expand_simple_binop (maskmode, AND, mask, vt,
NULL_RTX, 0, OPTAB_DIRECT);
t1 = simplify_gen_subreg (mode, t1, V16QImode, 0);
t2 = simplify_gen_subreg (mode, t2, V16QImode, 0);
xops[0] = target;
xops[1] = operands[1];
xops[2] = operands[2];
xops[3] = gen_rtx_EQ (mode, mask, w_vector);
xops[4] = t1;
xops[5] = t2;
return ix86_expand_int_vcond (xops);
xops[0] = gen_lowpart (maskmode, operands[0]);
xops[1] = gen_lowpart (maskmode, t2);
xops[2] = gen_lowpart (maskmode, t1);
xops[3] = gen_rtx_EQ (maskmode, mask, vt);
xops[4] = mask;
xops[5] = vt;
ok = ix86_expand_int_vcond (xops);
gcc_assert (ok);
}
return false;
}
/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
......@@ -6229,12 +6229,10 @@
(match_operand:<sseshuffint> 3 "register_operand" "")]
"TARGET_SSSE3 || TARGET_AVX"
{
bool ok = ix86_expand_vshuffle (operands);
gcc_assert (ok);
ix86_expand_vshuffle (operands);
DONE;
})
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Parallel bitwise logical operations
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment