Commit 2d542a9f by Richard Henderson Committed by Richard Henderson

re PR target/53749 (ice in expand_shift_1)

PR target/53749
        * config/i386/i386.c (ix86_rtx_costs): Add reasonable costs for
        V*QImode shifts and multiply.
        (ix86_expand_vecop_qihi): Support shifts.
        * config/i386/i386.md (any_shift): New code iterator.
        * config/i386/sse.md (ashlv16qi3): Merge ...
        (<any_shiftrt>v16qi3): ... into ...
        (<any_shift><VI1_AVX2>3): ... here.  Use ix86_expand_vecop_qihi
        to support SSE and AVX.

From-SVN: r188909
parent 7b532118
2012-06-23 Richard Henderson <rth@redhat.com>
PR target/53749
* config/i386/i386.c (ix86_rtx_costs): Add reasonable costs for
V*QImode shifts and multiply.
(ix86_expand_vecop_qihi): Support shifts.
* config/i386/i386.md (any_shift): New code iterator.
* config/i386/sse.md (ashlv16qi3): Merge ...
(<any_shiftrt>v16qi3): ... into ...
(<any_shift><VI1_AVX2>3): ... here. Use ix86_expand_vecop_qihi
to support SSE and AVX.
* config/i386/i386.c (ix86_expand_sse_unpack): Split operands[]
parameter into src and dest.
* config/i386/sse.md (vec_unpacku_hi_<V124_AVX2>): Update call.
......
......@@ -31938,9 +31938,10 @@ ix86_set_reg_reg_cost (enum machine_mode mode)
scanned. In either case, *TOTAL contains the cost result. */
static bool
ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
bool speed)
{
enum rtx_code code = (enum rtx_code) code_i;
enum rtx_code outer_code = (enum rtx_code) outer_code_i;
enum machine_mode mode = GET_MODE (x);
const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
......@@ -32045,7 +32046,31 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
/* ??? Should be SSE vector operation cost. */
/* At least for published AMD latencies, this really is the same
as the latency for a simple fpu operation like fabs. */
*total = cost->fabs;
/* V*QImode is emulated with 1-11 insns. */
if (mode == V16QImode || mode == V32QImode)
{
int count;
if (TARGET_XOP && mode == V16QImode)
{
/* For XOP we use vpshab, which requires a broadcast of the
value to the variable shift insn. For constants this
means a V16Q const in mem; even when we can perform the
shift with one insn set the cost to prefer paddb. */
if (CONSTANT_P (XEXP (x, 1)))
{
*total = (cost->fabs
+ rtx_cost (XEXP (x, 0), code, 0, speed)
+ (speed ? 2 : COSTS_N_BYTES (16)));
return true;
}
count = 3;
}
else
count = TARGET_SSSE3 ? 7 : 11;
*total = cost->fabs * count;
}
else
*total = cost->fabs;
return false;
}
if (GET_MODE_SIZE (mode) < UNITS_PER_WORD)
......@@ -32119,9 +32144,15 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
}
else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
{
/* V*QImode is emulated with 7-13 insns. */
if (mode == V16QImode || mode == V32QImode)
{
int extra = TARGET_XOP ? 5 : TARGET_SSSE3 ? 6 : 11;
*total = cost->fmul * 2 + cost->fabs * extra;
}
/* Without sse4.1, we don't have PMULLD; it's emulated with 7
insns, including two PMULUDQ. */
if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
*total = cost->fmul * 2 + cost->fabs * 5;
else
*total = cost->fmul;
......@@ -38448,44 +38479,66 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
rtx (*gen_ih) (rtx, rtx, rtx);
rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
struct expand_vec_perm_d d;
bool ok;
bool ok, full_interleave;
bool uns_p = false;
int i;
if (qimode == V16QImode)
switch (qimode)
{
case V16QImode:
himode = V8HImode;
gen_il = gen_vec_interleave_lowv16qi;
gen_ih = gen_vec_interleave_highv16qi;
}
else if (qimode == V32QImode)
{
break;
case V32QImode:
himode = V16HImode;
gen_il = gen_avx2_interleave_lowv32qi;
gen_ih = gen_avx2_interleave_highv32qi;
break;
default:
gcc_unreachable ();
}
else
gcc_unreachable ();
/* Unpack data such that we've got a source byte in each low byte of
each word. We don't care what goes into the high byte of each word.
Rather than trying to get zero in there, most convenient is to let
it be a copy of the low byte. */
op1_l = gen_reg_rtx (qimode);
op1_h = gen_reg_rtx (qimode);
emit_insn (gen_il (op1_l, op1, op1));
emit_insn (gen_ih (op1_h, op1, op1));
op2_l = op2_h = op2;
switch (code)
{
case MULT:
/* Unpack data such that we've got a source byte in each low byte of
each word. We don't care what goes into the high byte of each word.
Rather than trying to get zero in there, most convenient is to let
it be a copy of the low byte. */
op2_l = gen_reg_rtx (qimode);
op2_h = gen_reg_rtx (qimode);
emit_insn (gen_il (op2_l, op2, op2));
emit_insn (gen_ih (op2_h, op2, op2));
/* FALLTHRU */
op2_l = gen_reg_rtx (qimode);
op2_h = gen_reg_rtx (qimode);
emit_insn (gen_il (op2_l, op2, op2));
emit_insn (gen_ih (op2_h, op2, op2));
op1_l = gen_reg_rtx (qimode);
op1_h = gen_reg_rtx (qimode);
emit_insn (gen_il (op1_l, op1, op1));
emit_insn (gen_ih (op1_h, op1, op1));
full_interleave = qimode == V16QImode;
break;
case ASHIFT:
case LSHIFTRT:
uns_p = true;
/* FALLTHRU */
case ASHIFTRT:
op1_l = gen_reg_rtx (himode);
op1_h = gen_reg_rtx (himode);
ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
full_interleave = true;
break;
default:
gcc_unreachable ();
}
/* Perform the operation. */
res_l = expand_simple_binop (himode, code, gen_lowpart (himode, op1_l),
gen_lowpart (himode, op2_l), NULL_RTX,
res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
1, OPTAB_DIRECT);
res_h = expand_simple_binop (himode, code, gen_lowpart (himode, op1_h),
gen_lowpart (himode, op2_h), NULL_RTX,
res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
1, OPTAB_DIRECT);
gcc_assert (res_l && res_h);
......@@ -38498,11 +38551,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
d.one_operand_p = false;
d.testing_p = false;
if (qimode == V16QImode)
if (full_interleave)
{
/* For SSE2, we used an full interleave, so the desired
results are in the even elements. */
for (i = 0; i < 16; ++i)
for (i = 0; i < 32; ++i)
d.perm[i] = i * 2;
}
else
......@@ -711,6 +711,9 @@
;; Mapping of shift-right operators
(define_code_iterator any_shiftrt [lshiftrt ashiftrt])
;; Mapping of all shift operators
(define_code_iterator any_shift [ashift lshiftrt ashiftrt])
;; Base name for define_insn
(define_code_attr shift_insn
[(ashift "ashl") (lshiftrt "lshr") (ashiftrt "ashr")])
......
......@@ -10550,60 +10550,42 @@
(set_attr "prefix_extra" "2")
(set_attr "mode" "TI")])
;; SSE2 doesn't have some shift variants, so define versions for XOP
(define_expand "ashlv16qi3"
[(set (match_operand:V16QI 0 "register_operand")
(ashift:V16QI
(match_operand:V16QI 1 "register_operand")
(match_operand:SI 2 "nonmemory_operand")))]
"TARGET_XOP"
{
rtx reg = gen_reg_rtx (V16QImode);
rtx par;
int i;
par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
for (i = 0; i < 16; i++)
XVECEXP (par, 0, i) = operands[2];
emit_insn (gen_vec_initv16qi (reg, par));
emit_insn (gen_xop_shav16qi3 (operands[0], operands[1], reg));
DONE;
})
(define_expand "<shift_insn>v16qi3"
[(set (match_operand:V16QI 0 "register_operand")
(any_shiftrt:V16QI
(match_operand:V16QI 1 "register_operand")
(define_expand "<shift_insn><mode>3"
[(set (match_operand:VI1_AVX2 0 "register_operand")
(any_shift:VI1_AVX2
(match_operand:VI1_AVX2 1 "register_operand")
(match_operand:SI 2 "nonmemory_operand")))]
"TARGET_XOP"
"TARGET_SSE2"
{
rtx reg = gen_reg_rtx (V16QImode);
rtx par;
bool negate = false;
rtx (*shift_insn)(rtx, rtx, rtx);
int i;
if (CONST_INT_P (operands[2]))
operands[2] = GEN_INT (-INTVAL (operands[2]));
else
negate = true;
if (TARGET_XOP && <MODE>mode == V16QImode)
{
bool negate = false;
rtx (*gen) (rtx, rtx, rtx);
rtx tmp, par;
int i;
par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
for (i = 0; i < 16; i++)
XVECEXP (par, 0, i) = operands[2];
if (<CODE> != ASHIFT)
{
if (CONST_INT_P (operands[2]))
operands[2] = GEN_INT (-INTVAL (operands[2]));
else
negate = true;
}
par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
for (i = 0; i < 16; i++)
XVECEXP (par, 0, i) = operands[2];
emit_insn (gen_vec_initv16qi (reg, par));
tmp = gen_reg_rtx (V16QImode);
emit_insn (gen_vec_initv16qi (tmp, par));
if (negate)
emit_insn (gen_negv16qi2 (reg, reg));
if (negate)
emit_insn (gen_negv16qi2 (tmp, tmp));
if (<CODE> == LSHIFTRT)
shift_insn = gen_xop_shlv16qi3;
gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
emit_insn (gen (operands[0], operands[1], tmp));
}
else
shift_insn = gen_xop_shav16qi3;
emit_insn (shift_insn (operands[0], operands[1], reg));
ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
DONE;
})
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment