Commit ac357108 by Richard Henderson Committed by Richard Henderson

i386: Cleanup and unify widening multiply patterns

Prepares for exposing builtin_mul_widen_even/odd hooks
for more efficient reduction.  Adds QImode multiplication.
Shares code between mulv4si3 and the widening multiplies.

From-SVN: r188957
parent f008d5dc
2012-06-25 Richard Henderson <rth@redhat.com>
* config/i386/i386.c (ix86_rtx_costs) [MULT]: Only apply XOP cost
to V16QImode.
(ix86_expand_vec_interleave): New.
(ix86_expand_mul_widen_evenodd): New.
(ix86_expand_mul_widen_hilo): New.
(ix86_expand_sse2_mulv4si3): Use ix86_expand_mul_widen_evenodd.
* config/i386/i386.md (u_bool) New code attr.
* config/i386/predicates.md
(nonimmediate_or_const_vector_operand): Remove.
* config/i386/sse.md (mul<VI4_AVX2>3): Don't use it; don't test
both AVX and SSE4_1.
(vec_widen<s>mult_hi_<VI2_AVX2>): Remove.
(vec_widen<s>mult_lo_<VI2_AVX2>): Remove.
(vec_widen<s>mult_hi_v8si): Remove.
(vec_widen<s>mult_lo_v8si): Remove.
(vec_widen_smult_hi_v4si): Remove.
(vec_widen_smult_lo_v4si): Remove.
(vec_widen_umult_hi_v4si): Remove.
(vec_widen_umult_lo_v4si): Remove.
(vec_widen_<s>mult_hi_<VI124_AVX2>): New.
(vec_widen_<s>mult_lo_<VI124_AVX2>): New.
* config/i386/i386-protos.h: Update.
2012-06-25 Christophe Lyon <christophe.lyon@st.com>
* config/arm/neon.md (UNSPEC_VLD1_DUP): Remove.
......
/* Definitions of target machine for GCC for IA-32.
Copyright (C) 1988, 1992, 1994, 1995, 1996, 1996, 1997, 1998, 1999,
2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
Free Software Foundation, Inc.
This file is part of GCC.
......@@ -224,6 +224,8 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx);
extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);
extern bool ix86_expand_pinsr (rtx *);
extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool);
extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
/* In i386-c.c */
......
......@@ -32101,7 +32101,7 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
/* V*QImode is emulated with 1-11 insns. */
if (mode == V16QImode || mode == V32QImode)
{
int count;
int count = 11;
if (TARGET_XOP && mode == V16QImode)
{
/* For XOP we use vpshab, which requires a broadcast of the
......@@ -32117,8 +32117,8 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
}
count = 3;
}
else
count = TARGET_SSSE3 ? 7 : 11;
else if (TARGET_SSSE3)
count = 7;
*total = cost->fabs * count;
}
else
......@@ -32199,7 +32199,11 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
/* V*QImode is emulated with 7-13 insns. */
if (mode == V16QImode || mode == V32QImode)
{
int extra = TARGET_XOP ? 5 : TARGET_SSSE3 ? 6 : 11;
int extra = 11;
if (TARGET_XOP && mode == V16QImode)
extra = 5;
else if (TARGET_SSSE3)
extra = 6;
*total = cost->fmul * 2 + cost->fabs * extra;
}
/* Without sse4.1, we don't have PMULLD; it's emulated with 7
......@@ -38519,6 +38523,34 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
expand_vec_perm_even_odd_1 (&d, odd);
}
static void
ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
{
struct expand_vec_perm_d d;
unsigned i, nelt, base;
bool ok;
d.target = targ;
d.op0 = op0;
d.op1 = op1;
d.vmode = GET_MODE (targ);
d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
d.one_operand_p = false;
d.testing_p = false;
base = high_p ? nelt / 2 : 0;
for (i = 0; i < nelt / 2; ++i)
{
d.perm[i * 2] = i + base;
d.perm[i * 2 + 1] = i + base + nelt;
}
/* Note that for AVX this isn't one instruction. */
ok = ix86_expand_vec_perm_const_1 (&d);
gcc_assert (ok);
}
/* Expand a vector operation CODE for a V*QImode in terms of the
same operation on V*HImode. */
......@@ -38627,59 +38659,148 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
}
void
ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
bool uns_p, bool odd_p)
{
rtx op1_m1, op1_m2;
rtx op2_m1, op2_m2;
rtx res_1, res_2;
enum machine_mode mode = GET_MODE (op1);
rtx x;
/* Shift both input vectors down one element, so that elements 3
and 1 are now in the slots for elements 2 and 0. For K8, at
least, this is faster than using a shuffle. */
op1_m1 = op1 = force_reg (V4SImode, op1);
op1_m2 = gen_reg_rtx (V4SImode);
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2),
gen_lowpart (V1TImode, op1),
GEN_INT (32)));
/* We only play even/odd games with vectors of SImode. */
gcc_assert (mode == V4SImode || mode == V8SImode);
if (GET_CODE (op2) == CONST_VECTOR)
/* If we're looking for the odd results, shift those members down to
the even slots. For some cpus this is faster than a PSHUFD. */
if (odd_p)
{
rtvec v;
enum machine_mode wmode = GET_MODE (dest);
/* Constant propagate the vector shift, leaving the dont-care
vector elements as zero. */
v = rtvec_alloc (4);
RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0);
RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2);
RTVEC_ELT (v, 1) = const0_rtx;
RTVEC_ELT (v, 3) = const0_rtx;
op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v);
op2_m1 = force_reg (V4SImode, op2_m1);
v = rtvec_alloc (4);
RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1);
RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3);
RTVEC_ELT (v, 1) = const0_rtx;
RTVEC_ELT (v, 3) = const0_rtx;
op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v);
op2_m2 = force_reg (V4SImode, op2_m2);
op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL,
1, OPTAB_DIRECT);
op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL,
1, OPTAB_DIRECT);
op1 = gen_lowpart (mode, op1);
op2 = gen_lowpart (mode, op2);
}
if (mode == V8SImode)
{
if (uns_p)
x = gen_avx2_umulv4siv4di3 (dest, op1, op2);
else
x = gen_avx2_mulv4siv4di3 (dest, op1, op2);
}
else if (uns_p)
x = gen_sse2_umulv2siv2di3 (dest, op1, op2);
else if (TARGET_SSE4_1)
x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
else if (TARGET_XOP)
{
x = force_reg (V2DImode, CONST0_RTX (V2DImode));
x = gen_xop_pmacsdql (dest, op1, op2, x);
}
else
gcc_unreachable ();
emit_insn (x);
}
void
ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
bool uns_p, bool high_p)
{
enum machine_mode wmode = GET_MODE (dest);
enum machine_mode mode = GET_MODE (op1);
rtx t1, t2, t3, t4, mask;
switch (mode)
{
op2_m1 = op2 = force_reg (V4SImode, op2);
op2_m2 = gen_reg_rtx (V4SImode);
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2),
gen_lowpart (V1TImode, op2),
GEN_INT (32)));
case V4SImode:
t1 = gen_reg_rtx (mode);
t2 = gen_reg_rtx (mode);
if (TARGET_XOP && !uns_p)
{
/* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
shuffle the elements once so that all elements are in the right
place for immediate use: { A C B D }. */
emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
}
else
{
/* Put the elements into place for the multiply. */
ix86_expand_vec_interleave (t1, op1, op1, high_p);
ix86_expand_vec_interleave (t2, op2, op2, high_p);
high_p = false;
}
ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
break;
case V8SImode:
/* Shuffle the elements between the lanes. After this we
have { A B E F | C D G H } for each operand. */
t1 = gen_reg_rtx (V4DImode);
t2 = gen_reg_rtx (V4DImode);
emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
/* Shuffle the elements within the lanes. After this we
have { A A B B | C C D D } or { E E F F | G G H H }. */
t3 = gen_reg_rtx (V8SImode);
t4 = gen_reg_rtx (V8SImode);
mask = GEN_INT (high_p
? 2 + (2 << 2) + (3 << 4) + (3 << 6)
: 0 + (0 << 2) + (1 << 4) + (1 << 6));
emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
break;
case V8HImode:
case V16HImode:
t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
uns_p, OPTAB_DIRECT);
t2 = expand_binop (mode,
uns_p ? umul_highpart_optab : smul_highpart_optab,
op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
gcc_assert (t1 && t2);
ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
break;
case V16QImode:
case V32QImode:
t1 = gen_reg_rtx (wmode);
t2 = gen_reg_rtx (wmode);
ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
break;
default:
gcc_unreachable ();
}
}
void
ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
{
rtx res_1, res_2;
/* Widening multiply of elements 0+2, and 1+3. */
res_1 = gen_reg_rtx (V4SImode);
res_2 = gen_reg_rtx (V4SImode);
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1),
op1_m1, op2_m1));
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2),
op1_m2, op2_m2));
ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
op1, op2, true, false);
ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
op1, op2, true, true);
/* Move the results in element 2 down to element 1; we don't care
what goes in elements 2 and 3. Then we can merge the parts
......@@ -744,6 +744,7 @@
;; Prefix for define_insn
(define_code_attr u [(sign_extend "") (zero_extend "u")])
(define_code_attr s [(sign_extend "s") (zero_extend "u")])
(define_code_attr u_bool [(sign_extend "false") (zero_extend "true")])
;; All integer modes.
(define_mode_iterator SWI1248x [QI HI SI DI])
......
......@@ -816,13 +816,6 @@
return false;
})
;; Return true when OP is a nonimmediate or a vector constant. Note
;; that most vector constants are not legitimate operands, so we need
;; to special-case this.
(define_predicate "nonimmediate_or_const_vector_operand"
(ior (match_code "const_vector")
(match_operand 0 "nonimmediate_operand")))
;; Return true if OP is a register or a zero.
(define_predicate "reg_or_0_operand"
(ior (match_operand 0 "register_operand")
......
......@@ -5555,10 +5555,10 @@
[(set (match_operand:VI4_AVX2 0 "register_operand")
(mult:VI4_AVX2
(match_operand:VI4_AVX2 1 "nonimmediate_operand")
(match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))]
(match_operand:VI4_AVX2 2 "nonimmediate_operand")))]
"TARGET_SSE2"
{
if (TARGET_SSE4_1 || TARGET_AVX)
if (TARGET_SSE4_1)
{
if (CONSTANT_P (operands[2]))
operands[2] = force_const_mem (<MODE>mode, operands[2]);
......@@ -5677,198 +5677,28 @@
(define_expand "vec_widen_<s>mult_hi_<mode>"
[(match_operand:<sseunpackmode> 0 "register_operand")
(any_extend:<sseunpackmode>
(match_operand:VI2_AVX2 1 "register_operand"))
(match_operand:VI2_AVX2 2 "register_operand")]
"TARGET_SSE2"
{
rtx op1, op2, t1, t2, dest;
op1 = operands[1];
op2 = operands[2];
t1 = gen_reg_rtx (<MODE>mode);
t2 = gen_reg_rtx (<MODE>mode);
dest = gen_lowpart (<MODE>mode, operands[0]);
emit_insn (gen_mul<mode>3 (t1, op1, op2));
emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
emit_insn (gen_vec_interleave_high<mode> (dest, t1, t2));
(match_operand:VI124_AVX2 1 "register_operand"))
(match_operand:VI124_AVX2 2 "register_operand")]
; Note that SSE2 does not have signed SI multiply
"TARGET_XOP || TARGET_SSE4_1
|| (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
{
ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2],
<u_bool>, true);
DONE;
})
(define_expand "vec_widen_<s>mult_lo_<mode>"
[(match_operand:<sseunpackmode> 0 "register_operand")
(any_extend:<sseunpackmode>
(match_operand:VI2_AVX2 1 "register_operand"))
(match_operand:VI2_AVX2 2 "register_operand")]
"TARGET_SSE2"
{
rtx op1, op2, t1, t2, dest;
op1 = operands[1];
op2 = operands[2];
t1 = gen_reg_rtx (<MODE>mode);
t2 = gen_reg_rtx (<MODE>mode);
dest = gen_lowpart (<MODE>mode, operands[0]);
emit_insn (gen_mul<mode>3 (t1, op1, op2));
emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
emit_insn (gen_vec_interleave_low<mode> (dest, t1, t2));
DONE;
})
(define_expand "vec_widen_<s>mult_hi_v8si"
[(match_operand:V4DI 0 "register_operand")
(any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand"))
(match_operand:V8SI 2 "nonimmediate_operand")]
"TARGET_AVX2"
{
rtx t1, t2, t3, t4;
t1 = gen_reg_rtx (V4DImode);
t2 = gen_reg_rtx (V4DImode);
t3 = gen_reg_rtx (V8SImode);
t4 = gen_reg_rtx (V8SImode);
emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]),
const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, operands[2]),
const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1),
GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6))));
emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2),
GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6))));
emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));
DONE;
})
(define_expand "vec_widen_<s>mult_lo_v8si"
[(match_operand:V4DI 0 "register_operand")
(any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand"))
(match_operand:V8SI 2 "nonimmediate_operand")]
"TARGET_AVX2"
{
rtx t1, t2, t3, t4;
t1 = gen_reg_rtx (V4DImode);
t2 = gen_reg_rtx (V4DImode);
t3 = gen_reg_rtx (V8SImode);
t4 = gen_reg_rtx (V8SImode);
emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]),
const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, operands[2]),
const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1),
GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6))));
emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2),
GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6))));
emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));
DONE;
})
(define_expand "vec_widen_smult_hi_v4si"
[(match_operand:V2DI 0 "register_operand")
(match_operand:V4SI 1 "register_operand")
(match_operand:V4SI 2 "register_operand")]
"TARGET_SSE4_1"
{
rtx op1, op2, t1, t2;
op1 = operands[1];
op2 = operands[2];
t1 = gen_reg_rtx (V4SImode);
t2 = gen_reg_rtx (V4SImode);
if (TARGET_XOP)
{
rtx t3 = gen_reg_rtx (V2DImode);
emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
GEN_INT (1), GEN_INT (3)));
emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
GEN_INT (1), GEN_INT (3)));
emit_move_insn (t3, CONST0_RTX (V2DImode));
emit_insn (gen_xop_pmacsdqh (operands[0], t1, t2, t3));
DONE;
}
emit_insn (gen_vec_interleave_highv4si (t1, op1, op1));
emit_insn (gen_vec_interleave_highv4si (t2, op2, op2));
emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
DONE;
})
(define_expand "vec_widen_smult_lo_v4si"
[(match_operand:V2DI 0 "register_operand")
(match_operand:V4SI 1 "register_operand")
(match_operand:V4SI 2 "register_operand")]
"TARGET_SSE4_1"
{
rtx op1, op2, t1, t2;
op1 = operands[1];
op2 = operands[2];
t1 = gen_reg_rtx (V4SImode);
t2 = gen_reg_rtx (V4SImode);
if (TARGET_XOP)
{
rtx t3 = gen_reg_rtx (V2DImode);
emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
GEN_INT (1), GEN_INT (3)));
emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
GEN_INT (1), GEN_INT (3)));
emit_move_insn (t3, CONST0_RTX (V2DImode));
emit_insn (gen_xop_pmacsdql (operands[0], t1, t2, t3));
DONE;
}
emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1));
emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2));
emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
DONE;
})
(define_expand "vec_widen_umult_hi_v4si"
[(match_operand:V2DI 0 "register_operand")
(match_operand:V4SI 1 "register_operand")
(match_operand:V4SI 2 "register_operand")]
"TARGET_SSE2"
{
rtx op1, op2, t1, t2;
op1 = operands[1];
op2 = operands[2];
t1 = gen_reg_rtx (V4SImode);
t2 = gen_reg_rtx (V4SImode);
emit_insn (gen_vec_interleave_highv4si (t1, op1, op1));
emit_insn (gen_vec_interleave_highv4si (t2, op2, op2));
emit_insn (gen_sse2_umulv2siv2di3 (operands[0], t1, t2));
DONE;
})
(define_expand "vec_widen_umult_lo_v4si"
[(match_operand:V2DI 0 "register_operand")
(match_operand:V4SI 1 "register_operand")
(match_operand:V4SI 2 "register_operand")]
"TARGET_SSE2"
{
rtx op1, op2, t1, t2;
op1 = operands[1];
op2 = operands[2];
t1 = gen_reg_rtx (V4SImode);
t2 = gen_reg_rtx (V4SImode);
emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1));
emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2));
emit_insn (gen_sse2_umulv2siv2di3 (operands[0], t1, t2));
(match_operand:VI124_AVX2 1 "register_operand"))
(match_operand:VI124_AVX2 2 "register_operand")]
; Note that SSE2 does not have signed SI multiply
"TARGET_XOP || TARGET_SSE4_1
|| (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
{
ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2],
<u_bool>, false);
DONE;
})
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment