Commit ac357108 by Richard Henderson Committed by Richard Henderson

i386: Cleanup and unify widening multiply patterns

Prepares for exposing builtin_mul_widen_even/odd hooks
for more efficient reduction.  Adds QImode multiplication.
Shares code between mulv4si3 and the widening multiplies.

From-SVN: r188957
parent f008d5dc
2012-06-25 Richard Henderson <rth@redhat.com>
* config/i386/i386.c (ix86_rtx_costs) [MULT]: Only apply XOP cost
to V16QImode.
(ix86_expand_vec_interleave): New.
(ix86_expand_mul_widen_evenodd): New.
(ix86_expand_mul_widen_hilo): New.
(ix86_expand_sse2_mulv4si3): Use ix86_expand_mul_widen_evenodd.
* config/i386/i386.md (u_bool) New code attr.
* config/i386/predicates.md
(nonimmediate_or_const_vector_operand): Remove.
* config/i386/sse.md (mul<VI4_AVX2>3): Don't use it; don't test
both AVX and SSE4_1.
(vec_widen<s>mult_hi_<VI2_AVX2>): Remove.
(vec_widen<s>mult_lo_<VI2_AVX2>): Remove.
(vec_widen<s>mult_hi_v8si): Remove.
(vec_widen<s>mult_lo_v8si): Remove.
(vec_widen_smult_hi_v4si): Remove.
(vec_widen_smult_lo_v4si): Remove.
(vec_widen_umult_hi_v4si): Remove.
(vec_widen_umult_lo_v4si): Remove.
(vec_widen_<s>mult_hi_<VI124_AVX2>): New.
(vec_widen_<s>mult_lo_<VI124_AVX2>): New.
* config/i386/i386-protos.h: Update.
2012-06-25 Christophe Lyon <christophe.lyon@st.com> 2012-06-25 Christophe Lyon <christophe.lyon@st.com>
* config/arm/neon.md (UNSPEC_VLD1_DUP): Remove. * config/arm/neon.md (UNSPEC_VLD1_DUP): Remove.
......
/* Definitions of target machine for GCC for IA-32. /* Definitions of target machine for GCC for IA-32.
Copyright (C) 1988, 1992, 1994, 1995, 1996, 1996, 1997, 1998, 1999, Copyright (C) 1988, 1992, 1994, 1995, 1996, 1996, 1997, 1998, 1999,
2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
Free Software Foundation, Inc. Free Software Foundation, Inc.
This file is part of GCC. This file is part of GCC.
...@@ -224,6 +224,8 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx); ...@@ -224,6 +224,8 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx);
extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned); extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);
extern bool ix86_expand_pinsr (rtx *); extern bool ix86_expand_pinsr (rtx *);
extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool);
extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx); extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
/* In i386-c.c */ /* In i386-c.c */
......
...@@ -32101,7 +32101,7 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total, ...@@ -32101,7 +32101,7 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
/* V*QImode is emulated with 1-11 insns. */ /* V*QImode is emulated with 1-11 insns. */
if (mode == V16QImode || mode == V32QImode) if (mode == V16QImode || mode == V32QImode)
{ {
int count; int count = 11;
if (TARGET_XOP && mode == V16QImode) if (TARGET_XOP && mode == V16QImode)
{ {
/* For XOP we use vpshab, which requires a broadcast of the /* For XOP we use vpshab, which requires a broadcast of the
...@@ -32117,8 +32117,8 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total, ...@@ -32117,8 +32117,8 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
} }
count = 3; count = 3;
} }
else else if (TARGET_SSSE3)
count = TARGET_SSSE3 ? 7 : 11; count = 7;
*total = cost->fabs * count; *total = cost->fabs * count;
} }
else else
...@@ -32199,7 +32199,11 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total, ...@@ -32199,7 +32199,11 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
/* V*QImode is emulated with 7-13 insns. */ /* V*QImode is emulated with 7-13 insns. */
if (mode == V16QImode || mode == V32QImode) if (mode == V16QImode || mode == V32QImode)
{ {
int extra = TARGET_XOP ? 5 : TARGET_SSSE3 ? 6 : 11; int extra = 11;
if (TARGET_XOP && mode == V16QImode)
extra = 5;
else if (TARGET_SSSE3)
extra = 6;
*total = cost->fmul * 2 + cost->fabs * extra; *total = cost->fmul * 2 + cost->fabs * extra;
} }
/* Without sse4.1, we don't have PMULLD; it's emulated with 7 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
...@@ -38519,6 +38523,34 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd) ...@@ -38519,6 +38523,34 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
expand_vec_perm_even_odd_1 (&d, odd); expand_vec_perm_even_odd_1 (&d, odd);
} }
static void
ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
{
struct expand_vec_perm_d d;
unsigned i, nelt, base;
bool ok;
d.target = targ;
d.op0 = op0;
d.op1 = op1;
d.vmode = GET_MODE (targ);
d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
d.one_operand_p = false;
d.testing_p = false;
base = high_p ? nelt / 2 : 0;
for (i = 0; i < nelt / 2; ++i)
{
d.perm[i * 2] = i + base;
d.perm[i * 2 + 1] = i + base + nelt;
}
/* Note that for AVX this isn't one instruction. */
ok = ix86_expand_vec_perm_const_1 (&d);
gcc_assert (ok);
}
/* Expand a vector operation CODE for a V*QImode in terms of the /* Expand a vector operation CODE for a V*QImode in terms of the
same operation on V*HImode. */ same operation on V*HImode. */
...@@ -38627,59 +38659,148 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) ...@@ -38627,59 +38659,148 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
} }
void void
ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
bool uns_p, bool odd_p)
{ {
rtx op1_m1, op1_m2; enum machine_mode mode = GET_MODE (op1);
rtx op2_m1, op2_m2; rtx x;
rtx res_1, res_2;
/* Shift both input vectors down one element, so that elements 3 /* We only play even/odd games with vectors of SImode. */
and 1 are now in the slots for elements 2 and 0. For K8, at gcc_assert (mode == V4SImode || mode == V8SImode);
least, this is faster than using a shuffle. */
op1_m1 = op1 = force_reg (V4SImode, op1);
op1_m2 = gen_reg_rtx (V4SImode);
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2),
gen_lowpart (V1TImode, op1),
GEN_INT (32)));
if (GET_CODE (op2) == CONST_VECTOR) /* If we're looking for the odd results, shift those members down to
the even slots. For some cpus this is faster than a PSHUFD. */
if (odd_p)
{ {
rtvec v; enum machine_mode wmode = GET_MODE (dest);
/* Constant propagate the vector shift, leaving the dont-care op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
vector elements as zero. */ GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL,
v = rtvec_alloc (4); 1, OPTAB_DIRECT);
RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0); op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2); GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL,
RTVEC_ELT (v, 1) = const0_rtx; 1, OPTAB_DIRECT);
RTVEC_ELT (v, 3) = const0_rtx; op1 = gen_lowpart (mode, op1);
op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v); op2 = gen_lowpart (mode, op2);
op2_m1 = force_reg (V4SImode, op2_m1); }
v = rtvec_alloc (4); if (mode == V8SImode)
RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1); {
RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3); if (uns_p)
RTVEC_ELT (v, 1) = const0_rtx; x = gen_avx2_umulv4siv4di3 (dest, op1, op2);
RTVEC_ELT (v, 3) = const0_rtx; else
op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v); x = gen_avx2_mulv4siv4di3 (dest, op1, op2);
op2_m2 = force_reg (V4SImode, op2_m2); }
else if (uns_p)
x = gen_sse2_umulv2siv2di3 (dest, op1, op2);
else if (TARGET_SSE4_1)
x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
else if (TARGET_XOP)
{
x = force_reg (V2DImode, CONST0_RTX (V2DImode));
x = gen_xop_pmacsdql (dest, op1, op2, x);
} }
else else
gcc_unreachable ();
emit_insn (x);
}
void
ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
bool uns_p, bool high_p)
{
enum machine_mode wmode = GET_MODE (dest);
enum machine_mode mode = GET_MODE (op1);
rtx t1, t2, t3, t4, mask;
switch (mode)
{ {
op2_m1 = op2 = force_reg (V4SImode, op2); case V4SImode:
op2_m2 = gen_reg_rtx (V4SImode); t1 = gen_reg_rtx (mode);
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2), t2 = gen_reg_rtx (mode);
gen_lowpart (V1TImode, op2), if (TARGET_XOP && !uns_p)
GEN_INT (32))); {
/* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
shuffle the elements once so that all elements are in the right
place for immediate use: { A C B D }. */
emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
}
else
{
/* Put the elements into place for the multiply. */
ix86_expand_vec_interleave (t1, op1, op1, high_p);
ix86_expand_vec_interleave (t2, op2, op2, high_p);
high_p = false;
}
ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
break;
case V8SImode:
/* Shuffle the elements between the lanes. After this we
have { A B E F | C D G H } for each operand. */
t1 = gen_reg_rtx (V4DImode);
t2 = gen_reg_rtx (V4DImode);
emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
/* Shuffle the elements within the lanes. After this we
have { A A B B | C C D D } or { E E F F | G G H H }. */
t3 = gen_reg_rtx (V8SImode);
t4 = gen_reg_rtx (V8SImode);
mask = GEN_INT (high_p
? 2 + (2 << 2) + (3 << 4) + (3 << 6)
: 0 + (0 << 2) + (1 << 4) + (1 << 6));
emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
break;
case V8HImode:
case V16HImode:
t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
uns_p, OPTAB_DIRECT);
t2 = expand_binop (mode,
uns_p ? umul_highpart_optab : smul_highpart_optab,
op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
gcc_assert (t1 && t2);
ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
break;
case V16QImode:
case V32QImode:
t1 = gen_reg_rtx (wmode);
t2 = gen_reg_rtx (wmode);
ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
break;
default:
gcc_unreachable ();
} }
}
void
ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
{
rtx res_1, res_2;
/* Widening multiply of elements 0+2, and 1+3. */
res_1 = gen_reg_rtx (V4SImode); res_1 = gen_reg_rtx (V4SImode);
res_2 = gen_reg_rtx (V4SImode); res_2 = gen_reg_rtx (V4SImode);
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1), ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
op1_m1, op2_m1)); op1, op2, true, false);
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2), ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
op1_m2, op2_m2)); op1, op2, true, true);
/* Move the results in element 2 down to element 1; we don't care /* Move the results in element 2 down to element 1; we don't care
what goes in elements 2 and 3. Then we can merge the parts what goes in elements 2 and 3. Then we can merge the parts
...@@ -744,6 +744,7 @@ ...@@ -744,6 +744,7 @@
;; Prefix for define_insn ;; Prefix for define_insn
(define_code_attr u [(sign_extend "") (zero_extend "u")]) (define_code_attr u [(sign_extend "") (zero_extend "u")])
(define_code_attr s [(sign_extend "s") (zero_extend "u")]) (define_code_attr s [(sign_extend "s") (zero_extend "u")])
(define_code_attr u_bool [(sign_extend "false") (zero_extend "true")])
;; All integer modes. ;; All integer modes.
(define_mode_iterator SWI1248x [QI HI SI DI]) (define_mode_iterator SWI1248x [QI HI SI DI])
......
...@@ -816,13 +816,6 @@ ...@@ -816,13 +816,6 @@
return false; return false;
}) })
;; Return true when OP is a nonimmediate or a vector constant. Note
;; that most vector constants are not legitimate operands, so we need
;; to special-case this.
(define_predicate "nonimmediate_or_const_vector_operand"
(ior (match_code "const_vector")
(match_operand 0 "nonimmediate_operand")))
;; Return true if OP is a register or a zero. ;; Return true if OP is a register or a zero.
(define_predicate "reg_or_0_operand" (define_predicate "reg_or_0_operand"
(ior (match_operand 0 "register_operand") (ior (match_operand 0 "register_operand")
......
...@@ -5555,10 +5555,10 @@ ...@@ -5555,10 +5555,10 @@
[(set (match_operand:VI4_AVX2 0 "register_operand") [(set (match_operand:VI4_AVX2 0 "register_operand")
(mult:VI4_AVX2 (mult:VI4_AVX2
(match_operand:VI4_AVX2 1 "nonimmediate_operand") (match_operand:VI4_AVX2 1 "nonimmediate_operand")
(match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))] (match_operand:VI4_AVX2 2 "nonimmediate_operand")))]
"TARGET_SSE2" "TARGET_SSE2"
{ {
if (TARGET_SSE4_1 || TARGET_AVX) if (TARGET_SSE4_1)
{ {
if (CONSTANT_P (operands[2])) if (CONSTANT_P (operands[2]))
operands[2] = force_const_mem (<MODE>mode, operands[2]); operands[2] = force_const_mem (<MODE>mode, operands[2]);
...@@ -5677,198 +5677,28 @@ ...@@ -5677,198 +5677,28 @@
(define_expand "vec_widen_<s>mult_hi_<mode>" (define_expand "vec_widen_<s>mult_hi_<mode>"
[(match_operand:<sseunpackmode> 0 "register_operand") [(match_operand:<sseunpackmode> 0 "register_operand")
(any_extend:<sseunpackmode> (any_extend:<sseunpackmode>
(match_operand:VI2_AVX2 1 "register_operand")) (match_operand:VI124_AVX2 1 "register_operand"))
(match_operand:VI2_AVX2 2 "register_operand")] (match_operand:VI124_AVX2 2 "register_operand")]
"TARGET_SSE2" ; Note that SSE2 does not have signed SI multiply
{ "TARGET_XOP || TARGET_SSE4_1
rtx op1, op2, t1, t2, dest; || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
{
op1 = operands[1]; ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2],
op2 = operands[2]; <u_bool>, true);
t1 = gen_reg_rtx (<MODE>mode);
t2 = gen_reg_rtx (<MODE>mode);
dest = gen_lowpart (<MODE>mode, operands[0]);
emit_insn (gen_mul<mode>3 (t1, op1, op2));
emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
emit_insn (gen_vec_interleave_high<mode> (dest, t1, t2));
DONE; DONE;
}) })
(define_expand "vec_widen_<s>mult_lo_<mode>" (define_expand "vec_widen_<s>mult_lo_<mode>"
[(match_operand:<sseunpackmode> 0 "register_operand") [(match_operand:<sseunpackmode> 0 "register_operand")
(any_extend:<sseunpackmode> (any_extend:<sseunpackmode>
(match_operand:VI2_AVX2 1 "register_operand")) (match_operand:VI124_AVX2 1 "register_operand"))
(match_operand:VI2_AVX2 2 "register_operand")] (match_operand:VI124_AVX2 2 "register_operand")]
"TARGET_SSE2" ; Note that SSE2 does not have signed SI multiply
{ "TARGET_XOP || TARGET_SSE4_1
rtx op1, op2, t1, t2, dest; || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
{
op1 = operands[1]; ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2],
op2 = operands[2]; <u_bool>, false);
t1 = gen_reg_rtx (<MODE>mode);
t2 = gen_reg_rtx (<MODE>mode);
dest = gen_lowpart (<MODE>mode, operands[0]);
emit_insn (gen_mul<mode>3 (t1, op1, op2));
emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
emit_insn (gen_vec_interleave_low<mode> (dest, t1, t2));
DONE;
})
(define_expand "vec_widen_<s>mult_hi_v8si"
[(match_operand:V4DI 0 "register_operand")
(any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand"))
(match_operand:V8SI 2 "nonimmediate_operand")]
"TARGET_AVX2"
{
rtx t1, t2, t3, t4;
t1 = gen_reg_rtx (V4DImode);
t2 = gen_reg_rtx (V4DImode);
t3 = gen_reg_rtx (V8SImode);
t4 = gen_reg_rtx (V8SImode);
emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]),
const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, operands[2]),
const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1),
GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6))));
emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2),
GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6))));
emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));
DONE;
})
(define_expand "vec_widen_<s>mult_lo_v8si"
[(match_operand:V4DI 0 "register_operand")
(any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand"))
(match_operand:V8SI 2 "nonimmediate_operand")]
"TARGET_AVX2"
{
rtx t1, t2, t3, t4;
t1 = gen_reg_rtx (V4DImode);
t2 = gen_reg_rtx (V4DImode);
t3 = gen_reg_rtx (V8SImode);
t4 = gen_reg_rtx (V8SImode);
emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]),
const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, operands[2]),
const0_rtx, const2_rtx,
const1_rtx, GEN_INT (3)));
emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1),
GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6))));
emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2),
GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6))));
emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));
DONE;
})
(define_expand "vec_widen_smult_hi_v4si"
[(match_operand:V2DI 0 "register_operand")
(match_operand:V4SI 1 "register_operand")
(match_operand:V4SI 2 "register_operand")]
"TARGET_SSE4_1"
{
rtx op1, op2, t1, t2;
op1 = operands[1];
op2 = operands[2];
t1 = gen_reg_rtx (V4SImode);
t2 = gen_reg_rtx (V4SImode);
if (TARGET_XOP)
{
rtx t3 = gen_reg_rtx (V2DImode);
emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
GEN_INT (1), GEN_INT (3)));
emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
GEN_INT (1), GEN_INT (3)));
emit_move_insn (t3, CONST0_RTX (V2DImode));
emit_insn (gen_xop_pmacsdqh (operands[0], t1, t2, t3));
DONE;
}
emit_insn (gen_vec_interleave_highv4si (t1, op1, op1));
emit_insn (gen_vec_interleave_highv4si (t2, op2, op2));
emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
DONE;
})
(define_expand "vec_widen_smult_lo_v4si"
[(match_operand:V2DI 0 "register_operand")
(match_operand:V4SI 1 "register_operand")
(match_operand:V4SI 2 "register_operand")]
"TARGET_SSE4_1"
{
rtx op1, op2, t1, t2;
op1 = operands[1];
op2 = operands[2];
t1 = gen_reg_rtx (V4SImode);
t2 = gen_reg_rtx (V4SImode);
if (TARGET_XOP)
{
rtx t3 = gen_reg_rtx (V2DImode);
emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
GEN_INT (1), GEN_INT (3)));
emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
GEN_INT (1), GEN_INT (3)));
emit_move_insn (t3, CONST0_RTX (V2DImode));
emit_insn (gen_xop_pmacsdql (operands[0], t1, t2, t3));
DONE;
}
emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1));
emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2));
emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
DONE;
})
(define_expand "vec_widen_umult_hi_v4si"
[(match_operand:V2DI 0 "register_operand")
(match_operand:V4SI 1 "register_operand")
(match_operand:V4SI 2 "register_operand")]
"TARGET_SSE2"
{
rtx op1, op2, t1, t2;
op1 = operands[1];
op2 = operands[2];
t1 = gen_reg_rtx (V4SImode);
t2 = gen_reg_rtx (V4SImode);
emit_insn (gen_vec_interleave_highv4si (t1, op1, op1));
emit_insn (gen_vec_interleave_highv4si (t2, op2, op2));
emit_insn (gen_sse2_umulv2siv2di3 (operands[0], t1, t2));
DONE;
})
(define_expand "vec_widen_umult_lo_v4si"
[(match_operand:V2DI 0 "register_operand")
(match_operand:V4SI 1 "register_operand")
(match_operand:V4SI 2 "register_operand")]
"TARGET_SSE2"
{
rtx op1, op2, t1, t2;
op1 = operands[1];
op2 = operands[2];
t1 = gen_reg_rtx (V4SImode);
t2 = gen_reg_rtx (V4SImode);
emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1));
emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2));
emit_insn (gen_sse2_umulv2siv2di3 (operands[0], t1, t2));
DONE; DONE;
}) })
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment