Commit 73e9d637 by Richard Henderson Committed by Richard Henderson

Handle const_vector in mulv4si3 for pre-sse4.1.

From-SVN: r188787
parent 84ddb681
2012-06-19 Richard Henderson <rth@redhat.com>
* config/i386/i386-protos.h (ix86_expand_sse2_mulv4si3): Declare.
* config/i386/i386.c (ix86_expand_sse2_mulv4si3): New.
* config/i386/predicates.md (nonimmediate_or_const_vector_operand): New.
* config/i386/sse.md (sse2_mulv4si3): Delete.
(mul<VI4_AVX2>3): Use ix86_expand_sse2_mulv4si3 and
nonimmediate_or_const_vector_operand.
2012-06-19 Richard Henderson <rth@redhat.com>
* expmed.c (struct init_expmed_rtl): Split ...
(init_expmed_one_mode): ... out of ...
(init_expmed): ... here. Initialize integer vector modes also.
......
......@@ -222,6 +222,7 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx);
extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);
extern bool ix86_expand_pinsr (rtx *);
extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
/* In i386-c.c */
extern void ix86_target_macros (void);
......
......@@ -38438,6 +38438,82 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
expand_vec_perm_even_odd_1 (&d, odd);
}
void
ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
{
rtx op1_m1, op1_m2;
rtx op2_m1, op2_m2;
rtx res_1, res_2;
/* Shift both input vectors down one element, so that elements 3
and 1 are now in the slots for elements 2 and 0. For K8, at
least, this is faster than using a shuffle. */
op1_m1 = op1 = force_reg (V4SImode, op1);
op1_m2 = gen_reg_rtx (V4SImode);
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2),
gen_lowpart (V1TImode, op1),
GEN_INT (32)));
if (GET_CODE (op2) == CONST_VECTOR)
{
rtvec v;
/* Constant propagate the vector shift, leaving the dont-care
vector elements as zero. */
v = rtvec_alloc (4);
RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0);
RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2);
RTVEC_ELT (v, 1) = const0_rtx;
RTVEC_ELT (v, 3) = const0_rtx;
op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v);
op2_m1 = force_reg (V4SImode, op2_m1);
v = rtvec_alloc (4);
RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1);
RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3);
RTVEC_ELT (v, 1) = const0_rtx;
RTVEC_ELT (v, 3) = const0_rtx;
op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v);
op2_m2 = force_reg (V4SImode, op2_m2);
}
else
{
op2_m1 = op2 = force_reg (V4SImode, op2);
op2_m2 = gen_reg_rtx (V4SImode);
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2),
gen_lowpart (V1TImode, op2),
GEN_INT (32)));
}
/* Widening multiply of elements 0+2, and 1+3. */
res_1 = gen_reg_rtx (V4SImode);
res_2 = gen_reg_rtx (V4SImode);
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1),
op1_m1, op2_m1));
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2),
op1_m2, op2_m2));
/* Move the results in element 2 down to element 1; we don't care
what goes in elements 2 and 3. Then we can merge the parts
back together with an interleave.
Note that two other sequences were tried:
(1) Use interleaves at the start instead of psrldq, which allows
us to use a single shufps to merge things back at the end.
(2) Use shufps here to combine the two vectors, then pshufd to
put the elements in the correct order.
In both cases the cost of the reformatting stall was too high
and the overall sequence slower. */
emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
const0_rtx, const0_rtx));
emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
const0_rtx, const0_rtx));
res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
}
/* Expand an insert into a vector register through pinsr insn.
Return true if successful. */
......@@ -816,6 +816,13 @@
return false;
})
;; Return true when OP is a nonimmediate or a vector constant. Note
;; that most vector constants are not legitimate operands, so we need
;; to special-case this.
(define_predicate "nonimmediate_or_const_vector_operand"
(ior (match_code "const_vector")
(match_operand 0 "nonimmediate_operand")))
;; Return true if OP is a register or a zero.
(define_predicate "reg_or_0_operand"
(ior (match_operand 0 "register_operand")
......
......@@ -5610,12 +5610,22 @@
(define_expand "mul<mode>3"
[(set (match_operand:VI4_AVX2 0 "register_operand")
(mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "register_operand")
(match_operand:VI4_AVX2 2 "register_operand")))]
(mult:VI4_AVX2
(match_operand:VI4_AVX2 1 "nonimmediate_operand")
(match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))]
"TARGET_SSE2"
{
if (TARGET_SSE4_1 || TARGET_AVX)
ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
{
if (CONSTANT_P (operands[2]))
operands[2] = force_const_mem (<MODE>mode, operands[2]);
ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
}
else
{
ix86_expand_sse2_mulv4si3 (operands[0], operands[1], operands[2]);
DONE;
}
})
(define_insn "*<sse4_1_avx2>_mul<mode>3"
......@@ -5633,62 +5643,6 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "<sseinsnmode>")])
(define_insn_and_split "*sse2_mulv4si3"
[(set (match_operand:V4SI 0 "register_operand")
(mult:V4SI (match_operand:V4SI 1 "register_operand")
(match_operand:V4SI 2 "register_operand")))]
"TARGET_SSE2 && !TARGET_SSE4_1 && !TARGET_AVX
&& can_create_pseudo_p ()"
"#"
"&& 1"
[(const_int 0)]
{
rtx t1, t2, t3, t4, t5, t6, thirtytwo;
rtx op0, op1, op2;
op0 = operands[0];
op1 = operands[1];
op2 = operands[2];
t1 = gen_reg_rtx (V4SImode);
t2 = gen_reg_rtx (V4SImode);
t3 = gen_reg_rtx (V4SImode);
t4 = gen_reg_rtx (V4SImode);
t5 = gen_reg_rtx (V4SImode);
t6 = gen_reg_rtx (V4SImode);
thirtytwo = GEN_INT (32);
/* Multiply elements 2 and 0. */
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1),
op1, op2));
/* Shift both input vectors down one element, so that elements 3
and 1 are now in the slots for elements 2 and 0. For K8, at
least, this is faster than using a shuffle. */
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t2),
gen_lowpart (V1TImode, op1),
thirtytwo));
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t3),
gen_lowpart (V1TImode, op2),
thirtytwo));
/* Multiply elements 3 and 1. */
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4),
t2, t3));
/* Move the results in element 2 down to element 1; we don't care
what goes in elements 2 and 3. */
emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx,
const0_rtx, const0_rtx));
emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx,
const0_rtx, const0_rtx));
/* Merge the parts back together. */
emit_insn (gen_vec_interleave_lowv4si (op0, t5, t6));
set_unique_reg_note (get_last_insn (), REG_EQUAL,
gen_rtx_MULT (V4SImode, operands[1], operands[2]));
DONE;
})
(define_insn_and_split "mul<mode>3"
[(set (match_operand:VI8_AVX2 0 "register_operand")
(mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment