Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
R
riscv-gcc-1
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
lvzhengyang
riscv-gcc-1
Commits
73e9d637
Commit
73e9d637
authored
Jun 19, 2012
by
Richard Henderson
Committed by
Richard Henderson
Jun 19, 2012
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Handle const_vector in mulv4si3 for pre-sse4.1.
From-SVN: r188787
parent
84ddb681
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
106 additions
and
59 deletions
+106
-59
gcc/ChangeLog
+9
-0
gcc/config/i386/i386-protos.h
+1
-0
gcc/config/i386/i386.c
+76
-0
gcc/config/i386/predicates.md
+7
-0
gcc/config/i386/sse.md
+13
-59
No files found.
gcc/ChangeLog
View file @
73e9d637
2012-06-19 Richard Henderson <rth@redhat.com>
* config/i386/i386-protos.h (ix86_expand_sse2_mulv4si3): Declare.
* config/i386/i386.c (ix86_expand_sse2_mulv4si3): New.
* config/i386/predicates.md (nonimmediate_or_const_vector_operand): New.
* config/i386/sse.md (sse2_mulv4si3): Delete.
(mul<VI4_AVX2>3): Use ix86_expand_sse2_mulv4si3 and
nonimmediate_or_const_vector_operand.
2012-06-19 Richard Henderson <rth@redhat.com>
* expmed.c (struct init_expmed_rtl): Split ...
(init_expmed_one_mode): ... out of ...
(init_expmed): ... here. Initialize integer vector modes also.
...
...
gcc/config/i386/i386-protos.h
View file @
73e9d637
...
...
@@ -222,6 +222,7 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx);
extern
void
ix86_expand_vec_extract_even_odd
(
rtx
,
rtx
,
rtx
,
unsigned
);
extern
bool
ix86_expand_pinsr
(
rtx
*
);
extern
void
ix86_expand_sse2_mulv4si3
(
rtx
,
rtx
,
rtx
);
/* In i386-c.c */
extern
void
ix86_target_macros
(
void
);
...
...
gcc/config/i386/i386.c
View file @
73e9d637
...
...
@@ -38438,6 +38438,82 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
expand_vec_perm_even_odd_1
(
&
d
,
odd
);
}
void
ix86_expand_sse2_mulv4si3
(
rtx
op0
,
rtx
op1
,
rtx
op2
)
{
rtx
op1_m1
,
op1_m2
;
rtx
op2_m1
,
op2_m2
;
rtx
res_1
,
res_2
;
/* Shift both input vectors down one element, so that elements 3
and 1 are now in the slots for elements 2 and 0. For K8, at
least, this is faster than using a shuffle. */
op1_m1
=
op1
=
force_reg
(
V4SImode
,
op1
);
op1_m2
=
gen_reg_rtx
(
V4SImode
);
emit_insn
(
gen_sse2_lshrv1ti3
(
gen_lowpart
(
V1TImode
,
op1_m2
),
gen_lowpart
(
V1TImode
,
op1
),
GEN_INT
(
32
)));
if
(
GET_CODE
(
op2
)
==
CONST_VECTOR
)
{
rtvec
v
;
/* Constant propagate the vector shift, leaving the dont-care
vector elements as zero. */
v
=
rtvec_alloc
(
4
);
RTVEC_ELT
(
v
,
0
)
=
CONST_VECTOR_ELT
(
op2
,
0
);
RTVEC_ELT
(
v
,
2
)
=
CONST_VECTOR_ELT
(
op2
,
2
);
RTVEC_ELT
(
v
,
1
)
=
const0_rtx
;
RTVEC_ELT
(
v
,
3
)
=
const0_rtx
;
op2_m1
=
gen_rtx_CONST_VECTOR
(
V4SImode
,
v
);
op2_m1
=
force_reg
(
V4SImode
,
op2_m1
);
v
=
rtvec_alloc
(
4
);
RTVEC_ELT
(
v
,
0
)
=
CONST_VECTOR_ELT
(
op2
,
1
);
RTVEC_ELT
(
v
,
2
)
=
CONST_VECTOR_ELT
(
op2
,
3
);
RTVEC_ELT
(
v
,
1
)
=
const0_rtx
;
RTVEC_ELT
(
v
,
3
)
=
const0_rtx
;
op2_m2
=
gen_rtx_CONST_VECTOR
(
V4SImode
,
v
);
op2_m2
=
force_reg
(
V4SImode
,
op2_m2
);
}
else
{
op2_m1
=
op2
=
force_reg
(
V4SImode
,
op2
);
op2_m2
=
gen_reg_rtx
(
V4SImode
);
emit_insn
(
gen_sse2_lshrv1ti3
(
gen_lowpart
(
V1TImode
,
op2_m2
),
gen_lowpart
(
V1TImode
,
op2
),
GEN_INT
(
32
)));
}
/* Widening multiply of elements 0+2, and 1+3. */
res_1
=
gen_reg_rtx
(
V4SImode
);
res_2
=
gen_reg_rtx
(
V4SImode
);
emit_insn
(
gen_sse2_umulv2siv2di3
(
gen_lowpart
(
V2DImode
,
res_1
),
op1_m1
,
op2_m1
));
emit_insn
(
gen_sse2_umulv2siv2di3
(
gen_lowpart
(
V2DImode
,
res_2
),
op1_m2
,
op2_m2
));
/* Move the results in element 2 down to element 1; we don't care
what goes in elements 2 and 3. Then we can merge the parts
back together with an interleave.
Note that two other sequences were tried:
(1) Use interleaves at the start instead of psrldq, which allows
us to use a single shufps to merge things back at the end.
(2) Use shufps here to combine the two vectors, then pshufd to
put the elements in the correct order.
In both cases the cost of the reformatting stall was too high
and the overall sequence slower. */
emit_insn
(
gen_sse2_pshufd_1
(
res_1
,
res_1
,
const0_rtx
,
const2_rtx
,
const0_rtx
,
const0_rtx
));
emit_insn
(
gen_sse2_pshufd_1
(
res_2
,
res_2
,
const0_rtx
,
const2_rtx
,
const0_rtx
,
const0_rtx
));
res_1
=
emit_insn
(
gen_vec_interleave_lowv4si
(
op0
,
res_1
,
res_2
));
set_unique_reg_note
(
res_1
,
REG_EQUAL
,
gen_rtx_MULT
(
V4SImode
,
op1
,
op2
));
}
/* Expand an insert into a vector register through pinsr insn.
Return true if successful. */
gcc/config/i386/predicates.md
View file @
73e9d637
...
...
@@ -816,6 +816,13 @@
return false;
})
;; Return true when OP is a nonimmediate or a vector constant. Note
;; that most vector constants are not legitimate operands, so we need
;; to special-case this.
(define_predicate "nonimmediate_or_const_vector_operand"
(ior (match_code "const_vector")
(match_operand 0 "nonimmediate_operand")))
;; Return true if OP is a register or a zero.
(define_predicate "reg_or_0_operand"
(ior (match_operand 0 "register_operand")
...
...
gcc/config/i386/sse.md
View file @
73e9d637
...
...
@@ -5610,12 +5610,22 @@
(define_expand "mul
<mode>
3"
[
(set (match_operand:VI4_AVX2 0 "register_operand")
(mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "register_operand")
(match_operand:VI4_AVX2 2 "register_operand")))]
(mult:VI4_AVX2
(match_operand:VI4_AVX2 1 "nonimmediate_operand")
(match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))]
"TARGET_SSE2"
{
if (TARGET_SSE4_1 || TARGET_AVX)
ix86_fixup_binary_operands_no_copy (MULT,
<MODE>
mode, operands);
{
if (CONSTANT_P (operands
[
2
]
))
operands
[
2
]
= force_const_mem (
<MODE>
mode, operands
[
2
]
);
ix86_fixup_binary_operands_no_copy (MULT,
<MODE>
mode, operands);
}
else
{
ix86_expand_sse2_mulv4si3 (operands
[
0
]
, operands
[
1
]
, operands
[
2
]
);
DONE;
}
})
(define_insn "
*
<sse4
_1_avx2
>
_mul
<mode>
3"
...
...
@@ -5633,62 +5643,6 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "
<sseinsnmode>
")])
(define_insn_and_split "
*
sse2_mulv4si3"
[
(set (match_operand:V4SI 0 "register_operand")
(mult:V4SI (match_operand:V4SI 1 "register_operand")
(match_operand:V4SI 2 "register_operand")))]
"TARGET_SSE2 && !TARGET_SSE4_1 && !TARGET_AVX
&& can_create_pseudo_p ()"
"#"
"&& 1"
[
(const_int 0)
]
{
rtx t1, t2, t3, t4, t5, t6, thirtytwo;
rtx op0, op1, op2;
op0 = operands
[
0
]
;
op1 = operands
[
1
]
;
op2 = operands
[
2
]
;
t1 = gen_reg_rtx (V4SImode);
t2 = gen_reg_rtx (V4SImode);
t3 = gen_reg_rtx (V4SImode);
t4 = gen_reg_rtx (V4SImode);
t5 = gen_reg_rtx (V4SImode);
t6 = gen_reg_rtx (V4SImode);
thirtytwo = GEN_INT (32);
/
* Multiply elements 2 and 0. *
/
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1),
op1, op2));
/
*
Shift both input vectors down one element, so that elements 3
and 1 are now in the slots for elements 2 and 0. For K8, at
least, this is faster than using a shuffle.
*
/
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t2),
gen_lowpart (V1TImode, op1),
thirtytwo));
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, t3),
gen_lowpart (V1TImode, op2),
thirtytwo));
/
* Multiply elements 3 and 1. *
/
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4),
t2, t3));
/
*
Move the results in element 2 down to element 1; we don't care
what goes in elements 2 and 3.
*
/
emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx,
const0_rtx, const0_rtx));
emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx,
const0_rtx, const0_rtx));
/
* Merge the parts back together. *
/
emit_insn (gen_vec_interleave_lowv4si (op0, t5, t6));
set_unique_reg_note (get_last_insn (), REG_EQUAL,
gen_rtx_MULT (V4SImode, operands
[
1
]
, operands
[
2
]
));
DONE;
})
(define_insn_and_split "mul
<mode>
3"
[
(set (match_operand:VI8_AVX2 0 "register_operand")
(mult:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand")
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment