Commit 3fbee523 by Bernd Schmidt Committed by Bernd Schmidt

lib1funcs.asm (___umulsi3_highpart, [...]): Use a more efficient implementation.

	* config/bfin/lib1funcs.asm (___umulsi3_highpart, __smulsi3_highpart):
	Use a more efficient implementation.
	* config/bfin/bfin.md (umulsi3_highpart, smulsi3_highpart): Emit
	inline sequences when not optimizing for size.

From-SVN: r123748
parent 9d3f9aa3
...@@ -17,6 +17,11 @@ ...@@ -17,6 +17,11 @@
(flag_macinit1hi): Tighten constraints. (flag_macinit1hi): Tighten constraints.
(flag_mul_macv2hi_parts_acconly): New pattern. (flag_mul_macv2hi_parts_acconly): New pattern.
* config/bfin/lib1funcs.asm (___umulsi3_highpart, __smulsi3_highpart):
Use a more efficient implementation.
* config/bfin/bfin.md (umulsi3_highpart, smulsi3_highpart): Emit
inline sequences when not optimizing for size.
2007-02-11 Jie Zhang <jie.zhang@analog.com> 2007-02-11 Jie Zhang <jie.zhang@analog.com>
* config/bfin/bfin.opt (msim): New option. * config/bfin/bfin.opt (msim): New option.
(mcpu=): New option. (mcpu=): New option.
......
...@@ -1451,42 +1451,102 @@ ...@@ -1451,42 +1451,102 @@
[(set_attr "type" "mult")]) [(set_attr "type" "mult")])
(define_expand "umulsi3_highpart" (define_expand "umulsi3_highpart"
[(set (match_operand:SI 0 "register_operand" "") [(parallel
(truncate:SI [(set (match_operand:SI 0 "register_operand" "")
(lshiftrt:DI (truncate:SI
(mult:DI (zero_extend:DI (lshiftrt:DI
(match_operand:SI 1 "nonimmediate_operand" "")) (mult:DI (zero_extend:DI
(zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" ""))
(match_operand:SI 2 "register_operand" ""))) (zero_extend:DI
(const_int 32))))] (match_operand:SI 2 "register_operand" "")))
"" (const_int 32))))
{ (clobber (reg:PDI REG_A0))
rtx umulsi3_highpart_libfunc (clobber (reg:PDI REG_A1))])]
= init_one_libfunc ("__umulsi3_highpart"); ""
{
if (!optimize_size)
{
rtx a1reg = gen_rtx_REG (PDImode, REG_A1);
rtx a0reg = gen_rtx_REG (PDImode, REG_A0);
emit_insn (gen_flag_macinit1hi (a1reg,
gen_lowpart (HImode, operands[1]),
gen_lowpart (HImode, operands[2]),
GEN_INT (MACFLAG_FU)));
emit_insn (gen_lshrpdi3 (a1reg, a1reg, GEN_INT (16)));
emit_insn (gen_flag_mul_macv2hi_parts_acconly (a0reg, a1reg,
gen_lowpart (V2HImode, operands[1]),
gen_lowpart (V2HImode, operands[2]),
const1_rtx, const1_rtx,
const1_rtx, const0_rtx, a1reg,
const0_rtx, GEN_INT (MACFLAG_FU),
GEN_INT (MACFLAG_FU)));
emit_insn (gen_flag_machi_parts_acconly (a1reg,
gen_lowpart (V2HImode, operands[2]),
gen_lowpart (V2HImode, operands[1]),
const1_rtx, const0_rtx,
a1reg, const0_rtx, GEN_INT (MACFLAG_FU)));
emit_insn (gen_lshrpdi3 (a1reg, a1reg, GEN_INT (16)));
emit_insn (gen_sum_of_accumulators (operands[0], a0reg, a0reg, a1reg));
}
else
{
rtx umulsi3_highpart_libfunc
= init_one_libfunc ("__umulsi3_highpart");
emit_library_call_value (umulsi3_highpart_libfunc, emit_library_call_value (umulsi3_highpart_libfunc,
operands[0], LCT_NORMAL, SImode, operands[0], LCT_NORMAL, SImode,
2, operands[1], SImode, operands[2], SImode); 2, operands[1], SImode, operands[2], SImode);
}
DONE; DONE;
}) })
(define_expand "smulsi3_highpart" (define_expand "smulsi3_highpart"
[(set (match_operand:SI 0 "register_operand" "") [(parallel
(truncate:SI [(set (match_operand:SI 0 "register_operand" "")
(lshiftrt:DI (truncate:SI
(mult:DI (sign_extend:DI (lshiftrt:DI
(match_operand:SI 1 "nonimmediate_operand" "")) (mult:DI (sign_extend:DI
(sign_extend:DI (match_operand:SI 1 "nonimmediate_operand" ""))
(match_operand:SI 2 "register_operand" ""))) (sign_extend:DI
(const_int 32))))] (match_operand:SI 2 "register_operand" "")))
"" (const_int 32))))
{ (clobber (reg:PDI REG_A0))
rtx smulsi3_highpart_libfunc (clobber (reg:PDI REG_A1))])]
= init_one_libfunc ("__smulsi3_highpart"); ""
{
if (!optimize_size)
{
rtx a1reg = gen_rtx_REG (PDImode, REG_A1);
rtx a0reg = gen_rtx_REG (PDImode, REG_A0);
emit_insn (gen_flag_macinit1hi (a1reg,
gen_lowpart (HImode, operands[1]),
gen_lowpart (HImode, operands[2]),
GEN_INT (MACFLAG_FU)));
emit_insn (gen_lshrpdi3 (a1reg, a1reg, GEN_INT (16)));
emit_insn (gen_flag_mul_macv2hi_parts_acconly (a0reg, a1reg,
gen_lowpart (V2HImode, operands[1]),
gen_lowpart (V2HImode, operands[2]),
const1_rtx, const1_rtx,
const1_rtx, const0_rtx, a1reg,
const0_rtx, GEN_INT (MACFLAG_IS),
GEN_INT (MACFLAG_IS_M)));
emit_insn (gen_flag_machi_parts_acconly (a1reg,
gen_lowpart (V2HImode, operands[2]),
gen_lowpart (V2HImode, operands[1]),
const1_rtx, const0_rtx,
a1reg, const0_rtx, GEN_INT (MACFLAG_IS_M)));
emit_insn (gen_ashrpdi3 (a1reg, a1reg, GEN_INT (16)));
emit_insn (gen_sum_of_accumulators (operands[0], a0reg, a0reg, a1reg));
}
else
{
rtx smulsi3_highpart_libfunc
= init_one_libfunc ("__smulsi3_highpart");
emit_library_call_value (smulsi3_highpart_libfunc, emit_library_call_value (smulsi3_highpart_libfunc,
operands[0], LCT_NORMAL, SImode, operands[0], LCT_NORMAL, SImode,
2, operands[1], SImode, operands[2], SImode); 2, operands[1], SImode, operands[2], SImode);
}
DONE; DONE;
}) })
......
...@@ -123,17 +123,12 @@ ___umodsi3: ...@@ -123,17 +123,12 @@ ___umodsi3:
.type ___umulsi3_highpart, STT_FUNC; .type ___umulsi3_highpart, STT_FUNC;
___umulsi3_highpart: ___umulsi3_highpart:
R2 = R1.H * R0.H, R3 = R1.L * R0.H (FU); A1 = R1.L * R0.L (FU);
R0 = R1.L * R0.L, R1 = R1.H * R0.L (FU); A1 = A1 >> 16;
R0 >>= 16; A0 = R1.H * R0.H, A1 += R1.L * R0.H (FU);
/* Unsigned multiplication has the nice property that we can A1 += R0.L * R1.H (FU);
ignore carry on this first addition. */ A1 = A1 >> 16;
R0 = R0 + R3; R0 = (A0 += A1);
R0 = R0 + R1;
cc = ac0;
R1 = cc;
R1 = PACK(R1.l,R0.h);
R0 = R1 + R2;
RTS; RTS;
#endif #endif
...@@ -143,24 +138,11 @@ ___umulsi3_highpart: ...@@ -143,24 +138,11 @@ ___umulsi3_highpart:
.type ___smulsi3_highpart, STT_FUNC; .type ___smulsi3_highpart, STT_FUNC;
___smulsi3_highpart: ___smulsi3_highpart:
R2 = R1.L * R0.L (FU); A1 = R1.L * R0.L (FU);
R3 = R1.H * R0.L (IS,M); A1 = A1 >> 16;
R0 = R0.H * R1.H, R1 = R0.H * R1.L (IS,M); A0 = R0.H * R1.H, A1 += R0.H * R1.L (IS,M);
A1 += R1.H * R0.L (IS,M);
R1.L = R2.H + R1.L; A1 = A1 >>> 16;
cc = ac0; R0 = (A0 += A1);
R2 = cc;
R1.L = R1.L + R3.L;
cc = ac0;
R1 >>>= 16;
R3 >>>= 16;
R1 = R1 + R3;
R1 = R1 + R2;
R2 = cc;
R1 = R1 + R2;
R0 = R0 + R1;
RTS; RTS;
#endif #endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment