lib1funcs.asm (___umulsi3_highpart, [...]): Use a more efficient implementation.

* config/bfin/lib1funcs.asm (___umulsi3_highpart, __smulsi3_highpart): Use a more efficient implementation. * config/bfin/bfin.md (umulsi3_highpart, smulsi3_highpart): Emit inline sequences when not optimizing for size. From-SVN: r123748

lib1funcs.asm (___umulsi3_highpart, [...]): Use a more efficient implementation.
* config/bfin/lib1funcs.asm (___umulsi3_highpart, __smulsi3_highpart): Use a more efficient implementation. * config/bfin/bfin.md (umulsi3_highpart, smulsi3_highpart): Emit inline sequences when not optimizing for size. From-SVN: r123748
3fbee523 · Bernd Schmidt · Bernd Schmidt · 9d3f9aa3 · 3fbee523 · 3fbee523
Commit 3fbee523 authored Apr 12, 2007 by Bernd Schmidt Committed by Bernd Schmidt Apr 12, 2007
Show whitespace changes
Inline Side-by-side

Showing with 79 additions and 32 deletions

gcc/ChangeLog
+5 -0

gcc/config/bfin/bfin.md
+62 -2

gcc/config/bfin/lib1funcs.asm
+12 -30

No files found.
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -17,6 +17,11 @@
 	(flag_macinit1hi): Tighten constraints.
 	(flag_mul_macv2hi_parts_acconly): New pattern.

+	* config/bfin/lib1funcs.asm (___umulsi3_highpart, __smulsi3_highpart):
+	Use a more efficient implementation.
+	* config/bfin/bfin.md (umulsi3_highpart, smulsi3_highpart): Emit
+	inline sequences when not optimizing for size.
+
 	2007-02-11  Jie Zhang  <jie.zhang@analog.com>
 	* config/bfin/bfin.opt (msim): New option.
 	(mcpu=): New option.

--- a/gcc/config/bfin/bfin.md
+++ b/gcc/config/bfin/bfin.md
@@ -1451,6 +1451,7 @@
  [(set_attr "type" "mult")])

 (define_expand "umulsi3_highpart"
+  [(parallel
    [(set (match_operand:SI 0 "register_operand" "")
 	  (truncate:SI
 	   (lshiftrt:DI
@@ -1458,19 +1459,49 @@
 		      (match_operand:SI 1 "nonimmediate_operand" ""))
 		     (zero_extend:DI
 		      (match_operand:SI 2 "register_operand" "")))
-         (const_int 32))))]
+	    (const_int 32))))
+     (clobber (reg:PDI REG_A0))
+     (clobber (reg:PDI REG_A1))])]
  ""
 {
+  if (!optimize_size)
+    {
+      rtx a1reg = gen_rtx_REG (PDImode, REG_A1);
+      rtx a0reg = gen_rtx_REG (PDImode, REG_A0);
+      emit_insn (gen_flag_macinit1hi (a1reg,
+				      gen_lowpart (HImode, operands[1]),
+				      gen_lowpart (HImode, operands[2]),
+				      GEN_INT (MACFLAG_FU)));
+      emit_insn (gen_lshrpdi3 (a1reg, a1reg, GEN_INT (16)));
+      emit_insn (gen_flag_mul_macv2hi_parts_acconly (a0reg, a1reg,
+						     gen_lowpart (V2HImode, operands[1]),
+						     gen_lowpart (V2HImode, operands[2]),
+						     const1_rtx, const1_rtx,
+						     const1_rtx, const0_rtx, a1reg,
+						     const0_rtx, GEN_INT (MACFLAG_FU),
+						     GEN_INT (MACFLAG_FU)));
+      emit_insn (gen_flag_machi_parts_acconly (a1reg,
+					       gen_lowpart (V2HImode, operands[2]),
+					       gen_lowpart (V2HImode, operands[1]),
+					       const1_rtx, const0_rtx,
+					       a1reg, const0_rtx, GEN_INT (MACFLAG_FU)));
+      emit_insn (gen_lshrpdi3 (a1reg, a1reg, GEN_INT (16)));
+      emit_insn (gen_sum_of_accumulators (operands[0], a0reg, a0reg, a1reg));
+    }
+  else
+    {
      rtx umulsi3_highpart_libfunc
 	= init_one_libfunc ("__umulsi3_highpart");

      emit_library_call_value (umulsi3_highpart_libfunc,
 			       operands[0], LCT_NORMAL, SImode,
 			       2, operands[1], SImode, operands[2], SImode);
+    }
  DONE;
 })

 (define_expand "smulsi3_highpart"
+  [(parallel
    [(set (match_operand:SI 0 "register_operand" "")
 	  (truncate:SI
 	   (lshiftrt:DI
@@ -1478,15 +1509,44 @@
 		      (match_operand:SI 1 "nonimmediate_operand" ""))
 		     (sign_extend:DI
 		      (match_operand:SI 2 "register_operand" "")))
-         (const_int 32))))]
+	    (const_int 32))))
+     (clobber (reg:PDI REG_A0))
+     (clobber (reg:PDI REG_A1))])]
  ""
 {
+  if (!optimize_size)
+    {
+      rtx a1reg = gen_rtx_REG (PDImode, REG_A1);
+      rtx a0reg = gen_rtx_REG (PDImode, REG_A0);
+      emit_insn (gen_flag_macinit1hi (a1reg,
+				      gen_lowpart (HImode, operands[1]),
+				      gen_lowpart (HImode, operands[2]),
+				      GEN_INT (MACFLAG_FU)));
+      emit_insn (gen_lshrpdi3 (a1reg, a1reg, GEN_INT (16)));
+      emit_insn (gen_flag_mul_macv2hi_parts_acconly (a0reg, a1reg,
+						     gen_lowpart (V2HImode, operands[1]),
+						     gen_lowpart (V2HImode, operands[2]),
+						     const1_rtx, const1_rtx,
+						     const1_rtx, const0_rtx, a1reg,
+						     const0_rtx, GEN_INT (MACFLAG_IS),
+						     GEN_INT (MACFLAG_IS_M)));
+      emit_insn (gen_flag_machi_parts_acconly (a1reg,
+					       gen_lowpart (V2HImode, operands[2]),
+					       gen_lowpart (V2HImode, operands[1]),
+					       const1_rtx, const0_rtx,
+					       a1reg, const0_rtx, GEN_INT (MACFLAG_IS_M)));
+      emit_insn (gen_ashrpdi3 (a1reg, a1reg, GEN_INT (16)));
+      emit_insn (gen_sum_of_accumulators (operands[0], a0reg, a0reg, a1reg));
+    }
+  else
+    {
      rtx smulsi3_highpart_libfunc
 	= init_one_libfunc ("__smulsi3_highpart");

      emit_library_call_value (smulsi3_highpart_libfunc,
 			       operands[0], LCT_NORMAL, SImode,
 			       2, operands[1], SImode, operands[2], SImode);
+    }
  DONE;
 })


--- a/gcc/config/bfin/lib1funcs.asm
+++ b/gcc/config/bfin/lib1funcs.asm
@@ -123,17 +123,12 @@ ___umodsi3:
 .type ___umulsi3_highpart, STT_FUNC;

 ___umulsi3_highpart:
-	R2 = R1.H * R0.H, R3 = R1.L * R0.H (FU);
-	R0 = R1.L * R0.L, R1 = R1.H * R0.L (FU);
-	R0 >>= 16;
-	/* Unsigned multiplication has the nice property that we can
-	   ignore carry on this first addition.  */
-	R0 = R0 + R3;
-	R0 = R0 + R1;
-	cc = ac0;
-	R1 = cc;
-	R1 = PACK(R1.l,R0.h);
-	R0 = R1 + R2;
+	A1 = R1.L * R0.L (FU);
+	A1 = A1 >> 16;
+	A0 = R1.H * R0.H, A1 += R1.L * R0.H (FU);
+	A1 += R0.L * R1.H (FU);
+	A1 = A1 >> 16;
+	R0 = (A0 += A1);
 	RTS;
 #endif

@@ -143,24 +138,11 @@ ___umulsi3_highpart:
 .type ___smulsi3_highpart, STT_FUNC;

 ___smulsi3_highpart:
-	R2 = R1.L * R0.L (FU);
-	R3 = R1.H * R0.L (IS,M);
-	R0 = R0.H * R1.H, R1 = R0.H * R1.L (IS,M);
-
-	R1.L = R2.H + R1.L;
-	cc = ac0;
-	R2 = cc;
-
-	R1.L = R1.L + R3.L;
-	cc = ac0;
-	R1 >>>= 16;
-	R3 >>>= 16;
-	R1 = R1 + R3;
-	R1 = R1 + R2;
-	R2 = cc;
-	R1 = R1 + R2;
-
-	R0 = R0 + R1;
+	A1 = R1.L * R0.L (FU);
+	A1 = A1 >> 16;
+	A0 = R0.H * R1.H, A1 += R0.H * R1.L (IS,M);
+	A1 += R1.H * R0.L (IS,M);
+	A1 = A1 >>> 16;
+	R0 = (A0 += A1);
 	RTS;
 #endif
-