arm/lib1funcs.asm (__divsi3, __modsi3, __udivsi3, __umodsi3): Replace

with smaller, faster versions. From-SVN: r11070

arm/lib1funcs.asm (divsi3, modsi3, udivsi3, umodsi3): Replace
with smaller, faster versions. From-SVN: r11070
bd28bf5a · Richard Earnshaw · b920730a · bd28bf5a
Commit bd28bf5a authored Jan 19, 1996 by Richard Earnshaw
Show whitespace changes
Inline Side-by-side

Showing with 288 additions and 1504 deletions

gcc/config/arm/lib1funcs.asm
+288 -1504

No files found.
--- a/gcc/config/arm/lib1funcs.asm
+++ b/gcc/config/arm/lib1funcs.asm
 @ libgcc1 routines for ARM cpu.
-@ Division and remainder, from Appendix E of the Sparc Version 8
+@ Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
-@ Architecture Manual, with fixes from Gordon Irlam.
-@ Rewritten for the ARM by Richard Earnshaw (rwe@pegasus.esprit.ec.org)
-/* Copyright (C) 1995 Free Software Foundation, Inc.
+/* Copyright (C) 1995, 1996 Free Software Foundation, Inc.
 This file is free software; you can redistribute it and/or modify it
 under the terms of the GNU General Public License as published by the
@@ -35,243 +33,13 @@ Boston, MA 02111-1307, USA.  */
   This exception does not however invalidate any other reasons why
   the executable file might be covered by the GNU General Public License.  */
-/*
- * Input: dividend and divisor in r0 and r1 respectively.
- *
- * m4 parameters:
- *  NAME	name of function to generate
- *  OP		OP=div => r0 / r1; OP=mod => r0 % r1
- *  S		S=true => signed; S=false => unsigned
- *
- * Algorithm parameters:
- *  N		how many bits per iteration we try to get (4)
- *  WORDSIZE	total number of bits (32)
- *
- * Derived constants:
- *  TOPBITS	number of bits in the top `decade' of a number
- *
- * Important variables:
- *  Q		the partial quotient under development (initially 0)
- *  R		the remainder so far, initially the dividend
- *  ITER	number of main division loop iterations required;
- *		equal to ceil(log2(quotient) / N).  Note that this
- *		is the log base (2^N) of the quotient.
- *  V		the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- *  Current estimate for non-large dividend is
- *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
- *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
- *  different path, as the upper bits of the quotient must be developed
- *  one bit at a time.
- */
-/*
-define(N, `4')dnl
-define(WORDSIZE, `32')dnl
-define(TOPBITS, eval(WORDSIZE - N*((WORDSIZE-1)/N)))dnl
-dnl
-define(dividend, `r0')dnl
-define(divisor, `r1')dnl
-define(Q, `r2')dnl
-define(R, `r3')dnl
-define(ITER, `ip')dnl
-define(V, `lr')dnl
-dnl
-dnl m4 reminder: ifelse(a,b,c,d) => if a is b, then c, else d
-define(T, `r4')dnl
-define(SC, `r5')dnl
-ifelse(S, `true', `define(SIGN, `r6')')dnl
-define(REGLIST, `ifelse(S, `true', `{r4, r5, r6,', `{r4, r5,')')dnl
-define(ret, `ldmia	sp!, REGLIST pc}')dnl
-dnl
-dnl This is the recursive definition for developing quotient digits.
-dnl
-dnl Parameters:
-dnl  $1	the current depth, 1 <= $1 <= N
-dnl  $2	the current accumulation of quotient bits
-dnl  N	max depth
-dnl
-dnl We add a new bit to $2 and either recurse or insert the bits in
-dnl the quotient.  R, Q, and V are inputs and outputs as defined above;
-dnl the condition codes are expected to reflect the input R, and are
-dnl modified to reflect the output R.
-dnl
-define(DEVELOP_QUOTIENT_BITS,
-`	@ depth $1, accumulated bits $2
-	mov	V, V, lsr #1
-	blt	L.$1.eval(2^N+$2+999)
-	@ remainder is positive
-	subs	R, R, V
-	ifelse($1, N,
-	`	ifelse(eval(2*$2+1<0), `0',
-		`add	Q, Q, `#'eval($2*2+1)',
-		`sub	Q, Q, `#'eval(-($2*2+1))')
-		b	9f
-	', `	DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2+1)')')
-L.$1.eval(2^N+$2+999):
-	@ remainder is negative
-	adds	R, R, V
-	ifelse($1, N,
-	`	ifelse(eval(2*$2-1<0), `0',
-		`add	Q, Q, `#'eval($2*2-1)',
-		`sub	Q, Q, `#'eval(-($2*2-1))')
-		b	9f
-	', `	DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2-1)')')
-	ifelse($1, 1, `9:')')dnl
-#include "trap.h"
-ip	.req	r12
-sp	.req	r13
-lr	.req	r14
-pc	.req	r15
-.text
-	.globl NAME
-	.align 0
-NAME:
-	stmdb	sp!, REGLIST lr}
-ifelse(S, `true',
-`	@ compute sign of result; if neither is negative, no problem
-	ifelse(OP, `div', `eor	SIGN, divisor, dividend	@ compute sign',
-		`mov	SIGN, dividend')
-	cmp	divisor, #0
-	rsbmi	divisor, divisor, #0
-	beq	Ldiv_zero
-	mov	V, divisor
-	movs	R, dividend
-	rsbmi	R, R, #0	@ make dividend nonnegative
-',
-`	@ Ready to divide.  Compute size of quotient; scale comparand.
-	movs	V, divisor
-	mov	R, dividend
-	beq	Ldiv_zero
-')
-	cmp	R, V			@ if divisor exceeds dividend, done
-	mov	Q, #0
-	bcc	Lgot_result		@ (and algorithm fails otherwise)
-	mov	T, `#'(1 << (WORDSIZE - TOPBITS - 1))
-	cmp	R, T
-	mov	ITER, #0
-	bcc	Lnot_really_big
-	@ `Here the dividend is >= 2^(31-N) or so.  We must be careful here,
-	@ as our usual N-at-a-shot divide step will cause overflow and havoc.
-	@ The number of bits in the result here is N*ITER+SC, where SC <= N.
-	@ Compute ITER in an unorthodox manner: know we need to shift V into
-	@ the top decade: so do not even bother to compare to R.'
-		mov	SC, #1
-	1:
-		cmp	V, T
-		bcs	3f
-		mov	V, V, lsl `#'N
-		add	ITER, ITER, #1
-		b	1b
-	@ Now compute SC.
-	2:	adds	V, V, V
-		add	SC, SC, #1
-		bcc	Lnot_too_big
-		@ We get here if the divisor overflowed while shifting.
-		@ This means that R has the high-order bit set.
-		@ Restore V and subtract from R.
-		mov	T, T, lsl `#'TOPBITS
-		mov	V, V, lsr #1
-		add	V, T, V
-		sub	SC, SC, #1
-		b	Ldo_single_div
-	Lnot_too_big:
-	3:	cmp	V, R
-		bcc	2b
-@		beq	Ldo_single_div
-	/-* NB: these are commented out in the V8-Sparc manual as well *-/
-	/-* (I do not understand this) *-/
-	@ V > R: went too far: back up 1 step
-	@	srl	V, 1, V
-	@	dec	SC
-	@ do single-bit divide steps
-	@
-	@ We have to be careful here.  We know that R >= V, so we can do the
-	@ first divide step without thinking.  BUT, the others are conditional,
-	@ and are only done if R >= 0.  Because both R and V may have the high-
-	@ order bit set in the first step, just falling into the regular
-	@ division loop will mess up the first time around.
-	@ So we unroll slightly...
-	Ldo_single_div:
-		subs	SC, SC, #1
-		blt	Lend_regular_divide
-		sub	R, R, V
-		mov	Q, #1
-		b	Lend_single_divloop
-	Lsingle_divloop:
-		cmp	R, #0
-		mov	Q, Q, lsl #1
-		mov	V, V, lsr #1
-		@ R >= 0
-		subpl	R, R, V
-		addpl	Q, Q, #1
-		@ R < 0
-		addmi	R, R, V
-		submi	Q, Q, #1
-	Lend_single_divloop:
-		subs	SC, SC, #1
-		bge	Lsingle_divloop
-		b	Lend_regular_divide
-1:
-	add	ITER, ITER, #1
-Lnot_really_big:
-	mov	V, V, lsl `#'N
-	cmp	V, R
-	bls	1b
-	@
-	@	HOW CAN ITER EVER BE -1 HERE ?????
-	@
-	cmn	ITER, #1
-	beq	Lgot_result
-Ldivloop:
-	cmp	R, #0	@ set up for initial iteration
-	mov	Q, Q, lsl `#'N
-	DEVELOP_QUOTIENT_BITS(1, 0)
-Lend_regular_divide:
-	subs	ITER, ITER, #1
-	bge	Ldivloop
-	cmp	R, #0
-	@ non-restoring fixup here (one instruction only!)
-ifelse(OP, `div',
-`	sublt	Q, Q, #1
-', `	addlt	R, divisor, R
-')
-Lgot_result:
-ifelse(S, `true',
-`	@ check to see if answer should be < 0
-	cmp	SIGN, #0
-	ifelse(OP, `div', `rsbmi Q, Q, #0', `rsbmi R, R, #0')
-')
-	ifelse(OP, `div', `mov r0, Q', `mov r0, R')
-	ret
-Ldiv_zero:
-	@ Divide by zero trap.  If it returns, return 0 (about as
-	@ wrong as possible, but that is what SunOS does...).
-	bl	___div0
-	mov	r0, #0
-	ret
-*/
 #ifdef __APCS_26__
 #define RET	movs
+#define RETc(x)	mov##x##s
 #define RETCOND ^
 #else
 #define RET	mov
+#define RETc(x)	mov##x
 #define RETCOND
 #endif
@@ -290,1323 +58,339 @@ Ldiv_zero:
 #ifdef L_udivsi3
+dividend	.req	r0
+divisor		.req	r1
+result		.req	r2
+curbit		.req	r3
 ip		.req	r12
 sp		.req	r13
 lr		.req	r14
 pc		.req	r15
-.text
+	.text
 	.globl SYM (__udivsi3)
 	.align 0
-SYM (__udivsi3):
-	stmdb	sp!, {r4, r5, lr}
-	@ Ready to divide.  Compute size of quotient; scale comparand.
-	movs	lr, r1
-	mov	r3, r0
-	beq	Ldiv_zero
-	cmp	r3, lr			@ if r1 exceeds r0, done
-	mov	r2, #0
-	bcc	Lgot_result		@ (and algorithm fails otherwise)
-	mov	r4, #(1 << (32 - 4 - 1))
-	cmp	r3, r4
-	mov	ip, #0
-	bcc	Lnot_really_big
-	@ Here the dividend is >= 2^(31-N) or so.  We must be careful here,
-	@ as our usual N-at-a-shot divide step will cause overflow and havoc.
-	@ The number of bits in the result here is N*ITER+SC, where SC <= N.
-	@ Compute ITER in an unorthodox manner: know we need to shift V into
-	@ the top decade: so do not even bother to compare to R.
-		mov	r5, #1
-	1:
-		cmp	lr, r4
-		bcs	3f
-		mov	lr, lr, lsl #4
-		add	ip, ip, #1
-		b	1b
-	@ Now compute r5.
-	2:	adds	lr, lr, lr
-		add	r5, r5, #1
-		bcc	Lnot_too_big
-		@ We get here if the r1 overflowed while shifting.
-		@ This means that r3 has the high-order bit set.
-		@ Restore lr and subtract from r3.
-		mov	r4, r4, lsl #4
-		mov	lr, lr, lsr #1
-		add	lr, r4, lr
-		sub	r5, r5, #1
-		b	Ldo_single_div
-	Lnot_too_big:
-	3:	cmp	lr, r3
-		bcc	2b
-@		beq	Ldo_single_div
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	@ lr > r3: went too far: back up 1 step
-	@	srl	lr, 1, lr
-	@	dec	r5
-	@ do single-bit divide steps
-	@
-	@ We have to be careful here.  We know that r3 >= lr, so we can do the
-	@ first divide step without thinking.  BUT, the others are conditional,
-	@ and are only done if r3 >= 0.  Because both r3 and lr may have the high-
-	@ order bit set in the first step, just falling into the regular
-	@ division loop will mess up the first time around.
-	@ So we unroll slightly...
-	Ldo_single_div:
-		subs	r5, r5, #1
-		blt	Lend_regular_divide
-		sub	r3, r3, lr
-		mov	r2, #1
-		b	Lend_single_divloop
-	Lsingle_divloop:
-		cmp	r3, #0
-		mov	r2, r2, lsl #1
-		mov	lr, lr, lsr #1
-		@ r3 >= 0
-		subpl	r3, r3, lr
-		addpl	r2, r2, #1
-		@ r3 < 0
-		addmi	r3, r3, lr
-		submi	r2, r2, #1
-	Lend_single_divloop:
-		subs	r5, r5, #1
-		bge	Lsingle_divloop
-		b	Lend_regular_divide
-1:
-	add	ip, ip, #1
-Lnot_really_big:
-	mov	lr, lr, lsl #4
-	cmp	lr, r3
-	bls	1b
-	@
-	@	HOW CAN ip EVER BE -1 HERE ?????
-	@
-	cmn	ip, #1
-	beq	Lgot_result
-Ldivloop:
-	cmp	r3, #0	@ set up for initial iteration
-	mov	r2, r2, lsl #4
-		@ depth 1, accumulated bits 0
-	mov	lr, lr, lsr #1
-	blt	L.1.1015
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 2, accumulated bits 1
-	mov	lr, lr, lsr #1
-	blt	L.2.1016
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 3, accumulated bits 3
-	mov	lr, lr, lsr #1
-	blt	L.3.1018
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits 7
-	mov	lr, lr, lsr #1
-	blt	L.4.1022
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #15
-		b	9f
-L.4.1022:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #13
-		b	9f
-L.3.1018:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits 5
-	mov	lr, lr, lsr #1
-	blt	L.4.1020
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #11
-		b	9f
-L.4.1020:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #9
-		b	9f
-L.2.1016:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 3, accumulated bits 1
-	mov	lr, lr, lsr #1
-	blt	L.3.1016
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits 3
-	mov	lr, lr, lsr #1
-	blt	L.4.1018
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #7
-		b	9f
-L.4.1018:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #5
-		b	9f
-L.3.1016:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits 1
-	mov	lr, lr, lsr #1
-	blt	L.4.1016
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #3
-		b	9f
-L.4.1016:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #1
-		b	9f
-L.1.1015:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 2, accumulated bits -1
-	mov	lr, lr, lsr #1
-	blt	L.2.1014
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 3, accumulated bits -1
-	mov	lr, lr, lsr #1
-	blt	L.3.1014
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits -1
-	mov	lr, lr, lsr #1
-	blt	L.4.1014
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #1
-		b	9f
-L.4.1014:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #3
-		b	9f
-L.3.1014:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits -3
-	mov	lr, lr, lsr #1
-	blt	L.4.1012
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #5
-		b	9f
-L.4.1012:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #7
-		b	9f
-L.2.1014:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 3, accumulated bits -3
-	mov	lr, lr, lsr #1
-	blt	L.3.1012
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits -5
-	mov	lr, lr, lsr #1
-	blt	L.4.1010
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #9
-		b	9f
-L.4.1010:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #11
-		b	9f
-L.3.1012:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits -7
-	mov	lr, lr, lsr #1
-	blt	L.4.1008
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #13
-		b	9f
-L.4.1008:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #15
-		b	9f
-	9:
-Lend_regular_divide:
-	subs	ip, ip, #1
-	bge	Ldivloop
-	cmp	r3, #0
-	@ non-restoring fixup here (one instruction only!)
-	sublt	r2, r2, #1
+SYM (__udivsi3):
+	cmp	divisor, #0
+	beq	Ldiv0
+	mov	curbit, #1
+	mov	result, #0
+	cmp	dividend, divisor
+	bcc	Lgot_result
+Loop1:
+	@ Unless the divisor is very big, shift it up in multiples of
+	@ four bits, since this is the amount of unwinding in the main
+	@ division loop.  Continue shifting until the divisor is 
+	@ larger than the dividend.
+	cmp	divisor, #0x10000000
+	cmpcc	divisor, dividend
+	movcc	divisor, divisor, lsl #4
+	movcc	curbit, curbit, lsl #4
+	bcc	Loop1
+Lbignum:
+	@ For very big divisors, we must shift it a bit at a time, or
+	@ we will be in danger of overflowing.
+	cmp	divisor, #0x80000000
+	cmpcc	divisor, dividend
+	movcc	divisor, divisor, lsl #1
+	movcc	curbit, curbit, lsl #1
+	bcc	Lbignum
+Loop3:
+	@ Test for possible subtractions, and note which bits
+	@ are done in the result.  On the final pass, this may subtract
+	@ too much from the dividend, but the result will be ok, since the
+	@ "bit" will have been shifted out at the bottom.
+	cmp	dividend, divisor
+	subcs	dividend, dividend, divisor
+	orrcs	result, result, curbit
+	cmp	dividend, divisor, lsr #1
+	subcs	dividend, dividend, divisor, lsr #1
+	orrcs	result, result, curbit, lsr #1
+	cmp	dividend, divisor, lsr #2
+	subcs	dividend, dividend, divisor, lsr #2
+	orrcs	result, result, curbit, lsr #2
+	cmp	dividend, divisor, lsr #3
+	subcs	dividend, dividend, divisor, lsr #3
+	orrcs	result, result, curbit, lsr #3
+	cmp	dividend, #0			@ Early termination?
+	movnes	curbit, curbit, lsr #4		@ No, any more bits to do?
+	movne	divisor, divisor, lsr #4
+	bne	Loop3
 Lgot_result:
+	mov	r0, result
+	RET	pc, lr
-	mov r0, r2
+Ldiv0:
-	ldmia	sp!, {r4, r5, pc}RETCOND
+	str	lr, [sp, #-4]!
-Ldiv_zero:
-	@ Divide by zero trap.  If it returns, return 0 (about as
-	@ wrong as possible, but that is what SunOS does...).
 	bl	SYM (__div0)
-	mov	r0, #0
+	mov	r0, #0			@ about as wrong as it could be
-	ldmia	sp!, {r4, r5, pc}RETCOND
+	ldmia	sp!, {pc}RETCOND
 #endif /* L_udivsi3 */
-#ifdef L_divsi3
+#ifdef L_umodsi3
+dividend	.req	r0
+divisor		.req	r1
+overdone	.req	r2
+curbit		.req	r3
 ip		.req	r12
 sp		.req	r13
 lr		.req	r14
 pc		.req	r15
-.text
+	.text
-	.globl SYM (__divsi3)
+	.globl SYM (__umodsi3)
 	.align 0
-SYM (__divsi3):
-	stmdb	sp!, {r4, r5, r6, lr}
-	@ compute sign of result; if neither is negative, no problem
-	eor	r6, r1, r0	@ compute sign
-	cmp	r1, #0
-	rsbmi	r1, r1, #0
-	beq	Ldiv_zero
-	mov	lr, r1
-	movs	r3, r0
-	rsbmi	r3, r3, #0	@ make dividend nonnegative
-	cmp	r3, lr			@ if r1 exceeds r0, done
-	mov	r2, #0
-	bcc	Lgot_result		@ (and algorithm fails otherwise)
-	mov	r4, #(1 << (32 - 4 - 1))
-	cmp	r3, r4
-	mov	ip, #0
-	bcc	Lnot_really_big
-	@ Here the dividend is >= 2^(31-N) or so.  We must be careful here,
-	@ as our usual N-at-a-shot divide step will cause overflow and havoc.
-	@ The number of bits in the result here is N*ITER+SC, where SC <= N.
-	@ Compute ITER in an unorthodox manner: know we need to shift V into
-	@ the top decade: so do not even bother to compare to R.
-		mov	r5, #1
-	1:
-		cmp	lr, r4
-		bcs	3f
-		mov	lr, lr, lsl #4
-		add	ip, ip, #1
-		b	1b
-	@ Now compute r5.
-	2:	adds	lr, lr, lr
-		add	r5, r5, #1
-		bcc	Lnot_too_big
-		@ We get here if the r1 overflowed while shifting.
-		@ This means that r3 has the high-order bit set.
-		@ Restore lr and subtract from r3.
-		mov	r4, r4, lsl #4
-		mov	lr, lr, lsr #1
-		add	lr, r4, lr
-		sub	r5, r5, #1
-		b	Ldo_single_div
-	Lnot_too_big:
-	3:	cmp	lr, r3
-		bcc	2b
-@		beq	Ldo_single_div
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	@ lr > r3: went too far: back up 1 step
-	@	srl	lr, 1, lr
-	@	dec	r5
-	@ do single-bit divide steps
-	@
-	@ We have to be careful here.  We know that r3 >= lr, so we can do the
-	@ first divide step without thinking.  BUT, the others are conditional,
-	@ and are only done if r3 >= 0.  Because both r3 and lr may have the high-
-	@ order bit set in the first step, just falling into the regular
-	@ division loop will mess up the first time around.
-	@ So we unroll slightly...
-	Ldo_single_div:
-		subs	r5, r5, #1
-		blt	Lend_regular_divide
-		sub	r3, r3, lr
-		mov	r2, #1
-		b	Lend_single_divloop
-	Lsingle_divloop:
-		cmp	r3, #0
-		mov	r2, r2, lsl #1
-		mov	lr, lr, lsr #1
-		@ r3 >= 0
-		subpl	r3, r3, lr
-		addpl	r2, r2, #1
-		@ r3 < 0
-		addmi	r3, r3, lr
-		submi	r2, r2, #1
-	Lend_single_divloop:
-		subs	r5, r5, #1
-		bge	Lsingle_divloop
-		b	Lend_regular_divide
-1:
-	add	ip, ip, #1
-Lnot_really_big:
-	mov	lr, lr, lsl #4
-	cmp	lr, r3
-	bls	1b
-	@
-	@	HOW CAN ip EVER BE -1 HERE ?????
-	@
-	cmn	ip, #1
-	beq	Lgot_result
-Ldivloop:
-	cmp	r3, #0	@ set up for initial iteration
-	mov	r2, r2, lsl #4
-		@ depth 1, accumulated bits 0
-	mov	lr, lr, lsr #1
-	blt	L.1.1015
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 2, accumulated bits 1
-	mov	lr, lr, lsr #1
-	blt	L.2.1016
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 3, accumulated bits 3
-	mov	lr, lr, lsr #1
-	blt	L.3.1018
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits 7
-	mov	lr, lr, lsr #1
-	blt	L.4.1022
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #15
-		b	9f
-L.4.1022:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #13
-		b	9f
-L.3.1018:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits 5
-	mov	lr, lr, lsr #1
-	blt	L.4.1020
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #11
-		b	9f
-L.4.1020:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #9
-		b	9f
-L.2.1016:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 3, accumulated bits 1
-	mov	lr, lr, lsr #1
-	blt	L.3.1016
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits 3
-	mov	lr, lr, lsr #1
-	blt	L.4.1018
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #7
-		b	9f
-L.4.1018:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #5
-		b	9f
-L.3.1016:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits 1
-	mov	lr, lr, lsr #1
-	blt	L.4.1016
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #3
-		b	9f
-L.4.1016:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #1
-		b	9f
-L.1.1015:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 2, accumulated bits -1
-	mov	lr, lr, lsr #1
-	blt	L.2.1014
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 3, accumulated bits -1
-	mov	lr, lr, lsr #1
-	blt	L.3.1014
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits -1
-	mov	lr, lr, lsr #1
-	blt	L.4.1014
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #1
-		b	9f
-L.4.1014:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #3
-		b	9f
-L.3.1014:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits -3
-	mov	lr, lr, lsr #1
-	blt	L.4.1012
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #5
-		b	9f
-L.4.1012:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #7
-		b	9f
-L.2.1014:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 3, accumulated bits -3
-	mov	lr, lr, lsr #1
-	blt	L.3.1012
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits -5
-	mov	lr, lr, lsr #1
-	blt	L.4.1010
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #9
-		b	9f
-L.4.1010:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #11
-		b	9f
-L.3.1012:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits -7
-	mov	lr, lr, lsr #1
-	blt	L.4.1008
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #13
-		b	9f
-L.4.1008:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #15
-		b	9f
+SYM (__umodsi3):
+	cmp	divisor, #0
+	beq	Ldiv0
+	mov	curbit, #1
+	cmp	dividend, divisor
+	RETc(cc)	pc, lr
+Loop1:
+	@ Unless the divisor is very big, shift it up in multiples of
+	@ four bits, since this is the amount of unwinding in the main
+	@ division loop.  Continue shifting until the divisor is 
+	@ larger than the dividend.
+	cmp	divisor, #0x10000000
+	cmpcc	divisor, dividend
+	movcc	divisor, divisor, lsl #4
+	movcc	curbit, curbit, lsl #4
+	bcc	Loop1
+Lbignum:
+	@ For very big divisors, we must shift it a bit at a time, or
+	@ we will be in danger of overflowing.
+	cmp	divisor, #0x80000000
+	cmpcc	divisor, dividend
+	movcc	divisor, divisor, lsl #1
+	movcc	curbit, curbit, lsl #1
+	bcc	Lbignum
+Loop3:
+	@ Test for possible subtractions.  On the final pass, this may 
+	@ subtract too much from the dividend, so keep track of which
+	@ subtractions are done, we can fix them up afterwards...
+	mov	overdone, #0
+	cmp	dividend, divisor
+	subcs	dividend, dividend, divisor
+	cmp	dividend, divisor, lsr #1
+	subcs	dividend, dividend, divisor, lsr #1
+	orrcs	overdone, overdone, curbit, ror #1
+	cmp	dividend, divisor, lsr #2
+	subcs	dividend, dividend, divisor, lsr #2
+	orrcs	overdone, overdone, curbit, ror #2
+	cmp	dividend, divisor, lsr #3
+	subcs	dividend, dividend, divisor, lsr #3
+	orrcs	overdone, overdone, curbit, ror #3
+	mov	ip, curbit
+	cmp	dividend, #0			@ Early termination?
+	movnes	curbit, curbit, lsr #4		@ No, any more bits to do?
+	movne	divisor, divisor, lsr #4
+	bne	Loop3
+	@ Any subtractions that we should not have done will be recorded in
+	@ the top three bits of "overdone".  Exactly which were not needed
+	@ are governed by the position of the bit, stored in ip.
+	@ If we terminated early, because dividend became zero,
+	@ then none of the below will match, since the bit in ip will not be
+	@ in the bottom nibble.
+	ands	overdone, overdone, #0xe0000000
+	RETc(eq)	pc, lr				@ No fixups needed
+	tst	overdone, ip, ror #3
+	addne	dividend, dividend, divisor, lsr #3
+	tst	overdone, ip, ror #2
+	addne	dividend, dividend, divisor, lsr #2
+	tst	overdone, ip, ror #1
+	addne	dividend, dividend, divisor, lsr #1
+	RET	pc, lr
+Ldiv0:
+	str	lr, [sp, #-4]!
-	9:
-Lend_regular_divide:
-	subs	ip, ip, #1
-	bge	Ldivloop
-	cmp	r3, #0
-	@ non-restoring fixup here (one instruction only!)
-	sublt	r2, r2, #1
-Lgot_result:
-	@ check to see if answer should be < 0
-	cmp	r6, #0
-	rsbmi r2, r2, #0
-	mov r0, r2
-	ldmia	sp!, {r4, r5, r6, pc}RETCOND
-Ldiv_zero:
-	@ Divide by zero trap.  If it returns, return 0 (about as
-	@ wrong as possible, but that is what SunOS does...).
 	bl	SYM (__div0)
-	mov	r0, #0
+	mov	r0, #0			@ about as wrong as it could be
-	ldmia	sp!, {r4, r5, r6, pc}RETCOND
+	ldmia	sp!, {pc}RETCOND
-#endif /* L_divsi3 */
+#endif /* L_umodsi3 */
-#ifdef L_umodsi3
+#ifdef L_divsi3
+dividend	.req	r0
+divisor		.req	r1
+result		.req	r2
+curbit		.req	r3
 ip		.req	r12
 sp		.req	r13
 lr		.req	r14
 pc		.req	r15
-.text
+	.text
-	.globl SYM (__umodsi3)
+	.globl SYM (__divsi3)
 	.align 0
-SYM (__umodsi3):
-	stmdb	sp!, {r4, r5, lr}
-	@ Ready to divide.  Compute size of quotient; scale comparand.
-	movs	lr, r1
-	mov	r3, r0
-	beq	Ldiv_zero
-	cmp	r3, lr			@ if r1 exceeds r0, done
-	mov	r2, #0
-	bcc	Lgot_result		@ (and algorithm fails otherwise)
-	mov	r4, #(1 << (32 - 4 - 1))
-	cmp	r3, r4
-	mov	ip, #0
-	bcc	Lnot_really_big
-	@ Here the dividend is >= 2^(31-N) or so.  We must be careful here,
-	@ as our usual N-at-a-shot divide step will cause overflow and havoc.
-	@ The number of bits in the result here is N*ITER+SC, where SC <= N.
-	@ Compute ITER in an unorthodox manner: know we need to shift V into
-	@ the top decade: so do not even bother to compare to R.
-		mov	r5, #1
-	1:
-		cmp	lr, r4
-		bcs	3f
-		mov	lr, lr, lsl #4
-		add	ip, ip, #1
-		b	1b
-	@ Now compute r5.
-	2:	adds	lr, lr, lr
-		add	r5, r5, #1
-		bcc	Lnot_too_big
-		@ We get here if the r1 overflowed while shifting.
-		@ This means that r3 has the high-order bit set.
-		@ Restore lr and subtract from r3.
-		mov	r4, r4, lsl #4
-		mov	lr, lr, lsr #1
-		add	lr, r4, lr
-		sub	r5, r5, #1
-		b	Ldo_single_div
-	Lnot_too_big:
-	3:	cmp	lr, r3
-		bcc	2b
-@		beq	Ldo_single_div
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	@ lr > r3: went too far: back up 1 step
-	@	srl	lr, 1, lr
-	@	dec	r5
-	@ do single-bit divide steps
-	@
-	@ We have to be careful here.  We know that r3 >= lr, so we can do the
-	@ first divide step without thinking.  BUT, the others are conditional,
-	@ and are only done if r3 >= 0.  Because both r3 and lr may have the high-
-	@ order bit set in the first step, just falling into the regular
-	@ division loop will mess up the first time around.
-	@ So we unroll slightly...
-	Ldo_single_div:
-		subs	r5, r5, #1
-		blt	Lend_regular_divide
-		sub	r3, r3, lr
-		mov	r2, #1
-		b	Lend_single_divloop
-	Lsingle_divloop:
-		cmp	r3, #0
-		mov	r2, r2, lsl #1
-		mov	lr, lr, lsr #1
-		@ r3 >= 0
-		subpl	r3, r3, lr
-		addpl	r2, r2, #1
-		@ r3 < 0
-		addmi	r3, r3, lr
-		submi	r2, r2, #1
-	Lend_single_divloop:
-		subs	r5, r5, #1
-		bge	Lsingle_divloop
-		b	Lend_regular_divide
-1:
-	add	ip, ip, #1
-Lnot_really_big:
-	mov	lr, lr, lsl #4
-	cmp	lr, r3
-	bls	1b
-	@
-	@	HOW CAN ip EVER BE -1 HERE ?????
-	@
-	cmn	ip, #1
-	beq	Lgot_result
-Ldivloop:
-	cmp	r3, #0	@ set up for initial iteration
-	mov	r2, r2, lsl #4
-		@ depth 1, accumulated bits 0
-	mov	lr, lr, lsr #1
-	blt	L.1.1015
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 2, accumulated bits 1
-	mov	lr, lr, lsr #1
-	blt	L.2.1016
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 3, accumulated bits 3
-	mov	lr, lr, lsr #1
-	blt	L.3.1018
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits 7
-	mov	lr, lr, lsr #1
-	blt	L.4.1022
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #15
-		b	9f
-L.4.1022:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #13
-		b	9f
-L.3.1018:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits 5
-	mov	lr, lr, lsr #1
-	blt	L.4.1020
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #11
-		b	9f
-L.4.1020:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #9
-		b	9f
-L.2.1016:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 3, accumulated bits 1
-	mov	lr, lr, lsr #1
-	blt	L.3.1016
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits 3
-	mov	lr, lr, lsr #1
-	blt	L.4.1018
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #7
-		b	9f
-L.4.1018:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #5
-		b	9f
-L.3.1016:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits 1
-	mov	lr, lr, lsr #1
-	blt	L.4.1016
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #3
-		b	9f
-L.4.1016:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #1
-		b	9f
-L.1.1015:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 2, accumulated bits -1
-	mov	lr, lr, lsr #1
-	blt	L.2.1014
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 3, accumulated bits -1
-	mov	lr, lr, lsr #1
-	blt	L.3.1014
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits -1
-	mov	lr, lr, lsr #1
-	blt	L.4.1014
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #1
-		b	9f
-L.4.1014:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #3
-		b	9f
-L.3.1014:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits -3
-	mov	lr, lr, lsr #1
-	blt	L.4.1012
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #5
-		b	9f
-L.4.1012:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #7
-		b	9f
-L.2.1014:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 3, accumulated bits -3
-	mov	lr, lr, lsr #1
-	blt	L.3.1012
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits -5
-	mov	lr, lr, lsr #1
-	blt	L.4.1010
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #9
-		b	9f
-L.4.1010:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #11
-		b	9f
-L.3.1012:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits -7
-	mov	lr, lr, lsr #1
-	blt	L.4.1008
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #13
-		b	9f
-L.4.1008:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #15
-		b	9f
-	9:
-Lend_regular_divide:
-	subs	ip, ip, #1
-	bge	Ldivloop
-	cmp	r3, #0
-	@ non-restoring fixup here (one instruction only!)
-	addlt	r3, r1, r3
+SYM (__divsi3):
+	eor	ip, dividend, divisor		@ Save the sign of the result.
+	mov	curbit, #1
+	mov	result, #0
+	cmp	divisor, #0
+	rsbmi	divisor, divisor, #0		@ Loops below use unsigned.
+	beq	Ldiv0
+	cmp	dividend, #0
+	rsbmi	dividend, dividend, #0
+	cmp	dividend, divisor
+	bcc	Lgot_result
+Loop1:
+	@ Unless the divisor is very big, shift it up in multiples of
+	@ four bits, since this is the amount of unwinding in the main
+	@ division loop.  Continue shifting until the divisor is 
+	@ larger than the dividend.
+	cmp	divisor, #0x10000000
+	cmpcc	divisor, dividend
+	movcc	divisor, divisor, lsl #4
+	movcc	curbit, curbit, lsl #4
+	bcc	Loop1
+Lbignum:
+	@ For very big divisors, we must shift it a bit at a time, or
+	@ we will be in danger of overflowing.
+	cmp	divisor, #0x80000000
+	cmpcc	divisor, dividend
+	movcc	divisor, divisor, lsl #1
+	movcc	curbit, curbit, lsl #1
+	bcc	Lbignum
+Loop3:
+	@ Test for possible subtractions, and note which bits
+	@ are done in the result.  On the final pass, this may subtract
+	@ too much from the dividend, but the result will be ok, since the
+	@ "bit" will have been shifted out at the bottom.
+	cmp	dividend, divisor
+	subcs	dividend, dividend, divisor
+	orrcs	result, result, curbit
+	cmp	dividend, divisor, lsr #1
+	subcs	dividend, dividend, divisor, lsr #1
+	orrcs	result, result, curbit, lsr #1
+	cmp	dividend, divisor, lsr #2
+	subcs	dividend, dividend, divisor, lsr #2
+	orrcs	result, result, curbit, lsr #2
+	cmp	dividend, divisor, lsr #3
+	subcs	dividend, dividend, divisor, lsr #3
+	orrcs	result, result, curbit, lsr #3
+	cmp	dividend, #0			@ Early termination?
+	movnes	curbit, curbit, lsr #4		@ No, any more bits to do?
+	movne	divisor, divisor, lsr #4
+	bne	Loop3
 Lgot_result:
+	mov	r0, result
+	cmp	ip, #0
+	rsbmi	r0, r0, #0
+	RET	pc, lr
-	mov r0, r3
+Ldiv0:
-	ldmia	sp!, {r4, r5, pc}RETCOND
+	str	lr, [sp, #-4]!
-Ldiv_zero:
-	@ Divide by zero trap.  If it returns, return 0 (about as
-	@ wrong as possible, but that is what SunOS does...).
 	bl	SYM (__div0)
-	mov	r0, #0
+	mov	r0, #0			@ about as wrong as it could be
-	ldmia	sp!, {r4, r5, pc}RETCOND
+	ldmia	sp!, {pc}RETCOND
-#endif /* L_umodsi3 */
+#endif /* L_divsi3 */
 #ifdef L_modsi3
+dividend	.req	r0
+divisor		.req	r1
+overdone	.req	r2
+curbit		.req	r3
 ip		.req	r12
 sp		.req	r13
 lr		.req	r14
 pc		.req	r15
-.text
+	.text
 	.globl SYM (__modsi3)
 	.align 0
-SYM (__modsi3):
-	stmdb	sp!, {r4, r5, r6, lr}
-	@ compute sign of result; if neither is negative, no problem
-	mov	r6, r0
-	cmp	r1, #0
-	rsbmi	r1, r1, #0
-	beq	Ldiv_zero
-	mov	lr, r1
-	movs	r3, r0
-	rsbmi	r3, r3, #0	@ make dividend nonnegative
-	cmp	r3, lr			@ if r1 exceeds r0, done
-	mov	r2, #0
-	bcc	Lgot_result		@ (and algorithm fails otherwise)
-	mov	r4, #(1 << (32 - 4 - 1))
-	cmp	r3, r4
-	mov	ip, #0
-	bcc	Lnot_really_big
-	@ Here the dividend is >= 2^(31-N) or so.  We must be careful here,
-	@ as our usual N-at-a-shot divide step will cause overflow and havoc.
-	@ The number of bits in the result here is N*ITER+SC, where SC <= N.
-	@ Compute ITER in an unorthodox manner: know we need to shift V into
-	@ the top decade: so do not even bother to compare to R.
-		mov	r5, #1
-	1:
-		cmp	lr, r4
-		bcs	3f
-		mov	lr, lr, lsl #4
-		add	ip, ip, #1
-		b	1b
-	@ Now compute r5.
-	2:	adds	lr, lr, lr
-		add	r5, r5, #1
-		bcc	Lnot_too_big
-		@ We get here if the r1 overflowed while shifting.
-		@ This means that r3 has the high-order bit set.
-		@ Restore lr and subtract from r3.
-		mov	r4, r4, lsl #4
-		mov	lr, lr, lsr #1
-		add	lr, r4, lr
-		sub	r5, r5, #1
-		b	Ldo_single_div
-	Lnot_too_big:
-	3:	cmp	lr, r3
-		bcc	2b
-@		beq	Ldo_single_div
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	@ lr > r3: went too far: back up 1 step
-	@	srl	lr, 1, lr
-	@	dec	r5
-	@ do single-bit divide steps
-	@
-	@ We have to be careful here.  We know that r3 >= lr, so we can do the
-	@ first divide step without thinking.  BUT, the others are conditional,
-	@ and are only done if r3 >= 0.  Because both r3 and lr may have the high-
-	@ order bit set in the first step, just falling into the regular
-	@ division loop will mess up the first time around.
-	@ So we unroll slightly...
-	Ldo_single_div:
-		subs	r5, r5, #1
-		blt	Lend_regular_divide
-		sub	r3, r3, lr
-		mov	r2, #1
-		b	Lend_single_divloop
-	Lsingle_divloop:
-		cmp	r3, #0
-		mov	r2, r2, lsl #1
-		mov	lr, lr, lsr #1
-		@ r3 >= 0
-		subpl	r3, r3, lr
-		addpl	r2, r2, #1
-		@ r3 < 0
-		addmi	r3, r3, lr
-		submi	r2, r2, #1
-	Lend_single_divloop:
-		subs	r5, r5, #1
-		bge	Lsingle_divloop
-		b	Lend_regular_divide
-1:
+SYM (__modsi3):
-	add	ip, ip, #1
+	mov	curbit, #1
-Lnot_really_big:
+	cmp	divisor, #0
-	mov	lr, lr, lsl #4
+	rsbmi	divisor, divisor, #0		@ Loops below use unsigned.
-	cmp	lr, r3
+	beq	Ldiv0
-	bls	1b
+	@ Need to save the sign of the dividend, unfortunately, we need
-	@
+	@ ip later on; this is faster than pushing lr and using that.
-	@	HOW CAN ip EVER BE -1 HERE ?????
+	str	dividend, [sp, #-4]!
-	@
+	cmp	dividend, #0
-	cmn	ip, #1
+	rsbmi	dividend, dividend, #0
+	cmp	dividend, divisor
+	bcc	Lgot_result
+Loop1:
+	@ Unless the divisor is very big, shift it up in multiples of
+	@ four bits, since this is the amount of unwinding in the main
+	@ division loop.  Continue shifting until the divisor is 
+	@ larger than the dividend.
+	cmp	divisor, #0x10000000
+	cmpcc	divisor, dividend
+	movcc	divisor, divisor, lsl #4
+	movcc	curbit, curbit, lsl #4
+	bcc	Loop1
+Lbignum:
+	@ For very big divisors, we must shift it a bit at a time, or
+	@ we will be in danger of overflowing.
+	cmp	divisor, #0x80000000
+	cmpcc	divisor, dividend
+	movcc	divisor, divisor, lsl #1
+	movcc	curbit, curbit, lsl #1
+	bcc	Lbignum
+Loop3:
+	@ Test for possible subtractions.  On the final pass, this may 
+	@ subtract too much from the dividend, so keep track of which
+	@ subtractions are done, we can fix them up afterwards...
+	mov	overdone, #0
+	cmp	dividend, divisor
+	subcs	dividend, dividend, divisor
+	cmp	dividend, divisor, lsr #1
+	subcs	dividend, dividend, divisor, lsr #1
+	orrcs	overdone, overdone, curbit, ror #1
+	cmp	dividend, divisor, lsr #2
+	subcs	dividend, dividend, divisor, lsr #2
+	orrcs	overdone, overdone, curbit, ror #2
+	cmp	dividend, divisor, lsr #3
+	subcs	dividend, dividend, divisor, lsr #3
+	orrcs	overdone, overdone, curbit, ror #3
+	mov	ip, curbit
+	cmp	dividend, #0			@ Early termination?
+	movnes	curbit, curbit, lsr #4		@ No, any more bits to do?
+	movne	divisor, divisor, lsr #4
+	bne	Loop3
+	@ Any subtractions that we should not have done will be recorded in
+	@ the top three bits of "overdone".  Exactly which were not needed
+	@ are governed by the position of the bit, stored in ip.
+	@ If we terminated early, because dividend became zero,
+	@ then none of the below will match, since the bit in ip will not be
+	@ in the bottom nibble.
+	ands	overdone, overdone, #0xe0000000
 	beq	Lgot_result
+	tst	overdone, ip, ror #3
-Ldivloop:
+	addne	dividend, dividend, divisor, lsr #3
-	cmp	r3, #0	@ set up for initial iteration
+	tst	overdone, ip, ror #2
-	mov	r2, r2, lsl #4
+	addne	dividend, dividend, divisor, lsr #2
-		@ depth 1, accumulated bits 0
+	tst	overdone, ip, ror #1
-	mov	lr, lr, lsr #1
+	addne	dividend, dividend, divisor, lsr #1
-	blt	L.1.1015
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 2, accumulated bits 1
-	mov	lr, lr, lsr #1
-	blt	L.2.1016
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 3, accumulated bits 3
-	mov	lr, lr, lsr #1
-	blt	L.3.1018
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits 7
-	mov	lr, lr, lsr #1
-	blt	L.4.1022
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #15
-		b	9f
-L.4.1022:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #13
-		b	9f
-L.3.1018:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits 5
-	mov	lr, lr, lsr #1
-	blt	L.4.1020
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #11
-		b	9f
-L.4.1020:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #9
-		b	9f
-L.2.1016:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 3, accumulated bits 1
-	mov	lr, lr, lsr #1
-	blt	L.3.1016
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits 3
-	mov	lr, lr, lsr #1
-	blt	L.4.1018
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #7
-		b	9f
-L.4.1018:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #5
-		b	9f
-L.3.1016:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits 1
-	mov	lr, lr, lsr #1
-	blt	L.4.1016
-	@ remainder is positive
-	subs	r3, r3, lr
-		add	r2, r2, #3
-		b	9f
-L.4.1016:
-	@ remainder is negative
-	adds	r3, r3, lr
-		add	r2, r2, #1
-		b	9f
-L.1.1015:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 2, accumulated bits -1
-	mov	lr, lr, lsr #1
-	blt	L.2.1014
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 3, accumulated bits -1
-	mov	lr, lr, lsr #1
-	blt	L.3.1014
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits -1
-	mov	lr, lr, lsr #1
-	blt	L.4.1014
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #1
-		b	9f
-L.4.1014:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #3
-		b	9f
-L.3.1014:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits -3
-	mov	lr, lr, lsr #1
-	blt	L.4.1012
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #5
-		b	9f
-L.4.1012:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #7
-		b	9f
-L.2.1014:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 3, accumulated bits -3
-	mov	lr, lr, lsr #1
-	blt	L.3.1012
-	@ remainder is positive
-	subs	r3, r3, lr
-			@ depth 4, accumulated bits -5
-	mov	lr, lr, lsr #1
-	blt	L.4.1010
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #9
-		b	9f
-L.4.1010:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #11
-		b	9f
-L.3.1012:
-	@ remainder is negative
-	adds	r3, r3, lr
-			@ depth 4, accumulated bits -7
-	mov	lr, lr, lsr #1
-	blt	L.4.1008
-	@ remainder is positive
-	subs	r3, r3, lr
-		sub	r2, r2, #13
-		b	9f
-L.4.1008:
-	@ remainder is negative
-	adds	r3, r3, lr
-		sub	r2, r2, #15
-		b	9f
-	9:
-Lend_regular_divide:
-	subs	ip, ip, #1
-	bge	Ldivloop
-	cmp	r3, #0
-	@ non-restoring fixup here (one instruction only!)
-	addlt	r3, r1, r3
 Lgot_result:
-	@ check to see if answer should be < 0
+	ldr	ip, [sp], #4
-	cmp	r6, #0
+	cmp	ip, #0
-	rsbmi r3, r3, #0
+	rsbmi	dividend, dividend, #0
+	RET	pc, lr
-	mov r0, r3
-	ldmia	sp!, {r4, r5, r6, pc}RETCOND
-Ldiv_zero:
+Ldiv0:
-	@ Divide by zero trap.  If it returns, return 0 (about as
+	str	lr, [sp, #-4]!
-	@ wrong as possible, but that is what SunOS does...).
 	bl	SYM (__div0)
-	mov	r0, #0
+	mov	r0, #0			@ about as wrong as it could be
-	ldmia	sp!, {r4, r5, r6, pc}RETCOND
+	ldmia	sp!, {pc}RETCOND
 #endif /* L_modsi3 */