Commit bd28bf5a by Richard Earnshaw

arm/lib1funcs.asm (__divsi3, __modsi3, __udivsi3, __umodsi3): Replace

with smaller, faster versions.

From-SVN: r11070
parent b920730a
@ libgcc1 routines for ARM cpu.
@ Division and remainder, from Appendix E of the Sparc Version 8
@ Architecture Manual, with fixes from Gordon Irlam.
@ Rewritten for the ARM by Richard Earnshaw (rwe@pegasus.esprit.ec.org)
@ Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
/* Copyright (C) 1995 Free Software Foundation, Inc.
/* Copyright (C) 1995, 1996 Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
......@@ -35,243 +33,13 @@ Boston, MA 02111-1307, USA. */
This exception does not however invalidate any other reasons why
the executable file might be covered by the GNU General Public License. */
/*
* Input: dividend and divisor in r0 and r1 respectively.
*
* m4 parameters:
* NAME name of function to generate
* OP OP=div => r0 / r1; OP=mod => r0 % r1
* S S=true => signed; S=false => unsigned
*
* Algorithm parameters:
* N how many bits per iteration we try to get (4)
* WORDSIZE total number of bits (32)
*
* Derived constants:
* TOPBITS number of bits in the top `decade' of a number
*
* Important variables:
* Q the partial quotient under development (initially 0)
* R the remainder so far, initially the dividend
* ITER number of main division loop iterations required;
* equal to ceil(log2(quotient) / N). Note that this
* is the log base (2^N) of the quotient.
* V the current comparand, initially divisor*2^(ITER*N-1)
*
* Cost:
* Current estimate for non-large dividend is
* ceil(log2(quotient) / N) * (10 + 7N/2) + C
* A large dividend is one greater than 2^(31-TOPBITS) and takes a
* different path, as the upper bits of the quotient must be developed
* one bit at a time.
*/
/*
define(N, `4')dnl
define(WORDSIZE, `32')dnl
define(TOPBITS, eval(WORDSIZE - N*((WORDSIZE-1)/N)))dnl
dnl
define(dividend, `r0')dnl
define(divisor, `r1')dnl
define(Q, `r2')dnl
define(R, `r3')dnl
define(ITER, `ip')dnl
define(V, `lr')dnl
dnl
dnl m4 reminder: ifelse(a,b,c,d) => if a is b, then c, else d
define(T, `r4')dnl
define(SC, `r5')dnl
ifelse(S, `true', `define(SIGN, `r6')')dnl
define(REGLIST, `ifelse(S, `true', `{r4, r5, r6,', `{r4, r5,')')dnl
define(ret, `ldmia sp!, REGLIST pc}')dnl
dnl
dnl This is the recursive definition for developing quotient digits.
dnl
dnl Parameters:
dnl $1 the current depth, 1 <= $1 <= N
dnl $2 the current accumulation of quotient bits
dnl N max depth
dnl
dnl We add a new bit to $2 and either recurse or insert the bits in
dnl the quotient. R, Q, and V are inputs and outputs as defined above;
dnl the condition codes are expected to reflect the input R, and are
dnl modified to reflect the output R.
dnl
define(DEVELOP_QUOTIENT_BITS,
` @ depth $1, accumulated bits $2
mov V, V, lsr #1
blt L.$1.eval(2^N+$2+999)
@ remainder is positive
subs R, R, V
ifelse($1, N,
` ifelse(eval(2*$2+1<0), `0',
`add Q, Q, `#'eval($2*2+1)',
`sub Q, Q, `#'eval(-($2*2+1))')
b 9f
', ` DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2+1)')')
L.$1.eval(2^N+$2+999):
@ remainder is negative
adds R, R, V
ifelse($1, N,
` ifelse(eval(2*$2-1<0), `0',
`add Q, Q, `#'eval($2*2-1)',
`sub Q, Q, `#'eval(-($2*2-1))')
b 9f
', ` DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2-1)')')
ifelse($1, 1, `9:')')dnl
#include "trap.h"
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
.globl NAME
.align 0
NAME:
stmdb sp!, REGLIST lr}
ifelse(S, `true',
` @ compute sign of result; if neither is negative, no problem
ifelse(OP, `div', `eor SIGN, divisor, dividend @ compute sign',
`mov SIGN, dividend')
cmp divisor, #0
rsbmi divisor, divisor, #0
beq Ldiv_zero
mov V, divisor
movs R, dividend
rsbmi R, R, #0 @ make dividend nonnegative
',
` @ Ready to divide. Compute size of quotient; scale comparand.
movs V, divisor
mov R, dividend
beq Ldiv_zero
')
cmp R, V @ if divisor exceeds dividend, done
mov Q, #0
bcc Lgot_result @ (and algorithm fails otherwise)
mov T, `#'(1 << (WORDSIZE - TOPBITS - 1))
cmp R, T
mov ITER, #0
bcc Lnot_really_big
@ `Here the dividend is >= 2^(31-N) or so. We must be careful here,
@ as our usual N-at-a-shot divide step will cause overflow and havoc.
@ The number of bits in the result here is N*ITER+SC, where SC <= N.
@ Compute ITER in an unorthodox manner: know we need to shift V into
@ the top decade: so do not even bother to compare to R.'
mov SC, #1
1:
cmp V, T
bcs 3f
mov V, V, lsl `#'N
add ITER, ITER, #1
b 1b
@ Now compute SC.
2: adds V, V, V
add SC, SC, #1
bcc Lnot_too_big
@ We get here if the divisor overflowed while shifting.
@ This means that R has the high-order bit set.
@ Restore V and subtract from R.
mov T, T, lsl `#'TOPBITS
mov V, V, lsr #1
add V, T, V
sub SC, SC, #1
b Ldo_single_div
Lnot_too_big:
3: cmp V, R
bcc 2b
@ beq Ldo_single_div
/-* NB: these are commented out in the V8-Sparc manual as well *-/
/-* (I do not understand this) *-/
@ V > R: went too far: back up 1 step
@ srl V, 1, V
@ dec SC
@ do single-bit divide steps
@
@ We have to be careful here. We know that R >= V, so we can do the
@ first divide step without thinking. BUT, the others are conditional,
@ and are only done if R >= 0. Because both R and V may have the high-
@ order bit set in the first step, just falling into the regular
@ division loop will mess up the first time around.
@ So we unroll slightly...
Ldo_single_div:
subs SC, SC, #1
blt Lend_regular_divide
sub R, R, V
mov Q, #1
b Lend_single_divloop
Lsingle_divloop:
cmp R, #0
mov Q, Q, lsl #1
mov V, V, lsr #1
@ R >= 0
subpl R, R, V
addpl Q, Q, #1
@ R < 0
addmi R, R, V
submi Q, Q, #1
Lend_single_divloop:
subs SC, SC, #1
bge Lsingle_divloop
b Lend_regular_divide
1:
add ITER, ITER, #1
Lnot_really_big:
mov V, V, lsl `#'N
cmp V, R
bls 1b
@
@ HOW CAN ITER EVER BE -1 HERE ?????
@
cmn ITER, #1
beq Lgot_result
Ldivloop:
cmp R, #0 @ set up for initial iteration
mov Q, Q, lsl `#'N
DEVELOP_QUOTIENT_BITS(1, 0)
Lend_regular_divide:
subs ITER, ITER, #1
bge Ldivloop
cmp R, #0
@ non-restoring fixup here (one instruction only!)
ifelse(OP, `div',
` sublt Q, Q, #1
', ` addlt R, divisor, R
')
Lgot_result:
ifelse(S, `true',
` @ check to see if answer should be < 0
cmp SIGN, #0
ifelse(OP, `div', `rsbmi Q, Q, #0', `rsbmi R, R, #0')
')
ifelse(OP, `div', `mov r0, Q', `mov r0, R')
ret
Ldiv_zero:
@ Divide by zero trap. If it returns, return 0 (about as
@ wrong as possible, but that is what SunOS does...).
bl ___div0
mov r0, #0
ret
*/
#ifdef __APCS_26__
#define RET movs
#define RETc(x) mov##x##s
#define RETCOND ^
#else
#define RET mov
#define RETc(x) mov##x
#define RETCOND
#endif
......@@ -290,1323 +58,339 @@ Ldiv_zero:
#ifdef L_udivsi3
dividend .req r0
divisor .req r1
result .req r2
curbit .req r3
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
.text
.globl SYM (__udivsi3)
.align 0
SYM (__udivsi3):
stmdb sp!, {r4, r5, lr}
@ Ready to divide. Compute size of quotient; scale comparand.
movs lr, r1
mov r3, r0
beq Ldiv_zero
cmp r3, lr @ if r1 exceeds r0, done
mov r2, #0
bcc Lgot_result @ (and algorithm fails otherwise)
mov r4, #(1 << (32 - 4 - 1))
cmp r3, r4
mov ip, #0
bcc Lnot_really_big
@ Here the dividend is >= 2^(31-N) or so. We must be careful here,
@ as our usual N-at-a-shot divide step will cause overflow and havoc.
@ The number of bits in the result here is N*ITER+SC, where SC <= N.
@ Compute ITER in an unorthodox manner: know we need to shift V into
@ the top decade: so do not even bother to compare to R.
mov r5, #1
1:
cmp lr, r4
bcs 3f
mov lr, lr, lsl #4
add ip, ip, #1
b 1b
@ Now compute r5.
2: adds lr, lr, lr
add r5, r5, #1
bcc Lnot_too_big
@ We get here if the r1 overflowed while shifting.
@ This means that r3 has the high-order bit set.
@ Restore lr and subtract from r3.
mov r4, r4, lsl #4
mov lr, lr, lsr #1
add lr, r4, lr
sub r5, r5, #1
b Ldo_single_div
Lnot_too_big:
3: cmp lr, r3
bcc 2b
@ beq Ldo_single_div
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
@ lr > r3: went too far: back up 1 step
@ srl lr, 1, lr
@ dec r5
@ do single-bit divide steps
@
@ We have to be careful here. We know that r3 >= lr, so we can do the
@ first divide step without thinking. BUT, the others are conditional,
@ and are only done if r3 >= 0. Because both r3 and lr may have the high-
@ order bit set in the first step, just falling into the regular
@ division loop will mess up the first time around.
@ So we unroll slightly...
Ldo_single_div:
subs r5, r5, #1
blt Lend_regular_divide
sub r3, r3, lr
mov r2, #1
b Lend_single_divloop
Lsingle_divloop:
cmp r3, #0
mov r2, r2, lsl #1
mov lr, lr, lsr #1
@ r3 >= 0
subpl r3, r3, lr
addpl r2, r2, #1
@ r3 < 0
addmi r3, r3, lr
submi r2, r2, #1
Lend_single_divloop:
subs r5, r5, #1
bge Lsingle_divloop
b Lend_regular_divide
1:
add ip, ip, #1
Lnot_really_big:
mov lr, lr, lsl #4
cmp lr, r3
bls 1b
@
@ HOW CAN ip EVER BE -1 HERE ?????
@
cmn ip, #1
beq Lgot_result
Ldivloop:
cmp r3, #0 @ set up for initial iteration
mov r2, r2, lsl #4
@ depth 1, accumulated bits 0
mov lr, lr, lsr #1
blt L.1.1015
@ remainder is positive
subs r3, r3, lr
@ depth 2, accumulated bits 1
mov lr, lr, lsr #1
blt L.2.1016
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits 3
mov lr, lr, lsr #1
blt L.3.1018
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 7
mov lr, lr, lsr #1
blt L.4.1022
@ remainder is positive
subs r3, r3, lr
add r2, r2, #15
b 9f
L.4.1022:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #13
b 9f
L.3.1018:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 5
mov lr, lr, lsr #1
blt L.4.1020
@ remainder is positive
subs r3, r3, lr
add r2, r2, #11
b 9f
L.4.1020:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #9
b 9f
L.2.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits 1
mov lr, lr, lsr #1
blt L.3.1016
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 3
mov lr, lr, lsr #1
blt L.4.1018
@ remainder is positive
subs r3, r3, lr
add r2, r2, #7
b 9f
L.4.1018:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #5
b 9f
L.3.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 1
mov lr, lr, lsr #1
blt L.4.1016
@ remainder is positive
subs r3, r3, lr
add r2, r2, #3
b 9f
L.4.1016:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #1
b 9f
L.1.1015:
@ remainder is negative
adds r3, r3, lr
@ depth 2, accumulated bits -1
mov lr, lr, lsr #1
blt L.2.1014
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits -1
mov lr, lr, lsr #1
blt L.3.1014
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -1
mov lr, lr, lsr #1
blt L.4.1014
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #1
b 9f
L.4.1014:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #3
b 9f
L.3.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -3
mov lr, lr, lsr #1
blt L.4.1012
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #5
b 9f
L.4.1012:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #7
b 9f
L.2.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits -3
mov lr, lr, lsr #1
blt L.3.1012
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -5
mov lr, lr, lsr #1
blt L.4.1010
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #9
b 9f
L.4.1010:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #11
b 9f
L.3.1012:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -7
mov lr, lr, lsr #1
blt L.4.1008
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #13
b 9f
L.4.1008:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #15
b 9f
9:
Lend_regular_divide:
subs ip, ip, #1
bge Ldivloop
cmp r3, #0
@ non-restoring fixup here (one instruction only!)
sublt r2, r2, #1
SYM (__udivsi3):
cmp divisor, #0
beq Ldiv0
mov curbit, #1
mov result, #0
cmp dividend, divisor
bcc Lgot_result
Loop1:
@ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main
@ division loop. Continue shifting until the divisor is
@ larger than the dividend.
cmp divisor, #0x10000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #4
movcc curbit, curbit, lsl #4
bcc Loop1
Lbignum:
@ For very big divisors, we must shift it a bit at a time, or
@ we will be in danger of overflowing.
cmp divisor, #0x80000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #1
movcc curbit, curbit, lsl #1
bcc Lbignum
Loop3:
@ Test for possible subtractions, and note which bits
@ are done in the result. On the final pass, this may subtract
@ too much from the dividend, but the result will be ok, since the
@ "bit" will have been shifted out at the bottom.
cmp dividend, divisor
subcs dividend, dividend, divisor
orrcs result, result, curbit
cmp dividend, divisor, lsr #1
subcs dividend, dividend, divisor, lsr #1
orrcs result, result, curbit, lsr #1
cmp dividend, divisor, lsr #2
subcs dividend, dividend, divisor, lsr #2
orrcs result, result, curbit, lsr #2
cmp dividend, divisor, lsr #3
subcs dividend, dividend, divisor, lsr #3
orrcs result, result, curbit, lsr #3
cmp dividend, #0 @ Early termination?
movnes curbit, curbit, lsr #4 @ No, any more bits to do?
movne divisor, divisor, lsr #4
bne Loop3
Lgot_result:
mov r0, result
RET pc, lr
mov r0, r2
ldmia sp!, {r4, r5, pc}RETCOND
Ldiv_zero:
@ Divide by zero trap. If it returns, return 0 (about as
@ wrong as possible, but that is what SunOS does...).
Ldiv0:
str lr, [sp, #-4]!
bl SYM (__div0)
mov r0, #0
ldmia sp!, {r4, r5, pc}RETCOND
mov r0, #0 @ about as wrong as it could be
ldmia sp!, {pc}RETCOND
#endif /* L_udivsi3 */
#ifdef L_divsi3
#ifdef L_umodsi3
dividend .req r0
divisor .req r1
overdone .req r2
curbit .req r3
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
.globl SYM (__divsi3)
.text
.globl SYM (__umodsi3)
.align 0
SYM (__divsi3):
stmdb sp!, {r4, r5, r6, lr}
@ compute sign of result; if neither is negative, no problem
eor r6, r1, r0 @ compute sign
cmp r1, #0
rsbmi r1, r1, #0
beq Ldiv_zero
mov lr, r1
movs r3, r0
rsbmi r3, r3, #0 @ make dividend nonnegative
cmp r3, lr @ if r1 exceeds r0, done
mov r2, #0
bcc Lgot_result @ (and algorithm fails otherwise)
mov r4, #(1 << (32 - 4 - 1))
cmp r3, r4
mov ip, #0
bcc Lnot_really_big
@ Here the dividend is >= 2^(31-N) or so. We must be careful here,
@ as our usual N-at-a-shot divide step will cause overflow and havoc.
@ The number of bits in the result here is N*ITER+SC, where SC <= N.
@ Compute ITER in an unorthodox manner: know we need to shift V into
@ the top decade: so do not even bother to compare to R.
mov r5, #1
1:
cmp lr, r4
bcs 3f
mov lr, lr, lsl #4
add ip, ip, #1
b 1b
@ Now compute r5.
2: adds lr, lr, lr
add r5, r5, #1
bcc Lnot_too_big
@ We get here if the r1 overflowed while shifting.
@ This means that r3 has the high-order bit set.
@ Restore lr and subtract from r3.
mov r4, r4, lsl #4
mov lr, lr, lsr #1
add lr, r4, lr
sub r5, r5, #1
b Ldo_single_div
Lnot_too_big:
3: cmp lr, r3
bcc 2b
@ beq Ldo_single_div
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
@ lr > r3: went too far: back up 1 step
@ srl lr, 1, lr
@ dec r5
@ do single-bit divide steps
@
@ We have to be careful here. We know that r3 >= lr, so we can do the
@ first divide step without thinking. BUT, the others are conditional,
@ and are only done if r3 >= 0. Because both r3 and lr may have the high-
@ order bit set in the first step, just falling into the regular
@ division loop will mess up the first time around.
@ So we unroll slightly...
Ldo_single_div:
subs r5, r5, #1
blt Lend_regular_divide
sub r3, r3, lr
mov r2, #1
b Lend_single_divloop
Lsingle_divloop:
cmp r3, #0
mov r2, r2, lsl #1
mov lr, lr, lsr #1
@ r3 >= 0
subpl r3, r3, lr
addpl r2, r2, #1
@ r3 < 0
addmi r3, r3, lr
submi r2, r2, #1
Lend_single_divloop:
subs r5, r5, #1
bge Lsingle_divloop
b Lend_regular_divide
1:
add ip, ip, #1
Lnot_really_big:
mov lr, lr, lsl #4
cmp lr, r3
bls 1b
@
@ HOW CAN ip EVER BE -1 HERE ?????
@
cmn ip, #1
beq Lgot_result
Ldivloop:
cmp r3, #0 @ set up for initial iteration
mov r2, r2, lsl #4
@ depth 1, accumulated bits 0
mov lr, lr, lsr #1
blt L.1.1015
@ remainder is positive
subs r3, r3, lr
@ depth 2, accumulated bits 1
mov lr, lr, lsr #1
blt L.2.1016
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits 3
mov lr, lr, lsr #1
blt L.3.1018
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 7
mov lr, lr, lsr #1
blt L.4.1022
@ remainder is positive
subs r3, r3, lr
add r2, r2, #15
b 9f
L.4.1022:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #13
b 9f
L.3.1018:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 5
mov lr, lr, lsr #1
blt L.4.1020
@ remainder is positive
subs r3, r3, lr
add r2, r2, #11
b 9f
L.4.1020:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #9
b 9f
L.2.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits 1
mov lr, lr, lsr #1
blt L.3.1016
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 3
mov lr, lr, lsr #1
blt L.4.1018
@ remainder is positive
subs r3, r3, lr
add r2, r2, #7
b 9f
L.4.1018:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #5
b 9f
L.3.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 1
mov lr, lr, lsr #1
blt L.4.1016
@ remainder is positive
subs r3, r3, lr
add r2, r2, #3
b 9f
L.4.1016:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #1
b 9f
L.1.1015:
@ remainder is negative
adds r3, r3, lr
@ depth 2, accumulated bits -1
mov lr, lr, lsr #1
blt L.2.1014
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits -1
mov lr, lr, lsr #1
blt L.3.1014
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -1
mov lr, lr, lsr #1
blt L.4.1014
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #1
b 9f
L.4.1014:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #3
b 9f
L.3.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -3
mov lr, lr, lsr #1
blt L.4.1012
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #5
b 9f
L.4.1012:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #7
b 9f
L.2.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits -3
mov lr, lr, lsr #1
blt L.3.1012
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -5
mov lr, lr, lsr #1
blt L.4.1010
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #9
b 9f
L.4.1010:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #11
b 9f
L.3.1012:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -7
mov lr, lr, lsr #1
blt L.4.1008
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #13
b 9f
L.4.1008:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #15
b 9f
SYM (__umodsi3):
cmp divisor, #0
beq Ldiv0
mov curbit, #1
cmp dividend, divisor
RETc(cc) pc, lr
Loop1:
@ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main
@ division loop. Continue shifting until the divisor is
@ larger than the dividend.
cmp divisor, #0x10000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #4
movcc curbit, curbit, lsl #4
bcc Loop1
Lbignum:
@ For very big divisors, we must shift it a bit at a time, or
@ we will be in danger of overflowing.
cmp divisor, #0x80000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #1
movcc curbit, curbit, lsl #1
bcc Lbignum
Loop3:
@ Test for possible subtractions. On the final pass, this may
@ subtract too much from the dividend, so keep track of which
@ subtractions are done, we can fix them up afterwards...
mov overdone, #0
cmp dividend, divisor
subcs dividend, dividend, divisor
cmp dividend, divisor, lsr #1
subcs dividend, dividend, divisor, lsr #1
orrcs overdone, overdone, curbit, ror #1
cmp dividend, divisor, lsr #2
subcs dividend, dividend, divisor, lsr #2
orrcs overdone, overdone, curbit, ror #2
cmp dividend, divisor, lsr #3
subcs dividend, dividend, divisor, lsr #3
orrcs overdone, overdone, curbit, ror #3
mov ip, curbit
cmp dividend, #0 @ Early termination?
movnes curbit, curbit, lsr #4 @ No, any more bits to do?
movne divisor, divisor, lsr #4
bne Loop3
@ Any subtractions that we should not have done will be recorded in
@ the top three bits of "overdone". Exactly which were not needed
@ are governed by the position of the bit, stored in ip.
@ If we terminated early, because dividend became zero,
@ then none of the below will match, since the bit in ip will not be
@ in the bottom nibble.
ands overdone, overdone, #0xe0000000
RETc(eq) pc, lr @ No fixups needed
tst overdone, ip, ror #3
addne dividend, dividend, divisor, lsr #3
tst overdone, ip, ror #2
addne dividend, dividend, divisor, lsr #2
tst overdone, ip, ror #1
addne dividend, dividend, divisor, lsr #1
RET pc, lr
9:
Lend_regular_divide:
subs ip, ip, #1
bge Ldivloop
cmp r3, #0
@ non-restoring fixup here (one instruction only!)
sublt r2, r2, #1
Lgot_result:
@ check to see if answer should be < 0
cmp r6, #0
rsbmi r2, r2, #0
mov r0, r2
ldmia sp!, {r4, r5, r6, pc}RETCOND
Ldiv_zero:
@ Divide by zero trap. If it returns, return 0 (about as
@ wrong as possible, but that is what SunOS does...).
Ldiv0:
str lr, [sp, #-4]!
bl SYM (__div0)
mov r0, #0
ldmia sp!, {r4, r5, r6, pc}RETCOND
mov r0, #0 @ about as wrong as it could be
ldmia sp!, {pc}RETCOND
#endif /* L_divsi3 */
#endif /* L_umodsi3 */
#ifdef L_umodsi3
#ifdef L_divsi3
dividend .req r0
divisor .req r1
result .req r2
curbit .req r3
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
.globl SYM (__umodsi3)
.text
.globl SYM (__divsi3)
.align 0
SYM (__umodsi3):
stmdb sp!, {r4, r5, lr}
@ Ready to divide. Compute size of quotient; scale comparand.
movs lr, r1
mov r3, r0
beq Ldiv_zero
cmp r3, lr @ if r1 exceeds r0, done
mov r2, #0
bcc Lgot_result @ (and algorithm fails otherwise)
mov r4, #(1 << (32 - 4 - 1))
cmp r3, r4
mov ip, #0
bcc Lnot_really_big
@ Here the dividend is >= 2^(31-N) or so. We must be careful here,
@ as our usual N-at-a-shot divide step will cause overflow and havoc.
@ The number of bits in the result here is N*ITER+SC, where SC <= N.
@ Compute ITER in an unorthodox manner: know we need to shift V into
@ the top decade: so do not even bother to compare to R.
mov r5, #1
1:
cmp lr, r4
bcs 3f
mov lr, lr, lsl #4
add ip, ip, #1
b 1b
@ Now compute r5.
2: adds lr, lr, lr
add r5, r5, #1
bcc Lnot_too_big
@ We get here if the r1 overflowed while shifting.
@ This means that r3 has the high-order bit set.
@ Restore lr and subtract from r3.
mov r4, r4, lsl #4
mov lr, lr, lsr #1
add lr, r4, lr
sub r5, r5, #1
b Ldo_single_div
Lnot_too_big:
3: cmp lr, r3
bcc 2b
@ beq Ldo_single_div
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
@ lr > r3: went too far: back up 1 step
@ srl lr, 1, lr
@ dec r5
@ do single-bit divide steps
@
@ We have to be careful here. We know that r3 >= lr, so we can do the
@ first divide step without thinking. BUT, the others are conditional,
@ and are only done if r3 >= 0. Because both r3 and lr may have the high-
@ order bit set in the first step, just falling into the regular
@ division loop will mess up the first time around.
@ So we unroll slightly...
Ldo_single_div:
subs r5, r5, #1
blt Lend_regular_divide
sub r3, r3, lr
mov r2, #1
b Lend_single_divloop
Lsingle_divloop:
cmp r3, #0
mov r2, r2, lsl #1
mov lr, lr, lsr #1
@ r3 >= 0
subpl r3, r3, lr
addpl r2, r2, #1
@ r3 < 0
addmi r3, r3, lr
submi r2, r2, #1
Lend_single_divloop:
subs r5, r5, #1
bge Lsingle_divloop
b Lend_regular_divide
1:
add ip, ip, #1
Lnot_really_big:
mov lr, lr, lsl #4
cmp lr, r3
bls 1b
@
@ HOW CAN ip EVER BE -1 HERE ?????
@
cmn ip, #1
beq Lgot_result
Ldivloop:
cmp r3, #0 @ set up for initial iteration
mov r2, r2, lsl #4
@ depth 1, accumulated bits 0
mov lr, lr, lsr #1
blt L.1.1015
@ remainder is positive
subs r3, r3, lr
@ depth 2, accumulated bits 1
mov lr, lr, lsr #1
blt L.2.1016
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits 3
mov lr, lr, lsr #1
blt L.3.1018
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 7
mov lr, lr, lsr #1
blt L.4.1022
@ remainder is positive
subs r3, r3, lr
add r2, r2, #15
b 9f
L.4.1022:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #13
b 9f
L.3.1018:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 5
mov lr, lr, lsr #1
blt L.4.1020
@ remainder is positive
subs r3, r3, lr
add r2, r2, #11
b 9f
L.4.1020:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #9
b 9f
L.2.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits 1
mov lr, lr, lsr #1
blt L.3.1016
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 3
mov lr, lr, lsr #1
blt L.4.1018
@ remainder is positive
subs r3, r3, lr
add r2, r2, #7
b 9f
L.4.1018:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #5
b 9f
L.3.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 1
mov lr, lr, lsr #1
blt L.4.1016
@ remainder is positive
subs r3, r3, lr
add r2, r2, #3
b 9f
L.4.1016:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #1
b 9f
L.1.1015:
@ remainder is negative
adds r3, r3, lr
@ depth 2, accumulated bits -1
mov lr, lr, lsr #1
blt L.2.1014
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits -1
mov lr, lr, lsr #1
blt L.3.1014
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -1
mov lr, lr, lsr #1
blt L.4.1014
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #1
b 9f
L.4.1014:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #3
b 9f
L.3.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -3
mov lr, lr, lsr #1
blt L.4.1012
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #5
b 9f
L.4.1012:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #7
b 9f
L.2.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits -3
mov lr, lr, lsr #1
blt L.3.1012
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -5
mov lr, lr, lsr #1
blt L.4.1010
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #9
b 9f
L.4.1010:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #11
b 9f
L.3.1012:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -7
mov lr, lr, lsr #1
blt L.4.1008
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #13
b 9f
L.4.1008:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #15
b 9f
9:
Lend_regular_divide:
subs ip, ip, #1
bge Ldivloop
cmp r3, #0
@ non-restoring fixup here (one instruction only!)
addlt r3, r1, r3
SYM (__divsi3):
eor ip, dividend, divisor @ Save the sign of the result.
mov curbit, #1
mov result, #0
cmp divisor, #0
rsbmi divisor, divisor, #0 @ Loops below use unsigned.
beq Ldiv0
cmp dividend, #0
rsbmi dividend, dividend, #0
cmp dividend, divisor
bcc Lgot_result
Loop1:
@ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main
@ division loop. Continue shifting until the divisor is
@ larger than the dividend.
cmp divisor, #0x10000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #4
movcc curbit, curbit, lsl #4
bcc Loop1
Lbignum:
@ For very big divisors, we must shift it a bit at a time, or
@ we will be in danger of overflowing.
cmp divisor, #0x80000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #1
movcc curbit, curbit, lsl #1
bcc Lbignum
Loop3:
@ Test for possible subtractions, and note which bits
@ are done in the result. On the final pass, this may subtract
@ too much from the dividend, but the result will be ok, since the
@ "bit" will have been shifted out at the bottom.
cmp dividend, divisor
subcs dividend, dividend, divisor
orrcs result, result, curbit
cmp dividend, divisor, lsr #1
subcs dividend, dividend, divisor, lsr #1
orrcs result, result, curbit, lsr #1
cmp dividend, divisor, lsr #2
subcs dividend, dividend, divisor, lsr #2
orrcs result, result, curbit, lsr #2
cmp dividend, divisor, lsr #3
subcs dividend, dividend, divisor, lsr #3
orrcs result, result, curbit, lsr #3
cmp dividend, #0 @ Early termination?
movnes curbit, curbit, lsr #4 @ No, any more bits to do?
movne divisor, divisor, lsr #4
bne Loop3
Lgot_result:
mov r0, result
cmp ip, #0
rsbmi r0, r0, #0
RET pc, lr
mov r0, r3
ldmia sp!, {r4, r5, pc}RETCOND
Ldiv_zero:
@ Divide by zero trap. If it returns, return 0 (about as
@ wrong as possible, but that is what SunOS does...).
Ldiv0:
str lr, [sp, #-4]!
bl SYM (__div0)
mov r0, #0
ldmia sp!, {r4, r5, pc}RETCOND
mov r0, #0 @ about as wrong as it could be
ldmia sp!, {pc}RETCOND
#endif /* L_umodsi3 */
#endif /* L_divsi3 */
#ifdef L_modsi3
dividend .req r0
divisor .req r1
overdone .req r2
curbit .req r3
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
.text
.globl SYM (__modsi3)
.align 0
SYM (__modsi3):
stmdb sp!, {r4, r5, r6, lr}
@ compute sign of result; if neither is negative, no problem
mov r6, r0
cmp r1, #0
rsbmi r1, r1, #0
beq Ldiv_zero
mov lr, r1
movs r3, r0
rsbmi r3, r3, #0 @ make dividend nonnegative
cmp r3, lr @ if r1 exceeds r0, done
mov r2, #0
bcc Lgot_result @ (and algorithm fails otherwise)
mov r4, #(1 << (32 - 4 - 1))
cmp r3, r4
mov ip, #0
bcc Lnot_really_big
@ Here the dividend is >= 2^(31-N) or so. We must be careful here,
@ as our usual N-at-a-shot divide step will cause overflow and havoc.
@ The number of bits in the result here is N*ITER+SC, where SC <= N.
@ Compute ITER in an unorthodox manner: know we need to shift V into
@ the top decade: so do not even bother to compare to R.
mov r5, #1
1:
cmp lr, r4
bcs 3f
mov lr, lr, lsl #4
add ip, ip, #1
b 1b
@ Now compute r5.
2: adds lr, lr, lr
add r5, r5, #1
bcc Lnot_too_big
@ We get here if the r1 overflowed while shifting.
@ This means that r3 has the high-order bit set.
@ Restore lr and subtract from r3.
mov r4, r4, lsl #4
mov lr, lr, lsr #1
add lr, r4, lr
sub r5, r5, #1
b Ldo_single_div
Lnot_too_big:
3: cmp lr, r3
bcc 2b
@ beq Ldo_single_div
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
@ lr > r3: went too far: back up 1 step
@ srl lr, 1, lr
@ dec r5
@ do single-bit divide steps
@
@ We have to be careful here. We know that r3 >= lr, so we can do the
@ first divide step without thinking. BUT, the others are conditional,
@ and are only done if r3 >= 0. Because both r3 and lr may have the high-
@ order bit set in the first step, just falling into the regular
@ division loop will mess up the first time around.
@ So we unroll slightly...
Ldo_single_div:
subs r5, r5, #1
blt Lend_regular_divide
sub r3, r3, lr
mov r2, #1
b Lend_single_divloop
Lsingle_divloop:
cmp r3, #0
mov r2, r2, lsl #1
mov lr, lr, lsr #1
@ r3 >= 0
subpl r3, r3, lr
addpl r2, r2, #1
@ r3 < 0
addmi r3, r3, lr
submi r2, r2, #1
Lend_single_divloop:
subs r5, r5, #1
bge Lsingle_divloop
b Lend_regular_divide
1:
add ip, ip, #1
Lnot_really_big:
mov lr, lr, lsl #4
cmp lr, r3
bls 1b
@
@ HOW CAN ip EVER BE -1 HERE ?????
@
cmn ip, #1
SYM (__modsi3):
mov curbit, #1
cmp divisor, #0
rsbmi divisor, divisor, #0 @ Loops below use unsigned.
beq Ldiv0
@ Need to save the sign of the dividend, unfortunately, we need
@ ip later on; this is faster than pushing lr and using that.
str dividend, [sp, #-4]!
cmp dividend, #0
rsbmi dividend, dividend, #0
cmp dividend, divisor
bcc Lgot_result
Loop1:
@ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main
@ division loop. Continue shifting until the divisor is
@ larger than the dividend.
cmp divisor, #0x10000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #4
movcc curbit, curbit, lsl #4
bcc Loop1
Lbignum:
@ For very big divisors, we must shift it a bit at a time, or
@ we will be in danger of overflowing.
cmp divisor, #0x80000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #1
movcc curbit, curbit, lsl #1
bcc Lbignum
Loop3:
@ Test for possible subtractions. On the final pass, this may
@ subtract too much from the dividend, so keep track of which
@ subtractions are done, we can fix them up afterwards...
mov overdone, #0
cmp dividend, divisor
subcs dividend, dividend, divisor
cmp dividend, divisor, lsr #1
subcs dividend, dividend, divisor, lsr #1
orrcs overdone, overdone, curbit, ror #1
cmp dividend, divisor, lsr #2
subcs dividend, dividend, divisor, lsr #2
orrcs overdone, overdone, curbit, ror #2
cmp dividend, divisor, lsr #3
subcs dividend, dividend, divisor, lsr #3
orrcs overdone, overdone, curbit, ror #3
mov ip, curbit
cmp dividend, #0 @ Early termination?
movnes curbit, curbit, lsr #4 @ No, any more bits to do?
movne divisor, divisor, lsr #4
bne Loop3
@ Any subtractions that we should not have done will be recorded in
@ the top three bits of "overdone". Exactly which were not needed
@ are governed by the position of the bit, stored in ip.
@ If we terminated early, because dividend became zero,
@ then none of the below will match, since the bit in ip will not be
@ in the bottom nibble.
ands overdone, overdone, #0xe0000000
beq Lgot_result
Ldivloop:
cmp r3, #0 @ set up for initial iteration
mov r2, r2, lsl #4
@ depth 1, accumulated bits 0
mov lr, lr, lsr #1
blt L.1.1015
@ remainder is positive
subs r3, r3, lr
@ depth 2, accumulated bits 1
mov lr, lr, lsr #1
blt L.2.1016
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits 3
mov lr, lr, lsr #1
blt L.3.1018
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 7
mov lr, lr, lsr #1
blt L.4.1022
@ remainder is positive
subs r3, r3, lr
add r2, r2, #15
b 9f
L.4.1022:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #13
b 9f
L.3.1018:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 5
mov lr, lr, lsr #1
blt L.4.1020
@ remainder is positive
subs r3, r3, lr
add r2, r2, #11
b 9f
L.4.1020:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #9
b 9f
L.2.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits 1
mov lr, lr, lsr #1
blt L.3.1016
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 3
mov lr, lr, lsr #1
blt L.4.1018
@ remainder is positive
subs r3, r3, lr
add r2, r2, #7
b 9f
L.4.1018:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #5
b 9f
L.3.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 1
mov lr, lr, lsr #1
blt L.4.1016
@ remainder is positive
subs r3, r3, lr
add r2, r2, #3
b 9f
L.4.1016:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #1
b 9f
L.1.1015:
@ remainder is negative
adds r3, r3, lr
@ depth 2, accumulated bits -1
mov lr, lr, lsr #1
blt L.2.1014
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits -1
mov lr, lr, lsr #1
blt L.3.1014
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -1
mov lr, lr, lsr #1
blt L.4.1014
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #1
b 9f
L.4.1014:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #3
b 9f
L.3.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -3
mov lr, lr, lsr #1
blt L.4.1012
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #5
b 9f
L.4.1012:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #7
b 9f
L.2.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits -3
mov lr, lr, lsr #1
blt L.3.1012
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -5
mov lr, lr, lsr #1
blt L.4.1010
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #9
b 9f
L.4.1010:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #11
b 9f
L.3.1012:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -7
mov lr, lr, lsr #1
blt L.4.1008
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #13
b 9f
L.4.1008:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #15
b 9f
9:
Lend_regular_divide:
subs ip, ip, #1
bge Ldivloop
cmp r3, #0
@ non-restoring fixup here (one instruction only!)
addlt r3, r1, r3
tst overdone, ip, ror #3
addne dividend, dividend, divisor, lsr #3
tst overdone, ip, ror #2
addne dividend, dividend, divisor, lsr #2
tst overdone, ip, ror #1
addne dividend, dividend, divisor, lsr #1
Lgot_result:
@ check to see if answer should be < 0
cmp r6, #0
rsbmi r3, r3, #0
mov r0, r3
ldmia sp!, {r4, r5, r6, pc}RETCOND
ldr ip, [sp], #4
cmp ip, #0
rsbmi dividend, dividend, #0
RET pc, lr
Ldiv_zero:
@ Divide by zero trap. If it returns, return 0 (about as
@ wrong as possible, but that is what SunOS does...).
Ldiv0:
str lr, [sp, #-4]!
bl SYM (__div0)
mov r0, #0
ldmia sp!, {r4, r5, r6, pc}RETCOND
mov r0, #0 @ about as wrong as it could be
ldmia sp!, {pc}RETCOND
#endif /* L_modsi3 */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment