Commit bd28bf5a by Richard Earnshaw

arm/lib1funcs.asm (__divsi3, __modsi3, __udivsi3, __umodsi3): Replace

with smaller, faster versions.

From-SVN: r11070
parent b920730a
@ libgcc1 routines for ARM cpu.
@ Division and remainder, from Appendix E of the Sparc Version 8
@ Architecture Manual, with fixes from Gordon Irlam.
@ Rewritten for the ARM by Richard Earnshaw (rwe@pegasus.esprit.ec.org)
@ Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
/* Copyright (C) 1995 Free Software Foundation, Inc.
/* Copyright (C) 1995, 1996 Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
......@@ -35,243 +33,13 @@ Boston, MA 02111-1307, USA. */
This exception does not however invalidate any other reasons why
the executable file might be covered by the GNU General Public License. */
/*
* Input: dividend and divisor in r0 and r1 respectively.
*
* m4 parameters:
* NAME name of function to generate
* OP OP=div => r0 / r1; OP=mod => r0 % r1
* S S=true => signed; S=false => unsigned
*
* Algorithm parameters:
* N how many bits per iteration we try to get (4)
* WORDSIZE total number of bits (32)
*
* Derived constants:
* TOPBITS number of bits in the top `decade' of a number
*
* Important variables:
* Q the partial quotient under development (initially 0)
* R the remainder so far, initially the dividend
* ITER number of main division loop iterations required;
* equal to ceil(log2(quotient) / N). Note that this
* is the log base (2^N) of the quotient.
* V the current comparand, initially divisor*2^(ITER*N-1)
*
* Cost:
* Current estimate for non-large dividend is
* ceil(log2(quotient) / N) * (10 + 7N/2) + C
* A large dividend is one greater than 2^(31-TOPBITS) and takes a
* different path, as the upper bits of the quotient must be developed
* one bit at a time.
*/
/*
define(N, `4')dnl
define(WORDSIZE, `32')dnl
define(TOPBITS, eval(WORDSIZE - N*((WORDSIZE-1)/N)))dnl
dnl
define(dividend, `r0')dnl
define(divisor, `r1')dnl
define(Q, `r2')dnl
define(R, `r3')dnl
define(ITER, `ip')dnl
define(V, `lr')dnl
dnl
dnl m4 reminder: ifelse(a,b,c,d) => if a is b, then c, else d
define(T, `r4')dnl
define(SC, `r5')dnl
ifelse(S, `true', `define(SIGN, `r6')')dnl
define(REGLIST, `ifelse(S, `true', `{r4, r5, r6,', `{r4, r5,')')dnl
define(ret, `ldmia sp!, REGLIST pc}')dnl
dnl
dnl This is the recursive definition for developing quotient digits.
dnl
dnl Parameters:
dnl $1 the current depth, 1 <= $1 <= N
dnl $2 the current accumulation of quotient bits
dnl N max depth
dnl
dnl We add a new bit to $2 and either recurse or insert the bits in
dnl the quotient. R, Q, and V are inputs and outputs as defined above;
dnl the condition codes are expected to reflect the input R, and are
dnl modified to reflect the output R.
dnl
define(DEVELOP_QUOTIENT_BITS,
` @ depth $1, accumulated bits $2
mov V, V, lsr #1
blt L.$1.eval(2^N+$2+999)
@ remainder is positive
subs R, R, V
ifelse($1, N,
` ifelse(eval(2*$2+1<0), `0',
`add Q, Q, `#'eval($2*2+1)',
`sub Q, Q, `#'eval(-($2*2+1))')
b 9f
', ` DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2+1)')')
L.$1.eval(2^N+$2+999):
@ remainder is negative
adds R, R, V
ifelse($1, N,
` ifelse(eval(2*$2-1<0), `0',
`add Q, Q, `#'eval($2*2-1)',
`sub Q, Q, `#'eval(-($2*2-1))')
b 9f
', ` DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2-1)')')
ifelse($1, 1, `9:')')dnl
#include "trap.h"
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
.globl NAME
.align 0
NAME:
stmdb sp!, REGLIST lr}
ifelse(S, `true',
` @ compute sign of result; if neither is negative, no problem
ifelse(OP, `div', `eor SIGN, divisor, dividend @ compute sign',
`mov SIGN, dividend')
cmp divisor, #0
rsbmi divisor, divisor, #0
beq Ldiv_zero
mov V, divisor
movs R, dividend
rsbmi R, R, #0 @ make dividend nonnegative
',
` @ Ready to divide. Compute size of quotient; scale comparand.
movs V, divisor
mov R, dividend
beq Ldiv_zero
')
cmp R, V @ if divisor exceeds dividend, done
mov Q, #0
bcc Lgot_result @ (and algorithm fails otherwise)
mov T, `#'(1 << (WORDSIZE - TOPBITS - 1))
cmp R, T
mov ITER, #0
bcc Lnot_really_big
@ `Here the dividend is >= 2^(31-N) or so. We must be careful here,
@ as our usual N-at-a-shot divide step will cause overflow and havoc.
@ The number of bits in the result here is N*ITER+SC, where SC <= N.
@ Compute ITER in an unorthodox manner: know we need to shift V into
@ the top decade: so do not even bother to compare to R.'
mov SC, #1
1:
cmp V, T
bcs 3f
mov V, V, lsl `#'N
add ITER, ITER, #1
b 1b
@ Now compute SC.
2: adds V, V, V
add SC, SC, #1
bcc Lnot_too_big
@ We get here if the divisor overflowed while shifting.
@ This means that R has the high-order bit set.
@ Restore V and subtract from R.
mov T, T, lsl `#'TOPBITS
mov V, V, lsr #1
add V, T, V
sub SC, SC, #1
b Ldo_single_div
Lnot_too_big:
3: cmp V, R
bcc 2b
@ beq Ldo_single_div
/-* NB: these are commented out in the V8-Sparc manual as well *-/
/-* (I do not understand this) *-/
@ V > R: went too far: back up 1 step
@ srl V, 1, V
@ dec SC
@ do single-bit divide steps
@
@ We have to be careful here. We know that R >= V, so we can do the
@ first divide step without thinking. BUT, the others are conditional,
@ and are only done if R >= 0. Because both R and V may have the high-
@ order bit set in the first step, just falling into the regular
@ division loop will mess up the first time around.
@ So we unroll slightly...
Ldo_single_div:
subs SC, SC, #1
blt Lend_regular_divide
sub R, R, V
mov Q, #1
b Lend_single_divloop
Lsingle_divloop:
cmp R, #0
mov Q, Q, lsl #1
mov V, V, lsr #1
@ R >= 0
subpl R, R, V
addpl Q, Q, #1
@ R < 0
addmi R, R, V
submi Q, Q, #1
Lend_single_divloop:
subs SC, SC, #1
bge Lsingle_divloop
b Lend_regular_divide
1:
add ITER, ITER, #1
Lnot_really_big:
mov V, V, lsl `#'N
cmp V, R
bls 1b
@
@ HOW CAN ITER EVER BE -1 HERE ?????
@
cmn ITER, #1
beq Lgot_result
Ldivloop:
cmp R, #0 @ set up for initial iteration
mov Q, Q, lsl `#'N
DEVELOP_QUOTIENT_BITS(1, 0)
Lend_regular_divide:
subs ITER, ITER, #1
bge Ldivloop
cmp R, #0
@ non-restoring fixup here (one instruction only!)
ifelse(OP, `div',
` sublt Q, Q, #1
', ` addlt R, divisor, R
')
Lgot_result:
ifelse(S, `true',
` @ check to see if answer should be < 0
cmp SIGN, #0
ifelse(OP, `div', `rsbmi Q, Q, #0', `rsbmi R, R, #0')
')
ifelse(OP, `div', `mov r0, Q', `mov r0, R')
ret
Ldiv_zero:
@ Divide by zero trap. If it returns, return 0 (about as
@ wrong as possible, but that is what SunOS does...).
bl ___div0
mov r0, #0
ret
*/
#ifdef __APCS_26__
#define RET movs
#define RETc(x) mov##x##s
#define RETCOND ^
#else
#define RET mov
#define RETc(x) mov##x
#define RETCOND
#endif
......@@ -290,1323 +58,339 @@ Ldiv_zero:
#ifdef L_udivsi3
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
dividend .req r0
divisor .req r1
result .req r2
curbit .req r3
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
.globl SYM (__udivsi3)
.align 0
SYM (__udivsi3):
stmdb sp!, {r4, r5, lr}
@ Ready to divide. Compute size of quotient; scale comparand.
movs lr, r1
mov r3, r0
beq Ldiv_zero
cmp r3, lr @ if r1 exceeds r0, done
mov r2, #0
bcc Lgot_result @ (and algorithm fails otherwise)
mov r4, #(1 << (32 - 4 - 1))
cmp r3, r4
mov ip, #0
bcc Lnot_really_big
@ Here the dividend is >= 2^(31-N) or so. We must be careful here,
@ as our usual N-at-a-shot divide step will cause overflow and havoc.
@ The number of bits in the result here is N*ITER+SC, where SC <= N.
@ Compute ITER in an unorthodox manner: know we need to shift V into
@ the top decade: so do not even bother to compare to R.
mov r5, #1
1:
cmp lr, r4
bcs 3f
mov lr, lr, lsl #4
add ip, ip, #1
b 1b
@ Now compute r5.
2: adds lr, lr, lr
add r5, r5, #1
bcc Lnot_too_big
@ We get here if the r1 overflowed while shifting.
@ This means that r3 has the high-order bit set.
@ Restore lr and subtract from r3.
mov r4, r4, lsl #4
mov lr, lr, lsr #1
add lr, r4, lr
sub r5, r5, #1
b Ldo_single_div
Lnot_too_big:
3: cmp lr, r3
bcc 2b
@ beq Ldo_single_div
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
@ lr > r3: went too far: back up 1 step
@ srl lr, 1, lr
@ dec r5
@ do single-bit divide steps
@
@ We have to be careful here. We know that r3 >= lr, so we can do the
@ first divide step without thinking. BUT, the others are conditional,
@ and are only done if r3 >= 0. Because both r3 and lr may have the high-
@ order bit set in the first step, just falling into the regular
@ division loop will mess up the first time around.
@ So we unroll slightly...
Ldo_single_div:
subs r5, r5, #1
blt Lend_regular_divide
sub r3, r3, lr
mov r2, #1
b Lend_single_divloop
Lsingle_divloop:
cmp r3, #0
mov r2, r2, lsl #1
mov lr, lr, lsr #1
@ r3 >= 0
subpl r3, r3, lr
addpl r2, r2, #1
@ r3 < 0
addmi r3, r3, lr
submi r2, r2, #1
Lend_single_divloop:
subs r5, r5, #1
bge Lsingle_divloop
b Lend_regular_divide
1:
add ip, ip, #1
Lnot_really_big:
mov lr, lr, lsl #4
cmp lr, r3
bls 1b
@
@ HOW CAN ip EVER BE -1 HERE ?????
@
cmn ip, #1
beq Lgot_result
Ldivloop:
cmp r3, #0 @ set up for initial iteration
mov r2, r2, lsl #4
@ depth 1, accumulated bits 0
mov lr, lr, lsr #1
blt L.1.1015
@ remainder is positive
subs r3, r3, lr
@ depth 2, accumulated bits 1
mov lr, lr, lsr #1
blt L.2.1016
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits 3
mov lr, lr, lsr #1
blt L.3.1018
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 7
mov lr, lr, lsr #1
blt L.4.1022
@ remainder is positive
subs r3, r3, lr
add r2, r2, #15
b 9f
L.4.1022:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #13
b 9f
L.3.1018:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 5
mov lr, lr, lsr #1
blt L.4.1020
@ remainder is positive
subs r3, r3, lr
add r2, r2, #11
b 9f
L.4.1020:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #9
b 9f
L.2.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits 1
mov lr, lr, lsr #1
blt L.3.1016
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 3
mov lr, lr, lsr #1
blt L.4.1018
@ remainder is positive
subs r3, r3, lr
add r2, r2, #7
b 9f
L.4.1018:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #5
b 9f
L.3.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 1
mov lr, lr, lsr #1
blt L.4.1016
@ remainder is positive
subs r3, r3, lr
add r2, r2, #3
b 9f
L.4.1016:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #1
b 9f
L.1.1015:
@ remainder is negative
adds r3, r3, lr
@ depth 2, accumulated bits -1
mov lr, lr, lsr #1
blt L.2.1014
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits -1
mov lr, lr, lsr #1
blt L.3.1014
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -1
mov lr, lr, lsr #1
blt L.4.1014
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #1
b 9f
L.4.1014:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #3
b 9f
L.3.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -3
mov lr, lr, lsr #1
blt L.4.1012
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #5
b 9f
L.4.1012:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #7
b 9f
L.2.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits -3
mov lr, lr, lsr #1
blt L.3.1012
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -5
mov lr, lr, lsr #1
blt L.4.1010
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #9
b 9f
L.4.1010:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #11
b 9f
L.3.1012:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -7
mov lr, lr, lsr #1
blt L.4.1008
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #13
b 9f
L.4.1008:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #15
b 9f
9:
Lend_regular_divide:
subs ip, ip, #1
bge Ldivloop
cmp r3, #0
@ non-restoring fixup here (one instruction only!)
sublt r2, r2, #1
SYM (__udivsi3):
cmp divisor, #0
beq Ldiv0
mov curbit, #1
mov result, #0
cmp dividend, divisor
bcc Lgot_result
Loop1:
@ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main
@ division loop. Continue shifting until the divisor is
@ larger than the dividend.
cmp divisor, #0x10000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #4
movcc curbit, curbit, lsl #4
bcc Loop1
Lbignum:
@ For very big divisors, we must shift it a bit at a time, or
@ we will be in danger of overflowing.
cmp divisor, #0x80000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #1
movcc curbit, curbit, lsl #1
bcc Lbignum
Loop3:
@ Test for possible subtractions, and note which bits
@ are done in the result. On the final pass, this may subtract
@ too much from the dividend, but the result will be ok, since the
@ "bit" will have been shifted out at the bottom.
cmp dividend, divisor
subcs dividend, dividend, divisor
orrcs result, result, curbit
cmp dividend, divisor, lsr #1
subcs dividend, dividend, divisor, lsr #1
orrcs result, result, curbit, lsr #1
cmp dividend, divisor, lsr #2
subcs dividend, dividend, divisor, lsr #2
orrcs result, result, curbit, lsr #2
cmp dividend, divisor, lsr #3
subcs dividend, dividend, divisor, lsr #3
orrcs result, result, curbit, lsr #3
cmp dividend, #0 @ Early termination?
movnes curbit, curbit, lsr #4 @ No, any more bits to do?
movne divisor, divisor, lsr #4
bne Loop3
Lgot_result:
mov r0, result
RET pc, lr
mov r0, r2
ldmia sp!, {r4, r5, pc}RETCOND
Ldiv_zero:
@ Divide by zero trap. If it returns, return 0 (about as
@ wrong as possible, but that is what SunOS does...).
Ldiv0:
str lr, [sp, #-4]!
bl SYM (__div0)
mov r0, #0
ldmia sp!, {r4, r5, pc}RETCOND
mov r0, #0 @ about as wrong as it could be
ldmia sp!, {pc}RETCOND
#endif /* L_udivsi3 */
#ifdef L_divsi3
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
.globl SYM (__divsi3)
.align 0
SYM (__divsi3):
stmdb sp!, {r4, r5, r6, lr}
@ compute sign of result; if neither is negative, no problem
eor r6, r1, r0 @ compute sign
cmp r1, #0
rsbmi r1, r1, #0
beq Ldiv_zero
mov lr, r1
movs r3, r0
rsbmi r3, r3, #0 @ make dividend nonnegative
cmp r3, lr @ if r1 exceeds r0, done
mov r2, #0
bcc Lgot_result @ (and algorithm fails otherwise)
mov r4, #(1 << (32 - 4 - 1))
cmp r3, r4
mov ip, #0
bcc Lnot_really_big
@ Here the dividend is >= 2^(31-N) or so. We must be careful here,
@ as our usual N-at-a-shot divide step will cause overflow and havoc.
@ The number of bits in the result here is N*ITER+SC, where SC <= N.
@ Compute ITER in an unorthodox manner: know we need to shift V into
@ the top decade: so do not even bother to compare to R.
mov r5, #1
1:
cmp lr, r4
bcs 3f
mov lr, lr, lsl #4
add ip, ip, #1
b 1b
@ Now compute r5.
2: adds lr, lr, lr
add r5, r5, #1
bcc Lnot_too_big
@ We get here if the r1 overflowed while shifting.
@ This means that r3 has the high-order bit set.
@ Restore lr and subtract from r3.
mov r4, r4, lsl #4
mov lr, lr, lsr #1
add lr, r4, lr
sub r5, r5, #1
b Ldo_single_div
Lnot_too_big:
3: cmp lr, r3
bcc 2b
@ beq Ldo_single_div
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
@ lr > r3: went too far: back up 1 step
@ srl lr, 1, lr
@ dec r5
@ do single-bit divide steps
@
@ We have to be careful here. We know that r3 >= lr, so we can do the
@ first divide step without thinking. BUT, the others are conditional,
@ and are only done if r3 >= 0. Because both r3 and lr may have the high-
@ order bit set in the first step, just falling into the regular
@ division loop will mess up the first time around.
@ So we unroll slightly...
Ldo_single_div:
subs r5, r5, #1
blt Lend_regular_divide
sub r3, r3, lr
mov r2, #1
b Lend_single_divloop
Lsingle_divloop:
cmp r3, #0
mov r2, r2, lsl #1
mov lr, lr, lsr #1
@ r3 >= 0
subpl r3, r3, lr
addpl r2, r2, #1
@ r3 < 0
addmi r3, r3, lr
submi r2, r2, #1
Lend_single_divloop:
subs r5, r5, #1
bge Lsingle_divloop
b Lend_regular_divide
1:
add ip, ip, #1
Lnot_really_big:
mov lr, lr, lsl #4
cmp lr, r3
bls 1b
@
@ HOW CAN ip EVER BE -1 HERE ?????
@
cmn ip, #1
beq Lgot_result
Ldivloop:
cmp r3, #0 @ set up for initial iteration
mov r2, r2, lsl #4
@ depth 1, accumulated bits 0
mov lr, lr, lsr #1
blt L.1.1015
@ remainder is positive
subs r3, r3, lr
@ depth 2, accumulated bits 1
mov lr, lr, lsr #1
blt L.2.1016
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits 3
mov lr, lr, lsr #1
blt L.3.1018
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 7
mov lr, lr, lsr #1
blt L.4.1022
@ remainder is positive
subs r3, r3, lr
add r2, r2, #15
b 9f
L.4.1022:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #13
b 9f
L.3.1018:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 5
mov lr, lr, lsr #1
blt L.4.1020
@ remainder is positive
subs r3, r3, lr
add r2, r2, #11
b 9f
L.4.1020:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #9
b 9f
L.2.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits 1
mov lr, lr, lsr #1
blt L.3.1016
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 3
mov lr, lr, lsr #1
blt L.4.1018
@ remainder is positive
subs r3, r3, lr
add r2, r2, #7
b 9f
L.4.1018:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #5
b 9f
L.3.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 1
mov lr, lr, lsr #1
blt L.4.1016
@ remainder is positive
subs r3, r3, lr
add r2, r2, #3
b 9f
L.4.1016:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #1
b 9f
L.1.1015:
@ remainder is negative
adds r3, r3, lr
@ depth 2, accumulated bits -1
mov lr, lr, lsr #1
blt L.2.1014
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits -1
mov lr, lr, lsr #1
blt L.3.1014
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -1
mov lr, lr, lsr #1
blt L.4.1014
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #1
b 9f
L.4.1014:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #3
b 9f
L.3.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -3
mov lr, lr, lsr #1
blt L.4.1012
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #5
b 9f
L.4.1012:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #7
b 9f
L.2.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits -3
mov lr, lr, lsr #1
blt L.3.1012
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -5
mov lr, lr, lsr #1
blt L.4.1010
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #9
b 9f
L.4.1010:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #11
b 9f
L.3.1012:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -7
mov lr, lr, lsr #1
blt L.4.1008
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #13
b 9f
L.4.1008:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #15
b 9f
9:
Lend_regular_divide:
subs ip, ip, #1
bge Ldivloop
cmp r3, #0
@ non-restoring fixup here (one instruction only!)
sublt r2, r2, #1
Lgot_result:
@ check to see if answer should be < 0
cmp r6, #0
rsbmi r2, r2, #0
mov r0, r2
ldmia sp!, {r4, r5, r6, pc}RETCOND
Ldiv_zero:
@ Divide by zero trap. If it returns, return 0 (about as
@ wrong as possible, but that is what SunOS does...).
bl SYM (__div0)
mov r0, #0
ldmia sp!, {r4, r5, r6, pc}RETCOND
#endif /* L_divsi3 */
#ifdef L_umodsi3
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
dividend .req r0
divisor .req r1
overdone .req r2
curbit .req r3
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
.globl SYM (__umodsi3)
.align 0
SYM (__umodsi3):
stmdb sp!, {r4, r5, lr}
@ Ready to divide. Compute size of quotient; scale comparand.
movs lr, r1
mov r3, r0
beq Ldiv_zero
cmp r3, lr @ if r1 exceeds r0, done
mov r2, #0
bcc Lgot_result @ (and algorithm fails otherwise)
mov r4, #(1 << (32 - 4 - 1))
cmp r3, r4
mov ip, #0
bcc Lnot_really_big
@ Here the dividend is >= 2^(31-N) or so. We must be careful here,
@ as our usual N-at-a-shot divide step will cause overflow and havoc.
@ The number of bits in the result here is N*ITER+SC, where SC <= N.
@ Compute ITER in an unorthodox manner: know we need to shift V into
@ the top decade: so do not even bother to compare to R.
mov r5, #1
1:
cmp lr, r4
bcs 3f
mov lr, lr, lsl #4
add ip, ip, #1
b 1b
@ Now compute r5.
2: adds lr, lr, lr
add r5, r5, #1
bcc Lnot_too_big
@ We get here if the r1 overflowed while shifting.
@ This means that r3 has the high-order bit set.
@ Restore lr and subtract from r3.
mov r4, r4, lsl #4
mov lr, lr, lsr #1
add lr, r4, lr
sub r5, r5, #1
b Ldo_single_div
Lnot_too_big:
3: cmp lr, r3
bcc 2b
@ beq Ldo_single_div
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
@ lr > r3: went too far: back up 1 step
@ srl lr, 1, lr
@ dec r5
@ do single-bit divide steps
@
@ We have to be careful here. We know that r3 >= lr, so we can do the
@ first divide step without thinking. BUT, the others are conditional,
@ and are only done if r3 >= 0. Because both r3 and lr may have the high-
@ order bit set in the first step, just falling into the regular
@ division loop will mess up the first time around.
@ So we unroll slightly...
Ldo_single_div:
subs r5, r5, #1
blt Lend_regular_divide
sub r3, r3, lr
mov r2, #1
b Lend_single_divloop
Lsingle_divloop:
cmp r3, #0
mov r2, r2, lsl #1
mov lr, lr, lsr #1
@ r3 >= 0
subpl r3, r3, lr
addpl r2, r2, #1
@ r3 < 0
addmi r3, r3, lr
submi r2, r2, #1
Lend_single_divloop:
subs r5, r5, #1
bge Lsingle_divloop
b Lend_regular_divide
1:
add ip, ip, #1
Lnot_really_big:
mov lr, lr, lsl #4
cmp lr, r3
bls 1b
@
@ HOW CAN ip EVER BE -1 HERE ?????
@
cmn ip, #1
beq Lgot_result
Ldivloop:
cmp r3, #0 @ set up for initial iteration
mov r2, r2, lsl #4
@ depth 1, accumulated bits 0
mov lr, lr, lsr #1
blt L.1.1015
@ remainder is positive
subs r3, r3, lr
@ depth 2, accumulated bits 1
mov lr, lr, lsr #1
blt L.2.1016
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits 3
mov lr, lr, lsr #1
blt L.3.1018
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 7
mov lr, lr, lsr #1
blt L.4.1022
@ remainder is positive
subs r3, r3, lr
add r2, r2, #15
b 9f
L.4.1022:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #13
b 9f
L.3.1018:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 5
mov lr, lr, lsr #1
blt L.4.1020
@ remainder is positive
subs r3, r3, lr
add r2, r2, #11
b 9f
L.4.1020:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #9
b 9f
L.2.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits 1
mov lr, lr, lsr #1
blt L.3.1016
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 3
mov lr, lr, lsr #1
blt L.4.1018
@ remainder is positive
subs r3, r3, lr
add r2, r2, #7
b 9f
L.4.1018:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #5
b 9f
L.3.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 1
mov lr, lr, lsr #1
blt L.4.1016
@ remainder is positive
subs r3, r3, lr
add r2, r2, #3
b 9f
L.4.1016:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #1
b 9f
L.1.1015:
@ remainder is negative
adds r3, r3, lr
@ depth 2, accumulated bits -1
mov lr, lr, lsr #1
blt L.2.1014
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits -1
mov lr, lr, lsr #1
blt L.3.1014
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -1
mov lr, lr, lsr #1
blt L.4.1014
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #1
b 9f
L.4.1014:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #3
b 9f
L.3.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -3
mov lr, lr, lsr #1
blt L.4.1012
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #5
b 9f
L.4.1012:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #7
b 9f
L.2.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits -3
mov lr, lr, lsr #1
blt L.3.1012
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -5
mov lr, lr, lsr #1
blt L.4.1010
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #9
b 9f
L.4.1010:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #11
b 9f
SYM (__umodsi3):
cmp divisor, #0
beq Ldiv0
mov curbit, #1
cmp dividend, divisor
RETc(cc) pc, lr
Loop1:
@ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main
@ division loop. Continue shifting until the divisor is
@ larger than the dividend.
cmp divisor, #0x10000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #4
movcc curbit, curbit, lsl #4
bcc Loop1
Lbignum:
@ For very big divisors, we must shift it a bit at a time, or
@ we will be in danger of overflowing.
cmp divisor, #0x80000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #1
movcc curbit, curbit, lsl #1
bcc Lbignum
Loop3:
@ Test for possible subtractions. On the final pass, this may
@ subtract too much from the dividend, so keep track of which
@ subtractions are done, we can fix them up afterwards...
mov overdone, #0
cmp dividend, divisor
subcs dividend, dividend, divisor
cmp dividend, divisor, lsr #1
subcs dividend, dividend, divisor, lsr #1
orrcs overdone, overdone, curbit, ror #1
cmp dividend, divisor, lsr #2
subcs dividend, dividend, divisor, lsr #2
orrcs overdone, overdone, curbit, ror #2
cmp dividend, divisor, lsr #3
subcs dividend, dividend, divisor, lsr #3
orrcs overdone, overdone, curbit, ror #3
mov ip, curbit
cmp dividend, #0 @ Early termination?
movnes curbit, curbit, lsr #4 @ No, any more bits to do?
movne divisor, divisor, lsr #4
bne Loop3
@ Any subtractions that we should not have done will be recorded in
@ the top three bits of "overdone". Exactly which were not needed
@ are governed by the position of the bit, stored in ip.
@ If we terminated early, because dividend became zero,
@ then none of the below will match, since the bit in ip will not be
@ in the bottom nibble.
ands overdone, overdone, #0xe0000000
RETc(eq) pc, lr @ No fixups needed
tst overdone, ip, ror #3
addne dividend, dividend, divisor, lsr #3
tst overdone, ip, ror #2
addne dividend, dividend, divisor, lsr #2
tst overdone, ip, ror #1
addne dividend, dividend, divisor, lsr #1
RET pc, lr
L.3.1012:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -7
mov lr, lr, lsr #1
blt L.4.1008
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #13
Ldiv0:
str lr, [sp, #-4]!
bl SYM (__div0)
mov r0, #0 @ about as wrong as it could be
ldmia sp!, {pc}RETCOND
b 9f
L.4.1008:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #15
b 9f
#endif /* L_umodsi3 */
9:
Lend_regular_divide:
subs ip, ip, #1
bge Ldivloop
cmp r3, #0
@ non-restoring fixup here (one instruction only!)
addlt r3, r1, r3
#ifdef L_divsi3
dividend .req r0
divisor .req r1
result .req r2
curbit .req r3
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
.globl SYM (__divsi3)
.align 0
SYM (__divsi3):
eor ip, dividend, divisor @ Save the sign of the result.
mov curbit, #1
mov result, #0
cmp divisor, #0
rsbmi divisor, divisor, #0 @ Loops below use unsigned.
beq Ldiv0
cmp dividend, #0
rsbmi dividend, dividend, #0
cmp dividend, divisor
bcc Lgot_result
Loop1:
@ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main
@ division loop. Continue shifting until the divisor is
@ larger than the dividend.
cmp divisor, #0x10000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #4
movcc curbit, curbit, lsl #4
bcc Loop1
Lbignum:
@ For very big divisors, we must shift it a bit at a time, or
@ we will be in danger of overflowing.
cmp divisor, #0x80000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #1
movcc curbit, curbit, lsl #1
bcc Lbignum
Loop3:
@ Test for possible subtractions, and note which bits
@ are done in the result. On the final pass, this may subtract
@ too much from the dividend, but the result will be ok, since the
@ "bit" will have been shifted out at the bottom.
cmp dividend, divisor
subcs dividend, dividend, divisor
orrcs result, result, curbit
cmp dividend, divisor, lsr #1
subcs dividend, dividend, divisor, lsr #1
orrcs result, result, curbit, lsr #1
cmp dividend, divisor, lsr #2
subcs dividend, dividend, divisor, lsr #2
orrcs result, result, curbit, lsr #2
cmp dividend, divisor, lsr #3
subcs dividend, dividend, divisor, lsr #3
orrcs result, result, curbit, lsr #3
cmp dividend, #0 @ Early termination?
movnes curbit, curbit, lsr #4 @ No, any more bits to do?
movne divisor, divisor, lsr #4
bne Loop3
Lgot_result:
mov r0, result
cmp ip, #0
rsbmi r0, r0, #0
RET pc, lr
mov r0, r3
ldmia sp!, {r4, r5, pc}RETCOND
Ldiv_zero:
@ Divide by zero trap. If it returns, return 0 (about as
@ wrong as possible, but that is what SunOS does...).
Ldiv0:
str lr, [sp, #-4]!
bl SYM (__div0)
mov r0, #0
ldmia sp!, {r4, r5, pc}RETCOND
mov r0, #0 @ about as wrong as it could be
ldmia sp!, {pc}RETCOND
#endif /* L_umodsi3 */
#endif /* L_divsi3 */
#ifdef L_modsi3
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
dividend .req r0
divisor .req r1
overdone .req r2
curbit .req r3
ip .req r12
sp .req r13
lr .req r14
pc .req r15
.text
.globl SYM (__modsi3)
.align 0
SYM (__modsi3):
stmdb sp!, {r4, r5, r6, lr}
@ compute sign of result; if neither is negative, no problem
mov r6, r0
cmp r1, #0
rsbmi r1, r1, #0
beq Ldiv_zero
mov lr, r1
movs r3, r0
rsbmi r3, r3, #0 @ make dividend nonnegative
cmp r3, lr @ if r1 exceeds r0, done
mov r2, #0
bcc Lgot_result @ (and algorithm fails otherwise)
mov r4, #(1 << (32 - 4 - 1))
cmp r3, r4
mov ip, #0
bcc Lnot_really_big
@ Here the dividend is >= 2^(31-N) or so. We must be careful here,
@ as our usual N-at-a-shot divide step will cause overflow and havoc.
@ The number of bits in the result here is N*ITER+SC, where SC <= N.
@ Compute ITER in an unorthodox manner: know we need to shift V into
@ the top decade: so do not even bother to compare to R.
mov r5, #1
1:
cmp lr, r4
bcs 3f
mov lr, lr, lsl #4
add ip, ip, #1
b 1b
@ Now compute r5.
2: adds lr, lr, lr
add r5, r5, #1
bcc Lnot_too_big
@ We get here if the r1 overflowed while shifting.
@ This means that r3 has the high-order bit set.
@ Restore lr and subtract from r3.
mov r4, r4, lsl #4
mov lr, lr, lsr #1
add lr, r4, lr
sub r5, r5, #1
b Ldo_single_div
Lnot_too_big:
3: cmp lr, r3
bcc 2b
@ beq Ldo_single_div
/* NB: these are commented out in the V8-Sparc manual as well */
/* (I do not understand this) */
@ lr > r3: went too far: back up 1 step
@ srl lr, 1, lr
@ dec r5
@ do single-bit divide steps
@
@ We have to be careful here. We know that r3 >= lr, so we can do the
@ first divide step without thinking. BUT, the others are conditional,
@ and are only done if r3 >= 0. Because both r3 and lr may have the high-
@ order bit set in the first step, just falling into the regular
@ division loop will mess up the first time around.
@ So we unroll slightly...
Ldo_single_div:
subs r5, r5, #1
blt Lend_regular_divide
sub r3, r3, lr
mov r2, #1
b Lend_single_divloop
Lsingle_divloop:
cmp r3, #0
mov r2, r2, lsl #1
mov lr, lr, lsr #1
@ r3 >= 0
subpl r3, r3, lr
addpl r2, r2, #1
@ r3 < 0
addmi r3, r3, lr
submi r2, r2, #1
Lend_single_divloop:
subs r5, r5, #1
bge Lsingle_divloop
b Lend_regular_divide
1:
add ip, ip, #1
Lnot_really_big:
mov lr, lr, lsl #4
cmp lr, r3
bls 1b
@
@ HOW CAN ip EVER BE -1 HERE ?????
@
cmn ip, #1
SYM (__modsi3):
mov curbit, #1
cmp divisor, #0
rsbmi divisor, divisor, #0 @ Loops below use unsigned.
beq Ldiv0
@ Need to save the sign of the dividend, unfortunately, we need
@ ip later on; this is faster than pushing lr and using that.
str dividend, [sp, #-4]!
cmp dividend, #0
rsbmi dividend, dividend, #0
cmp dividend, divisor
bcc Lgot_result
Loop1:
@ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main
@ division loop. Continue shifting until the divisor is
@ larger than the dividend.
cmp divisor, #0x10000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #4
movcc curbit, curbit, lsl #4
bcc Loop1
Lbignum:
@ For very big divisors, we must shift it a bit at a time, or
@ we will be in danger of overflowing.
cmp divisor, #0x80000000
cmpcc divisor, dividend
movcc divisor, divisor, lsl #1
movcc curbit, curbit, lsl #1
bcc Lbignum
Loop3:
@ Test for possible subtractions. On the final pass, this may
@ subtract too much from the dividend, so keep track of which
@ subtractions are done, we can fix them up afterwards...
mov overdone, #0
cmp dividend, divisor
subcs dividend, dividend, divisor
cmp dividend, divisor, lsr #1
subcs dividend, dividend, divisor, lsr #1
orrcs overdone, overdone, curbit, ror #1
cmp dividend, divisor, lsr #2
subcs dividend, dividend, divisor, lsr #2
orrcs overdone, overdone, curbit, ror #2
cmp dividend, divisor, lsr #3
subcs dividend, dividend, divisor, lsr #3
orrcs overdone, overdone, curbit, ror #3
mov ip, curbit
cmp dividend, #0 @ Early termination?
movnes curbit, curbit, lsr #4 @ No, any more bits to do?
movne divisor, divisor, lsr #4
bne Loop3
@ Any subtractions that we should not have done will be recorded in
@ the top three bits of "overdone". Exactly which were not needed
@ are governed by the position of the bit, stored in ip.
@ If we terminated early, because dividend became zero,
@ then none of the below will match, since the bit in ip will not be
@ in the bottom nibble.
ands overdone, overdone, #0xe0000000
beq Lgot_result
Ldivloop:
cmp r3, #0 @ set up for initial iteration
mov r2, r2, lsl #4
@ depth 1, accumulated bits 0
mov lr, lr, lsr #1
blt L.1.1015
@ remainder is positive
subs r3, r3, lr
@ depth 2, accumulated bits 1
mov lr, lr, lsr #1
blt L.2.1016
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits 3
mov lr, lr, lsr #1
blt L.3.1018
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 7
mov lr, lr, lsr #1
blt L.4.1022
@ remainder is positive
subs r3, r3, lr
add r2, r2, #15
b 9f
L.4.1022:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #13
b 9f
L.3.1018:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 5
mov lr, lr, lsr #1
blt L.4.1020
@ remainder is positive
subs r3, r3, lr
add r2, r2, #11
b 9f
L.4.1020:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #9
b 9f
L.2.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits 1
mov lr, lr, lsr #1
blt L.3.1016
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits 3
mov lr, lr, lsr #1
blt L.4.1018
@ remainder is positive
subs r3, r3, lr
add r2, r2, #7
b 9f
L.4.1018:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #5
b 9f
L.3.1016:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits 1
mov lr, lr, lsr #1
blt L.4.1016
@ remainder is positive
subs r3, r3, lr
add r2, r2, #3
b 9f
L.4.1016:
@ remainder is negative
adds r3, r3, lr
add r2, r2, #1
b 9f
L.1.1015:
@ remainder is negative
adds r3, r3, lr
@ depth 2, accumulated bits -1
mov lr, lr, lsr #1
blt L.2.1014
@ remainder is positive
subs r3, r3, lr
@ depth 3, accumulated bits -1
mov lr, lr, lsr #1
blt L.3.1014
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -1
mov lr, lr, lsr #1
blt L.4.1014
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #1
b 9f
L.4.1014:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #3
b 9f
L.3.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -3
mov lr, lr, lsr #1
blt L.4.1012
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #5
b 9f
L.4.1012:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #7
b 9f
L.2.1014:
@ remainder is negative
adds r3, r3, lr
@ depth 3, accumulated bits -3
mov lr, lr, lsr #1
blt L.3.1012
@ remainder is positive
subs r3, r3, lr
@ depth 4, accumulated bits -5
mov lr, lr, lsr #1
blt L.4.1010
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #9
b 9f
L.4.1010:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #11
b 9f
L.3.1012:
@ remainder is negative
adds r3, r3, lr
@ depth 4, accumulated bits -7
mov lr, lr, lsr #1
blt L.4.1008
@ remainder is positive
subs r3, r3, lr
sub r2, r2, #13
b 9f
L.4.1008:
@ remainder is negative
adds r3, r3, lr
sub r2, r2, #15
b 9f
9:
Lend_regular_divide:
subs ip, ip, #1
bge Ldivloop
cmp r3, #0
@ non-restoring fixup here (one instruction only!)
addlt r3, r1, r3
tst overdone, ip, ror #3
addne dividend, dividend, divisor, lsr #3
tst overdone, ip, ror #2
addne dividend, dividend, divisor, lsr #2
tst overdone, ip, ror #1
addne dividend, dividend, divisor, lsr #1
Lgot_result:
@ check to see if answer should be < 0
cmp r6, #0
rsbmi r3, r3, #0
mov r0, r3
ldmia sp!, {r4, r5, r6, pc}RETCOND
ldr ip, [sp], #4
cmp ip, #0
rsbmi dividend, dividend, #0
RET pc, lr
Ldiv_zero:
@ Divide by zero trap. If it returns, return 0 (about as
@ wrong as possible, but that is what SunOS does...).
Ldiv0:
str lr, [sp, #-4]!
bl SYM (__div0)
mov r0, #0
ldmia sp!, {r4, r5, r6, pc}RETCOND
mov r0, #0 @ about as wrong as it could be
ldmia sp!, {pc}RETCOND
#endif /* L_modsi3 */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment