I searched the web to try to find an unsigned division to divide a 64 bit value by a 32 bit value. Some of the best implementations used an additive process with a negated denominator. Converting these to unsigned proved to be impossible.
I eventually wrote my own assembler routine. Hopefully, this can save someone else all the time I spent researching this.
I make use of the IAR ARM compiler. The REPT and ENDR repeat the instructions between these two statements. Functions are called with the first four parameters in registers R0 to R3. R12 does not need to be preserved.
Basic Implementation
//=======================================================================
// Description: Divide 64 bit hi:lo by 32 bit den
// Prototype: DWORD div64by32(DWORD hi, DWORD lo, DWORD den);
// Parameters: hi = high part numerator/dividend - R0
// lo = low part numerator/dividend - R1
// den = denominator/divisor - R2
// Returns: Result Quotient - R0
// Notes: Use subtractive process - use carry of shifter to check
// if numerator is big enough
//=======================================================================
PUBLIC div64by32
div64by32:
#define hi r0
#define lo r1
#define den r2
#define top r12 // work area for top of numerator
#define work r3 // work area
cmp hi, den // if hi > den do 32 bits, else 64 bits
bpl do64bits
REPT 32
adds lo, lo, lo // shift numerator through carry
adcs hi, hi, hi
subscc work, hi, den // if carry not set, compare
subcs hi, hi, den // if carry set, subtract
addcs lo, lo, #1 // if carry set, and 1 to quotient
ENDR
mov r0, lo // move result into R0
mov pc, lr // return
do64bits:
mov top, #0
REPT 64
adds lo, lo, lo // shift numerator through carry
adcs hi, hi, hi
adcs top, top, top
subscc work, top, den // if carry not set, compare
subcs top, top, den // if carry set, subtract
addcs lo, lo, #1 // if carry set, and 1 to quotient
ENDR
mov r0, lo // move result into R0
mov pc, lr // return
Extended Implementation with Boundary Checks
I excluded the boundary checks in the basic implementation, as these are seldom used, and do not save much time. They can be added as follows:
PUBLIC div64by32
div64by32:
#define hi r0
#define lo r1
#define den r2
#define top r12 // work area for top of numerator
#define work r3 // work area
// check that denominator is not zero
cmp den, #0 // exception if den == zero
beq div0
// check if denominator is 1
subs top, den, #1 // top = den - 1
beq div1
// check if denominator is greater than numerator
cmp hi, #0 // if hi > 0 then larger
cmpeq lo, den // keep prev flasg if hi==0; else new comparison
beq divequal // denomitor == numerator
bls divless // denominator > numerator
// check for power of 2
tst den, top // if(den & (den - 1) == 0)
beq powerOf2 // den is a power of 2
cmp hi, den // if hi > den do 32 bits, else 64 bits
bpl do64bits
REPT 32
adds lo, lo, lo // shift numerator through carry
adcs hi, hi, hi
subscc work, hi, den // if carry not set, compare
subcs hi, hi, den // if carry set, subtract
addcs lo, lo, #1 // if carry set, and 1 to quotient
ENDR
mov r0, lo // move result into R0
mov pc, lr // return
do64bits:
mov top, #0
REPT 64
adds lo, lo, lo // shift numerator through carry
adcs hi, hi, hi
adcs top, top, top
subscc work, top, den // if carry not set, compare
subcs top, top, den // if carry set, subtract
addcs lo, lo, #1 // if carry set, and 1 to quotient
ENDR
mov r0, lo // move result into R0
mov pc, lr // return
// divide by zero
div0:
mov r0, #0xFFFFFFFF
mov pc, lr // return
// divide by 1
div1:
mov r0, lo
mov pc, lr // return
// denominator == numerator
divequal:
mov r0, #1
mov pc, lr // return
// denominator > numerator
divless:
mov r0, #0
mov pc, lr // return
// power of 2
powerOf2:
REPT 31
movs den, den, rrx
bcs powerend
movs hi, hi, rrx // divide numerator by 2, carry is zero
movs lo, lo, rrx
ENDR
powerend:
mov r0, lo
mov pc, lr // return