Unsigned division for an ARM processor - 64bit/32bit

Selwyn Jackson

selwyn@idwiz.co.za

24th August 2012
I searched the web to try to find an unsigned division to divide a 64 bit value by a 32 bit value. Some of the best implementations used an additive process with a negated denominator. Converting these to unsigned proved to be impossible.
I eventually wrote my own assembler routine. Hopefully, this can save someone else all the time I spent researching this.
I make use of the IAR ARM compiler. The REPT and ENDR repeat the instructions between these two statements. Functions are called with the first four parameters in registers R0 to R3. R12 does not need to be preserved.

//=======================================================================
//  Description: Divide 64 bit hi:lo by 32 bit den
//  Prototype:   DWORD div64by32(DWORD hi, DWORD lo, DWORD den);
//  Parameters:  hi = high part numerator/dividend - R0
//               lo = low part numerator/dividend - R1
//               den = denominator/divisor - R2
//  Returns:     Result Quotient - R0
//  Notes:       Use subtractive process - use carry of shifter to check
//               if numerator is big enough
//=======================================================================   
    PUBLIC div64by32
div64by32:
    #define hi  r0
    #define lo  r1
    #define den r2
    #define top r12     // work area for top of numerator
    #define work r3     // work area
 
    cmp     hi, den                     // if hi > den do 32 bits, else 64 bits
    bpl     do64bits
    REPT    32
        adds    lo, lo, lo              // shift numerator through carry
        adcs    hi, hi, hi
        subscc  work, hi, den           // if carry not set, compare       
        subcs   hi, hi, den             // if carry set, subtract
        addcs   lo, lo, #1              // if carry set, and 1 to quotient
    ENDR
    mov     r0, lo                      // move result into R0
    mov     pc, lr                      // return
 
do64bits:
    mov     top, #0
    REPT    64
        adds    lo, lo, lo              // shift numerator through carry
        adcs    hi, hi, hi
        adcs    top, top, top
        subscc  work, top, den          // if carry not set, compare       
        subcs   top, top, den           // if carry set, subtract
        addcs   lo, lo, #1              // if carry set, and 1 to quotient
    ENDR
    mov     r0, lo                      // move result into R0
    mov     pc, lr                      // return
		

I excluded the boundary checks, as these are seldom used, and do not save much time. They can be added as follows:

    PUBLIC div64by32
div64by32:
    #define hi  r0
    #define lo  r1
    #define den r2
    #define top r12     // work area for top of numerator
    #define work r3     // work area
 
    // check that denominator is not zero
    cmp     den, #0                     // exception if den == zero
    beq     div0
 
    // check if denominator is 1
    subs    top, den, #1                // top = den - 1
    beq     div1                
   
    // check if denominator is greater than numerator
    cmp     hi, #0                      // if hi > 0 then larger
    cmpeq   lo, den                     // keep prev flasg if hi==0; else new comparison
    beq     divequal                    // denomitor == numerator
    bls     divless                     // denominator > numerator
 
    // check for power of 2
    tst     den, top                    // if(den & (den - 1) == 0)
    beq     powerOf2                    // den is a power of 2
 
    cmp     hi, den                     // if hi > den do 32 bits, else 64 bits
    bpl     do64bits
    REPT    32
        adds    lo, lo, lo              // shift numerator through carry
        adcs    hi, hi, hi
        subscc  work, hi, den           // if carry not set, compare       
        subcs   hi, hi, den             // if carry set, subtract
        addcs   lo, lo, #1              // if carry set, and 1 to quotient
    ENDR
    mov     r0, lo                      // move result into R0
    mov     pc, lr                      // return
 
do64bits:
    mov     top, #0
    REPT    64
        adds    lo, lo, lo              // shift numerator through carry
        adcs    hi, hi, hi
        adcs    top, top, top
        subscc  work, top, den          // if carry not set, compare       
        subcs   top, top, den           // if carry set, subtract
        addcs   lo, lo, #1              // if carry set, and 1 to quotient
    ENDR
    mov     r0, lo                      // move result into R0
    mov     pc, lr                      // return
 
    // divide by zero
div0:
    mov     r0, #0xFFFFFFFF
    mov     pc, lr                      // return
 
    // divide by 1
div1:
    mov     r0, lo
    mov     pc, lr                      // return
   
    // denominator == numerator
divequal:
    mov     r0, #1
    mov     pc, lr                      // return
   
    // denominator > numerator
divless:
    mov     r0, #0
    mov     pc, lr                      // return
 
    // power of 2
powerOf2:
    REPT    31
        movs    den, den, rrx
        bcs     powerend
        movs    hi, hi, rrx             // divide numerator by 2, carry is zero
        movs    lo, lo, rrx
    ENDR
powerend:
    mov     r0, lo
    mov     pc, lr                      // return