.globl  ..__u64div

//      unsigned 64 bit divide
//      divide r4:r3 by r6:r5 with the result in r4:r3

        .pdata
        .align  2
        .ualong ..__u64div,__u64div.e,0,0,__u64div.b

        .text
        .align  2
..__u64div:
        .function       ..__u64div

//      no prologue

__u64div.b:
        or.     r0,r4,r6
        bne     __u64div.1
// if both r4 and r6 are zero, use 32 bit divide
        divwu   r3,r3,r5
        twi     0x6,r5,0        // trap if division by zero
        blr

__u64div.1:
// start by left justifying divisor
// this allows us to use cntlzw to speed
// calculations
//
        li      r12,32          // r12 = 32 for rest of routine

        cntlzw  r9,r6
        subf.   r10,r9,r12
        slw     r6,r6,r9
        srw     r11,r5,r10
        slw     r5,r5,r9
        or      r6,r6,r11
        li      r0,32           // if divisor > 2^32, quotient is at most 32 bits
        bne     __u64div.2
// r10 == 0 iff r6 ==0, so may need to shift again
        twi     0x6,r6,0        // trap if division by zero
        cntlzw  r9,r6
        subf    r10,r9,r12
        slw     r6,r6,r9
        li      r0,64           // quotient is up to 64 bits long
__u64div.2:
// now need to shift dividend by same amount
// note that first 32 bits are taken care of by
// changing the shift count.
        srw     r8,r4,r10
        slw     r7,r4,r9
        srw     r11,r3,r10
        slw     r4,r3,r9
        or      r7,r7,r11
        li      r3,0

__u64div.3:
// main loop.
//
// begin by left justifying dividend
// and adjusting shift count appropriately
        cntlzw  r9,r8
        subf.   r0,r9,r0
        blt     __u64div.6      // done when count goes < 0

        subf.   r10,r9,r12
        slw     r8,r8,r9
        srw     r11,r7,r10
        or      r8,r8,r11
        slw     r7,r7,r9
        srw     r11,r4,r10
        or      r7,r7,r11
        slw     r4,r4,r9
        srw     r11,r3,r10
        or      r4,r4,r11
        slw     r3,r3,r9

// if r10 == 0, high bit may not be one - try again
        beq     __u64div.3

// assume dividend >= divisor
        subfc   r7,r5,r7
        subfe.  r8,r6,r8
        ori     r3,r3,1
        bge     __u64div.3

// check for last quotient bit
        cmpwi   cr1,r0,0

// shift dividend/quotient left one bit
        beq     cr1,__u64div.5 // exit if last quotient bit

        rlwinm  r8,r8,1,0,31
        rlwinm  r7,r7,1,0,31
        rlwinm  r4,r4,1,0,31
        rlwimi  r8,r7,0,31,31
        rlwimi  r7,r4,0,31,31
        rlwimi  r4,r3,1,31,31
        rlwinm  r3,r3,1,0,29   // also clears previous quotient bit

// add in divisor (now worth 1/2 subtracted value)
        addc    r7,r7,r5        
        adde    r8,r8,r6

__u64div.4:
// decrement count
        addic.  r0,r0,-1

// set quotient bit
        ori     r3,r3,1

        beqlr                   // if count has gone to zero, return

// shift dividend/quotient left one bit
        srwi.   r9,r8,31       // test upper bit of new dividend
        beq     __u64div.3		// if upper bit is 0 go back to original loop

        rlwinm  r8,r8,1,0,31
        rlwinm  r7,r7,1,0,31
        rlwinm  r4,r4,1,0,31
        rlwimi  r8,r7,0,31,31
        rlwimi  r7,r4,0,31,31
        rlwimi  r4,r3,1,31,31
        slwi    r3,r3,1

// subtract divisor from dividend and repeat
        subfc   r7,r5,r7
        subfe   r8,r6,r8

        b       __u64div.4

__u64div.5:
// eliminate last quotient bit
        rlwinm  r3,r3,0,0,30   // clears quotient bit
        blr

__u64div.6:
// get correct shift count
        add     r9,r9,r0
// and shift quotient by it
        subf    r10,r9,r12
        slw     r4,r4,r9
        srw     r11,r3,r10
        slw     r3,r3,r9
        or      r4,r4,r11
        blr
__u64div.e:


        .debug$S
        .ualong         1

        .uashort        17
        .uashort        0x9            # S_OBJNAME
        .ualong         0
        .byte           10, "u64div.obj"

        .uashort        24
        .uashort        0x1            # S_COMPILE
        .byte           0x42           # Target processor = PPC 604
        .byte           3              # Language = ASM
        .byte           0
        .byte           0
        .byte           17, "PowerPC Assembler"