I know that you are going for speed as well as compactness, and I can't figure out how to beat Garth's improved FIG UM/MOD for speed, but if I was going for smallest size, I would probably reuse code for UD/MOD and UM/MOD like so (58 code bytes total, headers not included):
Code: Select all
; - - - - - - - - - - - - - - - - - - - - - - - - - - -
; UM/MOD ( ud u1 -- u2 u3 )
; Dividend ud is 32-bits unsigned. Divisor u1,
; Remainder u2 and Quotient u3 are 16-bits unsigned
; Invalid inputs and/or outputs are silently ignored
; 1823 cycles (+ NEXT) best case (zero quotient)
; 2239 cycles (+ NEXT) worst case (zero divisor)
;
umslashmod:
jsr xudivmod ; (do the dirty work)
inx ; NIP
jmp swap ; SWAP
; - - - - - - - - - - - - - - - - - - - - - - - - - - -
; UD/MOD ( ud1 u1 -- u2 ud2 )
; Dividend ud1 and Quotient ud2 are 32-bits unsigned
; Divisor u1 and Remainder u2 are 16-bits unsigned
; Invalid inputs and/or outputs are silently ignored
; 1856 cycles (+ NEXT) best case (zero quotient)
; 2272 cycles (+ NEXT) worst case (zero divisor)
;
udslashmod:
jsr xudivmod ; (do the dirty work)
jmp dashrot ; -ROT
; - - - - - - - - - - - - - - - - - - - - - - - - - - -
; Internal divmod routine used by UM/MOD and UD/MOD
; ( ud1 u1 -- ud2 u2 )
; ud1 is dividend, u1 is divisor
; ud2 is quotient, u2 is remainder
xudivmod:
ldy #32 ;[2] init loop counter
lda #0 ;[2] init partial remainder
sta N ;[3] in N:A (h:l)
xudm2:
asl stackl+1,x ;[6] dividend in NOS:3OS (h:l)
rol stackh+1,x ;[6] is gradually replaced
rol stackl,x ;[6] with the quotient
rol stackh,x ;[6]
rol ;[2] N:A is gradually replaced
rol N ;[5] with the remainder
pha ;[3]
cmp TOS ;[3] TOS holds divisor
lda N ;[3] partial remainder >= TOS?
sbc TOS+1 ;[3]
bcc xudm3 ;[3]*
sta N ;[3] yes: update the partial
pla ;[4] remainder and set the
sbc TOS ;[3] low bit in the partial
inc stackl+1,x ;[6] quotient
.db $c9 ;[2]* cmp# naked opcode
xudm3:
pla ;[4]*
dey ;[2] loop 32 times
bne xudm2 ;[3]*
sta TOS ;[3]
ldy N ;[3]
sty TOS+1 ;[3]
rts ;[6]
YMMV, untested, as usual.
Mike B.