138 lines
3.1 KiB
ArmAsm
138 lines
3.1 KiB
ArmAsm
.define .dvu4
|
|
|
|
! 4-byte divide routine for z80
|
|
! parameters:
|
|
! stack: divisor
|
|
! dividend
|
|
! stack: quotient (out)
|
|
! bc de: remainder (out) (high part in bc)
|
|
|
|
|
|
|
|
! a n-byte divide may be implemented
|
|
! using 2 (virtual) registers:
|
|
! - a n-byte register containing
|
|
! the divisor
|
|
! - a 2n-byte shiftregister (VSR)
|
|
!
|
|
! Initially, the VSR contains the dividend
|
|
! in its low (right) n bytes and zeroes in its
|
|
! high n bytes. The dividend is shifted
|
|
! left into a "window" bit by bit. After
|
|
! each shift, the contents of the window
|
|
! is compared with the divisor. If it is
|
|
! higher or equal, the divisor is subtracted from
|
|
! it and a "1" bit is inserted in the
|
|
! VSR from the right side! else a "0" bit
|
|
! is inserted. These bits are shifted left
|
|
! too during subsequent iterations.
|
|
! At the end, the rightmost part of VSR
|
|
! contains the quotient.
|
|
! For n=4, we need 2*4+4 = 12 bytes of
|
|
! registers. Unfortunately we only have
|
|
! 5 2-byte registers on the z80
|
|
! (bc,de,hl,ix and iy). Therefore we use
|
|
! an overlay technique for the rightmost
|
|
! 4 bytes of the VSR. The 32 iterations
|
|
! are split up into two groups: during
|
|
! the first 16 iterations we use the high
|
|
! order 16 bits of the dividend! during
|
|
! the last 16 iterations we use the
|
|
! low order 16 bits.
|
|
! register allocation:
|
|
! VSR iy hl ix
|
|
! divisor -de bc
|
|
.dvu4:
|
|
! initialization
|
|
pop hl ! save return address
|
|
ld (.retaddr),hl
|
|
pop bc ! low part (2 bytes)
|
|
! of divisor in bc
|
|
xor a ! clear carry, a := 0
|
|
ld h,a ! hl := 0
|
|
ld l,a
|
|
ld (.flag),a ! first pass main loop
|
|
pop de ! high part divisor
|
|
sbc hl,de ! inverse of high part
|
|
ex de,hl ! of divisor in de
|
|
pop hl ! save low part of
|
|
! dividend in memory
|
|
ld (.low),hl ! used during second
|
|
! iteration over main loop
|
|
pop ix ! high part of dividend
|
|
push iy ! save LB
|
|
ld h,a ! hl := 0
|
|
ld l,a
|
|
ld iy,0 ! now the VSR is initialized
|
|
|
|
! main loop, done twice
|
|
1:
|
|
ld a,16
|
|
! sub-loop, done 16 times
|
|
2:
|
|
add iy,iy ! shift VSR left
|
|
add ix,ix
|
|
adc hl,hl
|
|
jp nc,3f
|
|
inc iy
|
|
3:
|
|
or a ! subtract divisor from
|
|
! window (iy hl)
|
|
ld (.iysave),iy
|
|
sbc hl,bc
|
|
jr nc,4f ! decrement iy if there
|
|
! was no borrow
|
|
dec iy
|
|
4:
|
|
add iy,de ! there is no "sbc iy,ss"
|
|
! on the z80, so de was
|
|
! inverted during init.
|
|
inc ix
|
|
! see if the result is non-negative,
|
|
! otherwise undo the subtract.
|
|
! note that this uncooperating machine
|
|
! does not set its S -or Z flag after
|
|
! a 16-bit add.
|
|
ex (sp),iy ! does anyone see a better
|
|
ex (sp),hl ! solution ???
|
|
bit 7,h
|
|
ex (sp),hl
|
|
ex (sp),iy
|
|
jp z,5f
|
|
! undo the subtract
|
|
add hl,bc
|
|
ld iy,(.iysave)
|
|
dec ix
|
|
5:
|
|
dec a
|
|
jr nz,2b
|
|
ld a,(.flag) ! see if this was first or
|
|
! second iteration of main loop
|
|
or a ! 0=first, 1=second
|
|
jr nz,6f
|
|
inc a ! a := 1
|
|
ld (.flag),a ! flag := 1
|
|
ld (.result),ix ! save high part of result
|
|
ld ix,(.low) ! initialize second
|
|
! iteration, ix := low
|
|
! part of dividend
|
|
jr 1b
|
|
6:
|
|
! clean up
|
|
push iy ! transfer remainder
|
|
pop bc ! from iy-hl to bc-de
|
|
ex de,hl
|
|
pop iy ! restore LB
|
|
ld hl,(.result) ! high part of result
|
|
push hl
|
|
push ix ! low part of result
|
|
ld hl,(.retaddr)
|
|
jp (hl) ! return
|
|
|
|
.data
|
|
.flag: .byte 0
|
|
.low: .word 0
|
|
.iysave: .word 0
|
|
.retaddr: .word 0
|
|
.result: .word 0
|