Added end library and floating point processor support

This commit is contained in:
ceriel 1991-02-01 15:09:58 +00:00
parent e64fb88a5d
commit 0f4e675b50
13 changed files with 718 additions and 124 deletions

View file

@ -1,5 +1,9 @@
LIST LIST
Makefile Makefile
compmodule compmodule
em_end.s
etext.s
edata.s
end.s end.s
libem_s.a libem_s.a
READ_ME

View file

@ -26,7 +26,7 @@ shp.s
sig.s sig.s
cms.s cms.s
gto.s gto.s
ffp.s fp68881.s
fat.s fat.s
trp.s trp.s
dia.s dia.s

View file

@ -1,28 +1,33 @@
# $Header$ # $Header$
MACH=m68k2 MACH=m68k2
all: libem_o.a end.o ASAR=aal
all: libem_o.a end.a
install: all install: all
../../install libem_o.a tail_em ../../install libem_o.a tail_em
../../install end.o end_em ../../install end.a end_em
cmp: all cmp: all
-../../compare libem_o.a tail_em -../../compare libem_o.a tail_em
-../../compare end.o end_em -../../compare end.a end_em
end.o: end.s end.a: em_end.s etext.s edata.s end.s
$(MACH) -I../../../h -c em_end.s
$(MACH) -I../../../h -c edata.s
$(MACH) -I../../../h -c etext.s
$(MACH) -I../../../h -c end.s $(MACH) -I../../../h -c end.s
$(ASAR) cr end.a em_end.o etext.o edata.o end.o
libem_o.a: libem_s.a libem_o.a: libem_s.a
ASAR=aal ; export ASAR ;\ ASAR=$(ASAR) ; export ASAR ;\
march . libem_o.a march . libem_o.a
clean: clean:
rm -f *.o libem_o.a rm -f *.o libem_o.a end.a
opr : opr :
make pr | opr make pr | opr
pr: pr:
@arch pv libem_s.a | pr -h `pwd`/libem_s.a @arch pv libem_s.a | pr -h `pwd`/libem_s.a
@pr `pwd`/end.s @pr `pwd`/em_end.s `pwd`/edata.s `pwd`/etext.s `pwd`/end.s

View file

@ -1,5 +1,4 @@
The original EM library routines saved all registers The routines in mli.s, mlu.s, dvi.s, and dvu.s are written by
(including scratch registers) in global data; hence they Kai-Uwe Bloem and were published on the comp.os.minix newsgroup.
were not reentrant. He allowed us to use them for ACK, but requested that
The new routines do not save registers d0,d1,d2,a0 and a1. they do not fall under the ACK copyright notice. So, they don't.
They are reentrant.

View file

@ -5,38 +5,96 @@
.sect .bss .sect .bss
! signed long divide ! signed long divide
!-----------------------------------------------------------------------------
! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
! #1 01/12/90 initial revision. Minor reduce of shift operations.
! #2 03/07/90 use 68000 divu instruction whereever possible. This change
! makes #1 superflous. (derived from my GNU division routine)
!-----------------------------------------------------------------------------
! Some common cases can be handled in a special, much faster way :
! 1) divisor = 0
! => cause trap, then return to user. Result is undefined
! 2) dividend < divisor
! => quotient = 0, remainder = dividend
! 3) divisor < 0x10000 ( i.e. divisor is only 16 bits wide )
! => quotient and remainder can be calculated quite fast by repeated
! application of 68000 divu operations (ca. 400 cycles)
! 4) otherwise (due to #2, #3 dividend, divisor both wider then 16 bits)
! => do slow division by shift and subtract
!-----------------------------------------------------------------------------
! register usage:
! : d0 divisor
! d1 dividend
! exit : d1 quotient
! d2 remainder
.sect .text .sect .text
.dvi: .dvi:
move.l (sp)+,a0 ! return address move.l (sp)+,a1 ! return address
move.l (sp)+,d0 move.l (sp)+,d0 ! divisor
move.l (sp)+,d1 move.l (sp)+,d2 ! dividend
move.l d3,-(sp) ! save d3 and d4 move.l d3,a0 ! save d3
move.l d4,-(sp) move.l d4,-(sp) ! save result sign register
clr.l d4 clr.l d4
tst.l d0 ! divisor tst.l d2
bpl 1f bpl 0f ! dividend is negative ?
neg.l d0 neg.l d2 ! yes - negate
not d4 not.l d4 ! and note negation in d4
1: 0:
tst.l d1 ! dividend tst.l d0
bpl 2f bpl 0f ! divisor is negative ?
neg.l d1 neg.l d0 ! yes - negate
not d4 not.w d4 ! note negation
swap d4 0:
not d4 clr.l d1 ! prepare quotient
swap d4 ! === case 1: divisor = 0
tst.l d0 ! divisor = 0 ?
beq 9f ! yes - divide by zero trap
! === case 2: dividend < divisor
cmp.l d0,d2 ! dividend < divisor ?
bcs 8f ! yes - division already finished
! === case 3: divisor <= 0x0ffff
cmp.l #0x0ffff,d0 ! is divisor only 16 bits wide ?
bhi 2f
move.w d2,d3 ! save dividend.l
clr.w d2 ! prepare dividend.h for divu operation
swap d2
beq 0f ! dividend.h is all zero, no divu necessary
divu d0,d2
0: move.w d2,d1 ! save quotient.h
swap d1
move.w d3,d2 ! divide dividend.l
divu d0,d2 ! (d2.h = remainder of prev divu)
move.w d2,d1 ! save qoutient.l
clr.w d2 ! get remainder
swap d2
bra 8f
! === case 4: divisor and dividend both > 0x0ffff
2: 2:
move.l d1,-(sp) move #32-1,d3 ! loop count
move.l d0,-(sp) 4:
jsr .dvu lsl.l #1,d2 ! shift dividend ...
tst d4 roxl.l #1,d1 ! ... into d1
beq 5f cmp.l d0,d1 ! compare with divisor
neg.l d1 ! quotient bcs 5f
sub.l d0,d1 ! bigger, subtract divisor
add #1,d2 ! note subtraction in result
5: 5:
tst.l d4 dbra d3,4b
bpl 6f exg d1,d2 ! get results in the correct registers
neg.l d2 ! remainder 8:
6: tst.w d4 ! quotient < 0 ?
move.l (sp)+,d4 ! restore d4 and d3 bpl 0f
move.l (sp)+,d3 neg.l d1 ! yes - negate
jmp (a0) 0: tst.l d4 ! remainder < 0 ?
bpl 0f
neg.l d2
0: move.l (sp)+,d4 ! restore d4
move.l a0,d3 ! restore d3
jmp (a1)
EIDIVZ = 6
9: move.w #EIDIVZ,-(sp)
jsr .trp

View file

@ -5,34 +5,77 @@
.sect .bss .sect .bss
! unsigned long divide ! unsigned long divide
!-----------------------------------------------------------------------------
! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
! #1 01/12/90 initial revision. Minor reduce of shift operations.
! #2 03/07/90 use 68000 divu instruction whereever possible. This change
! makes #1 superflous. (derived from my GNU division routine)
!-----------------------------------------------------------------------------
! Some common cases can be handled in a special, much faster way :
! 1) divisor = 0
! => cause trap, then return to user. Result is undefined
! 2) dividend < divisor
! => quotient = 0, remainder = dividend
! 3) divisor < 0x10000 ( i.e. divisor is only 16 bits wide )
! => quotient and remainder can be calculated quite fast by repeated
! application of 68000 divu operations (ca. 400 cycles)
! 4) otherwise (due to #2, #3 dividend, divisor both wider then 16 bits)
! => do slow division by shift and subtract
!-----------------------------------------------------------------------------
! register usage: ! register usage:
! : d0 divisor ! : d0 divisor
! d1 dividend ! d1 dividend
! exit : d1 quotient ! exit : d1 quotient
! d2 remainder ! d2 remainder
.sect .text .sect .text
.dvu: .dvu:
move.l d3,a0 ! save d3
move.l (sp)+,a1 ! return address move.l (sp)+,a1 ! return address
move.l (sp)+,d0 move.l (sp)+,d0 ! divisor
move.l (sp)+,d1 move.l (sp)+,d2 ! dividend
move.l d3,-(sp) ! save d3 clr.l d1 ! prepare quotient
tst.l d0 ! === case 1: divisor = 0
bne 0f tst.l d0 ! divisor = 0 ?
move.l (sp)+,d3 beq 9f ! yes - divide by zero trap
move.w #EIDIVZ,-(sp) ! === case 2: dividend < divisor
jsr .trp cmp.l d0,d2 ! dividend < divisor ?
0: bcs 8f ! yes - division already finished
clr.l d2 ! === case 3: divisor <= 0x0ffff
move.l #32,d3 cmp.l #0x0ffff,d0 ! is divisor only 16 bits wide ?
3: bhi 2f
lsl.l #1,d1 move.w d2,d3 ! save dividend.l
roxl.l #1,d2 clr.w d2 ! prepare dividend.h for divu operation
cmp.l d0,d2 swap d2
blt 4f beq 0f ! dividend.h is all zero, no divu necessary
sub.l d0,d2 divu d0,d2
add #1,d1 0: move.w d2,d1 ! save quotient.h
swap d1
move.w d3,d2 ! divide dividend.l
divu d0,d2 ! (d2.h = remainder of prev divu)
move.w d2,d1 ! save qoutient.l
clr.w d2 ! get remainder
swap d2
bra 8f
! === case 4: divisor and dividend both > 0x0ffff
2:
move #32-1,d3 ! loop count
4: 4:
sub #1,d3 lsl.l #1,d2 ! shift dividend ...
bgt 3b roxl.l #1,d1 ! ... into d1
move.l (sp)+,d3 cmp.l d0,d1 ! compare with divisor
bcs 5f
sub.l d0,d1 ! bigger, subtract divisor
add #1,d2 ! note subtraction in result
5:
dbra d3,4b
exg d1,d2 ! get results in the correct registers
8:
move.l a0,d3 ! restore d3
jmp (a1) jmp (a1)
EIDIVZ = 6
9: move.w #EIDIVZ,-(sp)
jsr .trp

9
mach/m68k2/libem/edata.s Normal file
View file

@ -0,0 +1,9 @@
.sect .text
.sect .rom
.sect .data
.sect .bss
.define _edata
.sect .data
.align 4
.sect .data
_edata:

22
mach/m68k2/libem/em_end.s Normal file
View file

@ -0,0 +1,22 @@
.sect .text
.sect .rom
.sect .data
.sect .bss
.define endtext,enddata,endbss,__end
.sect .text
.align 4
.sect .rom
.align 4
.sect .data
.align 4
.sect .bss
.align 4
.sect .end ! only for declaration of _end, __end and endbss.
.sect .text
endtext:
.sect .data
enddata:
.sect .end
__end:
endbss:

View file

@ -1,16 +1,7 @@
.define endtext,enddata,endbss,_etext,_edata,_end
.sect .text .sect .text
.sect .rom .sect .rom
.sect .data .sect .data
.sect .bss .sect .bss
.sect .end ! only for declaration of _end and endbss. .define _end
.sect .end ! only for declaration of _end, __end and endbss.
.sect .text
endtext:
_etext:
.sect .data
enddata:
_edata:
.sect .end
_end: _end:
endbss:

9
mach/m68k2/libem/etext.s Normal file
View file

@ -0,0 +1,9 @@
.sect .text
.sect .rom
.sect .data
.sect .bss
.define _etext
.sect .text
.align 4
.sect .text
_etext:

352
mach/m68k2/libem/fp68881.s Normal file
View file

@ -0,0 +1,352 @@
.define .adf4, .adf8, .sbf4, .sbf8, .mlf4, .mlf8, .dvf4, .dvf8
.define .ngf4, .ngf8, .fif4, .fif8, .fef4, .fef8
.define .cif4, .cif8, .cuf4, .cuf8, .cfi, .cfu, .cff4, .cff8
.define .cmf4, .cmf8
.sect .text
.sect .rom
.sect .data
.sect .bss
! $Header$
! Implement interface to floating point package for M68881
.sect .text
.adf4:
move.l (sp)+,a0
fmove.s (sp),fp0
fadd.s 4(sp),fp0
fmove.s fp0,4(sp)
jmp (a0)
.adf8:
move.l (sp)+,a0
fmove.d (sp),fp0
fadd.d 8(sp),fp0
fmove.d fp0,8(sp)
jmp (a0)
.sbf4:
move.l (sp)+,a0
fmove.s (sp),fp0
fmove.s 4(sp),fp1
fsub fp0,fp1
fmove.s fp1,4(sp)
jmp (a0)
.sbf8:
move.l (sp)+,a0
fmove.d (sp),fp0
fmove.d 8(sp),fp1
fsub fp0,fp1
fmove.d fp1,8(sp)
jmp (a0)
.mlf4:
move.l (sp)+,a0
fmove.s (sp),fp0
fmul.s 4(sp),fp0
fmove.s fp0,4(sp)
jmp (a0)
.mlf8:
move.l (sp)+,a0
fmove.d (sp),fp0
fmul.d 8(sp),fp0
fmove.d fp0,8(sp)
jmp (a0)
.dvf4:
move.l (sp)+,a0
fmove.s (sp),fp0
fmove.s 4(sp),fp1
fdiv fp0,fp1
fmove.s fp1,4(sp)
jmp (a0)
.dvf8:
move.l (sp)+,a0
fmove.d (sp),fp0
fmove.d 8(sp),fp1
fdiv fp0,fp1
fmove.d fp1,8(sp)
jmp (a0)
.ngf4:
fmove.s 4(sp),fp0
fneg fp0
fmove.s fp0,4(sp)
rts
.ngf8:
fmove.d 4(sp),fp0
fneg fp0
fmove.d fp0,4(sp)
rts
.fif4:
move.l (sp)+,a0
move.l (sp),a1
fmove.s 4(sp),fp0
fmove.s 8(sp),fp1
fmul fp0,fp1
fintrz fp1,fp0
fsub fp0,fp1
fmove.s fp1,4(a1)
fmove.s fp0,(a1)
jmp (a0)
.fif8:
move.l (sp)+,a0
move.l (sp),a1
fmove.d 4(sp),fp0
fmove.d 12(sp),fp1
fmul fp0,fp1
fintrz fp1,fp0
fsub fp0,fp1
fmove.d fp1,8(a1)
fmove.d fp0,(a1)
jmp (a0)
.fef4:
move.l (sp)+,a0
move.l (sp),a1
fmove.s 4(sp),fp0
fgetexp fp0,fp1
fmove.l fpsr,d0
and.l #0x2000,d0 ! set if Infinity
beq 1f
move.w #129,(a1)
fmove.s 4(sp),fp0
fblt 2f
move.l #0x3f000000,2(a1)
jmp (a0)
2:
move.l #0xbf000000,2(a1)
jmp (a0)
1:
fmove.l fp1,d0
add.l #1,d0
fgetman fp0
fbne 1f
clr.l d0
bra 2f
1:
fmove.l #2,fp1
fdiv fp1,fp0
2:
fmove.s fp0,2(a1)
move.w d0,(a1)
jmp (a0)
.fef8:
move.l (sp)+,a0
move.l (sp),a1
fmove.d 4(sp),fp0
fgetexp fp0,fp1
fmove.l fpsr,d0
and.l #0x2000,d0 ! set if Infinity
beq 1f
move.w #1025,(a1)
fmove.d 4(sp),fp0
fblt 2f
move.l #0x3fe00000,2(a1)
clr.l 6(a1)
jmp (a0)
2:
move.l #0xbfe00000,2(a1)
clr.l 6(a1)
jmp (a0)
1:
fmove.l fp1,d0
add.l #1,d0
fgetman fp0
fbne 1f
clr.l d0
bra 2f
1:
fmove.l #2,fp1
fdiv fp1,fp0
2:
fmove.d fp0,2(a1)
move.w d0,(a1)
jmp (a0)
.cif4:
move.l (sp)+,a0
cmp.w #2,(sp)
bne 1f
fmove.w 2(sp),fp0
fmove.s fp0,(sp)
jmp (a0)
1:
fmove.l 2(sp),fp0
fmove.s fp0,2(sp)
jmp (a0)
.cif8:
move.l (sp)+,a0
cmp.w #2,(sp)
bne 1f
fmove.w 2(sp),fp0
fmove.d fp0,(sp)
jmp (a0)
1:
fmove.l 2(sp),fp0
fmove.d fp0,(sp)
jmp (a0)
.cuf4:
move.l (sp)+,a0
cmp.w #2,(sp)
bne 2f
fmove.w 2(sp),fp0
tst.w 2(sp)
bge 1f
fadd.l #65536,fp0
1:
fmove.s fp0,(sp)
jmp (a0)
2:
fmove.l 2(sp),fp0
tst.l 2(sp)
bge 1f
fsub.l #-2147483648,fp0
fsub.l #-2147483648,fp0
1:
fmove.s fp0,2(sp)
jmp (a0)
.cuf8:
move.l (sp)+,a0
move.w (sp),d0
cmp.w #2,d0
bne 2f
fmove.w 2(sp),fp0
tst.w 2(sp)
bge 1f
fadd.l #65536,fp0
bra 1f
2:
fmove.l 2(sp),fp0
tst.l 2(sp)
bge 1f
fsub.l #-2147483648,fp0
fsub.l #-2147483648,fp0
1:
fmove.d fp0,(sp)
jmp (a0)
.cfi:
move.l (sp)+,a0
move.w (sp),d1
move.w 2(sp),d0
cmp.w #4,d0
bne 1f
fmove.s 4(sp),fp0
bra 2f
1:
fmove.d 4(sp),fp0
add.l #4,sp
2:
cmp.w #2,d1
bne 1f
fmove.w fp0,6(sp)
bra 2f
1:
fmove.l fp0,4(sp)
2:
cmp.w #4,d0
beq 1f
sub.l #4,sp
1:
jmp (a0)
.cfu:
move.l (sp)+,a0
move.w (sp),d1
move.w 2(sp),d2
cmp.w #4,d2
bne 1f
fmove.s 4(sp),fp0
fabs fp0
cmp.l #0x4f000000,4(sp)
bge 2f
fintrz fp0,fp0
fmove.l fp0,d0
bra 3f
2:
fadd.l #-2147483648,fp0
fintrz fp0,fp0
fmove.l fp0,d0
bchg #31,d0
bra 3f
1:
fmove.d 4(sp),fp0
add.l #4,sp
fabs fp0
cmp.l #0x41e00000,(sp)
bge 1f
fintrz fp0,fp0
fmove.l fp0,d0
bra 3f
1:
fadd.l #-2147483648,fp0
fintrz fp0,fp0
fmove.l fp0,d0
bchg #31,d0
3:
cmp.w #2,d1
bne 1f
move.w d0,6(sp)
bra 2f
1:
move.l d0,4(sp)
2:
cmp.w #4,d2
beq 1f
sub.l #4,sp
1:
jmp (a0)
.cff4:
move.l (sp)+,a0
fmove.d (sp),fp0
fmove.s fp0,4(sp)
jmp (a0)
.cff8:
move.l (sp)+,a0
fmove.s (sp),fp0
fmove.d fp0,(sp)
jmp (a0)
.cmf4:
move.l (sp)+,a0
clr.l d0
fmove.s (sp),fp0
fmove.s 4(sp),fp1
fcmp fp0,fp1
fbeq 2f
fblt 1f
add.l #1,d0
jmp (a0)
1:
sub.l #1,d0
2:
jmp (a0)
.cmf8:
move.l (sp)+,a0
clr.l d0
fmove.d (sp),fp0
fmove.d 8(sp),fp1
fcmp fp0,fp1
fbeq 2f
fblt 1f
add.l #1,d0
jmp (a0)
1:
sub.l #1,d0
2:
jmp (a0)

View file

@ -4,31 +4,92 @@
.sect .data .sect .data
.sect .bss .sect .bss
! signed long mulitply
!-----------------------------------------------------------------------------
! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
! #1 01/12/90 initial revision
!-----------------------------------------------------------------------------
! 3 cases worth to recognize :
! 1) both the upper word of u and v are zero
! => 1 mult : Low*Low
! 2) only one of the upper words is zero
! => 2 mult : Low*HighLow
! 3) both upper words are not zero
! => 4 mult : HighLow*HighLow
! there are other cases (e.g. lower word is zero but high word is not, or
! one operand is all zero). However, this seems not to be very common, so
! they are ignored for the price of superfluous multiplications in these
! cases.
!-----------------------------------------------------------------------------
! entry : d0 multiplicand
! d1 multiplier
! exit : d0 high order result
! d1 low order result
! d2,a0,a1 : destroyed
.sect .text .sect .text
.mli: .mli:
move.l (sp)+,a0 move.l (sp)+,a1 ! return address
move.l (sp)+,d1 move.l d3,a0 ! save register
move.l (sp)+,d0 movem.w (sp)+,d0-d3 ! get v and u
move.l d5,-(sp) move.w d5,-(sp) ! save sign register
clr d5 move.w d2,d5
tst.l d0 bge 0f ! negate u if neccessary
bpl 1f neg.w d1
neg.l d0 negx.w d0
not d5 0: tst.w d0
1: bge 0f ! negate v if neccessary
tst.l d1 eor.w d0,d5
bpl 2f neg.w d1
neg.l d1 negx.w d0
not d5 0: bne 1f ! case 2) or 3)
2: tst.w d2
move.l d0,-(sp) bne 2f ! case 2)
move.l d1,-(sp) ! === case 1: _l x _l ===
jsr .mlu mulu d3,d1 ! r.l = u.l x v.l
tst d5 9: ! (r.h is already zero)
beq 3f tst.w d5 ! negate result if neccessary
bpl 0f
neg.l d1 neg.l d1
negx.l d0 negx.l d0
0: move.w (sp)+,d5 ! return
move.l a0,d3
jmp (a1)
! === possibly case 2) or case 3) ===
1:
tst.w d2
bne 3f ! case 3)
! === case 2: _l x hl ===
exg d0,d2 ! exchange u and v
exg d1,d3 ! (minimizes number of distinct cases)
2:
mulu d1,d2 ! a = v.l x u.h
mulu d3,d1 ! r.l = v.l x u.l
swap d2 ! a = a << 16
clr.l d3
move.w d2,d3
clr.w d2
add.l d2,d1 ! r += a
addx.l d3,d0
bra 9b
! === case 3: hl x hl ===
3: 3:
move.l (sp)+,d5 move.l d4,-(sp) ! need more registers
jmp (a0) move.w d2,d4
mulu d1,d4 ! a = v.l x u.h
mulu d3,d1 ! r.l = u.l x v.l
mulu d0,d3 ! b = v.h x u.l
mulu d2,d0 ! r.h = u.h x v.h
swap d1 ! (just for simplicity)
add.w d4,d1 ! r += a << 16
clr.w d4
swap d4
addx.l d4,d0
add.w d3,d1 ! r += b << 16
clr.w d3
swap d3
addx.l d3,d0
swap d1
move.l (sp)+,d4 ! return
bra 9b

View file

@ -4,38 +4,79 @@
.sect .data .sect .data
.sect .bss .sect .bss
! unsigned long mulitply
!-----------------------------------------------------------------------------
! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
! #1 01/12/90 initial revision
!-----------------------------------------------------------------------------
! 3 cases worth to recognize :
! 1) both the upper word of u and v are zero
! => 1 mult : Low*Low
! 2) only one of the upper words is zero
! => 2 mult : Low*HighLow
! 3) both upper words are not zero
! => 4 mult : HighLow*HighLow
! there are other cases (e.g. lower word is zero but high word is not, or
! one operand is all zero). However, this seems not to be very common, so
! they are ignored for the price of superfluous multiplications in these
! cases.
!-----------------------------------------------------------------------------
! entry : d0 multiplicand ! entry : d0 multiplicand
! d1 multiplier ! d1 multiplier
! exit : d0 high order result ! exit : d0 high order result
! d1 low order result ! d1 low order result
! d2,a0,a1 : destroyed
.sect .text .sect .text
.mlu: .mlu:
move.l (sp)+,a1 move.l (sp)+,a1 ! return address
move.l (sp)+,d1 move.l d3,a0 ! save register
move.l (sp)+,d0 movem.w (sp)+,d0-d3 ! get v and u
movem.l d3/d4/d6,-(sp) tst.w d0
move.l d1,d3 bne 1f ! case 2) or 3)
move.l d1,d2 tst.w d2
swap d2 bne 2f ! case 2)
move.l d2,d4 ! === case 1: _l x _l ===
mulu d0,d1 mulu d3,d1 ! r.l = u.l x v.l
mulu d0,d2 move.l a0,d3 ! (r.h is already zero)
swap d0 jmp (a1) ! return
mulu d0,d3 ! === possibly case 2) or case 3) ===
mulu d4,d0 1:
clr.l d6 tst.w d2
swap d1 bne 3f ! case 3)
add d2,d1 ! === case 2: _l x hl ===
addx.l d6,d0 exg d0,d2 ! exchange u and v
add d3,d1 exg d1,d3 ! (minimizes number of distinct cases)
addx.l d6,d0 2:
swap d1 mulu d1,d2 ! a = v.l x u.h
clr d2 mulu d3,d1 ! r.l = v.l x u.l
clr d3 swap d2 ! a = a << 16
swap d2 clr.l d3
swap d3 move.w d2,d3
add.l d2,d0 clr.w d2
add.l d3,d0 add.l d2,d1 ! r += a
movem.l (sp)+,d3/d4/d6 addx.l d3,d0
move.l a0,d3 ! return
jmp (a1)
! === case 3: hl x hl ===
3:
move.l d4,-(sp) ! need more registers
move.w d2,d4
mulu d1,d4 ! a = v.l x u.h
mulu d3,d1 ! r.l = u.l x v.l
mulu d0,d3 ! b = v.h x u.l
mulu d2,d0 ! r.h = u.h x v.h
swap d1 ! (just for simplicity)
add.w d4,d1 ! r += a << 16
clr.w d4
swap d4
addx.l d4,d0
add.w d3,d1 ! r += b << 16
clr.w d3
swap d3
addx.l d3,d0
swap d1
move.l (sp)+,d4 ! return
move.l a0,d3
jmp (a1) jmp (a1)