Added end library and floating point processor support

1991-02-01 15:09:58 +00:00 · 1991-02-01 15:09:58 +00:00 · 0f4e675b50
commit 0f4e675b50
parent e64fb88a5d
13 changed files with 718 additions and 124 deletions
--- a/mach/m68k2/libem/.distr
+++ b/mach/m68k2/libem/.distr
@ -1,5 +1,9 @@
 LIST
 Makefile
 compmodule
+em_end.s
+etext.s
+edata.s
 end.s
 libem_s.a
+READ_ME
--- a/mach/m68k2/libem/LIST
+++ b/mach/m68k2/libem/LIST
@ -26,7 +26,7 @@ shp.s
 sig.s
 cms.s
 gto.s
-ffp.s
+fp68881.s
 fat.s
 trp.s
 dia.s
--- a/mach/m68k2/libem/Makefile
+++ b/mach/m68k2/libem/Makefile
@ -1,28 +1,33 @@
 # $Header$
 MACH=m68k2
-all:            libem_o.a end.o
+ASAR=aal
+all:		libem_o.a end.a

 install: 	all
 		../../install libem_o.a tail_em
-		../../install end.o end_em
+		../../install end.a end_em

 cmp:		all
 		-../../compare libem_o.a tail_em
-		-../../compare end.o end_em
+		-../../compare end.a end_em

-end.o:		end.s
+end.a:		em_end.s etext.s edata.s end.s
+		$(MACH) -I../../../h -c em_end.s
+		$(MACH) -I../../../h -c edata.s
+		$(MACH) -I../../../h -c etext.s
 		$(MACH) -I../../../h -c end.s
+		$(ASAR) cr end.a em_end.o etext.o edata.o end.o

 libem_o.a:	libem_s.a
-		ASAR=aal ; export ASAR ;\
+		ASAR=$(ASAR) ; export ASAR ;\
 		march . libem_o.a

 clean:
-		rm -f *.o libem_o.a
+		rm -f *.o libem_o.a end.a

 opr :
 		make pr | opr

 pr:
 		@arch pv libem_s.a | pr -h `pwd`/libem_s.a
-		@pr `pwd`/end.s
+		@pr `pwd`/em_end.s `pwd`/edata.s `pwd`/etext.s `pwd`/end.s
--- a/mach/m68k2/libem/READ_ME
+++ b/mach/m68k2/libem/READ_ME
@ -1,5 +1,4 @@
-The original EM library routines saved all registers
-(including scratch registers) in global data; hence they
-were not reentrant.
-The new routines do not save registers d0,d1,d2,a0 and a1.
-They are reentrant.
+The routines in mli.s, mlu.s, dvi.s, and dvu.s are written by
+Kai-Uwe Bloem and were published on the comp.os.minix newsgroup.
+He allowed us to use them for ACK, but requested that
+they do not fall under the ACK copyright notice. So, they don't.
--- a/mach/m68k2/libem/dvi.s
+++ b/mach/m68k2/libem/dvi.s
@ -5,38 +5,96 @@
 .sect .bss

 ! signed long divide
+ !-----------------------------------------------------------------------------
+ ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
+ !   #1  01/12/90  initial revision. Minor reduce of shift operations.
+ !   #2  03/07/90  use 68000 divu instruction whereever possible. This change
+ !		   makes #1 superflous. (derived from my GNU division routine)
+ !-----------------------------------------------------------------------------
+ ! Some common cases can be handled in a special, much faster way :
+ !      1) divisor = 0
+ !          => cause trap, then return to user. Result is undefined
+ !      2) dividend < divisor
+ !          => quotient = 0, remainder = dividend
+ !      3) divisor < 0x10000 ( i.e. divisor is only 16 bits wide )
+ !          => quotient and remainder can be calculated quite fast by repeated
+ !             application of 68000 divu operations (ca. 400 cycles)
+ !      4) otherwise (due to #2, #3 dividend, divisor both wider then 16 bits)
+ !          => do slow division by shift and subtract
+ !-----------------------------------------------------------------------------
+
+
+ ! register usage:
+ ! 	 : d0 divisor
+ !         d1 dividend
+ ! exit  : d1 quotient
+ !         d2 remainder
+
 	.sect .text
 .dvi:
-	move.l	(sp)+,a0	! return address
-	move.l	(sp)+,d0
-	move.l	(sp)+,d1
-	move.l	d3,-(sp)	! save d3 and d4
-	move.l	d4,-(sp)
+	move.l	(sp)+,a1	! return address
+	move.l	(sp)+,d0	! divisor
+	move.l	(sp)+,d2	! dividend
+	move.l	d3,a0		! save d3
+	move.l	d4,-(sp)	! save result sign register
 	clr.l	d4
-	tst.l	d0	! divisor
-	bpl	1f
-	neg.l	d0
-	not	d4
-1:
-	tst.l	d1	! dividend
-	bpl	2f
-	neg.l	d1
-	not	d4
-	swap	d4
-	not	d4
-	swap	d4
+	tst.l	d2
+	bpl	0f		! dividend is negative ?
+	neg.l	d2		! yes - negate
+	not.l	d4		! and note negation in d4
+0:
+	tst.l	d0
+	bpl	0f		! divisor is negative ?
+	neg.l	d0		! yes - negate
+	not.w	d4		! note negation
+0:
+	clr.l	d1		! prepare quotient
+! === case 1: divisor = 0
+	tst.l	d0		! divisor = 0 ?
+	beq	9f		! yes - divide by zero trap
+! === case 2: dividend < divisor
+	cmp.l	d0,d2		! dividend < divisor ?
+	bcs	8f		! yes - division already finished
+! === case 3: divisor <= 0x0ffff
+	cmp.l	#0x0ffff,d0	! is divisor only 16 bits wide ?
+	bhi	2f
+	move.w	d2,d3		! save dividend.l
+	clr.w	d2		! prepare dividend.h for divu operation
+	swap	d2
+	beq	0f		! dividend.h is all zero, no divu necessary
+	divu	d0,d2
+0:	move.w	d2,d1		! save quotient.h
+	swap	d1
+	move.w	d3,d2		! divide dividend.l
+	divu	d0,d2		! (d2.h = remainder of prev divu)
+	move.w	d2,d1		! save qoutient.l
+	clr.w	d2		! get remainder
+	swap	d2
+	bra	8f
+! === case 4: divisor and dividend both > 0x0ffff
 2:
-	move.l	d1,-(sp)
-	move.l	d0,-(sp)
-	jsr	.dvu
-	tst	d4
-	beq	5f
-	neg.l	d1	! quotient
+	move	#32-1,d3	! loop count
+4:
+	lsl.l	#1,d2		! shift dividend ...
+	roxl.l	#1,d1		!  ... into d1
+	cmp.l	d0,d1		! compare with divisor
+	bcs	5f
+	sub.l	d0,d1		! bigger, subtract divisor
+	add	#1,d2		! note subtraction in result
 5:
-	tst.l	d4
-	bpl	6f
-	neg.l	d2	! remainder
-6:
-	move.l	(sp)+,d4	! restore d4 and d3
-	move.l	(sp)+,d3
-	jmp	(a0)
+	dbra	d3,4b
+	exg	d1,d2		! get results in the correct registers
+8:
+	tst.w	d4		! quotient < 0 ?
+	bpl	0f
+	neg.l	d1		! yes - negate
+0:	tst.l	d4		! remainder < 0 ?
+	bpl	0f
+	neg.l	d2
+0:	move.l	(sp)+,d4	! restore d4
+	move.l	a0,d3		! restore d3
+	jmp	(a1)
+
+EIDIVZ	= 6
+9:	move.w	#EIDIVZ,-(sp)
+	jsr	.trp
--- a/mach/m68k2/libem/dvu.s
+++ b/mach/m68k2/libem/dvu.s
@ -5,34 +5,77 @@
 .sect .bss

 ! unsigned long divide
+ !-----------------------------------------------------------------------------
+ ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
+ !   #1  01/12/90  initial revision. Minor reduce of shift operations.
+ !   #2  03/07/90  use 68000 divu instruction whereever possible. This change
+ !		   makes #1 superflous. (derived from my GNU division routine)
+ !-----------------------------------------------------------------------------
+ ! Some common cases can be handled in a special, much faster way :
+ !      1) divisor = 0
+ !          => cause trap, then return to user. Result is undefined
+ !      2) dividend < divisor
+ !          => quotient = 0, remainder = dividend
+ !      3) divisor < 0x10000 ( i.e. divisor is only 16 bits wide )
+ !          => quotient and remainder can be calculated quite fast by repeated
+ !             application of 68000 divu operations (ca. 400 cycles)
+ !      4) otherwise (due to #2, #3 dividend, divisor both wider then 16 bits)
+ !          => do slow division by shift and subtract
+ !-----------------------------------------------------------------------------
+
+
 ! register usage:
 ! 	 : d0 divisor
 !         d1 dividend
 ! exit  : d1 quotient
 !         d2 remainder
+
 	.sect .text
 .dvu:
+	move.l	d3,a0		! save d3
 	move.l	(sp)+,a1	! return address
-	move.l	(sp)+,d0
-	move.l	(sp)+,d1
-	move.l	d3,-(sp)	! save d3
-	tst.l	d0
-	bne	0f
-	move.l	(sp)+,d3
-	move.w	#EIDIVZ,-(sp)
-	jsr	.trp
-0:
-	clr.l	d2
-	move.l	#32,d3
-3:
-	lsl.l	#1,d1
-	roxl.l	#1,d2
-	cmp.l	d0,d2
-	blt	4f
-	sub.l	d0,d2
-	add	#1,d1
+	move.l	(sp)+,d0	! divisor
+	move.l	(sp)+,d2	! dividend
+	clr.l	d1		! prepare quotient
+! === case 1: divisor = 0
+	tst.l	d0		! divisor = 0 ?
+	beq	9f		! yes - divide by zero trap
+! === case 2: dividend < divisor
+	cmp.l	d0,d2		! dividend < divisor ?
+	bcs	8f		! yes - division already finished
+! === case 3: divisor <= 0x0ffff
+	cmp.l	#0x0ffff,d0	! is divisor only 16 bits wide ?
+	bhi	2f
+	move.w	d2,d3		! save dividend.l
+	clr.w	d2		! prepare dividend.h for divu operation
+	swap	d2
+	beq	0f		! dividend.h is all zero, no divu necessary
+	divu	d0,d2
+0:	move.w	d2,d1		! save quotient.h
+	swap	d1
+	move.w	d3,d2		! divide dividend.l
+	divu	d0,d2		! (d2.h = remainder of prev divu)
+	move.w	d2,d1		! save qoutient.l
+	clr.w	d2		! get remainder
+	swap	d2
+	bra	8f
+! === case 4: divisor and dividend both > 0x0ffff
+2:
+	move	#32-1,d3	! loop count
 4:
-	sub	#1,d3
-	bgt	3b
-	move.l	(sp)+,d3
+	lsl.l	#1,d2		! shift dividend ...
+	roxl.l	#1,d1		!  ... into d1
+	cmp.l	d0,d1		! compare with divisor
+	bcs	5f
+	sub.l	d0,d1		! bigger, subtract divisor
+	add	#1,d2		! note subtraction in result
+5:
+	dbra	d3,4b
+	exg	d1,d2		! get results in the correct registers
+8:
+	move.l	a0,d3		! restore d3
 	jmp	(a1)
+
+EIDIVZ	= 6
+9:	move.w	#EIDIVZ,-(sp)
+	jsr	.trp
--- a/mach/m68k2/libem/edata.s
+++ b/mach/m68k2/libem/edata.s
@ -0,0 +1,9 @@
+.sect .text
+.sect .rom
+.sect .data
+.sect .bss
+.define	_edata
+.sect .data
+	.align 4
+	.sect .data
+_edata:
--- a/mach/m68k2/libem/em_end.s
+++ b/mach/m68k2/libem/em_end.s
@ -0,0 +1,22 @@
+.sect .text
+.sect .rom
+.sect .data
+.sect .bss
+.define	endtext,enddata,endbss,__end
+.sect .text
+	.align 4
+.sect .rom
+	.align 4
+.sect .data
+	.align 4
+.sect .bss
+	.align 4
+.sect .end ! only for declaration of _end, __end and endbss.
+
+	.sect .text
+endtext:
+	.sect .data
+enddata:
+	.sect .end
+__end:
+endbss:
--- a/mach/m68k2/libem/end.s
+++ b/mach/m68k2/libem/end.s
@ -1,16 +1,7 @@
-.define	endtext,enddata,endbss,_etext,_edata,_end
 .sect .text
 .sect .rom
 .sect .data
 .sect .bss
-.sect .end ! only for declaration of _end and endbss.
-
-	.sect .text
-endtext:
-_etext:
-	.sect .data
-enddata:
-_edata:
-	.sect .end
+.define	_end
+.sect .end ! only for declaration of _end, __end and endbss.
 _end:
-endbss:
--- a/mach/m68k2/libem/etext.s
+++ b/mach/m68k2/libem/etext.s
@ -0,0 +1,9 @@
+.sect .text
+.sect .rom
+.sect .data
+.sect .bss
+.define	_etext
+.sect .text
+	.align 4
+	.sect .text
+_etext:
--- a/mach/m68k2/libem/fp68881.s
+++ b/mach/m68k2/libem/fp68881.s
@ -0,0 +1,352 @@
+.define .adf4, .adf8, .sbf4, .sbf8, .mlf4, .mlf8, .dvf4, .dvf8
+.define .ngf4, .ngf8, .fif4, .fif8, .fef4, .fef8
+.define .cif4, .cif8, .cuf4, .cuf8, .cfi, .cfu, .cff4, .cff8
+.define .cmf4, .cmf8
+.sect .text
+.sect .rom
+.sect .data
+.sect .bss
+
+!	$Header$
+
+!	Implement interface to floating point package for M68881
+
+	.sect .text
+.adf4:
+	move.l	(sp)+,a0
+	fmove.s (sp),fp0
+	fadd.s	4(sp),fp0
+	fmove.s	fp0,4(sp)
+	jmp	(a0)
+
+.adf8:
+	move.l	(sp)+,a0
+	fmove.d (sp),fp0
+	fadd.d	8(sp),fp0
+	fmove.d	fp0,8(sp)
+	jmp	(a0)
+
+.sbf4:
+	move.l	(sp)+,a0
+	fmove.s (sp),fp0
+	fmove.s	4(sp),fp1
+	fsub	fp0,fp1
+	fmove.s	fp1,4(sp)
+	jmp	(a0)
+
+.sbf8:
+	move.l	(sp)+,a0
+	fmove.d (sp),fp0
+	fmove.d	8(sp),fp1
+	fsub	fp0,fp1
+	fmove.d	fp1,8(sp)
+	jmp	(a0)
+
+.mlf4:
+	move.l	(sp)+,a0
+	fmove.s (sp),fp0
+	fmul.s	4(sp),fp0
+	fmove.s	fp0,4(sp)
+	jmp	(a0)
+
+.mlf8:
+	move.l	(sp)+,a0
+	fmove.d (sp),fp0
+	fmul.d	8(sp),fp0
+	fmove.d	fp0,8(sp)
+	jmp	(a0)
+
+.dvf4:
+	move.l	(sp)+,a0
+	fmove.s (sp),fp0
+	fmove.s	4(sp),fp1
+	fdiv	fp0,fp1
+	fmove.s	fp1,4(sp)
+	jmp	(a0)
+
+.dvf8:
+	move.l	(sp)+,a0
+	fmove.d (sp),fp0
+	fmove.d	8(sp),fp1
+	fdiv	fp0,fp1
+	fmove.d	fp1,8(sp)
+	jmp	(a0)
+
+.ngf4:
+	fmove.s	4(sp),fp0
+	fneg	fp0
+	fmove.s	fp0,4(sp)
+	rts
+
+.ngf8:
+	fmove.d	4(sp),fp0
+	fneg	fp0
+	fmove.d	fp0,4(sp)
+	rts
+
+.fif4:
+	move.l	(sp)+,a0
+	move.l	(sp),a1
+	fmove.s	4(sp),fp0
+	fmove.s	8(sp),fp1
+	fmul	fp0,fp1
+	fintrz	fp1,fp0
+	fsub	fp0,fp1
+	fmove.s	fp1,4(a1)
+	fmove.s	fp0,(a1)
+	jmp	(a0)
+
+.fif8:
+	move.l	(sp)+,a0
+	move.l	(sp),a1
+	fmove.d	4(sp),fp0
+	fmove.d	12(sp),fp1
+	fmul	fp0,fp1
+	fintrz	fp1,fp0
+	fsub	fp0,fp1
+	fmove.d	fp1,8(a1)
+	fmove.d	fp0,(a1)
+	jmp	(a0)
+
+.fef4:
+	move.l	(sp)+,a0
+	move.l	(sp),a1
+	fmove.s	4(sp),fp0
+	fgetexp	fp0,fp1
+	fmove.l	fpsr,d0
+	and.l	#0x2000,d0	! set if Infinity
+	beq	1f
+	move.w	#129,(a1)
+	fmove.s	4(sp),fp0
+	fblt	2f
+	move.l	#0x3f000000,2(a1)
+	jmp	(a0)
+2:
+	move.l	#0xbf000000,2(a1)
+	jmp	(a0)
+1:
+	fmove.l	fp1,d0
+	add.l	#1,d0
+	fgetman	fp0
+	fbne	1f
+	clr.l	d0
+	bra	2f
+1:
+	fmove.l	#2,fp1
+	fdiv	fp1,fp0
+2:
+	fmove.s	fp0,2(a1)
+	move.w	d0,(a1)
+	jmp	(a0)
+
+.fef8:
+	move.l	(sp)+,a0
+	move.l	(sp),a1
+	fmove.d	4(sp),fp0
+	fgetexp	fp0,fp1
+	fmove.l	fpsr,d0
+	and.l	#0x2000,d0	! set if Infinity
+	beq	1f
+	move.w	#1025,(a1)
+	fmove.d	4(sp),fp0
+	fblt	2f
+	move.l	#0x3fe00000,2(a1)
+	clr.l	6(a1)
+	jmp	(a0)
+2:
+	move.l	#0xbfe00000,2(a1)
+	clr.l	6(a1)
+	jmp	(a0)
+1:
+	fmove.l	fp1,d0
+	add.l	#1,d0
+	fgetman	fp0
+	fbne	1f
+	clr.l	d0
+	bra	2f
+1:
+	fmove.l	#2,fp1
+	fdiv	fp1,fp0
+2:
+	fmove.d	fp0,2(a1)
+	move.w	d0,(a1)
+	jmp	(a0)
+
+.cif4:
+	move.l	(sp)+,a0
+	cmp.w	#2,(sp)
+	bne	1f
+	fmove.w	2(sp),fp0
+	fmove.s	fp0,(sp)
+	jmp	(a0)
+1:
+	fmove.l	2(sp),fp0
+	fmove.s	fp0,2(sp)
+	jmp	(a0)
+
+.cif8:
+	move.l	(sp)+,a0
+	cmp.w	#2,(sp)
+	bne	1f
+	fmove.w	2(sp),fp0
+	fmove.d	fp0,(sp)
+	jmp	(a0)
+1:
+	fmove.l	2(sp),fp0
+	fmove.d	fp0,(sp)
+	jmp	(a0)
+
+.cuf4:
+	move.l	(sp)+,a0
+	cmp.w	#2,(sp)
+	bne	2f
+	fmove.w	2(sp),fp0
+	tst.w	2(sp)
+	bge	1f
+	fadd.l	#65536,fp0
+1:
+	fmove.s	fp0,(sp)
+	jmp	(a0)
+2:
+	fmove.l	2(sp),fp0
+	tst.l	2(sp)
+	bge	1f
+	fsub.l	#-2147483648,fp0
+	fsub.l	#-2147483648,fp0
+1:
+	fmove.s	fp0,2(sp)
+	jmp	(a0)
+
+.cuf8:
+	move.l	(sp)+,a0
+	move.w	(sp),d0
+	cmp.w	#2,d0
+	bne	2f
+	fmove.w	2(sp),fp0
+	tst.w	2(sp)
+	bge	1f
+	fadd.l	#65536,fp0
+	bra	1f
+2:
+	fmove.l	2(sp),fp0
+	tst.l	2(sp)
+	bge	1f
+	fsub.l	#-2147483648,fp0
+	fsub.l	#-2147483648,fp0
+1:
+	fmove.d	fp0,(sp)
+	jmp	(a0)
+
+.cfi:
+	move.l	(sp)+,a0
+	move.w	(sp),d1
+	move.w	2(sp),d0
+	cmp.w	#4,d0
+	bne	1f
+	fmove.s	4(sp),fp0
+	bra	2f
+1:
+	fmove.d	4(sp),fp0
+	add.l	#4,sp
+2:
+	cmp.w	#2,d1
+	bne	1f
+	fmove.w	fp0,6(sp)
+	bra	2f
+1:
+	fmove.l	fp0,4(sp)
+2:
+	cmp.w	#4,d0
+	beq	1f
+	sub.l	#4,sp
+1:
+	jmp	(a0)
+
+.cfu:
+	move.l	(sp)+,a0
+	move.w	(sp),d1
+	move.w	2(sp),d2
+	cmp.w	#4,d2
+	bne	1f
+	fmove.s	4(sp),fp0
+	fabs	fp0
+	cmp.l	#0x4f000000,4(sp)
+	bge	2f
+	fintrz	fp0,fp0
+	fmove.l	fp0,d0
+	bra	3f
+2:
+	fadd.l	#-2147483648,fp0
+	fintrz	fp0,fp0
+	fmove.l	fp0,d0
+	bchg	#31,d0
+	bra	3f
+1:
+	fmove.d	4(sp),fp0
+	add.l	#4,sp
+	fabs	fp0
+	cmp.l	#0x41e00000,(sp)
+	bge	1f
+	fintrz	fp0,fp0
+	fmove.l	fp0,d0
+	bra	3f
+1:
+	fadd.l	#-2147483648,fp0
+	fintrz	fp0,fp0
+	fmove.l	fp0,d0
+	bchg	#31,d0
+3:
+	cmp.w	#2,d1
+	bne	1f
+	move.w	d0,6(sp)
+	bra	2f
+1:
+	move.l	d0,4(sp)
+2:
+	cmp.w	#4,d2
+	beq	1f
+	sub.l	#4,sp
+1:
+	jmp	(a0)
+
+.cff4:
+	move.l	(sp)+,a0
+	fmove.d	(sp),fp0
+	fmove.s	fp0,4(sp)
+	jmp	(a0)
+
+.cff8:
+	move.l	(sp)+,a0
+	fmove.s	(sp),fp0
+	fmove.d	fp0,(sp)
+	jmp	(a0)
+
+.cmf4:
+	move.l	(sp)+,a0
+	clr.l	d0
+	fmove.s	(sp),fp0
+	fmove.s	4(sp),fp1
+	fcmp	fp0,fp1
+	fbeq	2f
+	fblt	1f
+	add.l	#1,d0
+	jmp	(a0)
+1:
+	sub.l	#1,d0
+2:
+	jmp	(a0)
+
+.cmf8:
+	move.l	(sp)+,a0
+	clr.l	d0
+	fmove.d	(sp),fp0
+	fmove.d	8(sp),fp1
+	fcmp	fp0,fp1
+	fbeq	2f
+	fblt	1f
+	add.l	#1,d0
+	jmp	(a0)
+1:
+	sub.l	#1,d0
+2:
+	jmp	(a0)
--- a/mach/m68k2/libem/mli.s
+++ b/mach/m68k2/libem/mli.s
@ -4,31 +4,92 @@
 .sect .data
 .sect .bss

+ ! signed long mulitply
+ !-----------------------------------------------------------------------------
+ ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
+ !   #1  01/12/90  initial revision
+ !-----------------------------------------------------------------------------
+ !   3 cases worth to recognize :
+ !	1) both the upper word of u and v are zero
+ !	    => 1 mult : Low*Low
+ !	2) only one of the upper words is zero
+ !	    => 2 mult : Low*HighLow
+ !	3) both upper words are not zero
+ !	    => 4 mult : HighLow*HighLow
+ !   there are other cases (e.g. lower word is zero but high word is not, or
+ !   one operand is all zero). However, this seems not to be very common, so
+ !   they are ignored for the price of superfluous multiplications in these
+ !   cases.
+ !-----------------------------------------------------------------------------
+
+ ! entry : d0 multiplicand
+ !         d1 multiplier
+ ! exit  : d0 high order result
+ !         d1 low order result
+ !         d2,a0,a1 : destroyed

 	.sect .text
 .mli:
-	move.l	(sp)+,a0
-	move.l	(sp)+,d1
-	move.l	(sp)+,d0
-	move.l	d5,-(sp)
-	clr	d5
-	tst.l	d0
-	bpl	1f
-	neg.l	d0
-	not	d5
-1:
-	tst.l	d1
-	bpl	2f
-	neg.l	d1
-	not	d5
-2:
-	move.l	d0,-(sp)
-	move.l	d1,-(sp)
-	jsr	.mlu
-	tst	d5
-	beq	3f
+	move.l	(sp)+,a1	! return address
+	move.l	d3,a0		! save register
+	movem.w	(sp)+,d0-d3	! get v and u
+	move.w	d5,-(sp)	! save sign register
+	move.w	d2,d5
+	bge	0f		! negate u if neccessary
+	neg.w	d1
+	negx.w	d0
+0:	tst.w	d0
+	bge	0f		! negate v if neccessary
+	eor.w	d0,d5
+	neg.w	d1
+	negx.w	d0
+0:	bne	1f		! case 2) or 3)
+	tst.w	d2
+	bne	2f		! case 2)
+! === case 1: _l x _l ===
+	mulu	d3,d1		! r.l = u.l x v.l
+9:				! (r.h is already zero)
+	tst.w	d5		! negate result if neccessary
+	bpl	0f
 	neg.l	d1
 	negx.l	d0
+0:	move.w	(sp)+,d5	! return
+	move.l	a0,d3
+	jmp	(a1)
+! === possibly case 2) or case 3) ===
+1:
+	tst.w	d2
+	bne	3f		! case 3)
+! === case 2: _l x hl ===
+	exg	d0,d2		! exchange u and v
+	exg	d1,d3		! (minimizes number of distinct cases)
+2:
+	mulu	d1,d2		! a = v.l x u.h
+	mulu	d3,d1		! r.l = v.l x u.l
+	swap	d2		! a = a << 16
+	clr.l	d3
+	move.w	d2,d3
+	clr.w	d2
+	add.l	d2,d1		! r += a
+	addx.l	d3,d0
+	bra	9b
+! === case 3: hl x hl ===
 3:
-	move.l	(sp)+,d5
-	jmp	(a0)
+	move.l	d4,-(sp)	! need more registers
+	move.w	d2,d4
+	mulu	d1,d4		! a = v.l x u.h
+	mulu	d3,d1		! r.l = u.l x v.l
+	mulu	d0,d3		! b = v.h x u.l
+	mulu	d2,d0		! r.h = u.h x v.h
+	swap	d1		! (just for simplicity)
+	add.w	d4,d1		! r += a << 16
+	clr.w	d4
+	swap	d4
+	addx.l	d4,d0
+	add.w	d3,d1		! r += b << 16
+	clr.w	d3
+	swap	d3
+	addx.l	d3,d0
+	swap	d1
+	move.l	(sp)+,d4	! return
+	bra	9b
--- a/mach/m68k2/libem/mlu.s
+++ b/mach/m68k2/libem/mlu.s
@ -4,38 +4,79 @@
 .sect .data
 .sect .bss

+ ! unsigned long mulitply
+ !-----------------------------------------------------------------------------
+ ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
+ !   #1  01/12/90  initial revision
+ !-----------------------------------------------------------------------------
+ !   3 cases worth to recognize :
+ !	1) both the upper word of u and v are zero
+ !	    => 1 mult : Low*Low
+ !	2) only one of the upper words is zero
+ !	    => 2 mult : Low*HighLow
+ !	3) both upper words are not zero
+ !	    => 4 mult : HighLow*HighLow
+ !   there are other cases (e.g. lower word is zero but high word is not, or
+ !   one operand is all zero). However, this seems not to be very common, so
+ !   they are ignored for the price of superfluous multiplications in these
+ !   cases.
+ !-----------------------------------------------------------------------------
+
 ! entry : d0 multiplicand
 !         d1 multiplier
 ! exit  : d0 high order result
 !         d1 low order result
+ !         d2,a0,a1 : destroyed

 	.sect .text
 .mlu:
-	move.l	(sp)+,a1
-	move.l	(sp)+,d1
-	move.l	(sp)+,d0
-	movem.l	d3/d4/d6,-(sp)
-	move.l	d1,d3
-	move.l	d1,d2
-	swap	d2
-	move.l	d2,d4
-	mulu	d0,d1
-	mulu	d0,d2
-	swap	d0
-	mulu	d0,d3
-	mulu	d4,d0
-	clr.l	d6
-	swap	d1
-	add	d2,d1
-	addx.l	d6,d0
-	add	d3,d1
-	addx.l	d6,d0
-	swap	d1
-	clr	d2
-	clr	d3
-	swap	d2
-	swap	d3
-	add.l	d2,d0
-	add.l	d3,d0
-	movem.l	(sp)+,d3/d4/d6
+	move.l	(sp)+,a1	! return address
+	move.l	d3,a0		! save register
+	movem.w	(sp)+,d0-d3	! get v and u
+	tst.w	d0
+	bne	1f		! case 2) or 3)
+	tst.w	d2
+	bne	2f		! case 2)
+! === case 1: _l x _l ===
+	mulu	d3,d1		! r.l = u.l x v.l
+	move.l	a0,d3		! (r.h is already zero)
+	jmp	(a1)		! return
+! === possibly case 2) or case 3) ===
+1:
+	tst.w	d2
+	bne	3f		! case 3)
+! === case 2: _l x hl ===
+	exg	d0,d2		! exchange u and v
+	exg	d1,d3		! (minimizes number of distinct cases)
+2:
+	mulu	d1,d2		! a = v.l x u.h
+	mulu	d3,d1		! r.l = v.l x u.l
+	swap	d2		! a = a << 16
+	clr.l	d3
+	move.w	d2,d3
+	clr.w	d2
+	add.l	d2,d1		! r += a
+	addx.l	d3,d0
+	move.l	a0,d3		! return
+	jmp	(a1)
+! === case 3: hl x hl ===
+3:
+	move.l	d4,-(sp)	! need more registers
+	move.w	d2,d4
+	mulu	d1,d4		! a = v.l x u.h
+	mulu	d3,d1		! r.l = u.l x v.l
+	mulu	d0,d3		! b = v.h x u.l
+	mulu	d2,d0		! r.h = u.h x v.h
+	swap	d1		! (just for simplicity)
+	add.w	d4,d1		! r += a << 16
+	clr.w	d4
+	swap	d4
+	addx.l	d4,d0
+	add.w	d3,d1		! r += b << 16
+	clr.w	d3
+	swap	d3
+	addx.l	d3,d0
+	swap	d1
+	move.l	(sp)+,d4	! return
+	move.l	a0,d3
 	jmp	(a1)