Added end library and floating point processor support

1991-02-01 15:09:58 +00:00 · 1991-02-01 15:09:58 +00:00 · 0f4e675b50
commit 0f4e675b50
parent e64fb88a5d
13 changed files with 718 additions and 124 deletions
--- a/mach/m68k2/libem/.distr
+++ b/mach/m68k2/libem/.distr
@ -1,5 +1,9 @@
 LIST
 Makefile
 compmodule
 em_end.s
 etext.s
 edata.s
 end.s
 libem_s.a
 READ_ME
--- a/mach/m68k2/libem/LIST
+++ b/mach/m68k2/libem/LIST
@ -26,7 +26,7 @@ shp.s
 sig.s
 cms.s
 gto.s
-ffp.s
+fp68881.s
 fat.s
 trp.s
 dia.s
--- a/mach/m68k2/libem/Makefile
+++ b/mach/m68k2/libem/Makefile
@ -1,28 +1,33 @@
 # $Header$
 MACH=m68k2
-all:            libem_o.a end.o
+ASAR=aal
 all:		libem_o.a end.a
 install: 	all
 		../../install libem_o.a tail_em
-		../../install end.o end_em
+		../../install end.a end_em
 cmp:		all
 		-../../compare libem_o.a tail_em
-		-../../compare end.o end_em
+		-../../compare end.a end_em
-end.o:		end.s
+end.a:		em_end.s etext.s edata.s end.s
 		$(MACH) -I../../../h -c em_end.s
 		$(MACH) -I../../../h -c edata.s
 		$(MACH) -I../../../h -c etext.s
 		$(MACH) -I../../../h -c end.s
 		$(ASAR) cr end.a em_end.o etext.o edata.o end.o
 libem_o.a:	libem_s.a
-		ASAR=aal ; export ASAR ;\
+		ASAR=$(ASAR) ; export ASAR ;\
 		march . libem_o.a
 clean:
-		rm -f *.o libem_o.a
+		rm -f *.o libem_o.a end.a
 opr :
 		make pr | opr
 pr:
 		@arch pv libem_s.a | pr -h `pwd`/libem_s.a
-		@pr `pwd`/end.s
+		@pr `pwd`/em_end.s `pwd`/edata.s `pwd`/etext.s `pwd`/end.s
--- a/mach/m68k2/libem/READ_ME
+++ b/mach/m68k2/libem/READ_ME
@ -1,5 +1,4 @@
-The original EM library routines saved all registers
+The routines in mli.s, mlu.s, dvi.s, and dvu.s are written by
-(including scratch registers) in global data; hence they
+Kai-Uwe Bloem and were published on the comp.os.minix newsgroup.
-were not reentrant.
+He allowed us to use them for ACK, but requested that
-The new routines do not save registers d0,d1,d2,a0 and a1.
+they do not fall under the ACK copyright notice. So, they don't.
 They are reentrant.
--- a/mach/m68k2/libem/dvi.s
+++ b/mach/m68k2/libem/dvi.s
@ -5,38 +5,96 @@
 .sect .bss
 ! signed long divide
 !-----------------------------------------------------------------------------
 ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
 !   #1  01/12/90  initial revision. Minor reduce of shift operations.
 !   #2  03/07/90  use 68000 divu instruction whereever possible. This change
 !		   makes #1 superflous. (derived from my GNU division routine)
 !-----------------------------------------------------------------------------
 ! Some common cases can be handled in a special, much faster way :
 !      1) divisor = 0
 !          => cause trap, then return to user. Result is undefined
 !      2) dividend < divisor
 !          => quotient = 0, remainder = dividend
 !      3) divisor < 0x10000 ( i.e. divisor is only 16 bits wide )
 !          => quotient and remainder can be calculated quite fast by repeated
 !             application of 68000 divu operations (ca. 400 cycles)
 !      4) otherwise (due to #2, #3 dividend, divisor both wider then 16 bits)
 !          => do slow division by shift and subtract
 !-----------------------------------------------------------------------------
 ! register usage:
 ! 	 : d0 divisor
 !         d1 dividend
 ! exit  : d1 quotient
 !         d2 remainder
 	.sect .text
 .dvi:
-	move.l	(sp)+,a0	! return address
+	move.l	(sp)+,a1	! return address
-	move.l	(sp)+,d0
+	move.l	(sp)+,d0	! divisor
-	move.l	(sp)+,d1
+	move.l	(sp)+,d2	! dividend
-	move.l	d3,-(sp)	! save d3 and d4
+	move.l	d3,a0		! save d3
-	move.l	d4,-(sp)
+	move.l	d4,-(sp)	! save result sign register
 	clr.l	d4
-	tst.l	d0	! divisor
+	tst.l	d2
-	bpl	1f
+	bpl	0f		! dividend is negative ?
-	neg.l	d0
+	neg.l	d2		! yes - negate
-	not	d4
+	not.l	d4		! and note negation in d4
-1:
+0:
-	tst.l	d1	! dividend
+	tst.l	d0
-	bpl	2f
+	bpl	0f		! divisor is negative ?
-	neg.l	d1
+	neg.l	d0		! yes - negate
-	not	d4
+	not.w	d4		! note negation
-	swap	d4
+0:
-	not	d4
+	clr.l	d1		! prepare quotient
-	swap	d4
+! === case 1: divisor = 0
 	tst.l	d0		! divisor = 0 ?
 	beq	9f		! yes - divide by zero trap
 ! === case 2: dividend < divisor
 	cmp.l	d0,d2		! dividend < divisor ?
 	bcs	8f		! yes - division already finished
 ! === case 3: divisor <= 0x0ffff
 	cmp.l	#0x0ffff,d0	! is divisor only 16 bits wide ?
 	bhi	2f
 	move.w	d2,d3		! save dividend.l
 	clr.w	d2		! prepare dividend.h for divu operation
 	swap	d2
 	beq	0f		! dividend.h is all zero, no divu necessary
 	divu	d0,d2
 0:	move.w	d2,d1		! save quotient.h
 	swap	d1
 	move.w	d3,d2		! divide dividend.l
 	divu	d0,d2		! (d2.h = remainder of prev divu)
 	move.w	d2,d1		! save qoutient.l
 	clr.w	d2		! get remainder
 	swap	d2
 	bra	8f
 ! === case 4: divisor and dividend both > 0x0ffff
 2:
-	move.l	d1,-(sp)
+	move	#32-1,d3	! loop count
-	move.l	d0,-(sp)
+4:
-	jsr	.dvu
+	lsl.l	#1,d2		! shift dividend ...
-	tst	d4
+	roxl.l	#1,d1		!  ... into d1
-	beq	5f
+	cmp.l	d0,d1		! compare with divisor
-	neg.l	d1	! quotient
+	bcs	5f
 	sub.l	d0,d1		! bigger, subtract divisor
 	add	#1,d2		! note subtraction in result
 5:
-	tst.l	d4
+	dbra	d3,4b
-	bpl	6f
+	exg	d1,d2		! get results in the correct registers
-	neg.l	d2	! remainder
+8:
-6:
+	tst.w	d4		! quotient < 0 ?
-	move.l	(sp)+,d4	! restore d4 and d3
+	bpl	0f
-	move.l	(sp)+,d3
+	neg.l	d1		! yes - negate
-	jmp	(a0)
+0:	tst.l	d4		! remainder < 0 ?
 	bpl	0f
 	neg.l	d2
 0:	move.l	(sp)+,d4	! restore d4
 	move.l	a0,d3		! restore d3
 	jmp	(a1)
 EIDIVZ	= 6
 9:	move.w	#EIDIVZ,-(sp)
 	jsr	.trp
--- a/mach/m68k2/libem/dvu.s
+++ b/mach/m68k2/libem/dvu.s
@ -5,34 +5,77 @@
 .sect .bss
 ! unsigned long divide
 !-----------------------------------------------------------------------------
 ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
 !   #1  01/12/90  initial revision. Minor reduce of shift operations.
 !   #2  03/07/90  use 68000 divu instruction whereever possible. This change
 !		   makes #1 superflous. (derived from my GNU division routine)
 !-----------------------------------------------------------------------------
 ! Some common cases can be handled in a special, much faster way :
 !      1) divisor = 0
 !          => cause trap, then return to user. Result is undefined
 !      2) dividend < divisor
 !          => quotient = 0, remainder = dividend
 !      3) divisor < 0x10000 ( i.e. divisor is only 16 bits wide )
 !          => quotient and remainder can be calculated quite fast by repeated
 !             application of 68000 divu operations (ca. 400 cycles)
 !      4) otherwise (due to #2, #3 dividend, divisor both wider then 16 bits)
 !          => do slow division by shift and subtract
 !-----------------------------------------------------------------------------
 ! register usage:
 ! 	 : d0 divisor
 !         d1 dividend
 ! exit  : d1 quotient
 !         d2 remainder
 	.sect .text
 .dvu:
 	move.l	d3,a0		! save d3
 	move.l	(sp)+,a1	! return address
-	move.l	(sp)+,d0
+	move.l	(sp)+,d0	! divisor
-	move.l	(sp)+,d1
+	move.l	(sp)+,d2	! dividend
-	move.l	d3,-(sp)	! save d3
+	clr.l	d1		! prepare quotient
-	tst.l	d0
+! === case 1: divisor = 0
-	bne	0f
+	tst.l	d0		! divisor = 0 ?
-	move.l	(sp)+,d3
+	beq	9f		! yes - divide by zero trap
-	move.w	#EIDIVZ,-(sp)
+! === case 2: dividend < divisor
-	jsr	.trp
+	cmp.l	d0,d2		! dividend < divisor ?
-0:
+	bcs	8f		! yes - division already finished
-	clr.l	d2
+! === case 3: divisor <= 0x0ffff
-	move.l	#32,d3
+	cmp.l	#0x0ffff,d0	! is divisor only 16 bits wide ?
-3:
+	bhi	2f
-	lsl.l	#1,d1
+	move.w	d2,d3		! save dividend.l
-	roxl.l	#1,d2
+	clr.w	d2		! prepare dividend.h for divu operation
-	cmp.l	d0,d2
+	swap	d2
-	blt	4f
+	beq	0f		! dividend.h is all zero, no divu necessary
-	sub.l	d0,d2
+	divu	d0,d2
-	add	#1,d1
+0:	move.w	d2,d1		! save quotient.h
 	swap	d1
 	move.w	d3,d2		! divide dividend.l
 	divu	d0,d2		! (d2.h = remainder of prev divu)
 	move.w	d2,d1		! save qoutient.l
 	clr.w	d2		! get remainder
 	swap	d2
 	bra	8f
 ! === case 4: divisor and dividend both > 0x0ffff
 2:
 	move	#32-1,d3	! loop count
 4:
-	sub	#1,d3
+	lsl.l	#1,d2		! shift dividend ...
-	bgt	3b
+	roxl.l	#1,d1		!  ... into d1
-	move.l	(sp)+,d3
+	cmp.l	d0,d1		! compare with divisor
 	bcs	5f
 	sub.l	d0,d1		! bigger, subtract divisor
 	add	#1,d2		! note subtraction in result
 5:
 	dbra	d3,4b
 	exg	d1,d2		! get results in the correct registers
 8:
 	move.l	a0,d3		! restore d3
 	jmp	(a1)
 EIDIVZ	= 6
 9:	move.w	#EIDIVZ,-(sp)
 	jsr	.trp
--- a/mach/m68k2/libem/edata.s
+++ b/mach/m68k2/libem/edata.s
@ -0,0 +1,9 @@
 .sect .text
 .sect .rom
 .sect .data
 .sect .bss
 .define	_edata
 .sect .data
 	.align 4
 	.sect .data
 _edata:
--- a/mach/m68k2/libem/em_end.s
+++ b/mach/m68k2/libem/em_end.s
@ -0,0 +1,22 @@
 .sect .text
 .sect .rom
 .sect .data
 .sect .bss
 .define	endtext,enddata,endbss,__end
 .sect .text
 	.align 4
 .sect .rom
 	.align 4
 .sect .data
 	.align 4
 .sect .bss
 	.align 4
 .sect .end ! only for declaration of _end, __end and endbss.
 	.sect .text
 endtext:
 	.sect .data
 enddata:
 	.sect .end
 __end:
 endbss:
--- a/mach/m68k2/libem/end.s
+++ b/mach/m68k2/libem/end.s
@ -1,16 +1,7 @@
 .define	endtext,enddata,endbss,_etext,_edata,_end
 .sect .text
 .sect .rom
 .sect .data
 .sect .bss
-.sect .end ! only for declaration of _end and endbss.
+.define	_end
-
+.sect .end ! only for declaration of _end, __end and endbss.
 	.sect .text
 endtext:
 _etext:
 	.sect .data
 enddata:
 _edata:
 	.sect .end
 _end:
 endbss:
--- a/mach/m68k2/libem/etext.s
+++ b/mach/m68k2/libem/etext.s
@ -0,0 +1,9 @@
 .sect .text
 .sect .rom
 .sect .data
 .sect .bss
 .define	_etext
 .sect .text
 	.align 4
 	.sect .text
 _etext:
--- a/mach/m68k2/libem/fp68881.s
+++ b/mach/m68k2/libem/fp68881.s
@ -0,0 +1,352 @@
 .define .adf4, .adf8, .sbf4, .sbf8, .mlf4, .mlf8, .dvf4, .dvf8
 .define .ngf4, .ngf8, .fif4, .fif8, .fef4, .fef8
 .define .cif4, .cif8, .cuf4, .cuf8, .cfi, .cfu, .cff4, .cff8
 .define .cmf4, .cmf8
 .sect .text
 .sect .rom
 .sect .data
 .sect .bss
 !	$Header$
 !	Implement interface to floating point package for M68881
 	.sect .text
 .adf4:
 	move.l	(sp)+,a0
 	fmove.s (sp),fp0
 	fadd.s	4(sp),fp0
 	fmove.s	fp0,4(sp)
 	jmp	(a0)
 .adf8:
 	move.l	(sp)+,a0
 	fmove.d (sp),fp0
 	fadd.d	8(sp),fp0
 	fmove.d	fp0,8(sp)
 	jmp	(a0)
 .sbf4:
 	move.l	(sp)+,a0
 	fmove.s (sp),fp0
 	fmove.s	4(sp),fp1
 	fsub	fp0,fp1
 	fmove.s	fp1,4(sp)
 	jmp	(a0)
 .sbf8:
 	move.l	(sp)+,a0
 	fmove.d (sp),fp0
 	fmove.d	8(sp),fp1
 	fsub	fp0,fp1
 	fmove.d	fp1,8(sp)
 	jmp	(a0)
 .mlf4:
 	move.l	(sp)+,a0
 	fmove.s (sp),fp0
 	fmul.s	4(sp),fp0
 	fmove.s	fp0,4(sp)
 	jmp	(a0)
 .mlf8:
 	move.l	(sp)+,a0
 	fmove.d (sp),fp0
 	fmul.d	8(sp),fp0
 	fmove.d	fp0,8(sp)
 	jmp	(a0)
 .dvf4:
 	move.l	(sp)+,a0
 	fmove.s (sp),fp0
 	fmove.s	4(sp),fp1
 	fdiv	fp0,fp1
 	fmove.s	fp1,4(sp)
 	jmp	(a0)
 .dvf8:
 	move.l	(sp)+,a0
 	fmove.d (sp),fp0
 	fmove.d	8(sp),fp1
 	fdiv	fp0,fp1
 	fmove.d	fp1,8(sp)
 	jmp	(a0)
 .ngf4:
 	fmove.s	4(sp),fp0
 	fneg	fp0
 	fmove.s	fp0,4(sp)
 	rts
 .ngf8:
 	fmove.d	4(sp),fp0
 	fneg	fp0
 	fmove.d	fp0,4(sp)
 	rts
 .fif4:
 	move.l	(sp)+,a0
 	move.l	(sp),a1
 	fmove.s	4(sp),fp0
 	fmove.s	8(sp),fp1
 	fmul	fp0,fp1
 	fintrz	fp1,fp0
 	fsub	fp0,fp1
 	fmove.s	fp1,4(a1)
 	fmove.s	fp0,(a1)
 	jmp	(a0)
 .fif8:
 	move.l	(sp)+,a0
 	move.l	(sp),a1
 	fmove.d	4(sp),fp0
 	fmove.d	12(sp),fp1
 	fmul	fp0,fp1
 	fintrz	fp1,fp0
 	fsub	fp0,fp1
 	fmove.d	fp1,8(a1)
 	fmove.d	fp0,(a1)
 	jmp	(a0)
 .fef4:
 	move.l	(sp)+,a0
 	move.l	(sp),a1
 	fmove.s	4(sp),fp0
 	fgetexp	fp0,fp1
 	fmove.l	fpsr,d0
 	and.l	#0x2000,d0	! set if Infinity
 	beq	1f
 	move.w	#129,(a1)
 	fmove.s	4(sp),fp0
 	fblt	2f
 	move.l	#0x3f000000,2(a1)
 	jmp	(a0)
 2:
 	move.l	#0xbf000000,2(a1)
 	jmp	(a0)
 1:
 	fmove.l	fp1,d0
 	add.l	#1,d0
 	fgetman	fp0
 	fbne	1f
 	clr.l	d0
 	bra	2f
 1:
 	fmove.l	#2,fp1
 	fdiv	fp1,fp0
 2:
 	fmove.s	fp0,2(a1)
 	move.w	d0,(a1)
 	jmp	(a0)
 .fef8:
 	move.l	(sp)+,a0
 	move.l	(sp),a1
 	fmove.d	4(sp),fp0
 	fgetexp	fp0,fp1
 	fmove.l	fpsr,d0
 	and.l	#0x2000,d0	! set if Infinity
 	beq	1f
 	move.w	#1025,(a1)
 	fmove.d	4(sp),fp0
 	fblt	2f
 	move.l	#0x3fe00000,2(a1)
 	clr.l	6(a1)
 	jmp	(a0)
 2:
 	move.l	#0xbfe00000,2(a1)
 	clr.l	6(a1)
 	jmp	(a0)
 1:
 	fmove.l	fp1,d0
 	add.l	#1,d0
 	fgetman	fp0
 	fbne	1f
 	clr.l	d0
 	bra	2f
 1:
 	fmove.l	#2,fp1
 	fdiv	fp1,fp0
 2:
 	fmove.d	fp0,2(a1)
 	move.w	d0,(a1)
 	jmp	(a0)
 .cif4:
 	move.l	(sp)+,a0
 	cmp.w	#2,(sp)
 	bne	1f
 	fmove.w	2(sp),fp0
 	fmove.s	fp0,(sp)
 	jmp	(a0)
 1:
 	fmove.l	2(sp),fp0
 	fmove.s	fp0,2(sp)
 	jmp	(a0)
 .cif8:
 	move.l	(sp)+,a0
 	cmp.w	#2,(sp)
 	bne	1f
 	fmove.w	2(sp),fp0
 	fmove.d	fp0,(sp)
 	jmp	(a0)
 1:
 	fmove.l	2(sp),fp0
 	fmove.d	fp0,(sp)
 	jmp	(a0)
 .cuf4:
 	move.l	(sp)+,a0
 	cmp.w	#2,(sp)
 	bne	2f
 	fmove.w	2(sp),fp0
 	tst.w	2(sp)
 	bge	1f
 	fadd.l	#65536,fp0
 1:
 	fmove.s	fp0,(sp)
 	jmp	(a0)
 2:
 	fmove.l	2(sp),fp0
 	tst.l	2(sp)
 	bge	1f
 	fsub.l	#-2147483648,fp0
 	fsub.l	#-2147483648,fp0
 1:
 	fmove.s	fp0,2(sp)
 	jmp	(a0)
 .cuf8:
 	move.l	(sp)+,a0
 	move.w	(sp),d0
 	cmp.w	#2,d0
 	bne	2f
 	fmove.w	2(sp),fp0
 	tst.w	2(sp)
 	bge	1f
 	fadd.l	#65536,fp0
 	bra	1f
 2:
 	fmove.l	2(sp),fp0
 	tst.l	2(sp)
 	bge	1f
 	fsub.l	#-2147483648,fp0
 	fsub.l	#-2147483648,fp0
 1:
 	fmove.d	fp0,(sp)
 	jmp	(a0)
 .cfi:
 	move.l	(sp)+,a0
 	move.w	(sp),d1
 	move.w	2(sp),d0
 	cmp.w	#4,d0
 	bne	1f
 	fmove.s	4(sp),fp0
 	bra	2f
 1:
 	fmove.d	4(sp),fp0
 	add.l	#4,sp
 2:
 	cmp.w	#2,d1
 	bne	1f
 	fmove.w	fp0,6(sp)
 	bra	2f
 1:
 	fmove.l	fp0,4(sp)
 2:
 	cmp.w	#4,d0
 	beq	1f
 	sub.l	#4,sp
 1:
 	jmp	(a0)
 .cfu:
 	move.l	(sp)+,a0
 	move.w	(sp),d1
 	move.w	2(sp),d2
 	cmp.w	#4,d2
 	bne	1f
 	fmove.s	4(sp),fp0
 	fabs	fp0
 	cmp.l	#0x4f000000,4(sp)
 	bge	2f
 	fintrz	fp0,fp0
 	fmove.l	fp0,d0
 	bra	3f
 2:
 	fadd.l	#-2147483648,fp0
 	fintrz	fp0,fp0
 	fmove.l	fp0,d0
 	bchg	#31,d0
 	bra	3f
 1:
 	fmove.d	4(sp),fp0
 	add.l	#4,sp
 	fabs	fp0
 	cmp.l	#0x41e00000,(sp)
 	bge	1f
 	fintrz	fp0,fp0
 	fmove.l	fp0,d0
 	bra	3f
 1:
 	fadd.l	#-2147483648,fp0
 	fintrz	fp0,fp0
 	fmove.l	fp0,d0
 	bchg	#31,d0
 3:
 	cmp.w	#2,d1
 	bne	1f
 	move.w	d0,6(sp)
 	bra	2f
 1:
 	move.l	d0,4(sp)
 2:
 	cmp.w	#4,d2
 	beq	1f
 	sub.l	#4,sp
 1:
 	jmp	(a0)
 .cff4:
 	move.l	(sp)+,a0
 	fmove.d	(sp),fp0
 	fmove.s	fp0,4(sp)
 	jmp	(a0)
 .cff8:
 	move.l	(sp)+,a0
 	fmove.s	(sp),fp0
 	fmove.d	fp0,(sp)
 	jmp	(a0)
 .cmf4:
 	move.l	(sp)+,a0
 	clr.l	d0
 	fmove.s	(sp),fp0
 	fmove.s	4(sp),fp1
 	fcmp	fp0,fp1
 	fbeq	2f
 	fblt	1f
 	add.l	#1,d0
 	jmp	(a0)
 1:
 	sub.l	#1,d0
 2:
 	jmp	(a0)
 .cmf8:
 	move.l	(sp)+,a0
 	clr.l	d0
 	fmove.d	(sp),fp0
 	fmove.d	8(sp),fp1
 	fcmp	fp0,fp1
 	fbeq	2f
 	fblt	1f
 	add.l	#1,d0
 	jmp	(a0)
 1:
 	sub.l	#1,d0
 2:
 	jmp	(a0)
--- a/mach/m68k2/libem/mli.s
+++ b/mach/m68k2/libem/mli.s
@ -4,31 +4,92 @@
 .sect .data
 .sect .bss
 ! signed long mulitply
 !-----------------------------------------------------------------------------
 ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
 !   #1  01/12/90  initial revision
 !-----------------------------------------------------------------------------
 !   3 cases worth to recognize :
 !	1) both the upper word of u and v are zero
 !	    => 1 mult : Low*Low
 !	2) only one of the upper words is zero
 !	    => 2 mult : Low*HighLow
 !	3) both upper words are not zero
 !	    => 4 mult : HighLow*HighLow
 !   there are other cases (e.g. lower word is zero but high word is not, or
 !   one operand is all zero). However, this seems not to be very common, so
 !   they are ignored for the price of superfluous multiplications in these
 !   cases.
 !-----------------------------------------------------------------------------
 ! entry : d0 multiplicand
 !         d1 multiplier
 ! exit  : d0 high order result
 !         d1 low order result
 !         d2,a0,a1 : destroyed
 	.sect .text
 .mli:
-	move.l	(sp)+,a0
+	move.l	(sp)+,a1	! return address
-	move.l	(sp)+,d1
+	move.l	d3,a0		! save register
-	move.l	(sp)+,d0
+	movem.w	(sp)+,d0-d3	! get v and u
-	move.l	d5,-(sp)
+	move.w	d5,-(sp)	! save sign register
-	clr	d5
+	move.w	d2,d5
-	tst.l	d0
+	bge	0f		! negate u if neccessary
-	bpl	1f
+	neg.w	d1
-	neg.l	d0
+	negx.w	d0
-	not	d5
+0:	tst.w	d0
-1:
+	bge	0f		! negate v if neccessary
-	tst.l	d1
+	eor.w	d0,d5
-	bpl	2f
+	neg.w	d1
-	neg.l	d1
+	negx.w	d0
-	not	d5
+0:	bne	1f		! case 2) or 3)
-2:
+	tst.w	d2
-	move.l	d0,-(sp)
+	bne	2f		! case 2)
-	move.l	d1,-(sp)
+! === case 1: _l x _l ===
-	jsr	.mlu
+	mulu	d3,d1		! r.l = u.l x v.l
-	tst	d5
+9:				! (r.h is already zero)
-	beq	3f
+	tst.w	d5		! negate result if neccessary
 	bpl	0f
 	neg.l	d1
 	negx.l	d0
 0:	move.w	(sp)+,d5	! return
 	move.l	a0,d3
 	jmp	(a1)
 ! === possibly case 2) or case 3) ===
 1:
 	tst.w	d2
 	bne	3f		! case 3)
 ! === case 2: _l x hl ===
 	exg	d0,d2		! exchange u and v
 	exg	d1,d3		! (minimizes number of distinct cases)
 2:
 	mulu	d1,d2		! a = v.l x u.h
 	mulu	d3,d1		! r.l = v.l x u.l
 	swap	d2		! a = a << 16
 	clr.l	d3
 	move.w	d2,d3
 	clr.w	d2
 	add.l	d2,d1		! r += a
 	addx.l	d3,d0
 	bra	9b
 ! === case 3: hl x hl ===
 3:
-	move.l	(sp)+,d5
+	move.l	d4,-(sp)	! need more registers
-	jmp	(a0)
+	move.w	d2,d4
 	mulu	d1,d4		! a = v.l x u.h
 	mulu	d3,d1		! r.l = u.l x v.l
 	mulu	d0,d3		! b = v.h x u.l
 	mulu	d2,d0		! r.h = u.h x v.h
 	swap	d1		! (just for simplicity)
 	add.w	d4,d1		! r += a << 16
 	clr.w	d4
 	swap	d4
 	addx.l	d4,d0
 	add.w	d3,d1		! r += b << 16
 	clr.w	d3
 	swap	d3
 	addx.l	d3,d0
 	swap	d1
 	move.l	(sp)+,d4	! return
 	bra	9b
--- a/mach/m68k2/libem/mlu.s
+++ b/mach/m68k2/libem/mlu.s
@ -4,38 +4,79 @@
 .sect .data
 .sect .bss
 ! unsigned long mulitply
 !-----------------------------------------------------------------------------
 ! rewritten by Kai-Uwe Bloem (i5110401@dbstu1.bitnet) for speed.
 !   #1  01/12/90  initial revision
 !-----------------------------------------------------------------------------
 !   3 cases worth to recognize :
 !	1) both the upper word of u and v are zero
 !	    => 1 mult : Low*Low
 !	2) only one of the upper words is zero
 !	    => 2 mult : Low*HighLow
 !	3) both upper words are not zero
 !	    => 4 mult : HighLow*HighLow
 !   there are other cases (e.g. lower word is zero but high word is not, or
 !   one operand is all zero). However, this seems not to be very common, so
 !   they are ignored for the price of superfluous multiplications in these
 !   cases.
 !-----------------------------------------------------------------------------
 ! entry : d0 multiplicand
 !         d1 multiplier
 ! exit  : d0 high order result
 !         d1 low order result
 !         d2,a0,a1 : destroyed
 	.sect .text
 .mlu:
-	move.l	(sp)+,a1
+	move.l	(sp)+,a1	! return address
-	move.l	(sp)+,d1
+	move.l	d3,a0		! save register
-	move.l	(sp)+,d0
+	movem.w	(sp)+,d0-d3	! get v and u
-	movem.l	d3/d4/d6,-(sp)
+	tst.w	d0
-	move.l	d1,d3
+	bne	1f		! case 2) or 3)
-	move.l	d1,d2
+	tst.w	d2
-	swap	d2
+	bne	2f		! case 2)
-	move.l	d2,d4
+! === case 1: _l x _l ===
-	mulu	d0,d1
+	mulu	d3,d1		! r.l = u.l x v.l
-	mulu	d0,d2
+	move.l	a0,d3		! (r.h is already zero)
-	swap	d0
+	jmp	(a1)		! return
-	mulu	d0,d3
+! === possibly case 2) or case 3) ===
-	mulu	d4,d0
+1:
-	clr.l	d6
+	tst.w	d2
-	swap	d1
+	bne	3f		! case 3)
-	add	d2,d1
+! === case 2: _l x hl ===
-	addx.l	d6,d0
+	exg	d0,d2		! exchange u and v
-	add	d3,d1
+	exg	d1,d3		! (minimizes number of distinct cases)
-	addx.l	d6,d0
+2:
-	swap	d1
+	mulu	d1,d2		! a = v.l x u.h
-	clr	d2
+	mulu	d3,d1		! r.l = v.l x u.l
-	clr	d3
+	swap	d2		! a = a << 16
-	swap	d2
+	clr.l	d3
-	swap	d3
+	move.w	d2,d3
-	add.l	d2,d0
+	clr.w	d2
-	add.l	d3,d0
+	add.l	d2,d1		! r += a
-	movem.l	(sp)+,d3/d4/d6
+	addx.l	d3,d0
 	move.l	a0,d3		! return
 	jmp	(a1)
 ! === case 3: hl x hl ===
 3:
 	move.l	d4,-(sp)	! need more registers
 	move.w	d2,d4
 	mulu	d1,d4		! a = v.l x u.h
 	mulu	d3,d1		! r.l = u.l x v.l
 	mulu	d0,d3		! b = v.h x u.l
 	mulu	d2,d0		! r.h = u.h x v.h
 	swap	d1		! (just for simplicity)
 	add.w	d4,d1		! r += a << 16
 	clr.w	d4
 	swap	d4
 	addx.l	d4,d0
 	add.w	d3,d1		! r += b << 16
 	clr.w	d3
 	swap	d3
 	addx.l	d3,d0
 	swap	d1
 	move.l	(sp)+,d4	! return
 	move.l	a0,d3
 	jmp	(a1)