From e867861f6d84947514a90396d426a6472ffc3131 Mon Sep 17 00:00:00 2001
From: George Koehler <kernigh@gmail.com>
Date: Tue, 24 Sep 2019 13:32:17 -0400
Subject: [PATCH] Add 8-byte long long for linux68k.

Add rules for 8-byte integers to m68020 ncg.  Add 8-byte long long to
ACK C on linux68k.  Enable long-long tests for linux68k.  The tests
pass in our emulator using musahi; I don't have a real 68k processor
and haven't tried other emulators.

Still missing are conversions between 8-byte integers and any size of
floats.  The long-long tests don't cover these conversions, and our
emulator can't do floating-point.

Our build always enables TBL68020 and uses word size 4.  Without
TBL68020, 8-byte multiply and divide are missing.  With word size 2,
some conversions between 2-byte and 8-byte integers are missing.

Fix .cii in libem, which didn't work when converting from 1-byte or
2-byte integers.  Now .cii and .cuu work, but also add some rules to
skip .cii and .cuu when converting 8-byte integers.  The new rule for
loc 4 loc 8 cii `with test_set4` exposes a bug: the table may believe
that the condition codes test a 4-byte register when they only test a
word or byte, and this incorrect test may describe an unsigned word or
byte as negative.  Another rule `with exact test_set1+test_set2` works
around the bug by ignoring the negative flag, because a zero-extended
word or byte is never negative.

The old rules for comparison and logic do work with 8-byte integers
and bitsets, but add some specific 8-byte rules to skip libem calls or
loops.  There were no rules for 8-byte arithmetic, shift, or rotate;
so add some.  There is a register shortage, because the table requires
preserving d3 to d7, leaving only 3 data registers (d0, d1, d2) for
8-byte operations.  Because of the shortage, the code may move data to
an address register, or read a memory location more than once.

The multiplication and division code are translations of the i386
code.  They pass the tests, but might not give the best performance on
a real 68k processor.
---
 mach/m68020/libem/build.lua   |   2 +-
 mach/m68020/libem/cii.s       |  19 +-
 mach/m68020/libem/divrem8.s   |  76 +++++++
 mach/m68020/libem/dvi8.s      |  34 +++
 mach/m68020/libem/dvu8.s      |  20 ++
 mach/m68020/libem/rmi8.s      |  35 +++
 mach/m68020/libem/rmu8.s      |  22 ++
 mach/m68020/ncg/table         | 404 +++++++++++++++++++++++++++++++++-
 plat/linux68k/descr           |   7 +-
 plat/linux68k/tests/build.lua |   1 -
 10 files changed, 608 insertions(+), 12 deletions(-)
 create mode 100644 mach/m68020/libem/divrem8.s
 create mode 100644 mach/m68020/libem/dvi8.s
 create mode 100644 mach/m68020/libem/dvu8.s
 create mode 100644 mach/m68020/libem/rmi8.s
 create mode 100644 mach/m68020/libem/rmu8.s

diff --git a/mach/m68020/libem/build.lua b/mach/m68020/libem/build.lua
index d17adcd92..d5c9af8ad 100644
--- a/mach/m68020/libem/build.lua
+++ b/mach/m68020/libem/build.lua
@@ -2,7 +2,7 @@ for _, plat in ipairs(vars.plats) do
 	acklibrary {
 		name = "lib_"..plat,
 		srcs = {
-			"./*.s",
+			"./*.s", -- added divrem8.s
 			"./*.c"
 		},
 		vars = { plat = plat },
diff --git a/mach/m68020/libem/cii.s b/mach/m68020/libem/cii.s
index 01757dfc5..b3dd8c0be 100644
--- a/mach/m68020/libem/cii.s
+++ b/mach/m68020/libem/cii.s
@@ -13,12 +13,19 @@
 	sub.l	d0, sp		! pop extra bytes
 	jmp	(a0)
 1:
-	clr.l	d1
-	tst.l	(sp)
-	bne	4f
-	not.l	d1		! d1 contains sign of source
-4:
-	asr.l	#2, d0
+	move.l	(sp), d1
+	lsr.l	#1, d0
+	bcs	1f		! branch if source size == 1
+	lsr.l	#1, d0
+	bcs	2f		! branch if source size == 2
+	tst.l	d1
+	bra	4f
+1:	lsr.l	#1, d0		! size difference / 4
+	ext.w	d1
+2:	ext.l	d1
+	move.l	d1, (sp)
+4:	slt	d1
+	extb.l	d1		! d1 contains sign of source
 	sub.l	#1, d0
 2:
 	move.l	d1, -(sp)
diff --git a/mach/m68020/libem/divrem8.s b/mach/m68020/libem/divrem8.s
new file mode 100644
index 000000000..557924098
--- /dev/null
+++ b/mach/m68020/libem/divrem8.s
@@ -0,0 +1,76 @@
+.define .divrem8
+.sect .text
+.sect .rom
+.sect .data
+.sect .bss
+
+yh=16
+yl=20
+xh=24
+xl=28
+	! This private sub for .dvi8, .dvu8, .rmi8, .rmu8
+	! does unsigned division of x = xh:xl by y = yh:yl,
+	! yields d0:d1 = quotient, d2:d3 = remainder.
+
+.sect .text
+.divrem8:
+	! Caller must set d0, d1 like so:
+	! mov.l (xh, sp), d0
+	! mov.l (yh, sp), d1
+	tst.l	d1
+	bne	1f		! branch if y >= 2**32
+
+	! y = yl, so x / y = xh:xl / yl = qh:0 + rh:xl / yl
+	! where qh, rh are quotient, remainder from xh / yl.
+	move.l	(xl, sp), d1
+	move.l	(yl, sp), d2
+	clr.l	d3		! d3:d0 = xh
+	divu.l	d2, d3:d0	! d0 =  0:xh / yl, d3 = rh
+	divu.l	d2, d3:d1	! d1 = rh:xl / yl, so d0:d1 = x / y
+	clr.l	d2		! remainder in d2:d3
+	rts
+
+1:	! Here y >= 2**32.
+	move.l	d0, a0		! save xh
+	move.l	d1, a1		! save yh
+	move.l	d7, a2		! save caller's d7
+
+	! Find y >> right in [2**31, 2**32).
+	move.l	(yl, sp), d2
+	bfffo	d1[0:32], d3	! find highest set bit in yh
+	lsl.l	d3, d1		! shift yh left
+	bset	#5, d3
+	neg.l	d3		! right = (32 - left) modulo 64
+	lsr.l	d3, d2		! shift yl right
+	or.l	d1, d2		! d2 = y >> right
+
+	! Estimate x / y as q = (x / (y >> right)) >> right.
+	move.l	(xl, sp), d1
+	clr.l	d7
+	divu.l	d2, d7:d0
+	divu.l	d2, d7:d1	! d0:d1 = x / (y >> right)
+	lsr.l	d3, d1
+	bset	#5, d3
+	neg.l	d3
+	lsl.l	d3, d0
+	or.l	d0, d1		! d1 = q
+
+	! Calculate the remainder x - y * q.  If the subtraction
+	! overflows, then the correct quotient is q - 1, else it is q.
+	move.l	a1, d3		! yh
+	mulu.l	d1, d3		! yh * q
+	move.l	(yl, sp), d7
+	mulu.l	d1, d0:d7	! yl * q
+	add.l	d3, d0		! d0:d7 = y * q
+	move.l	(xl, sp), d3
+	move.l	a0, d2		! d2:d3 = x
+	sub.l	d7, d3
+	subx.l	d0, d2		! d2:d3 = x - y * q
+	bcc	1f		! branch unless subtraction overflowed
+	sub.l	#1, d1		! fix quotient
+	move.l	a1, d7		! yh
+	add.l	(yl, sp), d3
+	addx.l	d7, d2		! fix remainder
+1:	clr.l	d0		! d0:d1 = quotient
+	move.l	a2, d7		! restore caller's d7
+	rts
diff --git a/mach/m68020/libem/dvi8.s b/mach/m68020/libem/dvi8.s
new file mode 100644
index 000000000..03fc3e985
--- /dev/null
+++ b/mach/m68020/libem/dvi8.s
@@ -0,0 +1,34 @@
+.define .dvi8
+.sect .text
+.sect .rom
+.sect .data
+.sect .bss
+
+yh=8
+yl=12
+xh=16
+xl=20
+	! .dvi8 yields d0:d1 = quotient from x / y
+
+.sect .text
+.dvi8:
+	move.l	d3, -(sp)	! preserve caller's d3
+	clr.l	d2		! d2 = 0, non-negative result
+	move.l	(xh, sp), d0	! d0 for .divrem8
+	bge	1f
+	move.l	#1, d2		! d2 = 1, negative result
+	neg.l	(xl, sp)
+	negx.l	d0		! x = absolute value
+1:	move.l	(yh, sp), d1	! d1 for .divrem8
+	bge	1f
+	bchg	#0, d2		! flip sign of result
+	neg.l	(yl, sp)
+	negx.l	d1		! y = absolute value
+1:	move.l	d2, -(sp)
+	jsr	(.divrem8)
+	move.l	(sp)+, d2
+	beq	1f		! branch unless result < 0
+	neg.l	d1
+	negx.l	d0		! negate quotient d0:d1
+1:	move.l	(sp)+, d3
+	rtd	#16
diff --git a/mach/m68020/libem/dvu8.s b/mach/m68020/libem/dvu8.s
new file mode 100644
index 000000000..00ec6b552
--- /dev/null
+++ b/mach/m68020/libem/dvu8.s
@@ -0,0 +1,20 @@
+.define .dvu8
+.sect .text
+.sect .rom
+.sect .data
+.sect .bss
+
+yh=8
+xh=16
+	! .dvu8 yields d0:d1 = quotient from x / y
+
+.sect .text
+.dvu8:
+	move.l	d3, -(sp)	! preserve caller's d3
+	move.l	(xh, sp), d0
+	move.l	(yh, sp), d1
+	sub.l	#4, sp
+	jsr	(.divrem8)
+	add.l	#4, sp
+	move.l	(sp)+, d3
+	rtd	#16
diff --git a/mach/m68020/libem/rmi8.s b/mach/m68020/libem/rmi8.s
new file mode 100644
index 000000000..ffb672b2c
--- /dev/null
+++ b/mach/m68020/libem/rmi8.s
@@ -0,0 +1,35 @@
+.define .rmi8
+.sect .text
+.sect .rom
+.sect .data
+.sect .bss
+
+yh=8
+yl=12
+xh=16
+xl=20
+	! .rmi8 yields d0:d1 = remainder from x / y
+
+.sect .text
+.rmi8:
+	move.l	d3, -(sp)	! preserve caller's d3
+	clr.l	d2		! d2 = 0, non-negative result
+	move.l	(xh, sp), d0	! d0 for .divrem8
+	bge	1f
+	move.l	#1, d2		! d2 = 1, negative result
+	neg.l	(xl, sp)
+	negx.l	d0		! x = absolute value
+1:	move.l	(yh, sp), d1	! d1 for .divrem8
+	bge	1f
+	neg.l	(yl, sp)
+	negx.l	d1		! y = absolute value
+1:	move.l	d2, -(sp)
+	jsr	(.divrem8)
+	move.l	(sp)+, d0
+	beq	1f		! branch unless result < 0
+	neg.l	d3
+	negx.l	d2		! negate quotient d3:d2
+1:	move.l	d3, d1
+	move.l	d2, d0
+	move.l	(sp)+, d3
+	rtd	#16
diff --git a/mach/m68020/libem/rmu8.s b/mach/m68020/libem/rmu8.s
new file mode 100644
index 000000000..823a2778c
--- /dev/null
+++ b/mach/m68020/libem/rmu8.s
@@ -0,0 +1,22 @@
+.define .rmu8
+.sect .text
+.sect .rom
+.sect .data
+.sect .bss
+
+yh=8
+xh=16
+	! .rmu8 yields d0:d1 = remainder from x / y
+
+.sect .text
+.rmu8:
+	move.l	d3, -(sp)	! preserve caller's d3
+	move.l	(xh, sp), d0
+	move.l	(yh, sp), d1
+	sub.l	#4, sp
+	jsr	(.divrem8)
+	add.l	#4, sp
+	move.l	d3, d1
+	move.l	d2, d0
+	move.l	(sp)+, d3
+	rtd	#16
diff --git a/mach/m68020/ncg/table b/mach/m68020/ncg/table
index 9aede9929..fe1099078 100644
--- a/mach/m68020/ncg/table
+++ b/mach/m68020/ncg/table
@@ -612,6 +612,7 @@ add_l  "add.l"	conreg4:ro,	alterable4:rw:cc	cost(2,6).
 add_w  "add.w"	any2:ro,	D_REG+LOCAL:rw:cc	cost(2,3).
 add_w  "add.w"	conreg2:ro,	alterable2:rw:cc	cost(2,6).
 #endif
+addx_l "addx.l" D_REG4:ro,	D_REG4:rw kills :cc	cost(2,3).
 and_l  "and.l"	data4:ro,	D_REG4:rw:cc		cost(2,3).
 and_l  "and.l"	D_REG4:ro,	memalt4:rw:cc		cost(2,6).
 and_l  "and.l"	consts4:ro,	datalt4:rw:cc		cost(2,6).
@@ -628,6 +629,7 @@ asr   "asr #1,"	memalt2:rw:cc				cost(2,4).
 asl_w  "asl.w"	shconreg:ro,	D_REG:rw:cc		cost(2,5).
 asr_w  "asr.w"	shconreg:ro,	D_REG:rw:cc		cost(2,4).
 #endif
+bchg		const:ro,	D_REG:rw kills:cc	cost(2,4).
 bclr		const:ro,	D_REG:rw kills:cc	cost(2,4).
 bra		label					cost(2,5).
 bcc		label					cost(2,5).
@@ -671,14 +673,15 @@ eor_l  "eor.l"	conreg4:ro,	datalt4:rw:cc		cost(2,6).
 #if WORD_SIZE==2
 eor_w  "eor.w"	conreg2:ro,	datalt2:rw:cc		cost(2,4).
 #endif
+exg		genreg4:rw,	genreg4:rw		cost(2,3).
 /* in the next two instructions: LOCAL only allowed if register var */
 ext_l  "ext.l"	D_REG+LOCAL+D_REG4:rw:cc		cost(2,2).
 ext_w  "ext.w"	D_REG+LOCAL+D_REG4:rw:cc		cost(2,2).
 jmp		address+control4			cost(2,0).
 jsr		address+control4 kills :cc d0 d1 d2 a0 a1 cost(2,3).
 lea		address+control4:ro, A_REG+areg:wo	cost(2,0).
+lsl_l  "lsl.l"	shconreg:ro,	D_REG4:rw:cc		cost(2,4).
 /*
-lsl_l  "lsl.l"	shconreg:ro,	D_REG:rw:cc		cost(2,4).
 lsl   "lsl #1,"	memalt2:rw:cc				cost(2,4).
 */
 lsr_l  "lsr.l"	shconreg:ro,	D_REG4:rw:cc		cost(2,4).
@@ -709,6 +712,8 @@ neg_l  "neg.l"	memory4:rw:cc				cost(2,6).
 #if WORD_SIZE==2
 neg_w  "neg.w"	memory2:rw:cc				cost(2,6).
 #endif
+negx_l "negx.l" D_REG4:rw:cc				cost(2,3).
+negx_l "negx.l" memory4:rw:cc				cost(2,6).
 not_l  "not.l"	D_REG4:rw:cc				cost(2,3).
 not_l  "not.l"	memory4:rw:cc				cost(2,6).
 #if WORD_SIZE==2
@@ -733,6 +738,7 @@ ror_w  "ror.w"	shconreg:ro,	D_REG:rw:cc		cost(2,4).
 #endif
 roxl "roxl #1,"	memalt2:rw:cc				cost(2,4).
 roxr "roxr #1,"	memalt2:rw:cc				cost(2,4).
+slt		datalt1:rw				cost(2,3).
 sne		datalt1:rw				cost(2,3).
 sub_l  "sub.l"	any4:ro,	D_REG4:rw:cc		cost(2,3).
 sub_l  "sub.l"	any4+areg:ro,	A_REG+areg:rw		cost(2,3).
@@ -740,6 +746,9 @@ sub_l  "sub.l"	conreg4:ro,	alterable4:rw:cc	cost(2,6).
 #if WORD_SIZE==2
 sub_w  "sub.w"	any2:ro,	D_REG+LOCAL:rw:cc	cost(2,3).
 sub_w  "sub.w"	conreg2:ro,	alterable2:rw:cc	cost(2,6).
+#endif
+subx_l "subx.l" D_REG4:ro,	D_REG4:rw kills :cc	cost(2,3).
+#if WORD_SIZE==2
 /* On a swap, we only want the lower part of D_REG, so don't set cc */
 swap		D_REG:rw kills :cc			cost(2,2).
 #endif
@@ -773,6 +782,7 @@ divs_l "divs.l" data4:ro,	D_REG4:rw:cc		cost(2,90).
 divu_l "divu.l" data4:ro,	D_REG4:rw:cc		cost(2,78).
 divsl_l "divsl.l" data4:ro,	DREG_pair:rw kills :cc	cost(2,90).
 divul_l "divul.l" data4:ro,	DREG_pair:rw kills :cc	cost(2,78).
+mulu_l "mulu.l" data4:ro,	DREG_pair:rw kills :cc	cost(2,44).
 pea		address+control4+regX			cost(2,4).
 #if WORD_SIZE==2
 cmp2_w "cmp2.w" address+control2:ro, genreg2:ro kills :cc cost(2,18).
@@ -3796,6 +3806,18 @@ with exact any4 STACK
     gen add_l {post_inc4, sp}, %a
 			yields	%a
 
+pat adi $1==8
+with exact any4 any4 DD_REG4 DD_REG4
+    uses reusing %1, DD_REG4 = %1
+    gen add_l %2, %4
+	addx_l %a, %3	yields	%4 %3
+with DD_REG4 DD_REG4 D_REG4 any4
+    gen add_l %4, %2
+	addx_l %3, %1	yields	%2 %1
+with DD_REG4 DD_REG4 D_REG4 STACK
+    gen add_l {post_inc4, sp}, %2
+	addx_l %3, %1	yields	%2 %1
+
 #if WORD_SIZE==2
 pat sbi $1==2
 with any2-bconst DD_REG
@@ -3822,6 +3844,12 @@ with exact any4 STACK
 with any4-bconst4 AA_REG
     gen sub_l %1, %2	yields	%2
 
+pat sbi $1==8
+with D_REG4 any4-D_REG4 DD_REG4 DD_REG4
+    /* only 3 of DD_REG4; may unstack %2 into AA_REG */
+    gen sub_l %2, %4
+        subx_l %1, %3	yields	%4 %3
+
 #if WORD_SIZE==2
 pat loc loc cii ldc mli $1==2 && $2==4 && highw($4)==0 && loww($4)>0 && $5==4
 with any2-pre_post
@@ -3847,6 +3875,34 @@ with STACK
 			yields	dl1
 #endif
 
+#ifdef TBL68020
+pat mli $1==8
+with exact data4 data4 DD_REG4 DD_REG4	/* yh yl xh xl */
+    uses DD_REG4 = %4
+    gen mulu_l %1, %a			/* xl * yh */
+	mulu_l %2, %3			/* xh * yl */
+	add_l %3, %a
+	mulu_l %2, {DREG_pair, %3, %4}	/* xl * yl */
+	add_l %a, %3
+			yields	%4 %3
+with DD_REG4 DD_REG4 data4 data4	/* yh yl xh xl */
+    uses DD_REG = %2
+    gen mulu_l %3, %a			/* yl * xh */
+	mulu_l %4, %1			/* yh * xl */
+	add_l %1, %a
+	mulu_l %4, {DREG_pair, %1, %2}	/* yl * xl */
+	add_l %a, %1
+			yields	%2 %1
+with DD_REG4 DD_REG4 STACK		/* yh yl xh xl */
+    uses DD_REG4 = %2
+    gen mulu_l {post_inc4, sp}, %a	/* yl * xh */
+	mulu_l {indirect4, sp}, %1	/* yh * xl */
+	add_l %1, %a
+	mulu_l {post_inc4, sp}, {DREG_pair, %1, %2} /* yl * xl */
+	add_l %a, %1
+			yields	%2 %1
+#endif /* TBL68020 */
+
 #if WORD_SIZE==2
 pat dvi $1==2
 with data2-sconsts DD_REG
@@ -3866,6 +3922,14 @@ with STACK
 			yields	dl1
 #endif /* TBL68020 */
 
+#ifdef TBL68020
+pat dvi $1==8
+with STACK
+    kills ALL
+    gen jsr {absolute4, ".dvi8"}
+			yields	dl1 dl0
+#endif /* TBL68020 */
+
 #if WORD_SIZE==2
 pat rmi $1==2
 with data2-sconsts DD_REG
@@ -3891,6 +3955,14 @@ with STACK
 			yields	dl2
 #endif /* TBL68020 */
 
+#ifdef TBL68020
+pat rmi $1==8
+with STACK
+    kills ALL
+    gen jsr {absolute4, ".rmi8"}
+			yields	dl1 dl0
+#endif /* TBL68020 */
+
 #if WORD_SIZE==2
 pat ngi $1==2
 with DD_REG
@@ -3901,6 +3973,11 @@ pat ngi $1==4
 with DD_REG4
     gen neg_l %1	yields	%1
 
+pat ngi $1==8
+with DD_REG4 DD_REG4
+    gen neg_l %2
+	negx_l %1	yields	%2 %1
+
 #if WORD_SIZE==2
 pat sli $1==2
 with shconreg DD_REG
@@ -3911,6 +3988,43 @@ pat sli $1==4
 with shconreg DD_REG4
     gen asl_l %1, %2	yields	%2
 
+pat sli $1==8
+with DD_REG4 DD_REG4 DD_REG4
+    uses AA_REG = %3		/* no 4th DD_REG */
+    gen lsl_l %1, %3
+	lsl_l %1, %2		/* shift by %1 modulo 64 */
+	bchg {const, 5}, %1
+	bne {slabel, 1f}	/* jump if shift >= 32 */
+	neg_l %1
+	exg %a, %3
+	lsr_l %1, %3		/* (32 - shift) modulo 64 */
+	or_l %3, %2		/* shift bits from %3 to %2 */
+	move %a, %3
+	bra {slabel, 2f}
+	1:
+	move %a, %2
+	lsl_l %1, %2		/* (shift - 32) modulo 64 */
+	2:		yields	%3 %2
+
+pat loc sli ($1&32)==0 && $2==8
+with DD_REG4 DD_REG4
+    uses AA_REG = %2, DD_REG = {bconst, $1&31}
+    gen lsl_l %b, %2
+	lsl_l %b, %1
+	bset {const, 5}, %b
+	neg_l %b
+	exg %a, %2
+	lsr_l %b, %2
+	or_l %2, %1
+	move %a, %2
+			yields	%2 %1
+pat loc sli ($1&63)==32 && $2==8
+with any4 any4		yields	{zero_const, 0} %2
+pat loc sli ($1&32)!=0 && $2==8
+with any4 DD_REG4
+    uses reusing %1, DD_REG = {bconst, $1&31}
+    gen lsl_l %a, %2	yields	{zero_const, 0} %2
+
 #if WORD_SIZE==2
 pat sri $1==2
 with shconreg DD_REG
@@ -3921,6 +4035,43 @@ pat sri $1==4
 with shconreg DD_REG4
     gen asr_l %1, %2	yields	%2
 
+pat sri $1==8
+with DD_REG4 DD_REG4 DD_REG4
+    uses AA_REG = %2		/* no 4th DD_REG */
+    gen asr_l %1, %2
+	lsr_l %1, %3		/* shift by %1 modulo 64 */
+	bchg {const, 5}, %1
+	bne {slabel, 1f}	/* jump if shift >= 32 */
+	neg_l %1
+	exg %a, %2
+	lsl_l %1, %2		/* (32 - shift) modulo 64 */
+	or_l %2, %3		/* shift bits from %2 to %3 */
+	move %a, %2
+	bra {slabel, 2f}
+	1:
+	move %a, %3
+	asr_l %1, %3		/* (shift - 32) modulo 64 */
+	2:		yields	%3 %2
+
+pat loc sri ($1&32)==0 && $2==8
+with DD_REG4 DD_REG4
+    uses AA_REG = %1, DD_REG = {bconst, $1&31}
+    gen asr_l %b, %1
+	lsr_l %b, %2
+	bset {const, 5}, %b
+	neg_l %b
+	exg %a, %1
+	lsl_l %b, %1
+	or_l %1, %2
+	move %a, %1
+			yields	%2 %1
+pat loc sri ($1&63)==32 && $2==8
+with DD_REG4 any4	yields	%1	leaving loc 4 loc 8 cii
+pat loc sri ($1&32)!=0 && $2==8
+with DD_REG4 any4
+    uses reusing %2, DD_REG = {bconst, $1&31}
+    gen asr_l %a, %1	yields	%1	leaving loc 4 loc 8 cii
+
 /************************************************
  * Group 4: unsigned arithmetic.		*
  ************************************************/
@@ -3947,6 +4098,8 @@ with STACK
 			yields	dl1
 #endif /* TBL68020 */
 
+pat mlu $1==8			leaving mli 8
+
 #if WORD_SIZE==2
 pat dvu $1==2
 with data2-sconsts data2
@@ -3966,6 +4119,14 @@ with STACK
 			yields	dl1
 #endif /* TBL68020 */
 
+#ifdef TBL68020
+pat dvu $1==8
+with STACK
+    kills ALL
+    gen jsr {absolute4, ".dvu8"}
+			yields	dl1 dl0
+#endif /* TBL68020 */
+
 #if WORD_SIZE==2
 pat rmu $1==2
 with data2-sconsts data2
@@ -3992,8 +4153,18 @@ with STACK
 			yields	dl2
 #endif /* TBL68020 */
 
+#ifdef TBL68020
+pat rmu $1==8
+with STACK
+    kills ALL
+    gen jsr {absolute4, ".rmu8"}
+			yields	dl1 dl0
+#endif /* TBL68020 */
+
 pat slu				leaving sli $1
 
+pat loc slu $2==8		leaving loc $1 sli 8
+
 #if WORD_SIZE==2
 pat sru $1==2
 with shconreg DD_REG
@@ -4004,6 +4175,43 @@ pat sru $1==4
 with shconreg DD_REG4
     gen lsr_l %1, %2	yields	%2
 
+pat sru $1==8
+with DD_REG4 DD_REG4 DD_REG4
+    uses AA_REG = %2		/* no 4th DD_REG */
+    gen lsr_l %1, %2
+	lsr_l %1, %3		/* shift by %1 modulo 64 */
+	bchg {const, 5}, %1
+	bne {slabel, 1f}	/* jump if shift >= 32 */
+	neg_l %1
+	exg %a, %2
+	lsl_l %1, %2		/* (32 - shift) modulo 64 */
+	or_l %2, %3		/* shift bits from %2 to %3 */
+	move %a, %2
+	bra {slabel, 2f}
+	1:
+	move %a, %3
+	lsr_l %1, %3		/* (shift - 32) modulo 64 */
+	2:		yields	%3 %2
+
+pat loc sru ($1&32)==0 && $2==8
+with DD_REG4 DD_REG4
+    uses AA_REG = %2, DD_REG = {bconst, $1&31}
+    gen lsr_l %b, %1
+	lsr_l %b, %2
+	bset {const, 5}, %b
+	neg_l %b
+	exg %a, %1
+	lsl_l %b, %1
+	or_l %1, %2
+	move %a, %1
+			yields	%2 %1
+pat loc sru ($1&63)==32 && $2==8
+with any4 any4		yields	%1 {zero_const, 0}
+pat loc sru ($1&32)!=0 && $2==8
+with DD_REG4 any4
+    uses reusing %2, DD_REG = {bconst, $1&31}
+    gen lsr_l %a, %1	yields	%1 {zero_const, 0}
+
 /************************************************
  * Group 5: floating point arithmetic		*
  ************************************************/
@@ -4753,6 +4961,17 @@ with exact any_int STACK
     uses reusing %1,DD_REG=%1
     gen xxx* {post_inc_int, sp}, %a	yields %a
 
+proc log8 example and
+with exact data4 data4 DD_REG4 DD_REG4
+    gen xxx* %1, %3
+	xxx* %2, %4			yields	%4 %3
+with DD_REG4 DD_REG4 data4 data4
+    gen xxx* %3, %1
+	xxx* %4, %2			yields	%2 %1
+with DD_REG4 DD_REG4 STACK
+    gen xxx* {post_inc4, sp}, %1
+	xxx* {post_inc4, sp}, %2	yields	%2 %1
+
 proc logdef example and
 with STACK
     uses DD_REG4 = {const, $1/WORD_SIZE -1},
@@ -4813,6 +5032,7 @@ pat and $1==WORD_SIZE			call logw(AND_I)
 #if WORD_SIZE==2
 pat and $1==2*WORD_SIZE			call log2w("and.l")
 #endif
+pat and $1==8				call log8("and.l")
 pat and $1>4 && $1/WORD_SIZE<=65536	call logdef(AND_I)
 pat and defined($1)			call logbdef(AND_I)
 pat and !defined($1)			call logndef(AND_I)
@@ -4821,6 +5041,7 @@ pat ior $1==WORD_SIZE			call logw(OR_I)
 #if WORD_SIZE==2
 pat ior $1==2*WORD_SIZE			call log2w("or.l")
 #endif
+pat ior $1==8				call log8("or.l")
 pat ior $1>2 && $1/WORD_SIZE<=65536	call logdef(OR_I)
 pat ior defined($1)			call logbdef(OR_I)
 pat ior !defined($1)			call logndef(OR_I)
@@ -4835,6 +5056,21 @@ pat xor $1==4
 with DD_REG4 conreg4-bconst4
     gen eor_l %2, %1	yields	%1
 
+pat xor $1==8
+with exact any4 any4 DD_REG4 DD_REG4
+    uses reusing %1, DD_REG4 = %1
+    gen eor_l %a, %3
+	move %2, %a
+	eor_l %a, %4	yields	%4 %3
+with DD_REG4 DD_REG4 D_REG4 any4
+    gen eor_l %3, %1
+	move %4, %3
+	eor_l %3, %2	yields	%2 %1
+with DD_REG4 DD_REG4 DD_REG4 STACK
+    gen eor_l %3, %1
+        move_l {post_inc4, sp}, %3
+	eor_l %3, %2	yields	%2 %1
+
 pat xor $1>4 && $1/WORD_SIZE<=65536		call logdef(EOR_I)
 pat xor defined($1)			call logbdef(EOR_I)
 pat xor !defined($1)			call logndef(EOR_I)
@@ -4907,6 +5143,50 @@ pat rol $1==4
 with shconreg DD_REG4
     gen rol_l %1, %2	yields	%2
 
+pat rol $1==8
+with DD_REG4 DD_REG4 DD_REG4
+    uses AA_REG, AA_REG		/* no 4th DD_REG */
+    gen bclr {const, 5}, %1
+	beq {slabel, 1f}
+	exg %2, %3		/* rotate left 32 */
+	1:
+	move %2, %a
+	move %3, %b
+	lsl_l %1, %2
+	lsl_l %1, %3
+	bset {const, 5}, %1
+	neg_l %1		/* (32 - shift) modulo 64 */
+	exg %a, %2
+	lsr_l %1, %2
+	or_l %2, %3		/* rotate bits from %2 to %3 */
+	move %a, %2
+	exg %b, %3
+	lsr_l %1, %3
+	or_l %3, %2		/* rotate bits from %3 to %2 */
+	move %b, %3
+			yields	%3 %2
+
+pat loc rol ($1&32)==0 && $2==8
+with DD_REG4 DD_REG4
+    uses AA_REG, AA_REG, DD_REG = {bconst, $1&31}
+    gen move %1, %a
+	move %2, %b
+	lsl_l %c, %1
+	lsl_l %c, %2
+	bset {const, 5}, %c
+	neg_l %c
+	exg %a, %1
+	lsr_l %c, %1
+	or_l %1, %2
+	move %a, %1
+	exg %b, %2
+	lsr_l %c, %2
+	or_l %2, %1
+	move %b, %2
+			yields	%2 %1
+pat loc rol ($1&63)==32 && $2==8	leaving exg 4
+pat loc rol ($1&32)!=0 && $2==8		leaving loc (0-$1)&31 ror 8
+
 #if WORD_SIZE==2
 pat ror $1==2
 with shconreg DD_REG
@@ -4917,7 +5197,51 @@ pat ror $1==4
 with shconreg DD_REG4
     gen ror_l %1, %2	yields	%2
 
-	
+pat ror $1==8
+with DD_REG4 DD_REG4 DD_REG4
+    uses AA_REG, AA_REG		/* no 4th DD_REG */
+    gen bclr {const, 5}, %1
+	beq {slabel, 1f}
+	exg %2, %3		/* rotate right 32 */
+	1:
+	move %2, %a
+	move %3, %b
+	lsr_l %1, %2
+	lsr_l %1, %3
+	bset {const, 5}, %1
+	neg_l %1		/* (32 - shift) modulo 64 */
+	exg %a, %2
+	lsl_l %1, %2
+	or_l %2, %3		/* rotate bits from %2 to %3 */
+	move %a, %2
+	exg %b, %3
+	lsl_l %1, %3
+	or_l %3, %2		/* rotate bits from %3 to %2 */
+	move %b, %3
+			yields	%3 %2
+
+pat loc ror ($1&32)==0 && $2==8
+with DD_REG4 DD_REG4
+    uses AA_REG, AA_REG, DD_REG = {bconst, $1&31}
+    gen move %1, %a
+	move %2, %b
+	lsr_l %c, %1
+	lsr_l %c, %2
+	bset {const, 5}, %c
+	neg_l %c
+	exg %a, %1
+	lsl_l %c, %1
+	or_l %1, %2
+	move %a, %1
+	exg %b, %2
+	lsl_l %c, %2
+	or_l %2, %1
+	move %b, %2
+			yields	%2 %1
+pat loc ror ($1&63)==32 && $2==8	leaving exg 4
+pat loc ror ($1&32)!=0 && $2==8		leaving loc (0-$1)&31 rol 8
+
+
 
 
 /************************************************
@@ -6391,6 +6715,55 @@ pat cmu zge $1==WORD_SIZE		call cmuzxx("bcc","bls")
 pat cmu zgt $1==WORD_SIZE		call cmuzxx("bhi","bcs")
 
 
+proc cmx8txx example cmi tlt
+with exact DD_REG4 DD_REG4 any4 any4
+    uses reusing %3, DD_REG4 = %3
+    gen sub_l %4, %2
+	subx_l %a, %1	/* keep overflow flag */
+	sxx[2] %1
+	neg_b %1	yields	{dreg1, %1}
+with D_REG4 any4-D_REG4 DD_REG4 DD_REG4
+    /* only 3 of DD_REG4; may unstack %2 into AA_REG */
+    gen sub_l %2, %4
+	subx_l %1, %3
+	sxx[1] %3
+	neg_b %3	yields	{dreg1, %3}
+
+pat cmi tlt $1==8			call cmx8txx("slt","sgt")
+pat cmi tle $1==8			call cmx8txx("sle","sge")
+pat cmi tge $1==8			call cmx8txx("sge","sle")
+pat cmi tgt $1==8			call cmx8txx("sgt","slt")
+pat cms teq $1==8			call cmx8txx("seq","seq")
+pat cms tne $1==8			call cmx8txx("sne","sne")
+pat cmu tlt $1==8			call cmx8txx("scs","shi")
+pat cmu tle $1==8			call cmx8txx("sls","scc")
+pat cmu tge $1==8			call cmx8txx("scc","sls")
+pat cmu tgt $1==8			call cmx8txx("shi","scs")
+
+proc cmx8zxx example cmi zlt
+with exact DD_REG4 DD_REG4 any4 any4
+    kills ALL
+    uses reusing %3, DD_REG4 = %3
+    gen sub_l %4, %2
+	subx_l %a, %1
+	bxx[2] {llabel, $2}
+with D_REG4 any4-D_REG4 DD_REG4 DD_REG4 STACK
+    gen sub_l %2, %4
+	subx_l %1, %3
+	bxx[1] {llabel, $2}
+
+pat cmi zlt $1==8			call cmx8zxx("blt","bgt")
+pat cmi zle $1==8			call cmx8zxx("ble","bge")
+pat cmi zge $1==8			call cmx8zxx("bge","ble")
+pat cmi zgt $1==8			call cmx8zxx("bgt","blt")
+pat cms zeq $1==8			call cmx8zxx("beq","beq")
+pat cms zne $1==8			call cmx8zxx("bne","bne")
+pat cmu zlt $1==8			call cmx8zxx("bcs","bhi")
+pat cmu zle $1==8			call cmx8zxx("bls","bcc")
+pat cmu zge $1==8			call cmx8zxx("bcc","bls")
+pat cmu zgt $1==8			call cmx8zxx("bhi","bcs")
+
+
 #if TBL68881
 proc cmf4zxx example cmf zlt
 with FS_REG FS_REG
@@ -6630,6 +7003,33 @@ uses reusing %1,DD_REG4
 pat loc loc ciu $1==$2	/* skip this */
 pat loc loc cui $1==$2	/* skip this */
 
+pat loc loc cii $1==4 && $2==8
+with exact test_set1+test_set2
+			yields	%1 {zero_const, 0}
+with test_set4
+    uses DD_REG4
+    gen test %1
+	slt {dreg1, %a}
+#ifdef TBL68020
+	extb_l %a
+#else
+	ext_w %a
+	ext_l %a
+#endif
+			yields	%1 %a
+
+pat loc loc cii $1<4 && $2==8
+			leaving loc $1 loc 4 cii loc 4 loc 8 cii
+
+pat loc loc ciu $1==4 && $2==8		yields {zero_const, 0}
+pat loc loc cui $1==4 && $2==8		yields {zero_const, 0}
+pat loc loc cuu $1==4 && $2==8		yields {zero_const, 0}
+
+pat loc loc cii $1==8 && $2==4		leaving asp 4
+pat loc loc ciu $1==8 && $2==4		leaving asp 4
+pat loc loc cui $1==8 && $2==4		leaving asp 4
+pat loc loc cuu $1==8 && $2==4		leaving asp 4
+
 
 /* The following rules should be handled by the peephole optimizer, I think */
 
diff --git a/plat/linux68k/descr b/plat/linux68k/descr
index a530fffdb..d813f61a8 100644
--- a/plat/linux68k/descr
+++ b/plat/linux68k/descr
@@ -10,6 +10,8 @@ var s=2
 var sa={s}
 var l={w}
 var la={w}
+var q=8
+var qa=4
 var f={w}
 var fa={w}
 var d=8
@@ -19,11 +21,12 @@ var xa={x}
 var ARCH=m68020
 var PLATFORM=linux68k
 var PLATFORMDIR={EM}/share/ack/{PLATFORM}
-var CPP_F=-D__unix -D__mc68020 -D__m68k -D__mc68000 -D__M68020
+var CPP_F=-D__unix -D__mc68020 -D__m68k -D__mc68000 -D__M68020 -D_EM_LLSIZE={q}
 var ALIGN=-a0:4 -a1:4 -a2:4 -a3:4 -b0:0x08000054
 var C_LIB={PLATFORMDIR}/libc-ansi.a
 # bitfields reversed for compatibility with (g)cc.
-var CC_ALIGN=-Vr
+# long long enabled.
+var CC_ALIGN=-Vrq{q}.{qa}
 var OLD_C_LIB={C_LIB}
 var MACHOPT_F=-ml10
 var EGO_PLAT_FLAGS=-M{EM}/share/ack/ego/{ARCH}.descr
diff --git a/plat/linux68k/tests/build.lua b/plat/linux68k/tests/build.lua
index 221abc8d6..37edfaada 100644
--- a/plat/linux68k/tests/build.lua
+++ b/plat/linux68k/tests/build.lua
@@ -6,6 +6,5 @@ plat_testsuite {
     method = "plat/linux68k/emu+emu68k",
     skipsets = {
         "floats", -- FPU instructions not supported by emulator
-        "long-long",
     },
 }