From f6a1e08218e2dea11505da7456ba0601c40c209d Mon Sep 17 00:00:00 2001
From: George Koehler <kernigh@gmail.com>
Date: Mon, 16 Sep 2019 20:19:36 -0400
Subject: [PATCH] Test long long division and remainder; fix i386.

My i386 code from 893df4b gave the wrong sign to some 8-byte
remainders.  Fix by splitting .dvi8 and .rmi8 so each has its own code
to pick the sign.  They and .dvu8 and .rmu8 share a private sub
.divrem8 for unsigned division.

Improve the i386 code by using instructions like _bsr_ and _shrd_.
Change the helpers to yield a quotient in ebx:eax or a remainder in
ecx:edx; this seems more convenient, because _div_ puts its quotient
in eax and remainder in edx.
---
 mach/i386/libem/build.lua         |   2 +-
 mach/i386/libem/divrem8.s         |  63 ++++++++++++++
 mach/i386/libem/dvi8.s            | 132 ++++++------------------------
 mach/i386/libem/dvu8.s            |  20 +++++
 mach/i386/libem/rmi8.s            |  36 ++++++++
 mach/i386/ncg/table               |   8 +-
 tests/plat/build.lua              |   2 +-
 tests/plat/long-long/lldivrem_e.c |  71 ++++++++++++++++
 8 files changed, 223 insertions(+), 111 deletions(-)
 create mode 100644 mach/i386/libem/divrem8.s
 create mode 100644 mach/i386/libem/dvu8.s
 create mode 100644 mach/i386/libem/rmi8.s
 create mode 100644 tests/plat/long-long/lldivrem_e.c

diff --git a/mach/i386/libem/build.lua b/mach/i386/libem/build.lua
index b92254d96..37e05a45c 100644
--- a/mach/i386/libem/build.lua
+++ b/mach/i386/libem/build.lua
@@ -1,7 +1,7 @@
 for _, plat in ipairs(vars.plats) do
 	acklibrary {
 		name = "lib_"..plat,
-		srcs = { "./*.s" }, -- dvi8.s
+		srcs = { "./*.s" }, -- divrem8.s
 		vars = { plat = plat },
 	}
 end
diff --git a/mach/i386/libem/divrem8.s b/mach/i386/libem/divrem8.s
new file mode 100644
index 000000000..d3334d72d
--- /dev/null
+++ b/mach/i386/libem/divrem8.s
@@ -0,0 +1,63 @@
+.sect .text; .sect .rom; .sect .data; .sect .bss
+.sect .text
+.define .divrem8
+
+yl=12
+yh=16
+xl=20
+xh=24
+	! This private sub for .dvi8, .dvu8, .rmi8, .rmu8
+	! does unsigned division of x = xh:xl by y = yh:yl,
+	! yields ebx:eax = quotient, ecx:edx = remainder.
+
+.divrem8:
+	! Caller must set eax, edx, flag z like so:
+	! mov	edx,yh(esp)
+	! test	edx,edx
+	! mov	eax,xh(esp)
+	jnz	1f		! jump if y >= 2**32
+
+	! y = yl, so x / y = xh:xl / yl = qh:0 + (xl + rh) / yl
+	! where qh, rh are quotient, remainder from xh / yl.
+	mov	ecx,yl(esp)
+	xor	edx,edx		! edx:eax = xh
+	div	ecx		! eax = qh, edx = rh
+	mov	ebx,eax
+	mov	eax,xl(esp)	! edx:eax = qh:xl
+	div	ecx		! ebx:eax = qh:ql = quotient
+	xor	ecx,ecx		! ecx:edx =  0:rl = remainder
+	ret
+
+1:	! Here y >= 2**32.  Find y >> cl in [2**31, 2**32).
+	mov	ebx,yl(esp)	! edx:ebx = y
+	bsr	ecx,edx		! scan yh for highest set bit
+	incb	cl		! cl bits from cl-1 to 0
+	shrd	ebx,edx,cl	! ebx = y >> cl
+
+	! Estimate x / y as q = (x / (y >> cl)) >> cl.
+	xor	edx,edx		! edx:eax = xh
+	div	ebx		! eax = xh / (y >> cl)
+	push	eax
+	mov	eax,xl+4(esp)	! push moved xl to xl+4
+	div	ebx
+	pop	edx		! edx:eax = x / (y >> cl)
+	shrd	eax,edx,cl	! eax = q
+
+	! Calculate the remainder x - y * q.  If the subtraction
+	! overflows, then the correct quotient is q - 1, else it is q.
+	mov	ebx,yh(esp)
+	imul	ebx,eax		! ebx = yh * q
+	push	eax
+	mul	yl+4(esp)	! edx:eax = yl * q
+	add	ebx,edx		! ebx:eax = y * q
+	mov	edx,xl+4(esp)
+	mov	ecx,xh+4(esp)
+	sub	edx,eax
+	sbb	ecx,ebx		! ecx:edx = remainder
+	pop	eax		! eax = q
+	jnc	1f		! jump unless subtraction overflowed
+	dec	eax		! fix quotient
+	add	edx,yl(esp)
+	adc	ecx,yh(esp)	! fix remainder
+1:	xor	ebx,ebx		! ebx:eax = quotient
+	ret
diff --git a/mach/i386/libem/dvi8.s b/mach/i386/libem/dvi8.s
index 060f85cf1..986572525 100644
--- a/mach/i386/libem/dvi8.s
+++ b/mach/i386/libem/dvi8.s
@@ -1,115 +1,37 @@
 .sect .text; .sect .rom; .sect .data; .sect .bss
 .sect .text
-.define .dvi8, .dvu8
+.define .dvi8
 
-yl=8
-yh=12
-xl=16
-xh=20
-	! .dvi8 and .dvu8 divide x = xh:xl by y = yh:yl,
-	! yield edx:eax = quotient, ecx:ebx = remainder.
-
-.dvu8:
-	! Unsigned division: set di = 0 for non-negative quotient.
-	push	edi
-	xor	di,di
-	mov	eax,xh(esp)
-	mov	edx,yh(esp)
-	and	edx,edx
-	jmp	7f
+yl=4
+yh=8
+xl=12
+xh=16
+	! .dvi8 yields ebx:eax = quotient from x / y
 
 .dvi8:
-	! Signed division: replace x and y with their absolute values.
-	! Set di = 1 for negative quotient, 0 for non-negative.
-	push	edi
-	xor	di,di		! di = 0
-	mov	eax,xh(esp)
-	and	eax,eax
-	jns	1f
-	inc	di		! di = 1
+	xorb	cl,cl		! cl = 0, non-negative result
+	mov	eax,xh(esp)	! eax for .divrem8
+	test	eax,eax
+	jge	1f		! jump unless x < 0
+	incb	cl		! cl = 1, negative result
 	neg	eax
 	neg	xl(esp)
-	sbb	eax,0		! eax:xl = absolute value of x
-1:	mov	edx,yh(esp)
-	and	edx,edx
-	jns	7f
-	xor	di,1		! flip di
+	sbb	eax,0
+	mov	xh(esp),eax	! x = absolute value
+1:	mov	edx,yh(esp)	! edx for .divrem8
+	test	edx,edx		! flag z for .divrem8 when y >= 0
+	jge	1f		! jump unless y < 0
+	xorb	cl,1		! flip sign of result
 	neg	edx
 	neg	yl(esp)
-	sbb	edx,0		! edx:yl = absolute value of y
-
-7:	! Here .dvu8 joins .dvi8, eax = xh, edx = yh, flags test edx,
-	! the values in xh(esp) and yh(esp) are garbage.
-	jnz	8f		! jump if y >= 2**32
-
-	! x / y = x / yl = xh / yl + xl / yl = qh + (xl + rh) / yl
-	! where qh and rh are quotient, remainder from xh / yl.
-	mov	ebx,yl(esp)
-	xor	edx,edx		! edx:eax = xh
-	div	ebx		! eax = qh, edx = rh
-	mov	ecx,eax
-	mov	eax,xl(esp)
-	div	ebx		! eax = ql, edx = remainder
-	mov	ebx,edx
-	mov	edx,ecx		! edx:eax = quotient qh:ql
-	xor	ecx,ecx		! ecx:ebx = remainder
-
-9:	! Finally, if di != 0 then negate quotient, remainder.
-	and	di,di
-	jz	1f
-	neg	edx
-	neg	eax
-	sbb	edx,0		! negate quotient edx:eax
-	neg	ecx
+	sbb	edx,0		! flag z for .divrem8 when y < 0
+	mov	yh(esp),edx	! y = absolute value
+1:	push	ecx
+	call	.divrem8
+	pop	ecx
+	testb	cl,cl
+	jz	1f		! jump unless result < 0
 	neg	ebx
-	sbb	ecx,0		! negate remainder ecx:ebx
-1:	pop	edi		! caller's edi
-	ret	16
-
-8:	! We come here if y >= 2**32.
-	mov	xh(esp),eax
-	mov	yh(esp),edx
-	mov	ebx,yl(esp)	! edx:ebx = y
-
-	! Estimate x / y as q = (x / (y >> cl)) >> cl,
-	! where 2**31 <= (y >> cl) < 2**32.
-	xor	cx,cx
-1:	inc	cx
-	shr	edx,1
-	rcr	ebx,1		! edx:ebx = y >> cl
-	and	edx,edx
-	jnz	1b		! loop until y >> cl fits in ebx
-
-	! x / (y >> cl) = qh + (x + rh) / (y >> cl)
-	push	edi
-	xor	edx,edx		! edx:eax = xh
-	div	ebx		! eax = qh, edx = rh
-	mov	edi,eax
-	mov	eax,xl+4(esp)	! push edi moved xl to xl+4
-	div	ebx		! edi:eax = x / (y >> cl)
-
-	! q = (x / (y >> cl)) >> cl = esi:eax >> cl
-	shr	eax,cl
-	neg	cx		! cl = (32 - cl) modulo 32
-	shl	edi,cl
-	or	eax,edi		! eax = q
-
-	! Calculate the remainder x - q * y.  If the subtraction
-	! overflows, then the correct quotient is q - 1, else it is q.
-	mov	ecx,yh+4(esp)
-	imul	ecx,eax		! ecx = q * yh
-	mov	edi,eax
-	mul	yl+4(esp)	! edx:eax = q * yl
-	add	edx,ecx		! edx:eax = q * y
-	mov	ebx,xl+4(esp)
-	mov	ecx,xh+4(esp)	! ecx:ebx = x
-	sub	ebx,eax
-	sbb	ecx,edx		! ecx:ebx = remainder
-	jnc	1f
-	dec	edi		! fix quotient
-	add	ebx,yl+4(esp)
-	adc	ebx,yh+4(esp)	! fix remainder
-1:	mov	eax,edi
-	xor	edx,edx		! edx:eax = quotient
-	pop	edi		! negative flag
-	jmp	9b
+	neg	eax
+	sbb	ebx,0		! negate quotient ebx:eax
+1:	ret	16
diff --git a/mach/i386/libem/dvu8.s b/mach/i386/libem/dvu8.s
new file mode 100644
index 000000000..8a1f00203
--- /dev/null
+++ b/mach/i386/libem/dvu8.s
@@ -0,0 +1,20 @@
+.sect .text; .sect .rom; .sect .data; .sect .bss
+.sect .text
+.define .dvu8, .rmu8
+
+yl=4
+yh=8
+xl=12
+xh=16
+	! .dvu8 yields ebx:eax = quotient from x / y
+	! .rmu8 yields ecx:edx = remainder from x / y
+
+.dvu8:
+.rmu8:	
+	mov	edx,yh(esp)
+	test	edx,edx
+	mov	eax,xh(esp)	! prepare for .divrem8
+	push	ebp		! move esp
+	call	.divrem8
+	pop	ebp		! move esp
+	ret	16
diff --git a/mach/i386/libem/rmi8.s b/mach/i386/libem/rmi8.s
new file mode 100644
index 000000000..a52c282b6
--- /dev/null
+++ b/mach/i386/libem/rmi8.s
@@ -0,0 +1,36 @@
+.sect .text; .sect .rom; .sect .data; .sect .bss
+.sect .text
+.define .rmi8
+
+yl=4
+yh=8
+xl=12
+xh=16
+	! .rmi8 yields ecx:edx = remainder from x / y
+
+.rmi8:
+	xorb	cl,cl		! cl = 0, non-negative result
+	mov	eax,xh(esp)	! eax for .divrem8
+	test	eax,eax
+	jge	1f		! jump unless x < 0
+	incb	cl		! cl = 1, negative result
+	neg	eax
+	neg	xl(esp)
+	sbb	eax,0
+	mov	xh(esp),eax	! x = absolute value
+1:	mov	edx,yh(esp)	! edx for .divrem8
+	test	edx,edx		! flag z for .divrem8 when y >= 0
+	jge	1f		! jump unless y < 0
+	neg	edx
+	neg	yl(esp)
+	sbb	edx,0		! flag z for .divrem8 when y < 0
+	mov	yh(esp),edx	! y = absolute value
+1:	push	ecx
+	call	.divrem8
+	pop	eax
+	testb	al,al
+	jz	1f		! jump unless result < 0
+	neg	ecx
+	neg	edx
+	sbb	ecx,0		! negate remainder ecx:edx
+1:	ret	16
diff --git a/mach/i386/ncg/table b/mach/i386/ncg/table
index 184662a15..789fa5c9a 100644
--- a/mach/i386/ncg/table
+++ b/mach/i386/ncg/table
@@ -1038,7 +1038,7 @@ with noacc ACC
 
 pat dvi $1==8
   kills ALL
-  gen proccall {label,".dvi8"}	yields edx eax
+  gen proccall {label,".dvi8"}	yields ebx eax
 
 /*
 pat dvi !defined($1)
@@ -1055,7 +1055,7 @@ with noacc ACC
 
 pat rmi $1==8
   kills ALL
-  gen proccall {label,".dvi8"}	yields ecx ebx
+  gen proccall {label,".rmi8"}	yields ecx edx
 
 /*
 pat rmi !defined($1)
@@ -1202,7 +1202,7 @@ gen div %1			yields eax
 
 pat dvu $1==8
   kills ALL
-  gen proccall {label,".dvu8"}	yields edx eax
+  gen proccall {label,".dvu8"}	yields ebx eax
 
 /*
 pat dvu !defined($1)
@@ -1218,7 +1218,7 @@ gen div %1			yields edx
 
 pat rmu $1==8
   kills ALL
-  gen proccall {label,".dvu8"}	yields ecx ebx
+  gen proccall {label,".rmu8"}	yields ecx edx
 
 /*
 pat rmu !defined($1)
diff --git a/tests/plat/build.lua b/tests/plat/build.lua
index 117e6a735..1613255be 100644
--- a/tests/plat/build.lua
+++ b/tests/plat/build.lua
@@ -4,7 +4,7 @@ definerule("plat_testsuite",
 	{
 		plat = { type="string" },
 		method = { type="string" },
-		-- added long-long/llbitset_e.c
+		-- added long-long/lldivrem_e.c
 		sets = { type="table", default={"core", "b", "bugs", "m2", "floats", "long-long"}},
 		skipsets = { type="table", default={}},
 		tests = { type="targets", default={} },
diff --git a/tests/plat/long-long/lldivrem_e.c b/tests/plat/long-long/lldivrem_e.c
new file mode 100644
index 000000000..7e75125a6
--- /dev/null
+++ b/tests/plat/long-long/lldivrem_e.c
@@ -0,0 +1,71 @@
+#include "test.h"
+
+/*
+ * Test division and remainder.  Failure code will look like
+ *  - 0x3d = id 0x3, 'd' for division
+ *  - 0x3e = id 0x3, 'e' for remainder
+ */
+
+struct s_divrem {
+	unsigned int id;
+	long long a;
+	long long b;
+	long long a_div_b; /* a / b */
+	long long a_rem_b; /* a % b */
+} s_cases[] = {
+	{0x1,  310LL,  100LL,  3LL,  10LL},
+	{0x2,  310LL, -100LL, -3LL,  10LL},
+	{0x3, -310LL,  100LL, -3LL, -10LL},
+	{0x4, -310LL, -100LL,  3LL, -10LL},
+	{0x5,  3000000000000010LL,  100LL,  30000000000000LL,  10LL},
+	{0x6,  3000000000000010LL, -100LL, -30000000000000LL,  10LL},
+	{0x7, -3000000000000010LL,  100LL, -30000000000000LL, -10LL},
+	{0x8, -3000000000000010LL, -100LL,  30000000000000LL, -10LL},
+	{0x9,  3000000000000010LL,  1000000000000LL,  3000LL,  10LL},
+	{0xa,  3000000000000010LL, -1000000000000LL, -3000LL,  10LL},
+	{0xb, -3000000000000010LL,  1000000000000LL, -3000LL, -10LL},
+	{0xc, -3000000000000010LL, -1000000000000LL,  3000LL, -10LL},
+	/*
+	 * In next 3 cases, i386 tries (a / (b >> 13)) >> 13 = 8,
+	 * may need to correct the quotient from 8 to 7.
+	 */
+	{0x11, 0x864200000000LL, 0x10c840000000LL, 8LL, 0LL},
+	{0x12, 0x864200000000LL, 0x10c840000001LL, 7LL, 0x10c83ffffff9LL},
+	{0x13, 0x864200000000LL, 0x10c840001fffLL, 7LL, 0x10c83fff2007LL},
+};
+
+struct u_divrem {
+	unsigned int id;
+	unsigned long long a;
+	unsigned long long b;
+	unsigned long long a_div_b;
+	unsigned long long a_rem_b;
+} u_cases[] = {
+	{0x81, 310ULL, 100ULL, 3ULL, 10ULL},
+	{0x82, 3000000000000010ULL, 100ULL, 30000000000000ULL, 10ULL},
+	{0x83, 3000000000000010ULL, 1000000000000ULL, 3000ULL, 10ULL},
+	{0x91, 0x8000000000000000ULL, 3ULL, 0x2aaaaaaaaaaaaaaaULL, 2ULL},
+	{0x92, 0xffffffffffffffffULL, 3ULL, 0x5555555555555555ULL, 0ULL},
+};
+
+#define LEN(ary) (sizeof(ary) / sizeof(ary[0]))
+
+void _m_a_i_n(void) {
+	int i;
+
+	for (i = 0; i < LEN(s_cases); i++) {
+		struct s_divrem *s = &s_cases[i];
+		if (s->a / s->b != s->a_div_b)
+			fail((s->id << 4) | 0xd);
+		if (s->a % s->b != s->a_rem_b)
+			fail((s->id << 4) | 0xe);
+	}
+	for (i = 0; i < LEN(u_cases); i++) {
+		struct u_divrem *u = &u_cases[i];
+		if (u->a / u->b != u->a_div_b)
+			fail((u->id << 4) | 0xd);
+		if (u->a % u->b != u->a_rem_b)
+			fail((u->id << 4) | 0xe);
+	}
+	finished();
+}