From 893df4b79b91e90cddb289c722d0b9ebecfa252e Mon Sep 17 00:00:00 2001
From: George Koehler <kernigh@gmail.com>
Date: Tue, 20 Aug 2019 13:38:18 -0400
Subject: [PATCH] Experiment with 8-byte integers in ncg i386.

This provides adi, sbi, mli, dvi, rmi, ngi, dvu, rmu 8, but is missing
shifts and rotates.  It is also missing conversions between 8-byte
integers and other sizes of integers or floats.  The code might not be
all correct, but works at least some of the time.

I adapted this from how ncg i86 does 4-byte integers, but I use a
different algorithm when dividing by a large value: i86 avoids the div
instruction and uses a shift-and-subtract loop; but I use the div
instruction to estimate a quotient, which is more like how big integer
libraries do division.  My .dvi8 and .dvu8 also set ecx:ebx to the
remainder; this might be a bad idea, because it requires .dvi8 and
.dvu8 to always calculate the remainder, even when the caller only
wants the quotient.

To play with 8-byte integers, I wrote EM procedures like

     mes 2, 4, 4
     exp $ngi
     pro $ngi,0
     ldl 4
     ngi 8
     lol 0
     sti 8
     lol 0
     ret 4
     end
     exp $adi
     pro $adi,0
     ldl 4
     ldl 12
     adi 8
     lol 0
     sti 8
     lol 0
     ret 4
     end

and called them from C like

    typedef struct { int l; int h; } q;
    q ngi(q);
    q adi(q, q);
---
 mach/i386/libem/build.lua |   2 +-
 mach/i386/libem/dvi8.s    | 115 ++++++++++++++++++++++++++++++++++++++
 mach/i386/libem/mli8.s    |  20 +++++++
 mach/i386/ncg/table       |  41 +++++++++++++-
 4 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 mach/i386/libem/dvi8.s
 create mode 100644 mach/i386/libem/mli8.s

diff --git a/mach/i386/libem/build.lua b/mach/i386/libem/build.lua
index ca5a13c65..b92254d96 100644
--- a/mach/i386/libem/build.lua
+++ b/mach/i386/libem/build.lua
@@ -1,7 +1,7 @@
 for _, plat in ipairs(vars.plats) do
 	acklibrary {
 		name = "lib_"..plat,
-		srcs = { "./*.s" },
+		srcs = { "./*.s" }, -- dvi8.s
 		vars = { plat = plat },
 	}
 end
diff --git a/mach/i386/libem/dvi8.s b/mach/i386/libem/dvi8.s
new file mode 100644
index 000000000..060f85cf1
--- /dev/null
+++ b/mach/i386/libem/dvi8.s
@@ -0,0 +1,115 @@
+.sect .text; .sect .rom; .sect .data; .sect .bss
+.sect .text
+.define .dvi8, .dvu8
+
+yl=8
+yh=12
+xl=16
+xh=20
+	! .dvi8 and .dvu8 divide x = xh:xl by y = yh:yl,
+	! yield edx:eax = quotient, ecx:ebx = remainder.
+
+.dvu8:
+	! Unsigned division: set di = 0 for non-negative quotient.
+	push	edi
+	xor	di,di
+	mov	eax,xh(esp)
+	mov	edx,yh(esp)
+	and	edx,edx
+	jmp	7f
+
+.dvi8:
+	! Signed division: replace x and y with their absolute values.
+	! Set di = 1 for negative quotient, 0 for non-negative.
+	push	edi
+	xor	di,di		! di = 0
+	mov	eax,xh(esp)
+	and	eax,eax
+	jns	1f
+	inc	di		! di = 1
+	neg	eax
+	neg	xl(esp)
+	sbb	eax,0		! eax:xl = absolute value of x
+1:	mov	edx,yh(esp)
+	and	edx,edx
+	jns	7f
+	xor	di,1		! flip di
+	neg	edx
+	neg	yl(esp)
+	sbb	edx,0		! edx:yl = absolute value of y
+
+7:	! Here .dvu8 joins .dvi8, eax = xh, edx = yh, flags test edx,
+	! the values in xh(esp) and yh(esp) are garbage.
+	jnz	8f		! jump if y >= 2**32
+
+	! x / y = x / yl = xh / yl + xl / yl = qh + (xl + rh) / yl
+	! where qh and rh are quotient, remainder from xh / yl.
+	mov	ebx,yl(esp)
+	xor	edx,edx		! edx:eax = xh
+	div	ebx		! eax = qh, edx = rh
+	mov	ecx,eax
+	mov	eax,xl(esp)
+	div	ebx		! eax = ql, edx = remainder
+	mov	ebx,edx
+	mov	edx,ecx		! edx:eax = quotient qh:ql
+	xor	ecx,ecx		! ecx:ebx = remainder
+
+9:	! Finally, if di != 0 then negate quotient, remainder.
+	and	di,di
+	jz	1f
+	neg	edx
+	neg	eax
+	sbb	edx,0		! negate quotient edx:eax
+	neg	ecx
+	neg	ebx
+	sbb	ecx,0		! negate remainder ecx:ebx
+1:	pop	edi		! caller's edi
+	ret	16
+
+8:	! We come here if y >= 2**32.
+	mov	xh(esp),eax
+	mov	yh(esp),edx
+	mov	ebx,yl(esp)	! edx:ebx = y
+
+	! Estimate x / y as q = (x / (y >> cl)) >> cl,
+	! where 2**31 <= (y >> cl) < 2**32.
+	xor	cx,cx
+1:	inc	cx
+	shr	edx,1
+	rcr	ebx,1		! edx:ebx = y >> cl
+	and	edx,edx
+	jnz	1b		! loop until y >> cl fits in ebx
+
+	! x / (y >> cl) = qh + (x + rh) / (y >> cl)
+	push	edi
+	xor	edx,edx		! edx:eax = xh
+	div	ebx		! eax = qh, edx = rh
+	mov	edi,eax
+	mov	eax,xl+4(esp)	! push edi moved xl to xl+4
+	div	ebx		! edi:eax = x / (y >> cl)
+
+	! q = (x / (y >> cl)) >> cl = esi:eax >> cl
+	shr	eax,cl
+	neg	cx		! cl = (32 - cl) modulo 32
+	shl	edi,cl
+	or	eax,edi		! eax = q
+
+	! Calculate the remainder x - q * y.  If the subtraction
+	! overflows, then the correct quotient is q - 1, else it is q.
+	mov	ecx,yh+4(esp)
+	imul	ecx,eax		! ecx = q * yh
+	mov	edi,eax
+	mul	yl+4(esp)	! edx:eax = q * yl
+	add	edx,ecx		! edx:eax = q * y
+	mov	ebx,xl+4(esp)
+	mov	ecx,xh+4(esp)	! ecx:ebx = x
+	sub	ebx,eax
+	sbb	ecx,edx		! ecx:ebx = remainder
+	jnc	1f
+	dec	edi		! fix quotient
+	add	ebx,yl+4(esp)
+	adc	ebx,yh+4(esp)	! fix remainder
+1:	mov	eax,edi
+	xor	edx,edx		! edx:eax = quotient
+	pop	edi		! negative flag
+	jmp	9b
diff --git a/mach/i386/libem/mli8.s b/mach/i386/libem/mli8.s
new file mode 100644
index 000000000..c8b306ac0
--- /dev/null
+++ b/mach/i386/libem/mli8.s
@@ -0,0 +1,20 @@
+.sect .text; .sect .rom; .sect .data; .sect .bss
+.sect .text
+.define .mli8
+
+yl=4
+yh=8
+	! xl in eax
+	! xh in edx
+
+.mli8:
+	! x * y = (xh + xl) * (yh + yl)
+	!       = xh * yh + xh * yl + xl * yh + xl * yl
+	! The term xh * yh overflows to zero.
+	mov	ecx,eax
+	imul	ecx,yh(esp)	! ecx = xl * yh
+	imul	edx,yl(esp)	! edx = xh * yl
+	add	ecx,edx
+	mul	yl(esp)		! edx:eax = xl * yl
+	add	edx,ecx		! edx:eax = x * y
+	ret	8
diff --git a/mach/i386/ncg/table b/mach/i386/ncg/table
index b7efb3c12..42a39d15b 100644
--- a/mach/i386/ncg/table
+++ b/mach/i386/ncg/table
@@ -961,6 +961,14 @@ with EXACT rmorconst const
   uses reusing %1,REG=%1
   gen add %a,%2			yields %a
 
+pat adi $1==8
+with REG REG rmorconst rmorconst
+  gen add %1,%3
+      adc %2,%4			yields %2 %1
+with rmorconst rmorconst REG REG
+  gen add %3,%1
+      adc %4,%2			yields %4 %3
+
 /*
 pat adi !defined($1)
 with CXREG ACC
@@ -969,13 +977,17 @@ with CXREG ACC
 */
 
 pat sbi $1==4
-
 with rmorconst REG
   gen sub %2,%1			yields %2
 with EXACT REG rmorconst
   gen sub %1,%2
       neg %1			yields %1
 
+pat sbi $1==8
+with rmorconst rmorconst REG REG
+  gen sub %3,%1
+      sbb %4,%2			yields %4 %3
+
 /*
 pat sbi !defined($1)
 with CXREG ACC
@@ -995,6 +1007,11 @@ with rm const
   uses reusing %1,REG
   gen imul %a,%1,%2		yields %a
 
+pat mli $1==8
+with ACC DXREG
+  kills ALL
+  gen proccall {label,".mli8"}	yields edx eax
+
 /*
 pat mli !defined($1)
 with ACC
@@ -1008,6 +1025,10 @@ with noacc ACC
   gen cdq.
       idiv %1			yields eax
 
+pat dvi $1==8
+  kills ALL
+  gen proccall {label,".dvi8"}	yields edx eax
+
 /*
 pat dvi !defined($1)
 with ACC
@@ -1021,6 +1042,10 @@ with noacc ACC
   gen cdq.
       idiv %1			yields edx
 
+pat rmi $1==8
+  kills ALL
+  gen proccall {label,".dvi8"}	yields ecx ebx
+
 /*
 pat rmi !defined($1)
 with ACC
@@ -1032,6 +1057,12 @@ pat ngi $1==4
 with REG
   gen neg %1			yields %1
 
+pat ngi $1==8
+with REG REG
+  gen neg %2
+      neg %1
+      sbb %2,{ANYCON,0}		yields %2 %1
+
 /*
 pat ngi !defined($1)
 with ACC
@@ -1114,6 +1145,10 @@ with noacc ACC
 uses DXREG={ANYCON,0}
 gen div %1			yields eax
 
+pat dvu $1==8
+  kills ALL
+  gen proccall {label,".dvu8"}	yields edx eax
+
 /*
 pat dvu !defined($1)
 with ACC STACK
@@ -1126,6 +1161,10 @@ with noacc ACC
 uses DXREG={ANYCON,0}
 gen div %1			yields edx
 
+pat rmu $1==8
+  kills ALL
+  gen proccall {label,".dvu8"}	yields ecx ebx
+
 /*
 pat rmu !defined($1)
 with ACC STACK