From 893df4b79b91e90cddb289c722d0b9ebecfa252e Mon Sep 17 00:00:00 2001 From: George Koehler Date: Tue, 20 Aug 2019 13:38:18 -0400 Subject: [PATCH] Experiment with 8-byte integers in ncg i386. This provides adi, sbi, mli, dvi, rmi, ngi, dvu, rmu 8, but is missing shifts and rotates. It is also missing conversions between 8-byte integers and other sizes of integers or floats. The code might not be all correct, but works at least some of the time. I adapted this from how ncg i86 does 4-byte integers, but I use a different algorithm when dividing by a large value: i86 avoids the div instruction and uses a shift-and-subtract loop; but I use the div instruction to estimate a quotient, which is more like how big integer libraries do division. My .dvi8 and .dvu8 also set ecx:ebx to the remainder; this might be a bad idea, because it requires .dvi8 and .dvu8 to always calculate the remainder, even when the caller only wants the quotient. To play with 8-byte integers, I wrote EM procedures like mes 2, 4, 4 exp $ngi pro $ngi,0 ldl 4 ngi 8 lol 0 sti 8 lol 0 ret 4 end exp $adi pro $adi,0 ldl 4 ldl 12 adi 8 lol 0 sti 8 lol 0 ret 4 end and called them from C like typedef struct { int l; int h; } q; q ngi(q); q adi(q, q); --- mach/i386/libem/build.lua | 2 +- mach/i386/libem/dvi8.s | 115 ++++++++++++++++++++++++++++++++++++++ mach/i386/libem/mli8.s | 20 +++++++ mach/i386/ncg/table | 41 +++++++++++++- 4 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 mach/i386/libem/dvi8.s create mode 100644 mach/i386/libem/mli8.s diff --git a/mach/i386/libem/build.lua b/mach/i386/libem/build.lua index ca5a13c65..b92254d96 100644 --- a/mach/i386/libem/build.lua +++ b/mach/i386/libem/build.lua @@ -1,7 +1,7 @@ for _, plat in ipairs(vars.plats) do acklibrary { name = "lib_"..plat, - srcs = { "./*.s" }, + srcs = { "./*.s" }, -- dvi8.s vars = { plat = plat }, } end diff --git a/mach/i386/libem/dvi8.s b/mach/i386/libem/dvi8.s new file mode 100644 index 000000000..060f85cf1 --- /dev/null +++ b/mach/i386/libem/dvi8.s @@ -0,0 +1,115 @@ +.sect .text; .sect .rom; .sect .data; .sect .bss +.sect .text +.define .dvi8, .dvu8 + +yl=8 +yh=12 +xl=16 +xh=20 + ! .dvi8 and .dvu8 divide x = xh:xl by y = yh:yl, + ! yield edx:eax = quotient, ecx:ebx = remainder. + +.dvu8: + ! Unsigned division: set di = 0 for non-negative quotient. + push edi + xor di,di + mov eax,xh(esp) + mov edx,yh(esp) + and edx,edx + jmp 7f + +.dvi8: + ! Signed division: replace x and y with their absolute values. + ! Set di = 1 for negative quotient, 0 for non-negative. + push edi + xor di,di ! di = 0 + mov eax,xh(esp) + and eax,eax + jns 1f + inc di ! di = 1 + neg eax + neg xl(esp) + sbb eax,0 ! eax:xl = absolute value of x +1: mov edx,yh(esp) + and edx,edx + jns 7f + xor di,1 ! flip di + neg edx + neg yl(esp) + sbb edx,0 ! edx:yl = absolute value of y + +7: ! Here .dvu8 joins .dvi8, eax = xh, edx = yh, flags test edx, + ! the values in xh(esp) and yh(esp) are garbage. + jnz 8f ! jump if y >= 2**32 + + ! x / y = x / yl = xh / yl + xl / yl = qh + (xl + rh) / yl + ! where qh and rh are quotient, remainder from xh / yl. + mov ebx,yl(esp) + xor edx,edx ! edx:eax = xh + div ebx ! eax = qh, edx = rh + mov ecx,eax + mov eax,xl(esp) + div ebx ! eax = ql, edx = remainder + mov ebx,edx + mov edx,ecx ! edx:eax = quotient qh:ql + xor ecx,ecx ! ecx:ebx = remainder + +9: ! Finally, if di != 0 then negate quotient, remainder. + and di,di + jz 1f + neg edx + neg eax + sbb edx,0 ! negate quotient edx:eax + neg ecx + neg ebx + sbb ecx,0 ! negate remainder ecx:ebx +1: pop edi ! caller's edi + ret 16 + +8: ! We come here if y >= 2**32. + mov xh(esp),eax + mov yh(esp),edx + mov ebx,yl(esp) ! edx:ebx = y + + ! Estimate x / y as q = (x / (y >> cl)) >> cl, + ! where 2**31 <= (y >> cl) < 2**32. + xor cx,cx +1: inc cx + shr edx,1 + rcr ebx,1 ! edx:ebx = y >> cl + and edx,edx + jnz 1b ! loop until y >> cl fits in ebx + + ! x / (y >> cl) = qh + (x + rh) / (y >> cl) + push edi + xor edx,edx ! edx:eax = xh + div ebx ! eax = qh, edx = rh + mov edi,eax + mov eax,xl+4(esp) ! push edi moved xl to xl+4 + div ebx ! edi:eax = x / (y >> cl) + + ! q = (x / (y >> cl)) >> cl = esi:eax >> cl + shr eax,cl + neg cx ! cl = (32 - cl) modulo 32 + shl edi,cl + or eax,edi ! eax = q + + ! Calculate the remainder x - q * y. If the subtraction + ! overflows, then the correct quotient is q - 1, else it is q. + mov ecx,yh+4(esp) + imul ecx,eax ! ecx = q * yh + mov edi,eax + mul yl+4(esp) ! edx:eax = q * yl + add edx,ecx ! edx:eax = q * y + mov ebx,xl+4(esp) + mov ecx,xh+4(esp) ! ecx:ebx = x + sub ebx,eax + sbb ecx,edx ! ecx:ebx = remainder + jnc 1f + dec edi ! fix quotient + add ebx,yl+4(esp) + adc ebx,yh+4(esp) ! fix remainder +1: mov eax,edi + xor edx,edx ! edx:eax = quotient + pop edi ! negative flag + jmp 9b diff --git a/mach/i386/libem/mli8.s b/mach/i386/libem/mli8.s new file mode 100644 index 000000000..c8b306ac0 --- /dev/null +++ b/mach/i386/libem/mli8.s @@ -0,0 +1,20 @@ +.sect .text; .sect .rom; .sect .data; .sect .bss +.sect .text +.define .mli8 + +yl=4 +yh=8 + ! xl in eax + ! xh in edx + +.mli8: + ! x * y = (xh + xl) * (yh + yl) + ! = xh * yh + xh * yl + xl * yh + xl * yl + ! The term xh * yh overflows to zero. + mov ecx,eax + imul ecx,yh(esp) ! ecx = xl * yh + imul edx,yl(esp) ! edx = xh * yl + add ecx,edx + mul yl(esp) ! edx:eax = xl * yl + add edx,ecx ! edx:eax = x * y + ret 8 diff --git a/mach/i386/ncg/table b/mach/i386/ncg/table index b7efb3c12..42a39d15b 100644 --- a/mach/i386/ncg/table +++ b/mach/i386/ncg/table @@ -961,6 +961,14 @@ with EXACT rmorconst const uses reusing %1,REG=%1 gen add %a,%2 yields %a +pat adi $1==8 +with REG REG rmorconst rmorconst + gen add %1,%3 + adc %2,%4 yields %2 %1 +with rmorconst rmorconst REG REG + gen add %3,%1 + adc %4,%2 yields %4 %3 + /* pat adi !defined($1) with CXREG ACC @@ -969,13 +977,17 @@ with CXREG ACC */ pat sbi $1==4 - with rmorconst REG gen sub %2,%1 yields %2 with EXACT REG rmorconst gen sub %1,%2 neg %1 yields %1 +pat sbi $1==8 +with rmorconst rmorconst REG REG + gen sub %3,%1 + sbb %4,%2 yields %4 %3 + /* pat sbi !defined($1) with CXREG ACC @@ -995,6 +1007,11 @@ with rm const uses reusing %1,REG gen imul %a,%1,%2 yields %a +pat mli $1==8 +with ACC DXREG + kills ALL + gen proccall {label,".mli8"} yields edx eax + /* pat mli !defined($1) with ACC @@ -1008,6 +1025,10 @@ with noacc ACC gen cdq. idiv %1 yields eax +pat dvi $1==8 + kills ALL + gen proccall {label,".dvi8"} yields edx eax + /* pat dvi !defined($1) with ACC @@ -1021,6 +1042,10 @@ with noacc ACC gen cdq. idiv %1 yields edx +pat rmi $1==8 + kills ALL + gen proccall {label,".dvi8"} yields ecx ebx + /* pat rmi !defined($1) with ACC @@ -1032,6 +1057,12 @@ pat ngi $1==4 with REG gen neg %1 yields %1 +pat ngi $1==8 +with REG REG + gen neg %2 + neg %1 + sbb %2,{ANYCON,0} yields %2 %1 + /* pat ngi !defined($1) with ACC @@ -1114,6 +1145,10 @@ with noacc ACC uses DXREG={ANYCON,0} gen div %1 yields eax +pat dvu $1==8 + kills ALL + gen proccall {label,".dvu8"} yields edx eax + /* pat dvu !defined($1) with ACC STACK @@ -1126,6 +1161,10 @@ with noacc ACC uses DXREG={ANYCON,0} gen div %1 yields edx +pat rmu $1==8 + kills ALL + gen proccall {label,".dvu8"} yields ecx ebx + /* pat rmu !defined($1) with ACC STACK