From f6a1e08218e2dea11505da7456ba0601c40c209d Mon Sep 17 00:00:00 2001 From: George Koehler Date: Mon, 16 Sep 2019 20:19:36 -0400 Subject: [PATCH] Test long long division and remainder; fix i386. My i386 code from 893df4b gave the wrong sign to some 8-byte remainders. Fix by splitting .dvi8 and .rmi8 so each has its own code to pick the sign. They and .dvu8 and .rmu8 share a private sub .divrem8 for unsigned division. Improve the i386 code by using instructions like _bsr_ and _shrd_. Change the helpers to yield a quotient in ebx:eax or a remainder in ecx:edx; this seems more convenient, because _div_ puts its quotient in eax and remainder in edx. --- mach/i386/libem/build.lua | 2 +- mach/i386/libem/divrem8.s | 63 ++++++++++++++ mach/i386/libem/dvi8.s | 132 ++++++------------------------ mach/i386/libem/dvu8.s | 20 +++++ mach/i386/libem/rmi8.s | 36 ++++++++ mach/i386/ncg/table | 8 +- tests/plat/build.lua | 2 +- tests/plat/long-long/lldivrem_e.c | 71 ++++++++++++++++ 8 files changed, 223 insertions(+), 111 deletions(-) create mode 100644 mach/i386/libem/divrem8.s create mode 100644 mach/i386/libem/dvu8.s create mode 100644 mach/i386/libem/rmi8.s create mode 100644 tests/plat/long-long/lldivrem_e.c diff --git a/mach/i386/libem/build.lua b/mach/i386/libem/build.lua index b92254d96..37e05a45c 100644 --- a/mach/i386/libem/build.lua +++ b/mach/i386/libem/build.lua @@ -1,7 +1,7 @@ for _, plat in ipairs(vars.plats) do acklibrary { name = "lib_"..plat, - srcs = { "./*.s" }, -- dvi8.s + srcs = { "./*.s" }, -- divrem8.s vars = { plat = plat }, } end diff --git a/mach/i386/libem/divrem8.s b/mach/i386/libem/divrem8.s new file mode 100644 index 000000000..d3334d72d --- /dev/null +++ b/mach/i386/libem/divrem8.s @@ -0,0 +1,63 @@ +.sect .text; .sect .rom; .sect .data; .sect .bss +.sect .text +.define .divrem8 + +yl=12 +yh=16 +xl=20 +xh=24 + ! This private sub for .dvi8, .dvu8, .rmi8, .rmu8 + ! does unsigned division of x = xh:xl by y = yh:yl, + ! yields ebx:eax = quotient, ecx:edx = remainder. + +.divrem8: + ! Caller must set eax, edx, flag z like so: + ! mov edx,yh(esp) + ! test edx,edx + ! mov eax,xh(esp) + jnz 1f ! jump if y >= 2**32 + + ! y = yl, so x / y = xh:xl / yl = qh:0 + (xl + rh) / yl + ! where qh, rh are quotient, remainder from xh / yl. + mov ecx,yl(esp) + xor edx,edx ! edx:eax = xh + div ecx ! eax = qh, edx = rh + mov ebx,eax + mov eax,xl(esp) ! edx:eax = qh:xl + div ecx ! ebx:eax = qh:ql = quotient + xor ecx,ecx ! ecx:edx = 0:rl = remainder + ret + +1: ! Here y >= 2**32. Find y >> cl in [2**31, 2**32). + mov ebx,yl(esp) ! edx:ebx = y + bsr ecx,edx ! scan yh for highest set bit + incb cl ! cl bits from cl-1 to 0 + shrd ebx,edx,cl ! ebx = y >> cl + + ! Estimate x / y as q = (x / (y >> cl)) >> cl. + xor edx,edx ! edx:eax = xh + div ebx ! eax = xh / (y >> cl) + push eax + mov eax,xl+4(esp) ! push moved xl to xl+4 + div ebx + pop edx ! edx:eax = x / (y >> cl) + shrd eax,edx,cl ! eax = q + + ! Calculate the remainder x - y * q. If the subtraction + ! overflows, then the correct quotient is q - 1, else it is q. + mov ebx,yh(esp) + imul ebx,eax ! ebx = yh * q + push eax + mul yl+4(esp) ! edx:eax = yl * q + add ebx,edx ! ebx:eax = y * q + mov edx,xl+4(esp) + mov ecx,xh+4(esp) + sub edx,eax + sbb ecx,ebx ! ecx:edx = remainder + pop eax ! eax = q + jnc 1f ! jump unless subtraction overflowed + dec eax ! fix quotient + add edx,yl(esp) + adc ecx,yh(esp) ! fix remainder +1: xor ebx,ebx ! ebx:eax = quotient + ret diff --git a/mach/i386/libem/dvi8.s b/mach/i386/libem/dvi8.s index 060f85cf1..986572525 100644 --- a/mach/i386/libem/dvi8.s +++ b/mach/i386/libem/dvi8.s @@ -1,115 +1,37 @@ .sect .text; .sect .rom; .sect .data; .sect .bss .sect .text -.define .dvi8, .dvu8 +.define .dvi8 -yl=8 -yh=12 -xl=16 -xh=20 - ! .dvi8 and .dvu8 divide x = xh:xl by y = yh:yl, - ! yield edx:eax = quotient, ecx:ebx = remainder. - -.dvu8: - ! Unsigned division: set di = 0 for non-negative quotient. - push edi - xor di,di - mov eax,xh(esp) - mov edx,yh(esp) - and edx,edx - jmp 7f +yl=4 +yh=8 +xl=12 +xh=16 + ! .dvi8 yields ebx:eax = quotient from x / y .dvi8: - ! Signed division: replace x and y with their absolute values. - ! Set di = 1 for negative quotient, 0 for non-negative. - push edi - xor di,di ! di = 0 - mov eax,xh(esp) - and eax,eax - jns 1f - inc di ! di = 1 + xorb cl,cl ! cl = 0, non-negative result + mov eax,xh(esp) ! eax for .divrem8 + test eax,eax + jge 1f ! jump unless x < 0 + incb cl ! cl = 1, negative result neg eax neg xl(esp) - sbb eax,0 ! eax:xl = absolute value of x -1: mov edx,yh(esp) - and edx,edx - jns 7f - xor di,1 ! flip di + sbb eax,0 + mov xh(esp),eax ! x = absolute value +1: mov edx,yh(esp) ! edx for .divrem8 + test edx,edx ! flag z for .divrem8 when y >= 0 + jge 1f ! jump unless y < 0 + xorb cl,1 ! flip sign of result neg edx neg yl(esp) - sbb edx,0 ! edx:yl = absolute value of y - -7: ! Here .dvu8 joins .dvi8, eax = xh, edx = yh, flags test edx, - ! the values in xh(esp) and yh(esp) are garbage. - jnz 8f ! jump if y >= 2**32 - - ! x / y = x / yl = xh / yl + xl / yl = qh + (xl + rh) / yl - ! where qh and rh are quotient, remainder from xh / yl. - mov ebx,yl(esp) - xor edx,edx ! edx:eax = xh - div ebx ! eax = qh, edx = rh - mov ecx,eax - mov eax,xl(esp) - div ebx ! eax = ql, edx = remainder - mov ebx,edx - mov edx,ecx ! edx:eax = quotient qh:ql - xor ecx,ecx ! ecx:ebx = remainder - -9: ! Finally, if di != 0 then negate quotient, remainder. - and di,di - jz 1f - neg edx - neg eax - sbb edx,0 ! negate quotient edx:eax - neg ecx + sbb edx,0 ! flag z for .divrem8 when y < 0 + mov yh(esp),edx ! y = absolute value +1: push ecx + call .divrem8 + pop ecx + testb cl,cl + jz 1f ! jump unless result < 0 neg ebx - sbb ecx,0 ! negate remainder ecx:ebx -1: pop edi ! caller's edi - ret 16 - -8: ! We come here if y >= 2**32. - mov xh(esp),eax - mov yh(esp),edx - mov ebx,yl(esp) ! edx:ebx = y - - ! Estimate x / y as q = (x / (y >> cl)) >> cl, - ! where 2**31 <= (y >> cl) < 2**32. - xor cx,cx -1: inc cx - shr edx,1 - rcr ebx,1 ! edx:ebx = y >> cl - and edx,edx - jnz 1b ! loop until y >> cl fits in ebx - - ! x / (y >> cl) = qh + (x + rh) / (y >> cl) - push edi - xor edx,edx ! edx:eax = xh - div ebx ! eax = qh, edx = rh - mov edi,eax - mov eax,xl+4(esp) ! push edi moved xl to xl+4 - div ebx ! edi:eax = x / (y >> cl) - - ! q = (x / (y >> cl)) >> cl = esi:eax >> cl - shr eax,cl - neg cx ! cl = (32 - cl) modulo 32 - shl edi,cl - or eax,edi ! eax = q - - ! Calculate the remainder x - q * y. If the subtraction - ! overflows, then the correct quotient is q - 1, else it is q. - mov ecx,yh+4(esp) - imul ecx,eax ! ecx = q * yh - mov edi,eax - mul yl+4(esp) ! edx:eax = q * yl - add edx,ecx ! edx:eax = q * y - mov ebx,xl+4(esp) - mov ecx,xh+4(esp) ! ecx:ebx = x - sub ebx,eax - sbb ecx,edx ! ecx:ebx = remainder - jnc 1f - dec edi ! fix quotient - add ebx,yl+4(esp) - adc ebx,yh+4(esp) ! fix remainder -1: mov eax,edi - xor edx,edx ! edx:eax = quotient - pop edi ! negative flag - jmp 9b + neg eax + sbb ebx,0 ! negate quotient ebx:eax +1: ret 16 diff --git a/mach/i386/libem/dvu8.s b/mach/i386/libem/dvu8.s new file mode 100644 index 000000000..8a1f00203 --- /dev/null +++ b/mach/i386/libem/dvu8.s @@ -0,0 +1,20 @@ +.sect .text; .sect .rom; .sect .data; .sect .bss +.sect .text +.define .dvu8, .rmu8 + +yl=4 +yh=8 +xl=12 +xh=16 + ! .dvu8 yields ebx:eax = quotient from x / y + ! .rmu8 yields ecx:edx = remainder from x / y + +.dvu8: +.rmu8: + mov edx,yh(esp) + test edx,edx + mov eax,xh(esp) ! prepare for .divrem8 + push ebp ! move esp + call .divrem8 + pop ebp ! move esp + ret 16 diff --git a/mach/i386/libem/rmi8.s b/mach/i386/libem/rmi8.s new file mode 100644 index 000000000..a52c282b6 --- /dev/null +++ b/mach/i386/libem/rmi8.s @@ -0,0 +1,36 @@ +.sect .text; .sect .rom; .sect .data; .sect .bss +.sect .text +.define .rmi8 + +yl=4 +yh=8 +xl=12 +xh=16 + ! .rmi8 yields ecx:edx = remainder from x / y + +.rmi8: + xorb cl,cl ! cl = 0, non-negative result + mov eax,xh(esp) ! eax for .divrem8 + test eax,eax + jge 1f ! jump unless x < 0 + incb cl ! cl = 1, negative result + neg eax + neg xl(esp) + sbb eax,0 + mov xh(esp),eax ! x = absolute value +1: mov edx,yh(esp) ! edx for .divrem8 + test edx,edx ! flag z for .divrem8 when y >= 0 + jge 1f ! jump unless y < 0 + neg edx + neg yl(esp) + sbb edx,0 ! flag z for .divrem8 when y < 0 + mov yh(esp),edx ! y = absolute value +1: push ecx + call .divrem8 + pop eax + testb al,al + jz 1f ! jump unless result < 0 + neg ecx + neg edx + sbb ecx,0 ! negate remainder ecx:edx +1: ret 16 diff --git a/mach/i386/ncg/table b/mach/i386/ncg/table index 184662a15..789fa5c9a 100644 --- a/mach/i386/ncg/table +++ b/mach/i386/ncg/table @@ -1038,7 +1038,7 @@ with noacc ACC pat dvi $1==8 kills ALL - gen proccall {label,".dvi8"} yields edx eax + gen proccall {label,".dvi8"} yields ebx eax /* pat dvi !defined($1) @@ -1055,7 +1055,7 @@ with noacc ACC pat rmi $1==8 kills ALL - gen proccall {label,".dvi8"} yields ecx ebx + gen proccall {label,".rmi8"} yields ecx edx /* pat rmi !defined($1) @@ -1202,7 +1202,7 @@ gen div %1 yields eax pat dvu $1==8 kills ALL - gen proccall {label,".dvu8"} yields edx eax + gen proccall {label,".dvu8"} yields ebx eax /* pat dvu !defined($1) @@ -1218,7 +1218,7 @@ gen div %1 yields edx pat rmu $1==8 kills ALL - gen proccall {label,".dvu8"} yields ecx ebx + gen proccall {label,".rmu8"} yields ecx edx /* pat rmu !defined($1) diff --git a/tests/plat/build.lua b/tests/plat/build.lua index 117e6a735..1613255be 100644 --- a/tests/plat/build.lua +++ b/tests/plat/build.lua @@ -4,7 +4,7 @@ definerule("plat_testsuite", { plat = { type="string" }, method = { type="string" }, - -- added long-long/llbitset_e.c + -- added long-long/lldivrem_e.c sets = { type="table", default={"core", "b", "bugs", "m2", "floats", "long-long"}}, skipsets = { type="table", default={}}, tests = { type="targets", default={} }, diff --git a/tests/plat/long-long/lldivrem_e.c b/tests/plat/long-long/lldivrem_e.c new file mode 100644 index 000000000..7e75125a6 --- /dev/null +++ b/tests/plat/long-long/lldivrem_e.c @@ -0,0 +1,71 @@ +#include "test.h" + +/* + * Test division and remainder. Failure code will look like + * - 0x3d = id 0x3, 'd' for division + * - 0x3e = id 0x3, 'e' for remainder + */ + +struct s_divrem { + unsigned int id; + long long a; + long long b; + long long a_div_b; /* a / b */ + long long a_rem_b; /* a % b */ +} s_cases[] = { + {0x1, 310LL, 100LL, 3LL, 10LL}, + {0x2, 310LL, -100LL, -3LL, 10LL}, + {0x3, -310LL, 100LL, -3LL, -10LL}, + {0x4, -310LL, -100LL, 3LL, -10LL}, + {0x5, 3000000000000010LL, 100LL, 30000000000000LL, 10LL}, + {0x6, 3000000000000010LL, -100LL, -30000000000000LL, 10LL}, + {0x7, -3000000000000010LL, 100LL, -30000000000000LL, -10LL}, + {0x8, -3000000000000010LL, -100LL, 30000000000000LL, -10LL}, + {0x9, 3000000000000010LL, 1000000000000LL, 3000LL, 10LL}, + {0xa, 3000000000000010LL, -1000000000000LL, -3000LL, 10LL}, + {0xb, -3000000000000010LL, 1000000000000LL, -3000LL, -10LL}, + {0xc, -3000000000000010LL, -1000000000000LL, 3000LL, -10LL}, + /* + * In next 3 cases, i386 tries (a / (b >> 13)) >> 13 = 8, + * may need to correct the quotient from 8 to 7. + */ + {0x11, 0x864200000000LL, 0x10c840000000LL, 8LL, 0LL}, + {0x12, 0x864200000000LL, 0x10c840000001LL, 7LL, 0x10c83ffffff9LL}, + {0x13, 0x864200000000LL, 0x10c840001fffLL, 7LL, 0x10c83fff2007LL}, +}; + +struct u_divrem { + unsigned int id; + unsigned long long a; + unsigned long long b; + unsigned long long a_div_b; + unsigned long long a_rem_b; +} u_cases[] = { + {0x81, 310ULL, 100ULL, 3ULL, 10ULL}, + {0x82, 3000000000000010ULL, 100ULL, 30000000000000ULL, 10ULL}, + {0x83, 3000000000000010ULL, 1000000000000ULL, 3000ULL, 10ULL}, + {0x91, 0x8000000000000000ULL, 3ULL, 0x2aaaaaaaaaaaaaaaULL, 2ULL}, + {0x92, 0xffffffffffffffffULL, 3ULL, 0x5555555555555555ULL, 0ULL}, +}; + +#define LEN(ary) (sizeof(ary) / sizeof(ary[0])) + +void _m_a_i_n(void) { + int i; + + for (i = 0; i < LEN(s_cases); i++) { + struct s_divrem *s = &s_cases[i]; + if (s->a / s->b != s->a_div_b) + fail((s->id << 4) | 0xd); + if (s->a % s->b != s->a_rem_b) + fail((s->id << 4) | 0xe); + } + for (i = 0; i < LEN(u_cases); i++) { + struct u_divrem *u = &u_cases[i]; + if (u->a / u->b != u->a_div_b) + fail((u->id << 4) | 0xd); + if (u->a % u->b != u->a_rem_b) + fail((u->id << 4) | 0xe); + } + finished(); +}