From 66f93f08c5a0a3af85c1ce5f5278c06882403c37 Mon Sep 17 00:00:00 2001 From: George Koehler Date: Mon, 22 Jan 2018 14:04:15 -0500 Subject: [PATCH] Add fef 4, fif 4. Improve fef 8, fif 8. Other float changes. When I wrote fef 8, I forgot to test denormalized numbers. Oops. Now fix two of my mistakes: - When checking for zero, `extrwi r6, r3, 22, 12` needs to be `extrwi r6, r3, 20, 12`. There are only 20 bits to extract. - After the multiplication by 2**64, I forgot to put the fraction in [0.5, 1) or (-1, 0.5] by setting IEEE exponent = 1022. Teach fif 8 about signed zero and NaN. In ncg/table, change cmf so NaN is not equal to any value, and comment why ordered comparisons don't work with NaN. Also add cost for fctwiz, remove extra `uses REG`. Edit comment in cfu8.s because the conditional branch might be before or after fctwiz. --- mach/powerpc/libem/build.lua | 2 +- mach/powerpc/libem/cfu8.s | 8 +++-- mach/powerpc/libem/fef4.s | 48 +++++++++++++++++++++++++++ mach/powerpc/libem/fef8.s | 46 +++++++++++++------------- mach/powerpc/libem/fif4.s | 64 ++++++++++++++++++++++++++++++++++++ mach/powerpc/libem/fif8.s | 45 ++++++++++++++++--------- mach/powerpc/ncg/table | 30 +++++++++++++---- 7 files changed, 194 insertions(+), 49 deletions(-) create mode 100644 mach/powerpc/libem/fef4.s create mode 100644 mach/powerpc/libem/fif4.s diff --git a/mach/powerpc/libem/build.lua b/mach/powerpc/libem/build.lua index 7a0726b80..2709a4770 100644 --- a/mach/powerpc/libem/build.lua +++ b/mach/powerpc/libem/build.lua @@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do acklibrary { name = "lib_"..plat, srcs = { - "./*.s", -- cfu8.s + "./*.s", -- fif4.s }, vars = { plat = plat }, deps = { diff --git a/mach/powerpc/libem/cfu8.s b/mach/powerpc/libem/cfu8.s index fd69ff521..710d2a65c 100644 --- a/mach/powerpc/libem/cfu8.s +++ b/mach/powerpc/libem/cfu8.s @@ -42,6 +42,8 @@ ! 1: yields r3 = the converted value. ! ! Debian's clang 3.5.0-10 and gcc 4.9.2-10 don't clamp the value -! before conversion. They avoid fsel and put the conditional branch -! before fctwiz. PowerPC 601 lacks fsel (but kernel might trap and -! emulate fsel). PowerPC 603, 604, G3, G4, G5 have fsel. +! before conversion. They avoid fsel and use the conditional branch +! to pick between 2 fctwiz instructions. +! +! PowerPC 601 lacks fsel (but kernel might trap and emulate fsel). +! PowerPC 603, 604, G3, G4, G5 have fsel. diff --git a/mach/powerpc/libem/fef4.s b/mach/powerpc/libem/fef4.s new file mode 100644 index 000000000..a338ed0a9 --- /dev/null +++ b/mach/powerpc/libem/fef4.s @@ -0,0 +1,48 @@ +.sect .text + +! Split a single-precision float into fraction and exponent, like +! frexpf(3) in C, http://en.cppreference.com/w/c/numeric/math/frexp +! +! Stack: ( single -- fraction exponent ) + +.define .fef4 +.fef4: + lwz r3, 0(sp) ! r3 = word of float bits + + ! IEEE single = sign * 1.fraction * 2**(exponent - 127) + ! sign exponent fraction + ! 0 1..8 9..31 + ! + ! IEEE exponent = 126 in [0.5, 1) or (-1, -0.5]. + + extrwi. r6, r3, 8, 1 ! r6 = IEEE exponent + beq 3f ! jump if zero or denormalized + cmpwi r6, 255 + addi r5, r6, -126 ! r5 = our exponent + beq 2f ! jump if infinity or NaN + ! fall through if normalized + + ! Put fraction in [0.5, 1) or (-1, -0.5]. +1: li r6, 126 + insrwi r3, r6, 8, 1 ! IEEE exponent = 126 + ! fall through + +2: stw r3, 0(sp) ! push fraction + stwu r5, -4(sp) ! push exponent + blr + + ! Got denormalized number or zero, probably zero. + ! If zero, then exponent must also be zero. +3: extrwi. r6, r3, 23, 9 ! r6 = fraction + bne 4f ! jump if not zero + li r5, 0 ! exponent = 0 + b 2b + + ! Got denormalized number = 0.fraction * 2**-126 +4: cntlzw r5, r6 + addi r5, r5, -8 + slw r6, r6, r5 ! shift left to make 1.fraction + insrwi r3, r6, 23, 9 ! set new fraction + li r6, -126 + 1 + subf r5, r5, r6 ! r5 = our exponent + b 1b diff --git a/mach/powerpc/libem/fef8.s b/mach/powerpc/libem/fef8.s index 26a962d8b..aff5ea3b6 100644 --- a/mach/powerpc/libem/fef8.s +++ b/mach/powerpc/libem/fef8.s @@ -3,7 +3,7 @@ .sect .text ! Split a double-precision float into fraction and exponent, like -! frexp(3) in C. +! frexp(3) in C, http://en.cppreference.com/w/c/numeric/math/frexp ! ! Stack: ( double -- fraction exponent ) @@ -12,42 +12,41 @@ lwz r3, 0(sp) ! r3 = high word (bits 0..31) lwz r4, 4(sp) ! r4 = low word (bits 32..63) - ! IEEE double-precision format: + ! IEEE double = sign * 1.fraction * 2**(exponent - 1023) ! sign exponent fraction ! 0 1..11 12..63 ! - ! To get fraction in [0.5, 1) or (-1, -0.5], we subtract 1022 - ! from the IEEE exponent. + ! IEEE exponent = 1022 in [0.5, 1) or (-1, -0.5]. extrwi. r6, r3, 11, 1 ! r6 = IEEE exponent - addi r5, r6, -1022 ! r5 = our exponent - beq 2f ! jump if zero or denormalized + beq 3f ! jump if zero or denormalized cmpwi r6, 2047 - beq 1f ! jump if infinity or NaN + addi r5, r6, -1022 ! r5 = our exponent + beq 2f ! jump if infinity or NaN ! fall through if normalized - ! Put fraction in [0.5, 1) or (-1, -0.5] by setting its - ! IEEE exponent to 1022. - rlwinm r3, r3, 0, 12, 0 ! clear old exponent - oris r3, r3, 1022 << 4 ! set new exponent + ! Put fraction in [0.5, 1) or (-1, -0.5]. +1: li r6, 1022 + insrwi r3, r6, 11, 1 ! IEEE exponent = 1022 ! fall through -1: stw r3, 0(sp) +2: stw r3, 0(sp) stw r4, 4(sp) ! push fraction stwu r5, -4(sp) ! push exponent blr -2: ! Got denormalized number or zero, probably zero. - extrwi r6, r3, 22, 12 + ! Got denormalized number or zero, probably zero. + ! If zero, then exponent must also be zero. +3: extrwi r6, r3, 20, 12 or. r6, r6, r4 ! r6 = high|low fraction - bne 3f ! jump if not zero + bne 4f ! jump if not zero li r5, 0 ! exponent = 0 - b 1b + b 2b -3: ! Got denormalized number, not zero. - lfd f0, 0(sp) - lis r6, ha16[_2_64] - lfd f1, lo16[_2_64](r6) + ! Got denormalized number = 0.fraction * 2**-1022 +4: lfd f0, 0(sp) + lis r6, ha16[.fs_2_64] + lfs f1, lo16[.fs_2_64](r6) fmul f0, f0, f1 ! multiply it by 2**64 stfd f0, 0(sp) lwz r3, 0(sp) @@ -57,7 +56,6 @@ b 1b .sect .rom -_2_64: - ! (double) 2**64 - .data4 0x43f00000 - .data4 0x00000000 +.fs_2_64: + !float 1.84467440737095516e+19 sz 4 + .data1 0137,0200,00,00 diff --git a/mach/powerpc/libem/fif4.s b/mach/powerpc/libem/fif4.s new file mode 100644 index 000000000..fc29b178c --- /dev/null +++ b/mach/powerpc/libem/fif4.s @@ -0,0 +1,64 @@ +.sect .text + +! Multiplies two single-precision floats, then splits the product into +! fraction and integer, both as floats, like modff(3) in C, +! http://en.cppreference.com/w/c/numeric/math/modf +! +! Stack: ( a b -- fraction integer ) + +.define .fif4 +.fif4: + lfs f1, 4(sp) + lfs f2, 0(sp) + fmuls f1, f1, f2 ! f1 = a * b + stfs f1, 0(sp) + lwz r3, 0(sp) ! r3 = word of float bits + + ! IEEE single = sign * 1.fraction * 2**(exponent - 127) + ! sign exponent fraction + ! 0 1..8 9..31 + ! + ! Subtract 127 from the IEEE exponent. If the result is from + ! 0 to 23, then the IEEE fraction has that many integer bits. + + extrwi r5, r3, 8, 1 ! r5 = IEEE exponent + addic. r5, r5, -127 ! r5 = nr of integer bits + blt 3f ! branch if no integer + cmpwi r5, 24 + bge 4f ! branch if no fraction + ! fall through if integer with fraction + + ! f1 has r5 = 0 to 23 integer bits in the IEEE fraction. + ! There are 23 - r5 fraction bits. + li r6, 23 + subf r6, r5, r6 + srw r3, r3, r6 + slw r3, r3, r6 ! clear fraction in word + ! fall through + +1: stw r3, 0(sp) + lfs f2, 0(sp) ! integer = high word, low word + fsubs f1, f1, f2 ! fraction = value - integer +2: stfs f1, 4(sp) ! push fraction + stfs f2, 0(sp) ! push integer + blr + + ! f1 is a fraction without integer (or zero). + ! Then integer is zero with same sign. +3: extlwi r3, r3, 1, 0 ! extract sign bit + stfs f1, 4(sp) ! push fraction + stw r3, 0(sp) ! push integer = zero with sign + blr + + ! f1 is an integer without fraction (or infinity or NaN). + ! Unless NaN, then fraction is zero with same sign. +4: fcmpu cr0, f1, f1 + bun cr0, 5f + extlwi r3, r3, 1, 0 ! extract sign bit + stw r3, 4(sp) ! push fraction = zero with sign + stfs f1, 0(sp) ! push integer + blr + + ! f1 is NaN, so both fraction and integer are NaN. +5: fmr f2, f1 + b 2b diff --git a/mach/powerpc/libem/fif8.s b/mach/powerpc/libem/fif8.s index bce4f8d24..f93a39ac2 100644 --- a/mach/powerpc/libem/fif8.s +++ b/mach/powerpc/libem/fif8.s @@ -1,7 +1,8 @@ .sect .text ! Multiplies two double-precision floats, then splits the product into -! fraction and integer, like modf(3) in C. On entry: +! fraction and integer, both as floats, like modf(3) in C, +! http://en.cppreference.com/w/c/numeric/math/modf ! ! Stack: ( a b -- fraction integer ) @@ -14,20 +15,18 @@ lwz r3, 0(sp) ! r3 = high word lwz r4, 4(sp) ! r4 = low word - ! IEEE double-precision format: + ! IEEE double = sign * 1.fraction * 2**(exponent - 1023) ! sign exponent fraction ! 0 1..11 12..63 ! ! Subtract 1023 from the IEEE exponent. If the result is from ! 0 to 51, then the IEEE fraction has that many integer bits. - ! (IEEE has an implicit 1 before its fraction. If the IEEE - ! fraction has 0 integer bits, we still have an integer.) extrwi r5, r3, 11, 1 ! r5 = IEEE exponent addic. r5, r5, -1023 ! r5 = nr of integer bits - blt 4f ! branch if no integer + blt 3f ! branch if no integer cmpwi r5, 52 - bge 5f ! branch if no fraction + bge 4f ! branch if no fraction cmpwi r5, 21 bge 6f ! branch if large integer ! fall through if small integer @@ -44,22 +43,38 @@ 1: stw r3, 0(sp) stw r4, 4(sp) lfd f2, 0(sp) ! integer = high word, low word -2: fsub f1, f1, f2 ! fraction = value - integer -3: stfd f1, 8(sp) ! push fraction + fsub f1, f1, f2 ! fraction = value - integer +2: stfd f1, 8(sp) ! push fraction stfd f2, 0(sp) ! push integer blr -4: ! f1 is a fraction without integer. - fsub f2, f1, f1 ! integer = zero - b 3b + ! f1 is a fraction without integer (or zero). + ! Then integer is zero with same sign. +3: extlwi r3, r3, 1, 0 ! extract sign bit + li r4, 0 + stfd f1, 8(sp) ! push fraction + stw r4, 4(sp) + stw r3, 0(sp) ! push integer = zero with sign + blr -5: ! f1 is an integer without fraction (or infinity or NaN). - fmr f2, f1 ! integer = f1 + ! f1 is an integer without fraction (or infinity or NaN). + ! Unless NaN, then fraction is zero with same sign. +4: fcmpu cr0, f1, f1 ! integer = f1 + bun cr0, 5f + extlwi r3, r3, 1, 0 ! extract sign bit + li r4, 0 + stw r4, 12(sp) + stw r3, 8(sp) ! push fraction = zero with sign + stfd f1, 0(sp) ! push integer + blr + + ! f1 is NaN, so both fraction and integer are NaN. +5: fmr f2, f1 b 2b -6: ! f1 has r5 = 21 to 51 to integer bits. + ! f1 has r5 = 21 to 51 to integer bits. ! Low word has 52 - r5 fraction bits. - li r6, 52 +6: li r6, 52 subf r6, r5, r6 srw r4, r4, r6 slw r4, r4, r6 ! clear fraction in low word diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table index df06a5d49..1ea0b60ec 100644 --- a/mach/powerpc/ncg/table +++ b/mach/powerpc/ncg/table @@ -310,7 +310,7 @@ INSTRUCTIONS fadds FSREG+LOCAL:wo, FSREG:ro, FSREG:ro cost(4, 5). fcmpo CR:wo, FREG:ro, FREG:ro cost(4, 5). fcmpo CR:wo, FSREG:ro, FSREG:ro cost(4, 5). - fctiwz FREG:wo, FREG:ro. + fctiwz FREG:wo, FREG:ro cost(4, 5). fdiv FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 35). fdivs FSREG+LOCAL:wo, FSREG:ro, FSREG:ro cost(4, 21). fmr FPR:wo, FPR:ro cost(4, 5). @@ -2329,10 +2329,20 @@ PATTERNS with FSREG gen fneg {LOCAL, $2}, %1 + /* When a or b is NaN, then a < b, a <= b, a > b, a >= b + * should all be false. We can't make them false, because + * - EM's _cmf_ is only for ordered comparisons. + * - The peephole optimizer assumes (a < b) == !(a >= b). + * + * We do make a == b false and a != b true, by checking the + * eq (equal) bit or un (unordered) bit in cr0. + */ + pat cmf $1==4 /* Compare single */ with FSREG FSREG uses REG={COND_FS, %2, %1} - gen extlwi %a, %a, {C, 2}, {C, 0} + /* Extract lt, gt, un; put lt in sign bit. */ + gen andisX %a, %a, {C, 0xd000} yields %a pat cmf teq $1==4 /* Single second == top */ @@ -2367,7 +2377,6 @@ PATTERNS proc cmf4zxx example cmf zeq with FSREG FSREG STACK - uses REG gen fcmpo cr0, %2, %1 bxx* {LABEL, $2} @@ -2420,6 +2429,13 @@ PATTERNS loc 4 cff + pat fef $1==4 /* Split fraction, exponent */ + leaving cal ".fef4" + + /* Multiply two singles, then split fraction, integer */ + pat fif $1==4 + leaving cal ".fif4" + /* Double-precision floating-point */ @@ -2471,10 +2487,13 @@ PATTERNS with FREG gen fneg {DLOCAL, $2}, %1 + /* To compare NaN, see comment above pat cmf $1==4 */ + pat cmf $1==8 /* Compare double */ with FREG FREG uses REG={COND_FD, %2, %1} - gen extlwi %a, %a, {C, 2}, {C, 0} + /* Extract lt, gt, un; put lt in sign bit. */ + gen andisX %a, %a, {C, 0xd000} yields %a pat cmf teq $1==8 /* Double second == top */ @@ -2482,7 +2501,7 @@ PATTERNS uses REG={COND_FD, %2, %1} yields {XEQ, %a} - pat cmf tne $1==8 /* Single second == top */ + pat cmf tne $1==8 /* Double second == top */ with FREG FREG uses REG={COND_FD, %2, %1} yields {XNE, %a} @@ -2509,7 +2528,6 @@ PATTERNS proc cmf8zxx example cmf zeq with FREG FREG STACK - uses REG gen fcmpo cr0, %2, %1 bxx* {LABEL, $2}