Add fef 4, fif 4. Improve fef 8, fif 8. Other float changes.
When I wrote fef 8, I forgot to test denormalized numbers. Oops. Now fix two of my mistakes: - When checking for zero, `extrwi r6, r3, 22, 12` needs to be `extrwi r6, r3, 20, 12`. There are only 20 bits to extract. - After the multiplication by 2**64, I forgot to put the fraction in [0.5, 1) or (-1, 0.5] by setting IEEE exponent = 1022. Teach fif 8 about signed zero and NaN. In ncg/table, change cmf so NaN is not equal to any value, and comment why ordered comparisons don't work with NaN. Also add cost for fctwiz, remove extra `uses REG`. Edit comment in cfu8.s because the conditional branch might be before or after fctwiz.
This commit is contained in:
parent
f1304e1a3c
commit
66f93f08c5
7 changed files with 194 additions and 49 deletions
|
@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
|
|||
acklibrary {
|
||||
name = "lib_"..plat,
|
||||
srcs = {
|
||||
"./*.s", -- cfu8.s
|
||||
"./*.s", -- fif4.s
|
||||
},
|
||||
vars = { plat = plat },
|
||||
deps = {
|
||||
|
|
|
@ -42,6 +42,8 @@
|
|||
! 1: yields r3 = the converted value.
|
||||
!
|
||||
! Debian's clang 3.5.0-10 and gcc 4.9.2-10 don't clamp the value
|
||||
! before conversion. They avoid fsel and put the conditional branch
|
||||
! before fctwiz. PowerPC 601 lacks fsel (but kernel might trap and
|
||||
! emulate fsel). PowerPC 603, 604, G3, G4, G5 have fsel.
|
||||
! before conversion. They avoid fsel and use the conditional branch
|
||||
! to pick between 2 fctwiz instructions.
|
||||
!
|
||||
! PowerPC 601 lacks fsel (but kernel might trap and emulate fsel).
|
||||
! PowerPC 603, 604, G3, G4, G5 have fsel.
|
||||
|
|
48
mach/powerpc/libem/fef4.s
Normal file
48
mach/powerpc/libem/fef4.s
Normal file
|
@ -0,0 +1,48 @@
|
|||
.sect .text
|
||||
|
||||
! Split a single-precision float into fraction and exponent, like
|
||||
! frexpf(3) in C, http://en.cppreference.com/w/c/numeric/math/frexp
|
||||
!
|
||||
! Stack: ( single -- fraction exponent )
|
||||
|
||||
.define .fef4
|
||||
.fef4:
|
||||
lwz r3, 0(sp) ! r3 = word of float bits
|
||||
|
||||
! IEEE single = sign * 1.fraction * 2**(exponent - 127)
|
||||
! sign exponent fraction
|
||||
! 0 1..8 9..31
|
||||
!
|
||||
! IEEE exponent = 126 in [0.5, 1) or (-1, -0.5].
|
||||
|
||||
extrwi. r6, r3, 8, 1 ! r6 = IEEE exponent
|
||||
beq 3f ! jump if zero or denormalized
|
||||
cmpwi r6, 255
|
||||
addi r5, r6, -126 ! r5 = our exponent
|
||||
beq 2f ! jump if infinity or NaN
|
||||
! fall through if normalized
|
||||
|
||||
! Put fraction in [0.5, 1) or (-1, -0.5].
|
||||
1: li r6, 126
|
||||
insrwi r3, r6, 8, 1 ! IEEE exponent = 126
|
||||
! fall through
|
||||
|
||||
2: stw r3, 0(sp) ! push fraction
|
||||
stwu r5, -4(sp) ! push exponent
|
||||
blr
|
||||
|
||||
! Got denormalized number or zero, probably zero.
|
||||
! If zero, then exponent must also be zero.
|
||||
3: extrwi. r6, r3, 23, 9 ! r6 = fraction
|
||||
bne 4f ! jump if not zero
|
||||
li r5, 0 ! exponent = 0
|
||||
b 2b
|
||||
|
||||
! Got denormalized number = 0.fraction * 2**-126
|
||||
4: cntlzw r5, r6
|
||||
addi r5, r5, -8
|
||||
slw r6, r6, r5 ! shift left to make 1.fraction
|
||||
insrwi r3, r6, 23, 9 ! set new fraction
|
||||
li r6, -126 + 1
|
||||
subf r5, r5, r6 ! r5 = our exponent
|
||||
b 1b
|
|
@ -3,7 +3,7 @@
|
|||
.sect .text
|
||||
|
||||
! Split a double-precision float into fraction and exponent, like
|
||||
! frexp(3) in C.
|
||||
! frexp(3) in C, http://en.cppreference.com/w/c/numeric/math/frexp
|
||||
!
|
||||
! Stack: ( double -- fraction exponent )
|
||||
|
||||
|
@ -12,42 +12,41 @@
|
|||
lwz r3, 0(sp) ! r3 = high word (bits 0..31)
|
||||
lwz r4, 4(sp) ! r4 = low word (bits 32..63)
|
||||
|
||||
! IEEE double-precision format:
|
||||
! IEEE double = sign * 1.fraction * 2**(exponent - 1023)
|
||||
! sign exponent fraction
|
||||
! 0 1..11 12..63
|
||||
!
|
||||
! To get fraction in [0.5, 1) or (-1, -0.5], we subtract 1022
|
||||
! from the IEEE exponent.
|
||||
! IEEE exponent = 1022 in [0.5, 1) or (-1, -0.5].
|
||||
|
||||
extrwi. r6, r3, 11, 1 ! r6 = IEEE exponent
|
||||
addi r5, r6, -1022 ! r5 = our exponent
|
||||
beq 2f ! jump if zero or denormalized
|
||||
beq 3f ! jump if zero or denormalized
|
||||
cmpwi r6, 2047
|
||||
beq 1f ! jump if infinity or NaN
|
||||
addi r5, r6, -1022 ! r5 = our exponent
|
||||
beq 2f ! jump if infinity or NaN
|
||||
! fall through if normalized
|
||||
|
||||
! Put fraction in [0.5, 1) or (-1, -0.5] by setting its
|
||||
! IEEE exponent to 1022.
|
||||
rlwinm r3, r3, 0, 12, 0 ! clear old exponent
|
||||
oris r3, r3, 1022 << 4 ! set new exponent
|
||||
! Put fraction in [0.5, 1) or (-1, -0.5].
|
||||
1: li r6, 1022
|
||||
insrwi r3, r6, 11, 1 ! IEEE exponent = 1022
|
||||
! fall through
|
||||
|
||||
1: stw r3, 0(sp)
|
||||
2: stw r3, 0(sp)
|
||||
stw r4, 4(sp) ! push fraction
|
||||
stwu r5, -4(sp) ! push exponent
|
||||
blr
|
||||
|
||||
2: ! Got denormalized number or zero, probably zero.
|
||||
extrwi r6, r3, 22, 12
|
||||
! Got denormalized number or zero, probably zero.
|
||||
! If zero, then exponent must also be zero.
|
||||
3: extrwi r6, r3, 20, 12
|
||||
or. r6, r6, r4 ! r6 = high|low fraction
|
||||
bne 3f ! jump if not zero
|
||||
bne 4f ! jump if not zero
|
||||
li r5, 0 ! exponent = 0
|
||||
b 1b
|
||||
b 2b
|
||||
|
||||
3: ! Got denormalized number, not zero.
|
||||
lfd f0, 0(sp)
|
||||
lis r6, ha16[_2_64]
|
||||
lfd f1, lo16[_2_64](r6)
|
||||
! Got denormalized number = 0.fraction * 2**-1022
|
||||
4: lfd f0, 0(sp)
|
||||
lis r6, ha16[.fs_2_64]
|
||||
lfs f1, lo16[.fs_2_64](r6)
|
||||
fmul f0, f0, f1 ! multiply it by 2**64
|
||||
stfd f0, 0(sp)
|
||||
lwz r3, 0(sp)
|
||||
|
@ -57,7 +56,6 @@
|
|||
b 1b
|
||||
|
||||
.sect .rom
|
||||
_2_64:
|
||||
! (double) 2**64
|
||||
.data4 0x43f00000
|
||||
.data4 0x00000000
|
||||
.fs_2_64:
|
||||
!float 1.84467440737095516e+19 sz 4
|
||||
.data1 0137,0200,00,00
|
||||
|
|
64
mach/powerpc/libem/fif4.s
Normal file
64
mach/powerpc/libem/fif4.s
Normal file
|
@ -0,0 +1,64 @@
|
|||
.sect .text
|
||||
|
||||
! Multiplies two single-precision floats, then splits the product into
|
||||
! fraction and integer, both as floats, like modff(3) in C,
|
||||
! http://en.cppreference.com/w/c/numeric/math/modf
|
||||
!
|
||||
! Stack: ( a b -- fraction integer )
|
||||
|
||||
.define .fif4
|
||||
.fif4:
|
||||
lfs f1, 4(sp)
|
||||
lfs f2, 0(sp)
|
||||
fmuls f1, f1, f2 ! f1 = a * b
|
||||
stfs f1, 0(sp)
|
||||
lwz r3, 0(sp) ! r3 = word of float bits
|
||||
|
||||
! IEEE single = sign * 1.fraction * 2**(exponent - 127)
|
||||
! sign exponent fraction
|
||||
! 0 1..8 9..31
|
||||
!
|
||||
! Subtract 127 from the IEEE exponent. If the result is from
|
||||
! 0 to 23, then the IEEE fraction has that many integer bits.
|
||||
|
||||
extrwi r5, r3, 8, 1 ! r5 = IEEE exponent
|
||||
addic. r5, r5, -127 ! r5 = nr of integer bits
|
||||
blt 3f ! branch if no integer
|
||||
cmpwi r5, 24
|
||||
bge 4f ! branch if no fraction
|
||||
! fall through if integer with fraction
|
||||
|
||||
! f1 has r5 = 0 to 23 integer bits in the IEEE fraction.
|
||||
! There are 23 - r5 fraction bits.
|
||||
li r6, 23
|
||||
subf r6, r5, r6
|
||||
srw r3, r3, r6
|
||||
slw r3, r3, r6 ! clear fraction in word
|
||||
! fall through
|
||||
|
||||
1: stw r3, 0(sp)
|
||||
lfs f2, 0(sp) ! integer = high word, low word
|
||||
fsubs f1, f1, f2 ! fraction = value - integer
|
||||
2: stfs f1, 4(sp) ! push fraction
|
||||
stfs f2, 0(sp) ! push integer
|
||||
blr
|
||||
|
||||
! f1 is a fraction without integer (or zero).
|
||||
! Then integer is zero with same sign.
|
||||
3: extlwi r3, r3, 1, 0 ! extract sign bit
|
||||
stfs f1, 4(sp) ! push fraction
|
||||
stw r3, 0(sp) ! push integer = zero with sign
|
||||
blr
|
||||
|
||||
! f1 is an integer without fraction (or infinity or NaN).
|
||||
! Unless NaN, then fraction is zero with same sign.
|
||||
4: fcmpu cr0, f1, f1
|
||||
bun cr0, 5f
|
||||
extlwi r3, r3, 1, 0 ! extract sign bit
|
||||
stw r3, 4(sp) ! push fraction = zero with sign
|
||||
stfs f1, 0(sp) ! push integer
|
||||
blr
|
||||
|
||||
! f1 is NaN, so both fraction and integer are NaN.
|
||||
5: fmr f2, f1
|
||||
b 2b
|
|
@ -1,7 +1,8 @@
|
|||
.sect .text
|
||||
|
||||
! Multiplies two double-precision floats, then splits the product into
|
||||
! fraction and integer, like modf(3) in C. On entry:
|
||||
! fraction and integer, both as floats, like modf(3) in C,
|
||||
! http://en.cppreference.com/w/c/numeric/math/modf
|
||||
!
|
||||
! Stack: ( a b -- fraction integer )
|
||||
|
||||
|
@ -14,20 +15,18 @@
|
|||
lwz r3, 0(sp) ! r3 = high word
|
||||
lwz r4, 4(sp) ! r4 = low word
|
||||
|
||||
! IEEE double-precision format:
|
||||
! IEEE double = sign * 1.fraction * 2**(exponent - 1023)
|
||||
! sign exponent fraction
|
||||
! 0 1..11 12..63
|
||||
!
|
||||
! Subtract 1023 from the IEEE exponent. If the result is from
|
||||
! 0 to 51, then the IEEE fraction has that many integer bits.
|
||||
! (IEEE has an implicit 1 before its fraction. If the IEEE
|
||||
! fraction has 0 integer bits, we still have an integer.)
|
||||
|
||||
extrwi r5, r3, 11, 1 ! r5 = IEEE exponent
|
||||
addic. r5, r5, -1023 ! r5 = nr of integer bits
|
||||
blt 4f ! branch if no integer
|
||||
blt 3f ! branch if no integer
|
||||
cmpwi r5, 52
|
||||
bge 5f ! branch if no fraction
|
||||
bge 4f ! branch if no fraction
|
||||
cmpwi r5, 21
|
||||
bge 6f ! branch if large integer
|
||||
! fall through if small integer
|
||||
|
@ -44,22 +43,38 @@
|
|||
1: stw r3, 0(sp)
|
||||
stw r4, 4(sp)
|
||||
lfd f2, 0(sp) ! integer = high word, low word
|
||||
2: fsub f1, f1, f2 ! fraction = value - integer
|
||||
3: stfd f1, 8(sp) ! push fraction
|
||||
fsub f1, f1, f2 ! fraction = value - integer
|
||||
2: stfd f1, 8(sp) ! push fraction
|
||||
stfd f2, 0(sp) ! push integer
|
||||
blr
|
||||
|
||||
4: ! f1 is a fraction without integer.
|
||||
fsub f2, f1, f1 ! integer = zero
|
||||
b 3b
|
||||
! f1 is a fraction without integer (or zero).
|
||||
! Then integer is zero with same sign.
|
||||
3: extlwi r3, r3, 1, 0 ! extract sign bit
|
||||
li r4, 0
|
||||
stfd f1, 8(sp) ! push fraction
|
||||
stw r4, 4(sp)
|
||||
stw r3, 0(sp) ! push integer = zero with sign
|
||||
blr
|
||||
|
||||
5: ! f1 is an integer without fraction (or infinity or NaN).
|
||||
fmr f2, f1 ! integer = f1
|
||||
! f1 is an integer without fraction (or infinity or NaN).
|
||||
! Unless NaN, then fraction is zero with same sign.
|
||||
4: fcmpu cr0, f1, f1 ! integer = f1
|
||||
bun cr0, 5f
|
||||
extlwi r3, r3, 1, 0 ! extract sign bit
|
||||
li r4, 0
|
||||
stw r4, 12(sp)
|
||||
stw r3, 8(sp) ! push fraction = zero with sign
|
||||
stfd f1, 0(sp) ! push integer
|
||||
blr
|
||||
|
||||
! f1 is NaN, so both fraction and integer are NaN.
|
||||
5: fmr f2, f1
|
||||
b 2b
|
||||
|
||||
6: ! f1 has r5 = 21 to 51 to integer bits.
|
||||
! f1 has r5 = 21 to 51 to integer bits.
|
||||
! Low word has 52 - r5 fraction bits.
|
||||
li r6, 52
|
||||
6: li r6, 52
|
||||
subf r6, r5, r6
|
||||
srw r4, r4, r6
|
||||
slw r4, r4, r6 ! clear fraction in low word
|
||||
|
|
|
@ -310,7 +310,7 @@ INSTRUCTIONS
|
|||
fadds FSREG+LOCAL:wo, FSREG:ro, FSREG:ro cost(4, 5).
|
||||
fcmpo CR:wo, FREG:ro, FREG:ro cost(4, 5).
|
||||
fcmpo CR:wo, FSREG:ro, FSREG:ro cost(4, 5).
|
||||
fctiwz FREG:wo, FREG:ro.
|
||||
fctiwz FREG:wo, FREG:ro cost(4, 5).
|
||||
fdiv FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 35).
|
||||
fdivs FSREG+LOCAL:wo, FSREG:ro, FSREG:ro cost(4, 21).
|
||||
fmr FPR:wo, FPR:ro cost(4, 5).
|
||||
|
@ -2329,10 +2329,20 @@ PATTERNS
|
|||
with FSREG
|
||||
gen fneg {LOCAL, $2}, %1
|
||||
|
||||
/* When a or b is NaN, then a < b, a <= b, a > b, a >= b
|
||||
* should all be false. We can't make them false, because
|
||||
* - EM's _cmf_ is only for ordered comparisons.
|
||||
* - The peephole optimizer assumes (a < b) == !(a >= b).
|
||||
*
|
||||
* We do make a == b false and a != b true, by checking the
|
||||
* eq (equal) bit or un (unordered) bit in cr0.
|
||||
*/
|
||||
|
||||
pat cmf $1==4 /* Compare single */
|
||||
with FSREG FSREG
|
||||
uses REG={COND_FS, %2, %1}
|
||||
gen extlwi %a, %a, {C, 2}, {C, 0}
|
||||
/* Extract lt, gt, un; put lt in sign bit. */
|
||||
gen andisX %a, %a, {C, 0xd000}
|
||||
yields %a
|
||||
|
||||
pat cmf teq $1==4 /* Single second == top */
|
||||
|
@ -2367,7 +2377,6 @@ PATTERNS
|
|||
|
||||
proc cmf4zxx example cmf zeq
|
||||
with FSREG FSREG STACK
|
||||
uses REG
|
||||
gen
|
||||
fcmpo cr0, %2, %1
|
||||
bxx* {LABEL, $2}
|
||||
|
@ -2420,6 +2429,13 @@ PATTERNS
|
|||
loc 4
|
||||
cff
|
||||
|
||||
pat fef $1==4 /* Split fraction, exponent */
|
||||
leaving cal ".fef4"
|
||||
|
||||
/* Multiply two singles, then split fraction, integer */
|
||||
pat fif $1==4
|
||||
leaving cal ".fif4"
|
||||
|
||||
|
||||
/* Double-precision floating-point */
|
||||
|
||||
|
@ -2471,10 +2487,13 @@ PATTERNS
|
|||
with FREG
|
||||
gen fneg {DLOCAL, $2}, %1
|
||||
|
||||
/* To compare NaN, see comment above pat cmf $1==4 */
|
||||
|
||||
pat cmf $1==8 /* Compare double */
|
||||
with FREG FREG
|
||||
uses REG={COND_FD, %2, %1}
|
||||
gen extlwi %a, %a, {C, 2}, {C, 0}
|
||||
/* Extract lt, gt, un; put lt in sign bit. */
|
||||
gen andisX %a, %a, {C, 0xd000}
|
||||
yields %a
|
||||
|
||||
pat cmf teq $1==8 /* Double second == top */
|
||||
|
@ -2482,7 +2501,7 @@ PATTERNS
|
|||
uses REG={COND_FD, %2, %1}
|
||||
yields {XEQ, %a}
|
||||
|
||||
pat cmf tne $1==8 /* Single second == top */
|
||||
pat cmf tne $1==8 /* Double second == top */
|
||||
with FREG FREG
|
||||
uses REG={COND_FD, %2, %1}
|
||||
yields {XNE, %a}
|
||||
|
@ -2509,7 +2528,6 @@ PATTERNS
|
|||
|
||||
proc cmf8zxx example cmf zeq
|
||||
with FREG FREG STACK
|
||||
uses REG
|
||||
gen
|
||||
fcmpo cr0, %2, %1
|
||||
bxx* {LABEL, $2}
|
||||
|
|
Loading…
Reference in a new issue