Shrink .cfu8
With my PowerBook G4, a program that converts values from 1.0 to 4000000.0 runs in about 0.32s with the old .cfu8 and 0.29s with this shrunken .cfu8 Leave a comment about other ways to implement .cfu8
This commit is contained in:
parent
b90c97b00b
commit
64b50b3a45
|
@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
|
||||||
acklibrary {
|
acklibrary {
|
||||||
name = "lib_"..plat,
|
name = "lib_"..plat,
|
||||||
srcs = {
|
srcs = {
|
||||||
"./*.s", -- trp.s
|
"./*.s", -- cfu8.s
|
||||||
},
|
},
|
||||||
vars = { plat = plat },
|
vars = { plat = plat },
|
||||||
deps = {
|
deps = {
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
.sect .text; .sect .rom; .sect .data; .sect .bss
|
||||||
|
|
||||||
.sect .text
|
.sect .text
|
||||||
|
|
||||||
! Converts a 64-bit double into a 32-bit unsigned integer.
|
! Converts a 64-bit double into a 32-bit unsigned integer.
|
||||||
|
@ -6,32 +8,40 @@
|
||||||
|
|
||||||
.define .cfu8
|
.define .cfu8
|
||||||
.cfu8:
|
.cfu8:
|
||||||
lis r3, ha16[.fd_00000000]
|
lfd f1, 0(sp) ! f1 = value to convert
|
||||||
lfd f0, lo16[.fd_00000000](r3) ! f0 = 0.0
|
lis r3, ha16[.fs_80000000]
|
||||||
|
lfs f2, lo16[.fs_80000000](r3) ! f2 = 2**31
|
||||||
lfd f1, 0(sp) ! value to be converted
|
fsub f1, f1, f2
|
||||||
|
fctiwz f1, f1 ! convert value - 2**31
|
||||||
lis r3, ha16[.fd_FFFFFFFF]
|
stfd f1, 0(sp)
|
||||||
lfd f3, lo16[.fd_FFFFFFFF](r3) ! f3 = 0xFFFFFFFF
|
lwz r3, 4(sp)
|
||||||
|
xoris r3, r3, 0x8000 ! add 2**31
|
||||||
lis r3, ha16[.fd_80000000]
|
stw r3, 4(sp)
|
||||||
lfd f4, lo16[.fd_80000000](r3) ! f4 = 0x80000000
|
addi sp, sp, 4
|
||||||
|
|
||||||
fsel f2, f1, f1, f0
|
|
||||||
fsub f5, f3, f1
|
|
||||||
fsel f2, f5, f2, f3
|
|
||||||
fsub f5, f2, f4
|
|
||||||
fcmpu cr0, f2, f4
|
|
||||||
fsel f2, f5, f5, f2
|
|
||||||
fctiwz f2, f2
|
|
||||||
|
|
||||||
stfd f2, 0(sp)
|
|
||||||
addi sp, sp, 4
|
|
||||||
|
|
||||||
bltlr
|
|
||||||
|
|
||||||
lwz r3, 0(sp)
|
|
||||||
xoris r3, r3, 0x8000
|
|
||||||
stw r3, 0(sp)
|
|
||||||
|
|
||||||
blr
|
blr
|
||||||
|
|
||||||
|
.sect .rom
|
||||||
|
.fs_80000000:
|
||||||
|
!float 2.147483648e+9 sz 4
|
||||||
|
.data1 0117,00,00,00
|
||||||
|
|
||||||
|
! Freescale and IBM provide an example using fsel to select value or
|
||||||
|
! value - 2**31 for fctiwz. The following code adapts Freescale's
|
||||||
|
! _Programming Environments Manual for 32-Bit Implementations of the
|
||||||
|
! PowerPC Architecture_, section C.3.2, pdf page 557.
|
||||||
|
!
|
||||||
|
! Given f2 = value clamped from 0 to 2**32 - 1, f4 = 2**31, then
|
||||||
|
! fsub f5, f2, f4
|
||||||
|
! fcmpu cr2, f2, f4
|
||||||
|
! fsel f2, f5, f5, f2
|
||||||
|
! fctiwz f2, f2
|
||||||
|
! stfdu f2, 0(sp)
|
||||||
|
! lwz r3, 4(sp)
|
||||||
|
! blt cr2, 1f
|
||||||
|
! xoris r3, r3, 0x8000
|
||||||
|
! 1: yields r3 = the converted value.
|
||||||
|
!
|
||||||
|
! Debian's clang 3.5.0-10 and gcc 4.9.2-10 don't clamp the value
|
||||||
|
! before conversion. They avoid fsel and put the conditional branch
|
||||||
|
! before fctwiz. PowerPC 601 lacks fsel (but kernel might trap and
|
||||||
|
! emulate fsel). PowerPC 603, 604, G3, G4, G5 have fsel.
|
||||||
|
|
|
@ -1,10 +0,0 @@
|
||||||
.sect .text; .sect .rom; .sect .data; .sect .bss
|
|
||||||
|
|
||||||
.sect .rom
|
|
||||||
|
|
||||||
! Contains a handy double-precision 0x80000000.
|
|
||||||
|
|
||||||
.define .fd_80000000
|
|
||||||
.fd_80000000:
|
|
||||||
!float 2.147483648e+9 sz 8
|
|
||||||
.data1 0101,0340,00,00,00,00,00,00
|
|
|
@ -1,10 +0,0 @@
|
||||||
.sect .text; .sect .rom; .sect .data; .sect .bss
|
|
||||||
|
|
||||||
.sect .rom
|
|
||||||
|
|
||||||
! Contains a handy double-precision 0xFFFFFFFF.
|
|
||||||
|
|
||||||
.define .fd_FFFFFFFF
|
|
||||||
.fd_FFFFFFFF:
|
|
||||||
!float 4.294967295e+9 sz 8
|
|
||||||
.data1 0101,0357,0377,0377,0377,0340,00,00
|
|
Loading…
Reference in a new issue