Shrink .cfu8

With my PowerBook G4, a program that converts values from 1.0 to
4000000.0 runs in about 0.32s with the old .cfu8 and 0.29s with this
shrunken .cfu8

Leave a comment about other ways to implement .cfu8
This commit is contained in:
George Koehler 2018-01-07 16:03:55 -05:00
parent b90c97b00b
commit 64b50b3a45
4 changed files with 39 additions and 49 deletions

View file

@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
acklibrary {
name = "lib_"..plat,
srcs = {
"./*.s", -- trp.s
"./*.s", -- cfu8.s
},
vars = { plat = plat },
deps = {

View file

@ -1,3 +1,5 @@
.sect .text; .sect .rom; .sect .data; .sect .bss
.sect .text
! Converts a 64-bit double into a 32-bit unsigned integer.
@ -6,32 +8,40 @@
.define .cfu8
.cfu8:
lis r3, ha16[.fd_00000000]
lfd f0, lo16[.fd_00000000](r3) ! f0 = 0.0
lfd f1, 0(sp) ! value to be converted
lis r3, ha16[.fd_FFFFFFFF]
lfd f3, lo16[.fd_FFFFFFFF](r3) ! f3 = 0xFFFFFFFF
lis r3, ha16[.fd_80000000]
lfd f4, lo16[.fd_80000000](r3) ! f4 = 0x80000000
fsel f2, f1, f1, f0
fsub f5, f3, f1
fsel f2, f5, f2, f3
fsub f5, f2, f4
fcmpu cr0, f2, f4
fsel f2, f5, f5, f2
fctiwz f2, f2
stfd f2, 0(sp)
addi sp, sp, 4
bltlr
lwz r3, 0(sp)
xoris r3, r3, 0x8000
stw r3, 0(sp)
lfd f1, 0(sp) ! f1 = value to convert
lis r3, ha16[.fs_80000000]
lfs f2, lo16[.fs_80000000](r3) ! f2 = 2**31
fsub f1, f1, f2
fctiwz f1, f1 ! convert value - 2**31
stfd f1, 0(sp)
lwz r3, 4(sp)
xoris r3, r3, 0x8000 ! add 2**31
stw r3, 4(sp)
addi sp, sp, 4
blr
.sect .rom
.fs_80000000:
!float 2.147483648e+9 sz 4
.data1 0117,00,00,00
! Freescale and IBM provide an example using fsel to select value or
! value - 2**31 for fctiwz. The following code adapts Freescale's
! _Programming Environments Manual for 32-Bit Implementations of the
! PowerPC Architecture_, section C.3.2, pdf page 557.
!
! Given f2 = value clamped from 0 to 2**32 - 1, f4 = 2**31, then
! fsub f5, f2, f4
! fcmpu cr2, f2, f4
! fsel f2, f5, f5, f2
! fctiwz f2, f2
! stfdu f2, 0(sp)
! lwz r3, 4(sp)
! blt cr2, 1f
! xoris r3, r3, 0x8000
! 1: yields r3 = the converted value.
!
! Debian's clang 3.5.0-10 and gcc 4.9.2-10 don't clamp the value
! before conversion. They avoid fsel and put the conditional branch
! before fctwiz. PowerPC 601 lacks fsel (but kernel might trap and
! emulate fsel). PowerPC 603, 604, G3, G4, G5 have fsel.

View file

@ -1,10 +0,0 @@
.sect .text; .sect .rom; .sect .data; .sect .bss
.sect .rom
! Contains a handy double-precision 0x80000000.
.define .fd_80000000
.fd_80000000:
!float 2.147483648e+9 sz 8
.data1 0101,0340,00,00,00,00,00,00

View file

@ -1,10 +0,0 @@
.sect .text; .sect .rom; .sect .data; .sect .bss
.sect .rom
! Contains a handy double-precision 0xFFFFFFFF.
.define .fd_FFFFFFFF
.fd_FFFFFFFF:
!float 4.294967295e+9 sz 8
.data1 0101,0357,0377,0377,0377,0340,00,00