Shrink .cfu8

With my PowerBook G4, a program that converts values from 1.0 to 4000000.0 runs in about 0.32s with the old .cfu8 and 0.29s with this shrunken .cfu8 Leave a comment about other ways to implement .cfu8
2018-01-07 16:03:55 -05:00 · 2018-01-07 16:03:55 -05:00 · 64b50b3a45
commit 64b50b3a45
parent b90c97b00b
4 changed files with 39 additions and 49 deletions
--- a/mach/powerpc/libem/build.lua
+++ b/mach/powerpc/libem/build.lua
@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
 	acklibrary {
 		name = "lib_"..plat,
 		srcs = {
-			"./*.s", -- trp.s
+			"./*.s", -- cfu8.s
 		},
 		vars = { plat = plat },
 		deps = {
--- a/mach/powerpc/libem/cfu8.s
+++ b/mach/powerpc/libem/cfu8.s
@ -1,3 +1,5 @@
 .sect .text; .sect .rom; .sect .data; .sect .bss
 .sect .text
 ! Converts a 64-bit double into a 32-bit unsigned integer.
@ -6,32 +8,40 @@
 .define .cfu8
 .cfu8:
-	lis r3, ha16[.fd_00000000]
+	lfd f1, 0(sp)                   ! f1 = value to convert
-	lfd f0, lo16[.fd_00000000](r3) ! f0 = 0.0
+	lis r3, ha16[.fs_80000000]
-
+	lfs f2, lo16[.fs_80000000](r3)  ! f2 = 2**31
-	lfd f1, 0(sp)            ! value to be converted
+	fsub   f1, f1, f2
-
+	fctiwz f1, f1         ! convert value - 2**31
-	lis r3, ha16[.fd_FFFFFFFF]
+	stfd   f1, 0(sp)
-	lfd f3, lo16[.fd_FFFFFFFF](r3) ! f3 = 0xFFFFFFFF
+	lwz   r3, 4(sp)
-
+	xoris r3, r3, 0x8000  ! add 2**31
-	lis r3, ha16[.fd_80000000]
+	stw   r3, 4(sp)
-	lfd f4, lo16[.fd_80000000](r3) ! f4 = 0x80000000
+	addi  sp, sp, 4
 	fsel f2, f1, f1, f0
 	fsub f5, f3, f1
 	fsel f2, f5, f2, f3
 	fsub f5, f2, f4
 	fcmpu cr0, f2, f4
 	fsel f2, f5, f5, f2
 	fctiwz f2, f2
 	stfd f2, 0(sp)
 	addi sp, sp, 4
 	bltlr
 	lwz r3, 0(sp)
 	xoris r3, r3, 0x8000
 	stw r3, 0(sp)
 	blr
 .sect .rom
 .fs_80000000:
 	!float 2.147483648e+9 sz 4
 	.data1 0117,00,00,00
 ! Freescale and IBM provide an example using fsel to select value or
 ! value - 2**31 for fctiwz.  The following code adapts Freescale's
 ! _Programming Environments Manual for 32-Bit Implementations of the
 ! PowerPC Architecture_, section C.3.2, pdf page 557.
 !
 ! Given f2 = value clamped from 0 to 2**32 - 1, f4 = 2**31, then
 !	fsub	f5, f2, f4
 !	fcmpu	cr2, f2, f4
 !	fsel	f2, f5, f5, f2
 !	fctiwz	f2, f2
 !	stfdu	f2, 0(sp)
 !	lwz	r3, 4(sp)
 !	blt	cr2, 1f
 !	xoris	r3, r3, 0x8000
 ! 1: yields r3 = the converted value.
 !
 ! Debian's clang 3.5.0-10 and gcc 4.9.2-10 don't clamp the value
 ! before conversion.  They avoid fsel and put the conditional branch
 ! before fctwiz.  PowerPC 601 lacks fsel (but kernel might trap and
 ! emulate fsel).  PowerPC 603, 604, G3, G4, G5 have fsel.
--- a/mach/powerpc/libem/fd_80000000.s
+++ b/mach/powerpc/libem/fd_80000000.s
@ -1,10 +0,0 @@
 .sect .text; .sect .rom; .sect .data; .sect .bss
 .sect .rom
 ! Contains a handy double-precision 0x80000000.
 .define .fd_80000000
 .fd_80000000:
 	!float 2.147483648e+9 sz 8
 	.data1 0101,0340,00,00,00,00,00,00
--- a/mach/powerpc/libem/fd_FFFFFFFF.s
+++ b/mach/powerpc/libem/fd_FFFFFFFF.s
@ -1,10 +0,0 @@
 .sect .text; .sect .rom; .sect .data; .sect .bss
 .sect .rom
 ! Contains a handy double-precision 0xFFFFFFFF.
 .define .fd_FFFFFFFF
 .fd_FFFFFFFF:
 	!float 4.294967295e+9 sz 8
 	.data1 0101,0357,0377,0377,0377,0340,00,00