Shrink .cfu8

With my PowerBook G4, a program that converts values from 1.0 to 4000000.0 runs in about 0.32s with the old .cfu8 and 0.29s with this shrunken .cfu8 Leave a comment about other ways to implement .cfu8
2018-01-07 16:03:55 -05:00 · 2018-01-07 16:03:55 -05:00 · 64b50b3a45
commit 64b50b3a45
parent b90c97b00b
4 changed files with 39 additions and 49 deletions
--- a/mach/powerpc/libem/build.lua
+++ b/mach/powerpc/libem/build.lua
@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
 	acklibrary {
 		name = "lib_"..plat,
 		srcs = {
-			"./*.s", -- trp.s
+			"./*.s", -- cfu8.s
 		},
 		vars = { plat = plat },
 		deps = {
--- a/mach/powerpc/libem/cfu8.s
+++ b/mach/powerpc/libem/cfu8.s
@ -1,3 +1,5 @@
+.sect .text; .sect .rom; .sect .data; .sect .bss
+
 .sect .text

 ! Converts a 64-bit double into a 32-bit unsigned integer.
@ -6,32 +8,40 @@

 .define .cfu8
 .cfu8:
-	lis r3, ha16[.fd_00000000]
-	lfd f0, lo16[.fd_00000000](r3) ! f0 = 0.0
-
-	lfd f1, 0(sp)            ! value to be converted
-
-	lis r3, ha16[.fd_FFFFFFFF]
-	lfd f3, lo16[.fd_FFFFFFFF](r3) ! f3 = 0xFFFFFFFF
-
-	lis r3, ha16[.fd_80000000]
-	lfd f4, lo16[.fd_80000000](r3) ! f4 = 0x80000000
-
-	fsel f2, f1, f1, f0
-	fsub f5, f3, f1
-	fsel f2, f5, f2, f3
-	fsub f5, f2, f4
-	fcmpu cr0, f2, f4
-	fsel f2, f5, f5, f2
-	fctiwz f2, f2
-	
-	stfd f2, 0(sp)
-	addi sp, sp, 4
-
-	bltlr
-
-	lwz r3, 0(sp)
-	xoris r3, r3, 0x8000
-	stw r3, 0(sp)
-
+	lfd f1, 0(sp)                   ! f1 = value to convert
+	lis r3, ha16[.fs_80000000]
+	lfs f2, lo16[.fs_80000000](r3)  ! f2 = 2**31
+	fsub   f1, f1, f2
+	fctiwz f1, f1         ! convert value - 2**31
+	stfd   f1, 0(sp)
+	lwz   r3, 4(sp)
+	xoris r3, r3, 0x8000  ! add 2**31
+	stw   r3, 4(sp)
+	addi  sp, sp, 4
 	blr
+
+.sect .rom
+.fs_80000000:
+	!float 2.147483648e+9 sz 4
+	.data1 0117,00,00,00
+
+! Freescale and IBM provide an example using fsel to select value or
+! value - 2**31 for fctiwz.  The following code adapts Freescale's
+! _Programming Environments Manual for 32-Bit Implementations of the
+! PowerPC Architecture_, section C.3.2, pdf page 557.
+!
+! Given f2 = value clamped from 0 to 2**32 - 1, f4 = 2**31, then
+!	fsub	f5, f2, f4
+!	fcmpu	cr2, f2, f4
+!	fsel	f2, f5, f5, f2
+!	fctiwz	f2, f2
+!	stfdu	f2, 0(sp)
+!	lwz	r3, 4(sp)
+!	blt	cr2, 1f
+!	xoris	r3, r3, 0x8000
+! 1: yields r3 = the converted value.
+!
+! Debian's clang 3.5.0-10 and gcc 4.9.2-10 don't clamp the value
+! before conversion.  They avoid fsel and put the conditional branch
+! before fctwiz.  PowerPC 601 lacks fsel (but kernel might trap and
+! emulate fsel).  PowerPC 603, 604, G3, G4, G5 have fsel.
--- a/mach/powerpc/libem/fd_80000000.s
+++ b/mach/powerpc/libem/fd_80000000.s
@ -1,10 +0,0 @@
-.sect .text; .sect .rom; .sect .data; .sect .bss
-
-.sect .rom
-
-! Contains a handy double-precision 0x80000000.
-
-.define .fd_80000000
-.fd_80000000:
-	!float 2.147483648e+9 sz 8
-	.data1 0101,0340,00,00,00,00,00,00
--- a/mach/powerpc/libem/fd_FFFFFFFF.s
+++ b/mach/powerpc/libem/fd_FFFFFFFF.s
@ -1,10 +0,0 @@
-.sect .text; .sect .rom; .sect .data; .sect .bss
-
-.sect .rom
-
-! Contains a handy double-precision 0xFFFFFFFF.
-
-.define .fd_FFFFFFFF
-.fd_FFFFFFFF:
-	!float 4.294967295e+9 sz 8
-	.data1 0101,0357,0377,0377,0377,0340,00,00