From 66f93f08c5a0a3af85c1ce5f5278c06882403c37 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Mon, 22 Jan 2018 14:04:15 -0500
Subject: [PATCH] Add fef 4, fif 4.  Improve fef 8, fif 8.  Other float
 changes.

When I wrote fef 8, I forgot to test denormalized numbers.  Oops.  Now
fix two of my mistakes:

 - When checking for zero, `extrwi r6, r3, 22, 12` needs to be
   `extrwi r6, r3, 20, 12`.  There are only 20 bits to extract.

 - After the multiplication by 2**64, I forgot to put the fraction in
   [0.5, 1) or (-1, 0.5] by setting IEEE exponent = 1022.

Teach fif 8 about signed zero and NaN.

In ncg/table, change cmf so NaN is not equal to any value, and comment
why ordered comparisons don't work with NaN.  Also add cost for
fctwiz, remove extra `uses REG`.

Edit comment in cfu8.s because the conditional branch might be before
or after fctwiz.
---
 mach/powerpc/libem/build.lua |  2 +-
 mach/powerpc/libem/cfu8.s    |  8 +++--
 mach/powerpc/libem/fef4.s    | 48 +++++++++++++++++++++++++++
 mach/powerpc/libem/fef8.s    | 46 +++++++++++++-------------
 mach/powerpc/libem/fif4.s    | 64 ++++++++++++++++++++++++++++++++++++
 mach/powerpc/libem/fif8.s    | 45 ++++++++++++++++---------
 mach/powerpc/ncg/table       | 30 +++++++++++++----
 7 files changed, 194 insertions(+), 49 deletions(-)
 create mode 100644 mach/powerpc/libem/fef4.s
 create mode 100644 mach/powerpc/libem/fif4.s

diff --git a/mach/powerpc/libem/build.lua b/mach/powerpc/libem/build.lua
index 7a0726b80..2709a4770 100644
--- a/mach/powerpc/libem/build.lua
+++ b/mach/powerpc/libem/build.lua
@@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
 	acklibrary {
 		name = "lib_"..plat,
 		srcs = {
-			"./*.s", -- cfu8.s
+			"./*.s", -- fif4.s
 		},
 		vars = { plat = plat },
 		deps = {
diff --git a/mach/powerpc/libem/cfu8.s b/mach/powerpc/libem/cfu8.s
index fd69ff521..710d2a65c 100644
--- a/mach/powerpc/libem/cfu8.s
+++ b/mach/powerpc/libem/cfu8.s
@@ -42,6 +42,8 @@
 ! 1: yields r3 = the converted value.
 !
 ! Debian's clang 3.5.0-10 and gcc 4.9.2-10 don't clamp the value
-! before conversion.  They avoid fsel and put the conditional branch
-! before fctwiz.  PowerPC 601 lacks fsel (but kernel might trap and
-! emulate fsel).  PowerPC 603, 604, G3, G4, G5 have fsel.
+! before conversion.  They avoid fsel and use the conditional branch
+! to pick between 2 fctwiz instructions.
+!
+! PowerPC 601 lacks fsel (but kernel might trap and emulate fsel).
+! PowerPC 603, 604, G3, G4, G5 have fsel.
diff --git a/mach/powerpc/libem/fef4.s b/mach/powerpc/libem/fef4.s
new file mode 100644
index 000000000..a338ed0a9
--- /dev/null
+++ b/mach/powerpc/libem/fef4.s
@@ -0,0 +1,48 @@
+.sect .text
+
+! Split a single-precision float into fraction and exponent, like
+! frexpf(3) in C, http://en.cppreference.com/w/c/numeric/math/frexp
+!
+! Stack: ( single -- fraction exponent )
+
+.define .fef4
+.fef4:
+	lwz r3, 0(sp)			! r3 = word of float bits
+
+	! IEEE single = sign * 1.fraction * 2**(exponent - 127)
+	!   sign  exponent  fraction
+	!   0     1..8      9..31
+	!
+	! IEEE exponent = 126 in [0.5, 1) or (-1, -0.5].
+
+	extrwi. r6, r3, 8, 1		! r6 = IEEE exponent
+	beq 3f				! jump if zero or denormalized
+	cmpwi r6, 255
+	addi r5, r6, -126		! r5 = our exponent
+	beq 2f				! jump if infinity or NaN
+	! fall through if normalized
+
+	! Put fraction in [0.5, 1) or (-1, -0.5].
+1:	li r6, 126
+	insrwi r3, r6, 8, 1		! IEEE exponent = 126
+	! fall through
+
+2:	stw r3, 0(sp)			! push fraction
+	stwu r5, -4(sp)			! push exponent
+	blr
+
+	! Got denormalized number or zero, probably zero.
+	! If zero, then exponent must also be zero.
+3:	extrwi. r6, r3, 23, 9		! r6 = fraction
+	bne 4f				! jump if not zero
+	li r5, 0			! exponent = 0
+	b 2b
+
+	! Got denormalized number = 0.fraction * 2**-126
+4:	cntlzw r5, r6
+	addi r5, r5, -8
+	slw r6, r6, r5			! shift left to make 1.fraction
+	insrwi r3, r6, 23, 9		! set new fraction
+	li r6, -126 + 1
+	subf r5, r5, r6			! r5 = our exponent
+	b 1b
diff --git a/mach/powerpc/libem/fef8.s b/mach/powerpc/libem/fef8.s
index 26a962d8b..aff5ea3b6 100644
--- a/mach/powerpc/libem/fef8.s
+++ b/mach/powerpc/libem/fef8.s
@@ -3,7 +3,7 @@
 .sect .text
 
 ! Split a double-precision float into fraction and exponent, like
-! frexp(3) in C.
+! frexp(3) in C, http://en.cppreference.com/w/c/numeric/math/frexp
 !
 ! Stack: ( double -- fraction exponent )
 
@@ -12,42 +12,41 @@
 	lwz r3, 0(sp)			! r3 = high word (bits 0..31)
 	lwz r4, 4(sp)			! r4 = low word (bits 32..63)
 
-	! IEEE double-precision format:
+	! IEEE double = sign * 1.fraction * 2**(exponent - 1023)
 	!   sign  exponent  fraction
 	!   0     1..11     12..63
 	!
-	! To get fraction in [0.5, 1) or (-1, -0.5], we subtract 1022
-	! from the IEEE exponent.
+	! IEEE exponent = 1022 in [0.5, 1) or (-1, -0.5].
 
 	extrwi. r6, r3, 11, 1		! r6 = IEEE exponent
-	addi r5, r6, -1022		! r5 = our exponent
-	beq 2f				! jump if zero or denormalized
+	beq 3f				! jump if zero or denormalized
 	cmpwi r6, 2047
-	beq 1f				! jump if infinity or NaN
+	addi r5, r6, -1022		! r5 = our exponent
+	beq 2f				! jump if infinity or NaN
 	! fall through if normalized
 
-	! Put fraction in [0.5, 1) or (-1, -0.5] by setting its
-	! IEEE exponent to 1022.
-	rlwinm r3, r3, 0, 12, 0		! clear old exponent
-	oris r3, r3, 1022 << 4		! set new exponent
+	! Put fraction in [0.5, 1) or (-1, -0.5].
+1:	li r6, 1022
+	insrwi r3, r6, 11, 1		! IEEE exponent = 1022
 	! fall through
 
-1:	stw r3, 0(sp)
+2:	stw r3, 0(sp)
 	stw r4, 4(sp)			! push fraction
 	stwu r5, -4(sp)			! push exponent
 	blr
 
-2:	! Got denormalized number or zero, probably zero.
-	extrwi r6, r3, 22, 12
+	! Got denormalized number or zero, probably zero.
+	! If zero, then exponent must also be zero.
+3:	extrwi r6, r3, 20, 12
 	or. r6, r6, r4			! r6 = high|low fraction
-	bne 3f				! jump if not zero
+	bne 4f				! jump if not zero
 	li r5, 0			! exponent = 0
-	b 1b
+	b 2b
 
-3:	! Got denormalized number, not zero.
-	lfd f0, 0(sp)
-	lis r6, ha16[_2_64]
-	lfd f1, lo16[_2_64](r6)
+	! Got denormalized number = 0.fraction * 2**-1022
+4:	lfd f0, 0(sp)
+	lis r6, ha16[.fs_2_64]
+	lfs f1, lo16[.fs_2_64](r6)
 	fmul f0, f0, f1			! multiply it by 2**64
 	stfd f0, 0(sp)
 	lwz r3, 0(sp)
@@ -57,7 +56,6 @@
 	b 1b
 
 .sect .rom
-_2_64:
-	! (double) 2**64
-	.data4 0x43f00000
-	.data4 0x00000000
+.fs_2_64:
+	!float 1.84467440737095516e+19 sz 4
+	.data1 0137,0200,00,00
diff --git a/mach/powerpc/libem/fif4.s b/mach/powerpc/libem/fif4.s
new file mode 100644
index 000000000..fc29b178c
--- /dev/null
+++ b/mach/powerpc/libem/fif4.s
@@ -0,0 +1,64 @@
+.sect .text
+
+! Multiplies two single-precision floats, then splits the product into
+! fraction and integer, both as floats, like modff(3) in C,
+! http://en.cppreference.com/w/c/numeric/math/modf
+!
+! Stack: ( a b -- fraction integer )
+
+.define .fif4
+.fif4:
+	lfs f1, 4(sp)
+	lfs f2, 0(sp)
+	fmuls f1, f1, f2		! f1 = a * b
+	stfs f1, 0(sp)
+	lwz r3, 0(sp)			! r3 = word of float bits
+
+	! IEEE single = sign * 1.fraction * 2**(exponent - 127)
+	!   sign  exponent  fraction
+	!   0     1..8      9..31
+	!
+	! Subtract 127 from the IEEE exponent.  If the result is from
+	! 0 to 23, then the IEEE fraction has that many integer bits.
+
+	extrwi r5, r3, 8, 1		! r5 = IEEE exponent
+	addic. r5, r5, -127		! r5 = nr of integer bits
+	blt 3f				! branch if no integer
+	cmpwi r5, 24
+	bge 4f				! branch if no fraction
+	! fall through if integer with fraction
+
+	! f1 has r5 = 0 to 23 integer bits in the IEEE fraction.
+	! There are 23 - r5 fraction bits.
+	li r6, 23
+	subf r6, r5, r6
+	srw r3, r3, r6
+	slw r3, r3, r6			! clear fraction in word
+	! fall through
+
+1:	stw r3, 0(sp)
+	lfs f2, 0(sp)			! integer = high word, low word
+	fsubs f1, f1, f2		! fraction = value - integer
+2:	stfs f1, 4(sp)			! push fraction
+	stfs f2, 0(sp)			! push integer
+	blr
+
+	! f1 is a fraction without integer (or zero).
+	! Then integer is zero with same sign.
+3:	extlwi r3, r3, 1, 0		! extract sign bit
+	stfs f1, 4(sp)			! push fraction
+	stw r3, 0(sp)			! push integer = zero with sign
+	blr
+
+	! f1 is an integer without fraction (or infinity or NaN).
+	! Unless NaN, then fraction is zero with same sign.
+4:	fcmpu cr0, f1, f1
+	bun cr0, 5f
+	extlwi r3, r3, 1, 0		! extract sign bit
+	stw r3, 4(sp)			! push fraction = zero with sign
+	stfs f1, 0(sp)			! push integer
+	blr
+
+	! f1 is NaN, so both fraction and integer are NaN.
+5:	fmr f2, f1
+	b 2b
diff --git a/mach/powerpc/libem/fif8.s b/mach/powerpc/libem/fif8.s
index bce4f8d24..f93a39ac2 100644
--- a/mach/powerpc/libem/fif8.s
+++ b/mach/powerpc/libem/fif8.s
@@ -1,7 +1,8 @@
 .sect .text
 
 ! Multiplies two double-precision floats, then splits the product into
-! fraction and integer, like modf(3) in C.  On entry:
+! fraction and integer, both as floats, like modf(3) in C,
+! http://en.cppreference.com/w/c/numeric/math/modf
 !
 ! Stack: ( a b -- fraction integer )
 
@@ -14,20 +15,18 @@
 	lwz r3, 0(sp)			! r3 = high word
 	lwz r4, 4(sp)			! r4 = low word
 
-	! IEEE double-precision format:
+	! IEEE double = sign * 1.fraction * 2**(exponent - 1023)
 	!   sign  exponent  fraction
 	!   0     1..11     12..63
 	!
 	! Subtract 1023 from the IEEE exponent.  If the result is from
 	! 0 to 51, then the IEEE fraction has that many integer bits.
-	! (IEEE has an implicit 1 before its fraction.  If the IEEE
-	! fraction has 0 integer bits, we still have an integer.)
 
 	extrwi r5, r3, 11, 1		! r5 = IEEE exponent
 	addic. r5, r5, -1023		! r5 = nr of integer bits
-	blt 4f				! branch if no integer
+	blt 3f				! branch if no integer
 	cmpwi r5, 52
-	bge 5f				! branch if no fraction
+	bge 4f				! branch if no fraction
 	cmpwi r5, 21
 	bge 6f				! branch if large integer
 	! fall through if small integer
@@ -44,22 +43,38 @@
 1:	stw r3, 0(sp)
 	stw r4, 4(sp)
 	lfd f2, 0(sp)			! integer = high word, low word
-2:	fsub f1, f1, f2			! fraction = value - integer
-3:	stfd f1, 8(sp)			! push fraction
+	fsub f1, f1, f2			! fraction = value - integer
+2:	stfd f1, 8(sp)			! push fraction
 	stfd f2, 0(sp)			! push integer
 	blr
 
-4:	! f1 is a fraction without integer.
-	fsub f2, f1, f1			! integer = zero
-	b 3b
+	! f1 is a fraction without integer (or zero).
+	! Then integer is zero with same sign.
+3:	extlwi r3, r3, 1, 0		! extract sign bit
+	li r4, 0
+	stfd f1, 8(sp)			! push fraction
+	stw r4, 4(sp)
+	stw r3, 0(sp)			! push integer = zero with sign
+	blr
 
-5:	! f1 is an integer without fraction (or infinity or NaN).
-	fmr f2, f1			! integer = f1
+	! f1 is an integer without fraction (or infinity or NaN).
+	! Unless NaN, then fraction is zero with same sign.
+4:	fcmpu cr0, f1, f1		! integer = f1
+	bun cr0, 5f
+	extlwi r3, r3, 1, 0		! extract sign bit
+	li r4, 0
+	stw r4, 12(sp)
+	stw r3, 8(sp)			! push fraction = zero with sign
+	stfd f1, 0(sp)			! push integer
+	blr
+
+	! f1 is NaN, so both fraction and integer are NaN.
+5:	fmr f2, f1
 	b 2b
 
-6:	! f1 has r5 = 21 to 51 to integer bits.
+	! f1 has r5 = 21 to 51 to integer bits.
 	! Low word has 52 - r5 fraction bits.
-	li r6, 52
+6:	li r6, 52
 	subf r6, r5, r6
 	srw r4, r4, r6
 	slw r4, r4, r6			! clear fraction in low word
diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index df06a5d49..1ea0b60ec 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -310,7 +310,7 @@ INSTRUCTIONS
   fadds           FSREG+LOCAL:wo, FSREG:ro, FSREG:ro cost(4, 5).
   fcmpo           CR:wo, FREG:ro, FREG:ro cost(4, 5).
   fcmpo           CR:wo, FSREG:ro, FSREG:ro cost(4, 5).
-  fctiwz          FREG:wo, FREG:ro.
+  fctiwz          FREG:wo, FREG:ro cost(4, 5).
   fdiv            FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 35).
   fdivs           FSREG+LOCAL:wo, FSREG:ro, FSREG:ro cost(4, 21).
   fmr             FPR:wo, FPR:ro cost(4, 5).
@@ -2329,10 +2329,20 @@ PATTERNS
 		with FSREG
 			gen fneg {LOCAL, $2}, %1
 
+	/* When a or b is NaN, then a < b, a <= b, a > b, a >= b
+	 * should all be false.  We can't make them false, because
+	 *  - EM's _cmf_ is only for ordered comparisons.
+	 *  - The peephole optimizer assumes (a < b) == !(a >= b).
+	 *
+	 * We do make a == b false and a != b true, by checking the
+	 * eq (equal) bit or un (unordered) bit in cr0.
+	 */
+
 	pat cmf $1==4                      /* Compare single */
 		with FSREG FSREG
 			uses REG={COND_FS, %2, %1}
-			gen extlwi %a, %a, {C, 2}, {C, 0}
+			/* Extract lt, gt, un; put lt in sign bit. */
+			gen andisX %a, %a, {C, 0xd000}
 			yields %a
 
 	pat cmf teq $1==4                  /* Single second == top */
@@ -2367,7 +2377,6 @@ PATTERNS
 
 	proc cmf4zxx example cmf zeq
 		with FSREG FSREG STACK
-			uses REG
 			gen
 				fcmpo cr0, %2, %1
 				bxx* {LABEL, $2}
@@ -2420,6 +2429,13 @@ PATTERNS
 			loc 4
 			cff
 
+	pat fef $1==4                      /* Split fraction, exponent */
+		leaving cal ".fef4"
+
+	/* Multiply two singles, then split fraction, integer */
+	pat fif $1==4
+		leaving cal ".fif4"
+
 
 /* Double-precision floating-point */
 
@@ -2471,10 +2487,13 @@ PATTERNS
 		with FREG
 			gen fneg {DLOCAL, $2}, %1
 
+	/* To compare NaN, see comment above pat cmf $1==4 */
+
 	pat cmf $1==8                      /* Compare double */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
-			gen extlwi %a, %a, {C, 2}, {C, 0}
+			/* Extract lt, gt, un; put lt in sign bit. */
+			gen andisX %a, %a, {C, 0xd000}
 			yields %a
 
 	pat cmf teq $1==8                  /* Double second == top */
@@ -2482,7 +2501,7 @@ PATTERNS
 			uses REG={COND_FD, %2, %1}
 			yields {XEQ, %a}
 
-	pat cmf tne $1==8                  /* Single second == top */
+	pat cmf tne $1==8                  /* Double second == top */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
 			yields {XNE, %a}
@@ -2509,7 +2528,6 @@ PATTERNS
 
 	proc cmf8zxx example cmf zeq
 		with FREG FREG STACK
-			uses REG
 			gen
 				fcmpo cr0, %2, %1
 				bxx* {LABEL, $2}