Add fef 4, fif 4. Improve fef 8, fif 8. Other float changes.

When I wrote fef 8, I forgot to test denormalized numbers. Oops. Now fix two of my mistakes: - When checking for zero, `extrwi r6, r3, 22, 12` needs to be `extrwi r6, r3, 20, 12`. There are only 20 bits to extract. - After the multiplication by 2**64, I forgot to put the fraction in [0.5, 1) or (-1, 0.5] by setting IEEE exponent = 1022. Teach fif 8 about signed zero and NaN. In ncg/table, change cmf so NaN is not equal to any value, and comment why ordered comparisons don't work with NaN. Also add cost for fctwiz, remove extra `uses REG`. Edit comment in cfu8.s because the conditional branch might be before or after fctwiz.
2018-01-22 14:04:15 -05:00 · 2018-01-22 14:04:15 -05:00 · 66f93f08c5
commit 66f93f08c5
parent f1304e1a3c
7 changed files with 194 additions and 49 deletions
--- a/mach/powerpc/libem/build.lua
+++ b/mach/powerpc/libem/build.lua
@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
 	acklibrary {
 		name = "lib_"..plat,
 		srcs = {
-			"./*.s", -- cfu8.s
+			"./*.s", -- fif4.s
 		},
 		vars = { plat = plat },
 		deps = {
--- a/mach/powerpc/libem/cfu8.s
+++ b/mach/powerpc/libem/cfu8.s
@ -42,6 +42,8 @@
 ! 1: yields r3 = the converted value.
 !
 ! Debian's clang 3.5.0-10 and gcc 4.9.2-10 don't clamp the value
-! before conversion.  They avoid fsel and put the conditional branch
+! before conversion.  They avoid fsel and use the conditional branch
-! before fctwiz.  PowerPC 601 lacks fsel (but kernel might trap and
+! to pick between 2 fctwiz instructions.
-! emulate fsel).  PowerPC 603, 604, G3, G4, G5 have fsel.
+!
 ! PowerPC 601 lacks fsel (but kernel might trap and emulate fsel).
 ! PowerPC 603, 604, G3, G4, G5 have fsel.
--- a/mach/powerpc/libem/fef4.s
+++ b/mach/powerpc/libem/fef4.s
@ -0,0 +1,48 @@
 .sect .text
 ! Split a single-precision float into fraction and exponent, like
 ! frexpf(3) in C, http://en.cppreference.com/w/c/numeric/math/frexp
 !
 ! Stack: ( single -- fraction exponent )
 .define .fef4
 .fef4:
 	lwz r3, 0(sp)			! r3 = word of float bits
 	! IEEE single = sign * 1.fraction * 2**(exponent - 127)
 	!   sign  exponent  fraction
 	!   0     1..8      9..31
 	!
 	! IEEE exponent = 126 in [0.5, 1) or (-1, -0.5].
 	extrwi. r6, r3, 8, 1		! r6 = IEEE exponent
 	beq 3f				! jump if zero or denormalized
 	cmpwi r6, 255
 	addi r5, r6, -126		! r5 = our exponent
 	beq 2f				! jump if infinity or NaN
 	! fall through if normalized
 	! Put fraction in [0.5, 1) or (-1, -0.5].
 1:	li r6, 126
 	insrwi r3, r6, 8, 1		! IEEE exponent = 126
 	! fall through
 2:	stw r3, 0(sp)			! push fraction
 	stwu r5, -4(sp)			! push exponent
 	blr
 	! Got denormalized number or zero, probably zero.
 	! If zero, then exponent must also be zero.
 3:	extrwi. r6, r3, 23, 9		! r6 = fraction
 	bne 4f				! jump if not zero
 	li r5, 0			! exponent = 0
 	b 2b
 	! Got denormalized number = 0.fraction * 2**-126
 4:	cntlzw r5, r6
 	addi r5, r5, -8
 	slw r6, r6, r5			! shift left to make 1.fraction
 	insrwi r3, r6, 23, 9		! set new fraction
 	li r6, -126 + 1
 	subf r5, r5, r6			! r5 = our exponent
 	b 1b
--- a/mach/powerpc/libem/fef8.s
+++ b/mach/powerpc/libem/fef8.s
@ -3,7 +3,7 @@
 .sect .text
 ! Split a double-precision float into fraction and exponent, like
-! frexp(3) in C.
+! frexp(3) in C, http://en.cppreference.com/w/c/numeric/math/frexp
 !
 ! Stack: ( double -- fraction exponent )
@ -12,42 +12,41 @@
 	lwz r3, 0(sp)			! r3 = high word (bits 0..31)
 	lwz r4, 4(sp)			! r4 = low word (bits 32..63)
-	! IEEE double-precision format:
+	! IEEE double = sign * 1.fraction * 2**(exponent - 1023)
 	!   sign  exponent  fraction
 	!   0     1..11     12..63
 	!
-	! To get fraction in [0.5, 1) or (-1, -0.5], we subtract 1022
+	! IEEE exponent = 1022 in [0.5, 1) or (-1, -0.5].
 	! from the IEEE exponent.
 	extrwi. r6, r3, 11, 1		! r6 = IEEE exponent
-	addi r5, r6, -1022		! r5 = our exponent
+	beq 3f				! jump if zero or denormalized
 	beq 2f				! jump if zero or denormalized
 	cmpwi r6, 2047
-	beq 1f				! jump if infinity or NaN
+	addi r5, r6, -1022		! r5 = our exponent
 	beq 2f				! jump if infinity or NaN
 	! fall through if normalized
-	! Put fraction in [0.5, 1) or (-1, -0.5] by setting its
+	! Put fraction in [0.5, 1) or (-1, -0.5].
-	! IEEE exponent to 1022.
+1:	li r6, 1022
-	rlwinm r3, r3, 0, 12, 0		! clear old exponent
+	insrwi r3, r6, 11, 1		! IEEE exponent = 1022
 	oris r3, r3, 1022 << 4		! set new exponent
 	! fall through
-1:	stw r3, 0(sp)
+2:	stw r3, 0(sp)
 	stw r4, 4(sp)			! push fraction
 	stwu r5, -4(sp)			! push exponent
 	blr
-2:	! Got denormalized number or zero, probably zero.
+	! Got denormalized number or zero, probably zero.
-	extrwi r6, r3, 22, 12
+	! If zero, then exponent must also be zero.
 3:	extrwi r6, r3, 20, 12
 	or. r6, r6, r4			! r6 = high|low fraction
-	bne 3f				! jump if not zero
+	bne 4f				! jump if not zero
 	li r5, 0			! exponent = 0
-	b 1b
+	b 2b
-3:	! Got denormalized number, not zero.
+	! Got denormalized number = 0.fraction * 2**-1022
-	lfd f0, 0(sp)
+4:	lfd f0, 0(sp)
-	lis r6, ha16[_2_64]
+	lis r6, ha16[.fs_2_64]
-	lfd f1, lo16[_2_64](r6)
+	lfs f1, lo16[.fs_2_64](r6)
 	fmul f0, f0, f1			! multiply it by 2**64
 	stfd f0, 0(sp)
 	lwz r3, 0(sp)
@ -57,7 +56,6 @@
 	b 1b
 .sect .rom
-_2_64:
+.fs_2_64:
-	! (double) 2**64
+	!float 1.84467440737095516e+19 sz 4
-	.data4 0x43f00000
+	.data1 0137,0200,00,00
 	.data4 0x00000000
--- a/mach/powerpc/libem/fif4.s
+++ b/mach/powerpc/libem/fif4.s
@ -0,0 +1,64 @@
 .sect .text
 ! Multiplies two single-precision floats, then splits the product into
 ! fraction and integer, both as floats, like modff(3) in C,
 ! http://en.cppreference.com/w/c/numeric/math/modf
 !
 ! Stack: ( a b -- fraction integer )
 .define .fif4
 .fif4:
 	lfs f1, 4(sp)
 	lfs f2, 0(sp)
 	fmuls f1, f1, f2		! f1 = a * b
 	stfs f1, 0(sp)
 	lwz r3, 0(sp)			! r3 = word of float bits
 	! IEEE single = sign * 1.fraction * 2**(exponent - 127)
 	!   sign  exponent  fraction
 	!   0     1..8      9..31
 	!
 	! Subtract 127 from the IEEE exponent.  If the result is from
 	! 0 to 23, then the IEEE fraction has that many integer bits.
 	extrwi r5, r3, 8, 1		! r5 = IEEE exponent
 	addic. r5, r5, -127		! r5 = nr of integer bits
 	blt 3f				! branch if no integer
 	cmpwi r5, 24
 	bge 4f				! branch if no fraction
 	! fall through if integer with fraction
 	! f1 has r5 = 0 to 23 integer bits in the IEEE fraction.
 	! There are 23 - r5 fraction bits.
 	li r6, 23
 	subf r6, r5, r6
 	srw r3, r3, r6
 	slw r3, r3, r6			! clear fraction in word
 	! fall through
 1:	stw r3, 0(sp)
 	lfs f2, 0(sp)			! integer = high word, low word
 	fsubs f1, f1, f2		! fraction = value - integer
 2:	stfs f1, 4(sp)			! push fraction
 	stfs f2, 0(sp)			! push integer
 	blr
 	! f1 is a fraction without integer (or zero).
 	! Then integer is zero with same sign.
 3:	extlwi r3, r3, 1, 0		! extract sign bit
 	stfs f1, 4(sp)			! push fraction
 	stw r3, 0(sp)			! push integer = zero with sign
 	blr
 	! f1 is an integer without fraction (or infinity or NaN).
 	! Unless NaN, then fraction is zero with same sign.
 4:	fcmpu cr0, f1, f1
 	bun cr0, 5f
 	extlwi r3, r3, 1, 0		! extract sign bit
 	stw r3, 4(sp)			! push fraction = zero with sign
 	stfs f1, 0(sp)			! push integer
 	blr
 	! f1 is NaN, so both fraction and integer are NaN.
 5:	fmr f2, f1
 	b 2b
--- a/mach/powerpc/libem/fif8.s
+++ b/mach/powerpc/libem/fif8.s
@ -1,7 +1,8 @@
 .sect .text
 ! Multiplies two double-precision floats, then splits the product into
-! fraction and integer, like modf(3) in C.  On entry:
+! fraction and integer, both as floats, like modf(3) in C,
 ! http://en.cppreference.com/w/c/numeric/math/modf
 !
 ! Stack: ( a b -- fraction integer )
@ -14,20 +15,18 @@
 	lwz r3, 0(sp)			! r3 = high word
 	lwz r4, 4(sp)			! r4 = low word
-	! IEEE double-precision format:
+	! IEEE double = sign * 1.fraction * 2**(exponent - 1023)
 	!   sign  exponent  fraction
 	!   0     1..11     12..63
 	!
 	! Subtract 1023 from the IEEE exponent.  If the result is from
 	! 0 to 51, then the IEEE fraction has that many integer bits.
 	! (IEEE has an implicit 1 before its fraction.  If the IEEE
 	! fraction has 0 integer bits, we still have an integer.)
 	extrwi r5, r3, 11, 1		! r5 = IEEE exponent
 	addic. r5, r5, -1023		! r5 = nr of integer bits
-	blt 4f				! branch if no integer
+	blt 3f				! branch if no integer
 	cmpwi r5, 52
-	bge 5f				! branch if no fraction
+	bge 4f				! branch if no fraction
 	cmpwi r5, 21
 	bge 6f				! branch if large integer
 	! fall through if small integer
@ -44,22 +43,38 @@
 1:	stw r3, 0(sp)
 	stw r4, 4(sp)
 	lfd f2, 0(sp)			! integer = high word, low word
-2:	fsub f1, f1, f2			! fraction = value - integer
+	fsub f1, f1, f2			! fraction = value - integer
-3:	stfd f1, 8(sp)			! push fraction
+2:	stfd f1, 8(sp)			! push fraction
 	stfd f2, 0(sp)			! push integer
 	blr
-4:	! f1 is a fraction without integer.
+	! f1 is a fraction without integer (or zero).
-	fsub f2, f1, f1			! integer = zero
+	! Then integer is zero with same sign.
-	b 3b
+3:	extlwi r3, r3, 1, 0		! extract sign bit
 	li r4, 0
 	stfd f1, 8(sp)			! push fraction
 	stw r4, 4(sp)
 	stw r3, 0(sp)			! push integer = zero with sign
 	blr
-5:	! f1 is an integer without fraction (or infinity or NaN).
+	! f1 is an integer without fraction (or infinity or NaN).
-	fmr f2, f1			! integer = f1
+	! Unless NaN, then fraction is zero with same sign.
 4:	fcmpu cr0, f1, f1		! integer = f1
 	bun cr0, 5f
 	extlwi r3, r3, 1, 0		! extract sign bit
 	li r4, 0
 	stw r4, 12(sp)
 	stw r3, 8(sp)			! push fraction = zero with sign
 	stfd f1, 0(sp)			! push integer
 	blr
 	! f1 is NaN, so both fraction and integer are NaN.
 5:	fmr f2, f1
 	b 2b
-6:	! f1 has r5 = 21 to 51 to integer bits.
+	! f1 has r5 = 21 to 51 to integer bits.
 	! Low word has 52 - r5 fraction bits.
-	li r6, 52
+6:	li r6, 52
 	subf r6, r5, r6
 	srw r4, r4, r6
 	slw r4, r4, r6			! clear fraction in low word
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@ -310,7 +310,7 @@ INSTRUCTIONS
  fadds           FSREG+LOCAL:wo, FSREG:ro, FSREG:ro cost(4, 5).
  fcmpo           CR:wo, FREG:ro, FREG:ro cost(4, 5).
  fcmpo           CR:wo, FSREG:ro, FSREG:ro cost(4, 5).
-  fctiwz          FREG:wo, FREG:ro.
+  fctiwz          FREG:wo, FREG:ro cost(4, 5).
  fdiv            FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 35).
  fdivs           FSREG+LOCAL:wo, FSREG:ro, FSREG:ro cost(4, 21).
  fmr             FPR:wo, FPR:ro cost(4, 5).
@ -2329,10 +2329,20 @@ PATTERNS
 		with FSREG
 			gen fneg {LOCAL, $2}, %1
 	/* When a or b is NaN, then a < b, a <= b, a > b, a >= b
 	 * should all be false.  We can't make them false, because
 	 *  - EM's _cmf_ is only for ordered comparisons.
 	 *  - The peephole optimizer assumes (a < b) == !(a >= b).
 	 *
 	 * We do make a == b false and a != b true, by checking the
 	 * eq (equal) bit or un (unordered) bit in cr0.
 	 */
 	pat cmf $1==4                      /* Compare single */
 		with FSREG FSREG
 			uses REG={COND_FS, %2, %1}
-			gen extlwi %a, %a, {C, 2}, {C, 0}
+			/* Extract lt, gt, un; put lt in sign bit. */
 			gen andisX %a, %a, {C, 0xd000}
 			yields %a
 	pat cmf teq $1==4                  /* Single second == top */
@ -2367,7 +2377,6 @@ PATTERNS
 	proc cmf4zxx example cmf zeq
 		with FSREG FSREG STACK
 			uses REG
 			gen
 				fcmpo cr0, %2, %1
 				bxx* {LABEL, $2}
@ -2420,6 +2429,13 @@ PATTERNS
 			loc 4
 			cff
 	pat fef $1==4                      /* Split fraction, exponent */
 		leaving cal ".fef4"
 	/* Multiply two singles, then split fraction, integer */
 	pat fif $1==4
 		leaving cal ".fif4"
 /* Double-precision floating-point */
@ -2471,10 +2487,13 @@ PATTERNS
 		with FREG
 			gen fneg {DLOCAL, $2}, %1
 	/* To compare NaN, see comment above pat cmf $1==4 */
 	pat cmf $1==8                      /* Compare double */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
-			gen extlwi %a, %a, {C, 2}, {C, 0}
+			/* Extract lt, gt, un; put lt in sign bit. */
 			gen andisX %a, %a, {C, 0xd000}
 			yields %a
 	pat cmf teq $1==8                  /* Double second == top */
@ -2482,7 +2501,7 @@ PATTERNS
 			uses REG={COND_FD, %2, %1}
 			yields {XEQ, %a}
-	pat cmf tne $1==8                  /* Single second == top */
+	pat cmf tne $1==8                  /* Double second == top */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
 			yields {XNE, %a}
@ -2509,7 +2528,6 @@ PATTERNS
 	proc cmf8zxx example cmf zeq
 		with FREG FREG STACK
 			uses REG
 			gen
 				fcmpo cr0, %2, %1
 				bxx* {LABEL, $2}