From 88207db63853eef2c86ae9a63f7f6f58c7a5b89e Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Wed, 6 Dec 2017 17:09:12 -0500
Subject: [PATCH 01/55] Use <stdarg.h> in util/misc/convert.c

I made a syntax error in some .e file, and em_encode dumped core
because a 64-bit pointer didn't fit in a 32-bit int.  Now use stdarg
to pass pointers to error() and fatal().

Stop using the number of errors as the exit status.  Many systems use
only the low 8 bits of the exit status, so 256 errors would become 0.

Also change modules/src/print to accept const char *buf
---
 modules/src/print/doprnt.c |  2 +-
 modules/src/print/format.c |  2 +-
 modules/src/print/fprint.c |  2 +-
 modules/src/print/print.c  |  2 +-
 modules/src/print/print.h  | 10 +++++-----
 modules/src/print/sprint.c |  2 +-
 util/misc/convert.c        | 28 +++++++++++++++++++---------
 7 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/modules/src/print/doprnt.c b/modules/src/print/doprnt.c
index a77b7d2c1..1d888e570 100644
--- a/modules/src/print/doprnt.c
+++ b/modules/src/print/doprnt.c
@@ -16,7 +16,7 @@
 	%d = int
 $ */
 void
-doprnt(File *fp, char *fmt, va_list argp)
+doprnt(File *fp, const char *fmt, va_list argp)
 {
 	char buf[SSIZE];
 
diff --git a/modules/src/print/format.c b/modules/src/print/format.c
index 2ad920bc8..e03717918 100644
--- a/modules/src/print/format.c
+++ b/modules/src/print/format.c
@@ -35,7 +35,7 @@ integral(int c)
 	%d = int
 $ */
 int
-_format(char *buf, char *fmt, va_list argp)
+_format(char *buf, const char *fmt, va_list argp)
 {
 	register char *pf = fmt;
 	register char *pb = buf;
diff --git a/modules/src/print/fprint.c b/modules/src/print/fprint.c
index c401858a9..6b5b8a389 100644
--- a/modules/src/print/fprint.c
+++ b/modules/src/print/fprint.c
@@ -17,7 +17,7 @@
 $ */
 /*VARARGS*/
 void
-fprint(File *fp, char *fmt, ...)
+fprint(File *fp, const char *fmt, ...)
 {
 	va_list args;
 	char buf[SSIZE];
diff --git a/modules/src/print/print.c b/modules/src/print/print.c
index cd9346e98..2e1256a54 100644
--- a/modules/src/print/print.c
+++ b/modules/src/print/print.c
@@ -17,7 +17,7 @@
 $ */
 /*VARARGS*/
 void
-print(char *fmt, ...)
+print(const char *fmt, ...)
 {
 	va_list args;
 	char buf[SSIZE];
diff --git a/modules/src/print/print.h b/modules/src/print/print.h
index 56372376a..974e4bf1b 100644
--- a/modules/src/print/print.h
+++ b/modules/src/print/print.h
@@ -9,10 +9,10 @@
 
 #include <stdarg.h>
 
-void print(char *fmt, ...);
-void fprint(File *f, char *fmt, ...);
-void doprnt(File *f, char *fmt, va_list ap);
-int _format(char *buf, char *fmt, va_list ap);
-char *sprint(char *buf, char *fmt, ...);
+void print(const char *fmt, ...);
+void fprint(File *f, const char *fmt, ...);
+void doprnt(File *f, const char *fmt, va_list ap);
+int _format(char *buf, const char *fmt, va_list ap);
+char *sprint(char *buf, const char *fmt, ...);
 
 #endif /* __PRINT_INCLUDED__ */
diff --git a/modules/src/print/sprint.c b/modules/src/print/sprint.c
index d88b47e69..7c9dbf9b0 100644
--- a/modules/src/print/sprint.c
+++ b/modules/src/print/sprint.c
@@ -17,7 +17,7 @@
 $ */
 /*VARARGS*/
 char *
-sprint(char *buf, char *fmt, ...)
+sprint(char *buf, const char *fmt, ...)
 {
 	va_list args;
 
diff --git a/util/misc/convert.c b/util/misc/convert.c
index ec38761fa..9bdc12011 100644
--- a/util/misc/convert.c
+++ b/util/misc/convert.c
@@ -16,8 +16,10 @@ static char rcsid[] = "$Id$";
 	linked.
 */
 
+#include <stdarg.h>
 #include <stdlib.h>
 #include "system.h"
+#include "print.h"
 #include "em_pseu.h"
 #include "em_mnem.h"
 #include "em_spec.h"
@@ -30,8 +32,11 @@ char *filename;			/* Name of input file */
 int errors;			/* Number of errors */
 extern char *C_error;
 
-main(argc,argv)
-	char **argv;
+void error(const char *, ...);
+void fatal(const char *, ...);
+
+int
+main(int argc, char **argv)
 {
 	struct e_instr buf;
 	register struct e_instr *p = &buf;
@@ -66,27 +71,32 @@ main(argc,argv)
 	}
 	C_close();
 	EM_close();
-	exit(errors);
+	exit(errors ? 1 : 0);
 }
 
 /* VARARGS */
-error(s,a1,a2,a3,a4)
-	char *s;
+void
+error(const char *s, ...)
 {
+	va_list ap;
+	va_start(ap, s);
 	fprint(STDERR,
 		"%s, line %d: ",
 		filename ? filename : "standard input",
 		EM_lineno);
-	fprint(STDERR,s,a1,a2,a3,a4);
+	doprnt(STDERR, s, ap);
 	fprint(STDERR, "\n");
 	errors++;
+	va_end(ap);
 }
 
 /* VARARGS */
-fatal(s,a1,a2,a3,a4)
-	char *s;
+void
+fatal(const char *s, ...)
 {
+	va_list ap;
+	va_start(ap, s);
 	if (C_busy()) C_close();
-	error(s,a1,a2,a3,a4);
+	error(s, ap);
 	exit(1);
 }

From 34cf0c8b63dc709254fbdc4523e5990a2c4714e6 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Wed, 6 Dec 2017 22:14:00 -0500
Subject: [PATCH 02/55] Kill registers a, de, when i80 ncg does Call libem.

I compiled tests/plat/lib/test.c with ack -mcpm, but i80 ncg did emit
wrong code in writehex(uint32_t) for

    "0123456789abcdef"[code & 0xf]

The code called '.and' to evaluate `code & 0xf`, then tried to call
'.cii' to narrow the result from 4 to 2 bytes, but it passed garbage
instead of 4 to '.cii'.  The rule for '.and' was

    pat and defined($1)
    kills ALL
    uses dereg={const2,$1}
    gen Call {label,".and"}

This failed to kill register de={const2,4}, so ncg pushed de,
expecting to push 4, but actually pushing garbage.

Fix such rules using `mvi a,...` or `lxi de,...` so ncg doesn't track
the token in the register.  This is like the i86 table.  A different
fix would use a dummy instruction `killreg a` or `killreg de` like the
m68020 table.

Also correct 1 to $1 when calling '.exg'.
---
 mach/i80/ncg/table | 122 +++++++++++++++++++++++----------------------
 1 file changed, 62 insertions(+), 60 deletions(-)

diff --git a/mach/i80/ncg/table b/mach/i80/ncg/table
index 575820c81..e6d7e02f6 100644
--- a/mach/i80/ncg/table
+++ b/mach/i80/ncg/table
@@ -385,8 +385,9 @@ gen dad de
 
 pat loi $1>=512
 kills ALL
-uses dereg={const2,$1}
-gen Call {label,".loi"}
+/* 'uses dereg={const2,$1}' fails to kill de. */
+gen lxi de,{const2,$1}
+    Call {label,".loi"}
 
 pat los $1==2
 with dereg
@@ -597,8 +598,8 @@ gen 1:
 
 pat sti
 kills ALL
-uses dereg={const2,$1}
-gen Call {label,".sti"}
+gen lxi de,{const2,$1}
+    Call {label,".sti"}
 
 pat sts $1==2
 with dereg
@@ -702,23 +703,24 @@ gen Call {label,".mli4"}
 
 pat dvi $1==2
 kills ALL
-uses areg={const1,129}
-gen Call {label,".dvi2"}		yields de
+/* 'uses areg={const1,129}' fails to kill a. */
+gen mvi a,{const1,129}
+    Call {label,".dvi2"}		yields de
 
 pat dvi $1==4
 kills ALL
-uses areg={const1,129}
-gen Call {label,".dvi4"}
+gen mvi a,{const1,129}
+    Call {label,".dvi4"}
 
 pat rmi $1==2
 kills ALL
-uses areg={const1,128}
-gen Call {label,".dvi2"}		yields de
+gen mvi a,{const1,128}
+    Call {label,".dvi2"}		yields de
 
 pat rmi $1==4
 kills ALL
-uses areg={const1,128}
-gen Call {label,".dvi4"}
+gen mvi a,{const1,128}
+    Call {label,".dvi4"}
 
 pat ngi $1==2
 with hl_or_de
@@ -738,7 +740,7 @@ pat loc sli ($1 == 8) && ($2 == 2)
 with hl_or_de
 gen move %1.2, %1.1
     mvi %1.2, {const1,0}        yields %1
-    
+
 pat sli $1==2
 kills ALL
 gen Call {label,".sli2"}		yields de
@@ -749,13 +751,13 @@ gen Call {label,".sli4"}
 
 pat sri $1==2
 kills ALL
-uses areg={const1,1}
-gen Call {label,".sri2"}		yields de
+gen mvi a,{const1,1}
+    Call {label,".sri2"}		yields de
 
 pat sri $1==4
 kills ALL
-uses areg={const1,1}
-gen Call {label,".sri4"}
+gen mvi a,{const1,1}
+    Call {label,".sri4"}
 
 /********************************************/
 /* Group 4: Unsigned arithmetic		    */
@@ -775,23 +777,23 @@ gen Call {label,".mli4"}
 
 pat dvu $1==2
 kills ALL
-uses areg={const1,1}
-gen Call {label,".dvi2"}		yields de
+gen mvi a,{const1,1}
+    Call {label,".dvi2"}		yields de
 
 pat dvu $1==4
 kills ALL
-uses areg={const1,1}
-gen Call {label,".dvi4"}
+gen mvi a,{const1,1}
+    Call {label,".dvi4"}
 
 pat rmu $1==2
 kills ALL
-uses areg={const1,0}
-gen Call {label,".dvi2"}		yields de
+gen mvi a,{const1,0}
+    Call {label,".dvi2"}		yields de
 
 pat rmu $1==4
 kills ALL
-uses areg={const1,0}
-gen Call {label,".dvi4"}
+gen mvi a,{const1,0}
+    Call {label,".dvi4"}
 
 pat slu						leaving sli $1
 
@@ -799,16 +801,16 @@ pat loc sru ($1 == 8) && ($2 == 2)
 with hl_or_de
 gen move %1.1, %1.2
     mvi %1.1, {const1,0}        yields %1
-    
+
 pat sru $1==2
 kills ALL
-uses areg={const1,0}
-gen Call {label,".sri2"}		yields de
+gen mvi a,{const1,0}
+    Call {label,".sri2"}		yields de
 
 pat sru $1==4
 kills ALL
-uses areg={const1,0}
-gen Call {label,".sri4"}
+gen mvi a,{const1,0}
+    Call {label,".sri4"}
 
 
 /********************************************/
@@ -1047,8 +1049,8 @@ with hlreg
 
 pat cii
 kills ALL
-uses areg={const1,1}
-gen Call {label,".cii"}
+gen mvi a,{const1,1}
+    Call {label,".cii"}
 
 pat loc loc ciu					leaving loc $1 loc $2 cuu
 pat loc loc cui					leaving loc $1 loc $2 cuu
@@ -1081,8 +1083,8 @@ with hl_or_de
 
 pat cuu
 kills ALL
-uses areg={const1,0}
-gen Call {label,".cii"}
+gen mvi a,{const1,0}
+    Call {label,".cii"}
 
 pat cfi
 kills ALL
@@ -1128,8 +1130,8 @@ gen mov a,%1.2
 
 pat and defined($1)
 kills ALL
-uses dereg={const2,$1}
-gen Call {label,".and"}
+gen lxi de,{const2,$1}
+    Call {label,".and"}
 
 pat and !defined($1)
 with dereg
@@ -1156,8 +1158,8 @@ gen mov a,%1.2
 
 pat ior defined($1)
 kills ALL
-uses dereg={const2,$1}
-gen Call {label,".ior"}
+gen lxi de,{const2,$1}
+    Call {label,".ior"}
 
 pat ior !defined($1)
 with dereg
@@ -1184,8 +1186,8 @@ gen mov a,%1.2
 
 pat xor defined($1)
 kills ALL
-uses dereg={const2,$1}
-gen Call {label,".xor"}
+gen lxi de,{const2,$1}
+    Call {label,".xor"}
 
 pat xor !defined($1)
 with dereg
@@ -1204,8 +1206,8 @@ gen mov a,%1.2
 
 pat com defined($1)
 kills ALL
-uses dereg={const2,$1}
-gen Call {label,".com"}
+gen lxi de,{const2,$1}
+    Call {label,".com"}
 
 pat com !defined($1)
 with dereg
@@ -1269,8 +1271,8 @@ gen Call {label,".inn2"}		yields de
 
 pat inn defined($1)
 kills ALL
-uses dereg={const2,$1}
-gen Call {label,".inn"}			yields de
+gen lxi de,{const2,$1}
+    Call {label,".inn"}			yields de
 
 pat inn !defined($1)
 with dereg
@@ -1284,8 +1286,8 @@ gen Call {label,".set2"}		yields de
 
 pat set defined($1)
 kills ALL
-uses dereg={const2,$1}
-gen Call {label,".set"}
+gen lxi de,{const2,$1}
+    Call {label,".set"}
 
 pat set !defined($1)
 with dereg
@@ -1402,8 +1404,8 @@ pat cmi $1==2					leaving sbi 2
 
 pat cmi $1==4
 kills ALL
-uses areg={const1,1}
-gen Call {label,".cmi4"}		yields de
+gen mvi a,{const1,1}
+    Call {label,".cmi4"}		yields de
 
 pat cmf $1==4
 kills ALL
@@ -1412,14 +1414,14 @@ gen Call {label,".cmf4"}
 pat cmf $1==8
 kills ALL
 gen Call {label,".cmf8"}
- 
+
 pat cmu $1==2
 with hl_or_de hl_or_de
 uses areg
 gen mov a,%2.1
     cmp %1.1
     jz {label,2f}
-    jc {label,1f}  
+    jc {label,1f}
     0:
     lxi %2,{const2,1}
     jmp {label,3f}
@@ -1436,15 +1438,15 @@ gen mov a,%2.1
 
 pat cmu $1==4
 kills ALL
-uses areg={const1,0}
-gen Call {label,".cmi4"}		yields de
+gen mvi a,{const1,0}
+    Call {label,".cmi4"}		yields de
 
 pat cms $1==2					leaving cmi 2
 
 pat cms defined($1)
 kills ALL
-uses dereg={const2,$1}
-gen Call {label,".cms"}			yields de
+gen lxi de,{const2,$1}
+    Call {label,".cms"}			yields de
 
 pat cms !defined($1)
 with dereg
@@ -1936,8 +1938,8 @@ gen dad sp
 
 pat blm
 kills ALL
-uses dereg={const2,$1}
-gen Call {label,".blm"}
+gen lxi de,{const2,$1}
+    Call {label,".blm"}
 
 pat bls
 with dereg
@@ -1962,8 +1964,8 @@ with src1or2 src1or2			yields %2 %1 %2 %1
 
 pat dup
 kills ALL
-uses dereg={const2,$1}
-gen Call {label,".dup"}
+gen lxi de,{const2,$1}
+    Call {label,".dup"}
 
 pat dus $1==2
 with dereg
@@ -1975,8 +1977,8 @@ with src1or2 src1or2			yields %1 %2
 
 pat exg defined($1)
 kills ALL
-uses dereg={const2,1}
-gen Call {label,".exg"}
+gen lxi de,{const2,$1}
+    Call {label,".exg"}
 
 pat fil
 uses hlreg={label,$1}

From c95bcac91de056e6b804fabb22b15caec62fbee0 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 7 Dec 2017 15:39:41 -0500
Subject: [PATCH 03/55] Correct the stack pointer when i80 shrinks an integer.

The code used `sphl` to set the stack pointer, but the correct value
was in de, not hl.  Fix by swapping the values of de and hl, so `sphl`
is now correct.  When we shrink an integer from 4 to 2 bytes, both
registers de and hl point to copies of the result, but only one
register preserves the stack below the result.

This fixes writehex() in tests/plat/lib/test.c, when I compile it with
ack -mcpm, so it preserves the pointer to "0123456789abcdef", so it
writes hexadecimal digits and not garbage.

This bug goes back to commit 157b243 of Mar 18, 1985, so the bug is
32 years old, and probably the oldest bug that I ever fixed.
---
 mach/i80/libem/cii.s | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mach/i80/libem/cii.s b/mach/i80/libem/cii.s
index 7d091da5e..bf4e7efb8 100644
--- a/mach/i80/libem/cii.s
+++ b/mach/i80/libem/cii.s
@@ -65,19 +65,19 @@
 	jmp 3f		! done
 
 !if destination size < source size only:
-shrink:	mov l,c		! load source size in hl
+shrink:	mov l,b		! load destination size in hl
 	mvi h,0
 	dad sp
 	mov d,h
-	mov e,l		! de points just above source
-	mov l,b		! load destination size in hl
+	mov e,l		! de points just above lowest bytes of source
+	mov l,c		! load source size in hl
 	mvi h,0
 	dad sp		! hl points just above "destination"
 
 1:	dcx d		! move upwards
 	dcx h
-	mov a,m
-	stax d
+	ldax d
+	mov m,a
 	dcr b
 	jnz 1b
 	sphl

From a1d1f3869151c8022578d46e41969f95baec94a4 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 7 Dec 2017 17:16:21 -0500
Subject: [PATCH 04/55] Add test for EM rol, ror.  Fix i80, i86, powerpc.

EM instructions _rol_ and _ror_ do rotate an integer left or right.
Our compilers and optimizers never emit _rol_ nor _ror_, but I might
want to use them in the future.

Add _rol_ and _ror_ to powerpc.  Fix `rol 4` and `ror 4` in both i80
and i86, where the rules for `rol 4` and `ror 4` seem to have never
been tested until now.
---
 mach/i80/libem/rol4.s  |   4 +-
 mach/i80/libem/ror4.s  |   4 +-
 mach/i86/ncg/table     |   4 +-
 mach/powerpc/ncg/table |  37 +++++++
 tests/plat/build.lua   |   3 +-
 tests/plat/rotate_e.e  | 219 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 264 insertions(+), 7 deletions(-)
 create mode 100644 tests/plat/rotate_e.e

diff --git a/mach/i80/libem/rol4.s b/mach/i80/libem/rol4.s
index e5bb1a83d..41219ea0c 100644
--- a/mach/i80/libem/rol4.s
+++ b/mach/i80/libem/rol4.s
@@ -25,8 +25,8 @@
 	mov e,a
 
 	mov a,b
-	ral
-1:	mov a,l
+1:	ral
+	mov a,l
 	ral
 	mov l,a
 	mov a,h
diff --git a/mach/i80/libem/ror4.s b/mach/i80/libem/ror4.s
index e77d8a74c..490c75abc 100644
--- a/mach/i80/libem/ror4.s
+++ b/mach/i80/libem/ror4.s
@@ -25,8 +25,8 @@
 	mov e,a
 
 	mov a,l
-	rar
-1:	mov a,b
+1:	rar
+	mov a,b
 	rar
 	mov b,a
 	mov a,c
diff --git a/mach/i86/ncg/table b/mach/i86/ncg/table
index ffbd7101e..ce2ac7b87 100644
--- a/mach/i86/ncg/table
+++ b/mach/i86/ncg/table
@@ -2292,7 +2292,7 @@ with CXREG REG REG
       rcl %3,{ANYCON,1}
       adc %2,{ANYCON,0}
       loop {label, 2b}
-      1:
+      1:				yields %3 %2
 
 pat loc ror $1==1 && $2==2
 with REG
@@ -2311,7 +2311,7 @@ with CXREG REG REG
       rcl %3,{ANYCON,1}
       adc %2,{ANYCON,0}
       loop {label, 2b}
-      1:
+      1:				yields %3 %2
 
 /*******************************************************************
  *  Group 10 : Set Instructions                                    *
diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 62e8f62af..851578f64 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -307,7 +307,11 @@ INSTRUCTIONS
   rlwinm          GPR:wo, GPR:ro, CONST:ro, CONST:ro, CONST:ro.
     extlwi        GPR:wo, GPR:ro, CONST:ro, CONST:ro.
     extrwi        GPR:wo, GPR:ro, CONST:ro, CONST:ro.
+    rotlwi        GPR:wo, GPR:ro, CONST:ro.
+    rotrwi        GPR:wo, GPR:ro, CONST:ro.
     srwi          GPR:wo, GPR:ro, CONST:ro.
+  rlwnm           GPR:wo, GPR:ro, GPR:ro, CONST:ro, CONST:ro.
+    rotlw         GPR:wo, GPR:ro, GPR:ro.
   slw             GPR:wo, GPR:ro, GPR:ro.
   subf            GPR:wo, GPR:ro, GPR:ro.
   sraw            GPR:wo, GPR:ro, GPR:ro cost(4, 2).
@@ -1232,6 +1236,9 @@ PATTERNS
 				subf %a, %a, %2
 			yields %a
 
+
+/* Bitwise logic */
+
 	pat and $1==4                      /* AND word */
 		with REG NOT_R
 			uses reusing %1, REG
@@ -1381,6 +1388,9 @@ PATTERNS
 			loc $1
 			cal ".zer"
 
+
+/* Shifts and rotations */
+
 	pat sli $1==4                      /* Shift left (second << top) */
 		with CONST_STACK REG
 			uses reusing %2, REG
@@ -1417,6 +1427,33 @@ PATTERNS
 				srw %a, %2, %1
 			yields %a
 
+	pat rol $1==4                      /* Rotate left word */
+		with CONST_STACK REG
+			uses reusing %2, REG
+			gen rotlwi %a, %2, {CONST, %1.val & 0x1F}
+			yields %a
+		with REG REG
+			uses reusing %2, REG
+			gen rotlw %a, %2, %1
+			yields %a
+
+	/*
+	 * ror 4 -> ngi 4, rol 4
+	 *   because to rotate right by n bits is to rotate left by
+	 *   (32 - n), which is to rotate left by -n.  PowerPC rotlw
+	 *   handles -n as (-n & 0x1F).
+	 */
+
+	pat ror $1==4                      /* Rotate right word */
+		with CONST_STACK REG
+			uses reusing %2, REG
+			gen rotrwi %a, %2, {CONST, %1.val & 0x1F}
+			yields %a
+		with /* anything */
+			leaving
+				ngi 4
+				rol 4
+
 
 /* Arrays */
 
diff --git a/tests/plat/build.lua b/tests/plat/build.lua
index 0d3091559..cbd39468e 100644
--- a/tests/plat/build.lua
+++ b/tests/plat/build.lua
@@ -10,7 +10,8 @@ definerule("plat_testsuite",
 		-- target names will resolve there.
 		local testfiles = filenamesof(
 			"tests/plat/*.c",
-			"tests/plat/*.e",
+			"tests/plat/inn_e.e",
+			"tests/plat/rotate_e.e",
 			"tests/plat/*.p",
 			"tests/plat/b/*.b",
 			"tests/plat/bugs/bug-22-inn_mod.mod",
diff --git a/tests/plat/rotate_e.e b/tests/plat/rotate_e.e
new file mode 100644
index 000000000..a6f8f28dd
--- /dev/null
+++ b/tests/plat/rotate_e.e
@@ -0,0 +1,219 @@
+#
+    mes 2, EM_WSIZE, EM_PSIZE
+
+/*
+ * Test _rol_ (rotate left) and _ror_ (rotate right).
+ *
+ * By tradition, _rol_ and _ror_ can't rotate values shorter than the
+ * word size, or longer than 4 bytes.
+ *  - If word size is 2, then try rotating 2-byte and 4-byte values.
+ *  - If word size is 4, then try rotating 4-byte values.
+ */
+
+#if EM_WSIZE == 2
+#define LEN2  4
+    exa table2
+    exa left2
+    exa right2
+table2         /* left, right */
+    con 12715U2  /*  0,  0 */
+    con 25430U2  /*  1, 15 */
+    con 43825U2  /*  8,  8 */
+    con 39125U2  /* 15,  1 */
+left2
+    con 0I2, 1I2, 8I2, 15I2
+right2
+    con 0I2, 15I2, 8I2, 1I2
+#endif
+
+#define LEN4  4
+    exa table4
+    exa left4
+    exa right4
+table4              /* left, right */
+    con  437223536U4  /*  0,  0 */
+    con  874447072U4  /*  1, 31 */
+    con 2154830351U4  /* 16, 16 */
+    con  218611768U4  /* 31,  1 */
+left4
+    con 0I2, 1I2, 16I2, 31I2
+right4
+    con 0I2, 31I2, 16I2, 1I2
+
+    exa val4
+    exa val4left7
+    exa val4right11
+val4
+    con 4283808839U4
+val4left7
+    con 2866684927U4
+val4right11
+    con 2298473143U4
+
+    exp $_m_a_i_n
+    pro $_m_a_i_n, EM_WSIZE
+#define i -EM_WSIZE
+
+#if EM_WSIZE == 2
+    /*
+     * Loop for LEN2 items in table2.
+     */
+    loc 0
+    stl i
+1
+    lae table2
+    loi 2         /* value to rotate */
+    lae left2
+    lol i
+    loc 1
+    sli EM_WSIZE
+    ads EM_WSIZE
+    loi 2         /* left distance */
+    rol 2         /* rotate left */
+    lae table2
+    lol i
+    loc 1
+    sli EM_WSIZE
+    ads EM_WSIZE
+    loi 2         /* expected result */
+    cmu 2
+    zeq *2
+    loc __LINE__
+    loc EM_WSIZE
+    loc 4
+    cuu
+    cal $fail
+    asp 4
+2
+    lae table2
+    loi 2         /* value to rotate */
+    lae right2
+    lol i
+    loc 1
+    sli EM_WSIZE
+    ads EM_WSIZE
+    loi 2         /* right distance */
+    ror 2         /* rotate right */
+    lae table2
+    lol i
+    loc 1
+    sli EM_WSIZE
+    ads EM_WSIZE
+    loi 2         /* expected result */
+    cmu 2
+    zeq *3
+    loc __LINE__
+    loc EM_WSIZE
+    loc 4
+    cuu
+    cal $fail
+    asp 4
+3
+    inl i         /* loop LEN2 times */
+    lol i
+    loc LEN2
+    blt *1
+#endif /* EM_WSIZE == 2 */
+
+    /*
+     * Loop for LEN4 items in table4.
+     */
+    loc 0
+    stl i
+4
+    lae table4
+    loi 4         /* value to rotate */
+    lae left4
+    lol i
+    loc 1
+    sli EM_WSIZE
+    ads EM_WSIZE
+    loi 2         /* left distance */
+    loc 2
+    loc EM_WSIZE
+    cii
+    rol 4         /* rotate left */
+    lae table4
+    lol i
+    loc 2
+    sli EM_WSIZE
+    ads EM_WSIZE
+    loi 4         /* expected result */
+    cmu 4
+    zeq *5
+    loc __LINE__
+    loc EM_WSIZE
+    loc 4
+    cuu
+    cal $fail
+    asp 4
+5
+    lae table4
+    loi 4         /* value to rotate */
+    lae right4
+    lol i
+    loc 1
+    sli EM_WSIZE
+    ads EM_WSIZE
+    loi 2         /* right distance */
+    loc 2
+    loc EM_WSIZE
+    cii
+    ror 4         /* rotate right */
+    lae table4
+    lol i
+    loc 2
+    sli EM_WSIZE
+    ads EM_WSIZE
+    loi 4         /* expected result */
+    cmu 4
+    zeq *6
+    loc __LINE__
+    loc EM_WSIZE
+    loc 4
+    cuu
+    cal $fail
+    asp 4
+6
+    inl i         /* loop LEN4 times */
+    lol i
+    loc LEN4
+    blt *4
+
+    /*
+     * Rotate 4-byte values by a constant distance, because this uses
+     * different rules in PowerPC ncg.
+     */
+    lae val4
+    loi 4
+    loc 7
+    rol 4         /* rotate left by 7 bits */
+    lae val4left7
+    loi 4
+    cmu 4
+    zeq *7
+    loc __LINE__
+    loc EM_WSIZE
+    loc 4
+    cuu
+    cal $fail
+    asp 4
+7
+    lae val4
+    loi 4
+    loc 11
+    ror 4         /* rotate right by 11 bits */
+    lae val4right11
+    loi 4
+    cmu 4
+    zeq *8
+    loc __LINE__
+    loc EM_WSIZE
+    loc 4
+    cuu
+    cal $fail
+    asp 4
+8
+
+    cal $finished
+    end

From 6b933db90b93822f2817a78dd017d909f2e100aa Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 7 Dec 2017 19:24:09 -0500
Subject: [PATCH 05/55] Split C from CONST.

Rename token CONST to C.  Define set CONST = C + CONST_STACK.  The
instructions with CONST operands can now accept CONST_STACK tokens;
some cases of {CONST, %1.val} become %1.

Also simplify two of _rlwinm_ into _slwi_ and _srwi_.
---
 mach/powerpc/ncg/table | 151 ++++++++++++++++++++---------------------
 1 file changed, 74 insertions(+), 77 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 851578f64..251c83e71 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -96,7 +96,7 @@ TOKENS
 
 /* Primitives */
 
-	CONST              = { INT val; }             4    val.
+	C /* constant */   = { INT val; }             4    val.
 	LABEL              = { ADDR adr; }            4    adr.
 	LABEL_HI           = { ADDR adr; }            4    "hi16[" adr "]".
 	LABEL_HA           = { ADDR adr; }            4    "ha16[" adr "]".
@@ -112,13 +112,13 @@ TOKENS
 
 /* Constants on the stack */
 
-	CONST_N8000        = { INT val; }             4.
-	CONST_N7FFF_N0001  = { INT val; }             4.
-	CONST_0000_7FFF    = { INT val; }             4.
-	CONST_8000         = { INT val; }             4.
-	CONST_8001_FFFF    = { INT val; }             4.
-	CONST_HZ           = { INT val; }             4.
-	CONST_HL           = { INT val; }             4.
+	CONST_N8000        = { INT val; }             4    val.
+	CONST_N7FFF_N0001  = { INT val; }             4    val.
+	CONST_0000_7FFF    = { INT val; }             4    val.
+	CONST_8000         = { INT val; }             4    val.
+	CONST_8001_FFFF    = { INT val; }             4    val.
+	CONST_HZ           = { INT val; }             4    val.
+	CONST_HL           = { INT val; }             4    val.
 
 /* Expression partial results */
 
@@ -183,6 +183,8 @@ SETS
 	CONST_STACK     = CONST_N8000 + CONST_N7FFF_N0001 + CONST_0000_7FFF +
 	                  CONST_8000 + CONST_8001_FFFF + CONST_HZ + CONST_HL.
 
+	CONST           = C + CONST_STACK.
+
 	SUM_ALL            = SUM_RC + SUM_RL + SUM_RR.
 
 	SEX_ALL            = SEX_B + SEX_H.
@@ -309,6 +311,7 @@ INSTRUCTIONS
     extrwi        GPR:wo, GPR:ro, CONST:ro, CONST:ro.
     rotlwi        GPR:wo, GPR:ro, CONST:ro.
     rotrwi        GPR:wo, GPR:ro, CONST:ro.
+    slwi          GPR:wo, GPR:ro, CONST:ro.
     srwi          GPR:wo, GPR:ro, CONST:ro.
   rlwnm           GPR:wo, GPR:ro, GPR:ro, CONST:ro, CONST:ro.
     rotlw         GPR:wo, GPR:ro, GPR:ro.
@@ -351,22 +354,22 @@ MOVES
 
 /* Constants */
 
-	from CONST + CONST_STACK smalls(%val) to GPR
+	from CONST smalls(%val) to GPR
 		gen
 			COMMENT("move CONST->GPR smalls")
-			li %2, {CONST, %1.val}
+			li %2, %1
 
-	from CONST + CONST_STACK lo(%val)==0 to GPR
+	from CONST lo(%val)==0 to GPR
 		gen
 			COMMENT("move CONST->GPR shifted")
-			lis %2, {CONST, hi(%1.val)}
+			lis %2, {C, hi(%1.val)}
 
-	from CONST + CONST_STACK to GPR
+	from CONST to GPR
 		gen
 			COMMENT("move CONST->GPR")
-			lis %2, {CONST, hi(%1.val)}
-			ori %2, %2, {CONST, lo(%1.val)}
-			/* Can't use addi %2, %2, {CONST, los(%1.val)}
+			lis %2, {C, hi(%1.val)}
+			ori %2, %2, {C, lo(%1.val)}
+			/* Can't use addi %2, %2, {C, los(%1.val)}
 			 * because %2 might be R0. */
 
 	from LABEL to GPR
@@ -389,10 +392,10 @@ MOVES
 /* Register + something */
 
 	from SUM_RIS to GPR
-		gen addis %2, %1.reg, {CONST, %1.offhi}
+		gen addis %2, %1.reg, {C, %1.offhi}
 
 	from SUM_RC to GPR
-		gen addi %2, %1.reg, {CONST, %1.off}
+		gen addi %2, %1.reg, {C, %1.off}
 
 	from SUM_RL to GPR
 		gen addi %2, %1.reg, {LABEL_LO, %1.adr}
@@ -494,19 +497,19 @@ MOVES
 		gen or %2, %1.reg1, %1.reg2
 
 	from OR_RIS to GPR
-		gen oris %2, %1.reg, {CONST, %1.valhi}
+		gen oris %2, %1.reg, {C, %1.valhi}
 
 	from OR_RC to GPR
-		gen ori %2, %1.reg, {CONST, %1.val}
+		gen ori %2, %1.reg, {C, %1.val}
 
 	from XOR_RR to GPR
 		gen xor %2, %1.reg1, %1.reg2
 
 	from XOR_RIS to GPR
-		gen xoris %2, %1.reg, {CONST, %1.valhi}
+		gen xoris %2, %1.reg, {C, %1.valhi}
 
 	from XOR_RC to GPR
-		gen xori %2, %1.reg, {CONST, %1.val}
+		gen xori %2, %1.reg, {C, %1.val}
 
 /* Conditions */
 
@@ -514,7 +517,7 @@ MOVES
 
 	from COND_RC to GPR
 		gen
-			cmpwi %1.reg, {CONST, %1.val}
+			cmpwi %1.reg, {C, %1.val}
 			mfcr %2
 
 	from COND_RR to GPR
@@ -524,7 +527,7 @@ MOVES
 
 	from CONDL_RC to GPR
 		gen
-			cmplwi %1.reg, {CONST, %1.val}
+			cmplwi %1.reg, {C, %1.val}
 			mfcr %2
 
 	from CONDL_RR to GPR
@@ -548,30 +551,30 @@ MOVES
 
 	from XEQ to GPR
 		gen
-			extrwi %2, %1.reg, {CONST, 1}, {CONST, 2}
+			extrwi %2, %1.reg, {C, 1}, {C, 2}
 
 	from XNE to GPR
 		gen
-			extrwi %2, %1.reg, {CONST, 1}, {CONST, 2}
-			xori %2, %2, {CONST, 1}
+			extrwi %2, %1.reg, {C, 1}, {C, 2}
+			xori %2, %2, {C, 1}
 
 	from XGT to GPR
 		gen
-			extrwi %2, %1.reg, {CONST, 1}, {CONST, 1}
+			extrwi %2, %1.reg, {C, 1}, {C, 1}
 
 	from XGE to GPR
 		gen
-			extrwi %2, %1.reg, {CONST, 1}, {CONST, 0}
-			xori %2, %2, {CONST, 1}
+			extrwi %2, %1.reg, {C, 1}, {C, 0}
+			xori %2, %2, {C, 1}
 
 	from XLT to GPR
 		gen
-			extrwi %2, %1.reg, {CONST, 1}, {CONST, 0}
+			extrwi %2, %1.reg, {C, 1}, {C, 0}
 
 	from XLE to GPR
 		gen
-			extrwi %2, %1.reg, {CONST, 1}, {CONST, 1}
-			xori %2, %2, {CONST, 1}
+			extrwi %2, %1.reg, {C, 1}, {C, 1}
+			xori %2, %2, {C, 1}
 
 /* GPR_EXPR exists solely to allow us to use regvar() (which can only
    be used in an expression) as a register constant.  We can then use
@@ -649,7 +652,7 @@ COERCIONS
 		gen
 			COMMENT("coerce STACK->REG")
 			lwz %a, {IND_RC_W, sp, 0}
-			addi sp, sp, {CONST, 4}
+			addi sp, sp, {C, 4}
 		yields %a
 
 	from STACK
@@ -657,7 +660,7 @@ COERCIONS
 		gen
 			COMMENT("coerce STACK->FREG")
 			lfd %a, {IND_RC_D, sp, 0}
-			addi sp, sp, {CONST, 8}
+			addi sp, sp, {C, 8}
 		yields %a
 
 	from STACK
@@ -665,7 +668,7 @@ COERCIONS
 		gen
 			COMMENT("coerce STACK->FSREG")
 			lfs %a, {IND_RC_W, sp, 0}
-			addi sp, sp, {CONST, 4}
+			addi sp, sp, {C, 4}
 		yields %a
 
 	from ANY_BHW
@@ -824,7 +827,7 @@ PATTERNS
 		with STACK
 			gen
 				lwz {LOCAL, $1}, {IND_RC_W, sp, 0}
-				addi sp, sp, {CONST, 4}
+				addi sp, sp, {C, 4}
 	pat stl inreg($1)==reg_float
 		with exact FSREG+IND_ALL_W
 			kills regvar_w($1, reg_float)
@@ -832,7 +835,7 @@ PATTERNS
 		with STACK
 			gen
 				lfs {LOCAL, $1}, {IND_RC_W, sp, 0}
-				addi sp, sp, {CONST, 4}
+				addi sp, sp, {C, 4}
 	pat stl
 		leaving
 			lal $1
@@ -846,7 +849,7 @@ PATTERNS
 		with STACK
 			gen
 				lfd {DLOCAL, $1}, {IND_RC_D, sp, 0}
-				addi sp, sp, {CONST, 8}
+				addi sp, sp, {C, 8}
 	pat sdl
 		leaving
 			lal $1
@@ -1255,22 +1258,22 @@ PATTERNS
 		with REG UCONST2
 			uses reusing %1, REG
 			gen
-				andiX %a, %1, {CONST, %2.val}
+				andiX %a, %1, %2
 			yields %a
 		with UCONST2 REG
 			uses reusing %2, REG
 			gen
-				andiX %a, %2, {CONST, %1.val}
+				andiX %a, %2, %1
 			yields %a
 		with REG CONST_HZ
 			uses reusing %1, REG
 			gen
-				andisX %a, %1, {CONST, hi(%2.val)}
+				andisX %a, %1, {C, hi(%2.val)}
 			yields %a
 		with CONST_HZ REG
 			uses reusing %2, REG
 			gen
-				andisX %a, %2, {CONST, hi(%1.val)}
+				andisX %a, %2, {C, hi(%1.val)}
 			yields %a
 
 	pat and defined($1)                /* AND set */
@@ -1394,43 +1397,37 @@ PATTERNS
 	pat sli $1==4                      /* Shift left (second << top) */
 		with CONST_STACK REG
 			uses reusing %2, REG
-			gen
-				rlwinm %a, %2, {CONST, (%1.val & 0x1F)}, {CONST, 0}, {CONST, 31-(%1.val & 0x1F)}
+			gen slwi %a, %2, {C, %1.val & 0x1F}
 			yields %a
 		with REG REG
 			uses reusing %2, REG
-			gen
-				slw %a, %2, %1
+			gen slw %a, %2, %1
 			yields %a
 
 	pat sri $1==4                      /* Shift right signed (second >> top) */
 		with CONST_STACK REG
 			uses reusing %2, REG
-			gen
-				srawi %a, %2, {CONST, %1.val & 0x1F}
+			gen srawi %a, %2, {C, %1.val & 0x1F}
 			yields %a
 		with REG REG
 			uses reusing %2, REG
-			gen
-				sraw %a, %2, %1
+			gen sraw %a, %2, %1
 			yields %a
 
 	pat sru $1==4                      /* Shift right unsigned (second >> top) */
 		with CONST_STACK REG
 			uses reusing %2, REG
-			gen
-				rlwinm %a, %2, {CONST, 32-(%1.val & 0x1F)}, {CONST, (%1.val & 0x1F)}, {CONST, 31}
+			gen srwi %a, %2, {C, %1.val & 0x1F}
 			yields %a
 		with REG REG
 			uses reusing %2, REG
-			gen
-				srw %a, %2, %1
+			gen srw %a, %2, %1
 			yields %a
 
 	pat rol $1==4                      /* Rotate left word */
 		with CONST_STACK REG
 			uses reusing %2, REG
-			gen rotlwi %a, %2, {CONST, %1.val & 0x1F}
+			gen rotlwi %a, %2, {C, %1.val & 0x1F}
 			yields %a
 		with REG REG
 			uses reusing %2, REG
@@ -1447,7 +1444,7 @@ PATTERNS
 	pat ror $1==4                      /* Rotate right word */
 		with CONST_STACK REG
 			uses reusing %2, REG
-			gen rotrwi %a, %2, {CONST, %1.val & 0x1F}
+			gen rotrwi %a, %2, {C, %1.val & 0x1F}
 			yields %a
 		with /* anything */
 			leaving
@@ -1761,11 +1758,11 @@ PATTERNS
 	proc bxx example beq
 		with REG CONST2 STACK
 			gen
-				cmpwi %1, {CONST, %2.val}
+				cmpwi %1, %2
 				bxx[2] {LABEL, $1}
 		with CONST2 REG STACK
 			gen
-				cmpwi %2, {CONST, %1.val}
+				cmpwi %2, %1
 				bxx[1] {LABEL, $1}
 		with REG REG STACK
 			gen
@@ -1783,11 +1780,11 @@ PATTERNS
 	proc cmu4zxx example cmu zeq
 		with REG CONST2 STACK
 			gen
-				cmplwi %1, {CONST, %2.val}
+				cmplwi %1, %2
 				bxx[2] {LABEL, $2}
 		with CONST2 REG STACK
 			gen
-				cmplwi %2, {CONST, %1.val}
+				cmplwi %2, %1
 				bxx[1] {LABEL, $2}
 		with REG REG STACK
 			gen
@@ -1816,29 +1813,29 @@ PATTERNS
 	pat cmi $1==INT32                  /* Signed tristate compare */
 		with REG CONST2
 			uses reusing %1, REG={COND_RC, %1, %2.val}
-			gen rlwinm %a, %a, {CONST, 1}, {CONST, 31}, {CONST, 0}
+			gen rlwinm %a, %a, {C, 1}, {C, 31}, {C, 0}
 			yields %a
 		with CONST2 REG
 			uses reusing %2, REG={COND_RC, %2, %1.val}
-			gen extlwi %a, %a, {CONST, 2}, {CONST, 0}
+			gen extlwi %a, %a, {C, 2}, {C, 0}
 			yields %a
 		with REG REG
 			uses reusing %1, REG={COND_RR, %2, %1}
-			gen extlwi %a, %a, {CONST, 2}, {CONST, 0}
+			gen extlwi %a, %a, {C, 2}, {C, 0}
 			yields %a
 
 	pat cmu $1==INT32                  /* Unsigned tristate compare */
 		with REG UCONST2
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
-			gen rlwinm %a, %a, {CONST, 1}, {CONST, 31}, {CONST, 0}
+			gen rlwinm %a, %a, {C, 1}, {C, 31}, {C, 0}
 			yields %a
 		with UCONST2 REG
 			uses reusing %2, REG={CONDL_RC, %2, %1.val}
-			gen extlwi %a, %a, {CONST, 2}, {CONST, 0}
+			gen extlwi %a, %a, {C, 2}, {C, 0}
 			yields %a
 		with REG REG
 			uses reusing %1, REG={CONDL_RR, %2, %1}
-			gen extlwi %a, %a, {CONST, 2}, {CONST, 0}
+			gen extlwi %a, %a, {C, 2}, {C, 0}
 			yields %a
 
 	pat cmp                            /* Compare pointers */
@@ -1921,7 +1918,7 @@ PATTERNS
 			mtspr lr, r0
 			lwz r0, {IND_RC_W, fp, 0}
 			/* Free our stack frame. */
-			addi sp, fp, {CONST, 8}
+			addi sp, fp, {C, 8}
 			mr fp, r0
 			blr.
 
@@ -1949,10 +1946,10 @@ PATTERNS
 			/* ( src%3 dst%2 len%1 -- ) */
 			uses reusing %1, REG, REG, REG
 			gen
-				srwi %a, %1, {CONST, 2}
+				srwi %a, %1, {C, 2}
 				mtspr ctr, %a
-				addi %b, %3, {CONST, 0-4}
-				addi %c, %2, {CONST, 0-4}
+				addi %b, %3, {C, 0-4}
+				addi %c, %2, {C, 0-4}
 			1:	lwzu %a, {IND_RC_W, %b, 4}
 				stwu %a, {IND_RC_W, %c, 4}
 				bdnz {LABEL, "1b"}
@@ -2084,7 +2081,7 @@ PATTERNS
 			/* nop */
 		with STACK
 			gen
-				addi sp, sp, {CONST, 4}
+				addi sp, sp, {C, 4}
 
 	pat ass $1==4                      /* Adjust stack by variable amount */
 		with CONST2 STACK
@@ -2110,9 +2107,9 @@ PATTERNS
 		with REG
 			kills ALL
 			gen
-				cmpwi %1, {CONST, rom($1, 1)}
+				cmpwi %1, {C, rom($1, 1)}
 				blt {LABEL, ".trap_erange"}
-				cmpwi %1, {CONST, rom($1, 2)}
+				cmpwi %1, {C, rom($1, 2)}
 				bgt {LABEL, ".trap_erange"}
 			yields %1
 
@@ -2176,7 +2173,7 @@ PATTERNS
 	pat cmf $1==INT32                  /* Compare single */
 		with FSREG FSREG
 			uses REG={COND_FS, %2, %1}
-			gen extlwi %a, %a, {CONST, 2}, {CONST, 0}
+			gen extlwi %a, %a, {C, 2}, {C, 0}
 			yields %a
 
 	pat cmf teq $1==4                  /* Single second == top */
@@ -2334,7 +2331,7 @@ PATTERNS
 	pat cmf $1==INT64                  /* Compare double */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
-			gen extlwi %a, %a, {CONST, 2}, {CONST, 0}
+			gen extlwi %a, %a, {C, 2}, {C, 0}
 			yields %a
 
 	pat cmf teq $1==8                  /* Double second == top */
@@ -2402,7 +2399,7 @@ PATTERNS
 			gen
 				fctiwz %a, %1
 				stfdu %a, {IND_RC_D, sp, 0-8}
-				addi sp, sp, {CONST, 4}
+				addi sp, sp, {C, 4}
 
 	/* Convert double to unsigned int */
 	pat loc loc cfu $1==8 && $2==4

From 48788287b80ad378bb638fd10f9ffcc68c256585 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Fri, 8 Dec 2017 17:19:26 -0500
Subject: [PATCH 06/55] Add more chances to put results in register variables.

When a rule `uses REG ... yields %a`, the result %a is always a
temporary, never a regvar.  If the EM code uses _stl_ to put the
result in a regvar, then ncg emits _mr_ to move %a to the regvar.

There are two ways to put the result in the regvar without %a:

  1. Yield a token, as in `yields {MUL_RR, %2, %1}`, so that _stl_
     can move the token to the regvar without using %a.

  2. Provide a pattern, like `sli stl`, that just puts the result
     in `{LOCAL, $2}` and not %a.

Allow some tokens, like SUM_RIS and XEQ, onto the stack; and add
tokens like MUL_RR, and patterns like `sli stl`.

Delete patterns for `stl lol` and `sdl ldl` to avoid an extra
temporary %a when the local is a regvar.  Delete `lal sti lal loi`
because it would emit wrong code.
---
 mach/powerpc/ncg/table | 476 +++++++++++++++++++----------------------
 1 file changed, 215 insertions(+), 261 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 251c83e71..5768c4382 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -122,13 +122,19 @@ TOKENS
 
 /* Expression partial results */
 
+	SEX_B       = { GPR reg; }             4.   /* sign extension */
+	SEX_H       = { GPR reg; }             4.
+
 	SUM_RIS     = { GPR reg; INT offhi; }  4.   /* reg + (offhi << 16) */
 	SUM_RC      = { GPR reg; INT off; }    4.   /* reg + off */
 	SUM_RL      = { GPR reg; ADDR adr; }   4.   /* reg + lo16[adr] */
 	SUM_RR      = { GPR reg1; GPR reg2; }  4.   /* reg1 + reg2 */
 
-	SEX_B              = { GPR reg; }             4.
-	SEX_H              = { GPR reg; }             4.
+	SUB_RR      = { GPR reg1; GPR reg2; }  4.   /* reg1 - reg2 */
+	NEG_R       = { GPR reg; }             4.   /* -reg */
+	MUL_RR      = { GPR reg1; GPR reg2; }  4.   /* reg1 * reg2 */
+	DIV_RR      = { GPR reg1; GPR reg2; }  4.   /* reg1 / reg2 signed */
+	DIV_RR_U    = { GPR reg1; GPR reg2; }  4.   /* reg1 / reg2 unsigned */
 
 	IND_RC_B    = { GPR reg; INT off; }    4    off "(" reg ")".
 	IND_RL_B    = { GPR reg; ADDR adr; }   4    "lo16[" adr "](" reg ")".
@@ -146,15 +152,21 @@ TOKENS
 	IND_RL_D    = { GPR reg; ADDR adr; }   8    "lo16[" adr "](" reg ")".
 	IND_RR_D    = { GPR reg1; GPR reg2; }  8.
 
-	NOT_R              = { GPR reg; }             4.
-
-	AND_RR             = { GPR reg1; GPR reg2; }  4.
-	OR_RR              = { GPR reg1; GPR reg2; }  4.
-	OR_RIS             = { GPR reg; INT valhi; }  4.
-	OR_RC              = { GPR reg; INT val; }    4.
-	XOR_RR             = { GPR reg1; GPR reg2; }  4.
-	XOR_RIS            = { GPR reg; INT valhi; }  4.
-	XOR_RC             = { GPR reg; INT val; }    4.
+	NOT_R       = { GPR reg; }             4.   /* ~reg */
+	AND_RIS     = { GPR reg; INT valhi; }  4.
+	AND_RC      = { GPR reg; INT val; }    4.
+	AND_RR      = { GPR reg1; GPR reg2; }  4.
+	ANDC_RR     = { GPR reg1; GPR reg2; }  4.   /* reg1 & ~reg2 */
+	OR_RIS      = { GPR reg; INT valhi; }  4.
+	OR_RC       = { GPR reg; INT val; }    4.
+	OR_RR       = { GPR reg1; GPR reg2; }  4.
+	ORC_RR      = { GPR reg1; GPR reg2; }  4.   /* reg1 | ~reg2 */
+	XOR_RIS     = { GPR reg; INT valhi; }  4.
+	XOR_RC      = { GPR reg; INT val; }    4.
+	XOR_RR      = { GPR reg1; GPR reg2; }  4.
+	NAND_RR     = { GPR reg1; GPR reg2; }  4.   /* ~(reg1 & reg2) */
+	NOR_RR      = { GPR reg1; GPR reg2; }  4.   /* ~(reg1 | reg2) */
+	EQV_RR      = { GPR reg1; GPR reg2; }  4.   /* ~(reg1 ^ reg2) */
 
 	COND_RC            = { GPR reg; INT val; }    4.
 	COND_RR            = { GPR reg1; GPR reg2; }  4.
@@ -185,13 +197,6 @@ SETS
 
 	CONST           = C + CONST_STACK.
 
-	SUM_ALL            = SUM_RC + SUM_RL + SUM_RR.
-
-	SEX_ALL            = SEX_B + SEX_H.
-
-	LOGICAL_ALL        = NOT_R + AND_RR + OR_RR + OR_RC + XOR_RR +
-	                     XOR_RC.
-
 	IND_ALL_B          = IND_RC_B + IND_RL_B + IND_RR_B.
 	IND_ALL_H          = IND_RC_H + IND_RL_H + IND_RR_H +
 	                     IND_RC_H_S + IND_RL_H_S + IND_RR_H_S.
@@ -203,8 +208,14 @@ SETS
 	MEMORY             = IND_ALL_BHW + IND_ALL_D.
 
 	/* any stack token that we can easily move to GPR */
-	ANY_BHW            = REG + CONST_STACK + SEX_ALL +
-	                     SUM_ALL + IND_ALL_BHW + LOGICAL_ALL.
+	ANY_BHW = REG + CONST_STACK + SEX_B + SEX_H +
+	          SUM_RIS + SUM_RC + SUM_RL + SUM_RR +
+	          SUB_RR + NEG_R + MUL_RR + DIV_RR + DIV_RR_U +
+	          IND_ALL_BHW +
+	          NOT_R + AND_RIS + AND_RC + AND_RR + ANDC_RR +
+	          OR_RIS + OR_RC + OR_RR + ORC_RR +
+	          XOR_RIS + XOR_RC + XOR_RR + NAND_RR + NOR_RR + EQV_RR +
+	          XEQ + XNE + XGT + XGE + XLT + XLE.
 
 
 INSTRUCTIONS
@@ -274,7 +285,7 @@ INSTRUCTIONS
   fmuls           FSREG+LOCAL:wo, FSREG:ro, FSREG:ro cost(4, 5).
   fneg            FREG+DLOCAL:wo, FREG:ro cost(4, 5).
   fneg            FSREG+LOCAL:wo, FSREG:ro cost(4, 5).
-  frsp            FSREG:wo, FREG:ro cost(4, 5).
+  frsp            FSREG+LOCAL:wo, FREG:ro cost(4, 5).
   fsub            FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 5).
   fsubs           FSREG+LOCAL:wo, FSREG:ro, FSREG:ro cost(4, 5).
   lbz             GPR:wo, IND_RC_B+IND_RL_B:ro cost(4, 3).
@@ -292,13 +303,13 @@ INSTRUCTIONS
   lwzu            GPR:wo, IND_RC_W:rw cost(4, 3).
   lwzx            GPR:wo, GPR:ro, GPR:ro cost(4, 3).
   lwz             GPR+LOCAL:wo, IND_RC_W+IND_RL_W:ro cost(4, 3).
+  mfcr            GPR:wo cost(4,2).
+  mfspr           GPR:wo, SPR:ro cost(4, 3).
+  mtspr           SPR:wo, GPR:ro cost(4, 2).
+  mullw           GPR:wo, GPR:ro, GPR:ro cost(4, 4).
   nand            GPR:wo, GPR:ro, GPR:ro.
   neg             GPR:wo, GPR:ro.
   nor             GPR:wo, GPR:ro, GPR:ro.
-  mfcr            GPR:wo cost(4,2).
-  mullw           GPR:wo, GPR:ro, GPR:ro cost(4, 4).
-  mfspr           GPR:wo, SPR:ro cost(4, 3).
-  mtspr           SPR:wo, GPR:ro cost(4, 2).
   or              GPR:wo, GPR:ro, GPR:ro.
     mr            GPR:wo, GPR:ro.
   orX "or."       GPR:wo:cc, GPR:ro, GPR:ro.
@@ -309,17 +320,17 @@ INSTRUCTIONS
   rlwinm          GPR:wo, GPR:ro, CONST:ro, CONST:ro, CONST:ro.
     extlwi        GPR:wo, GPR:ro, CONST:ro, CONST:ro.
     extrwi        GPR:wo, GPR:ro, CONST:ro, CONST:ro.
-    rotlwi        GPR:wo, GPR:ro, CONST:ro.
-    rotrwi        GPR:wo, GPR:ro, CONST:ro.
-    slwi          GPR:wo, GPR:ro, CONST:ro.
-    srwi          GPR:wo, GPR:ro, CONST:ro.
+    rotlwi        GPR+LOCAL:wo, GPR:ro, CONST:ro.
+    rotrwi        GPR+LOCAL:wo, GPR:ro, CONST:ro.
+    slwi          GPR+LOCAL:wo, GPR:ro, CONST:ro.
+    srwi          GPR+LOCAL:wo, GPR:ro, CONST:ro.
   rlwnm           GPR:wo, GPR:ro, GPR:ro, CONST:ro, CONST:ro.
-    rotlw         GPR:wo, GPR:ro, GPR:ro.
-  slw             GPR:wo, GPR:ro, GPR:ro.
+    rotlw         GPR+LOCAL:wo, GPR:ro, GPR:ro.
+  slw             GPR+LOCAL:wo, GPR:ro, GPR:ro.
   subf            GPR:wo, GPR:ro, GPR:ro.
-  sraw            GPR:wo, GPR:ro, GPR:ro cost(4, 2).
-  srawi           GPR:wo, GPR:ro, CONST:ro cost(4, 2).
-  srw             GPR:wo, GPR:ro, GPR:ro.
+  sraw            GPR+LOCAL:wo, GPR:ro, GPR:ro cost(4, 2).
+  srawi           GPR+LOCAL:wo, GPR:ro, CONST:ro cost(4, 2).
+  srw             GPR+LOCAL:wo, GPR:ro, GPR:ro.
   stb             GPR:ro, IND_RC_B+IND_RL_B:rw cost(4, 3).
   stbx            GPR:ro, GPR:ro, GPR:ro cost(4, 3).
   stfd            FPR:ro, IND_RC_D+IND_RL_D:rw cost(4, 4).
@@ -403,6 +414,24 @@ MOVES
 	from SUM_RR to GPR
 		gen add %2, %1.reg1, %1.reg2
 
+/* Other arithmetic */
+
+	from SUB_RR to GPR
+		/* reg1 - reg2 -> subtract reg2 from reg1 */
+		gen subf %2, %1.reg2, %1.reg1
+
+	from NEG_R to GPR
+		gen neg %2, %1.reg
+
+	from MUL_RR to GPR
+		gen mullw %2, %1.reg1, %1.reg2
+
+	from DIV_RR to GPR
+		gen divw %2, %1.reg1, %1.reg2
+
+	from DIV_RR_U to GPR
+		gen divwu %2, %1.reg1, %1.reg2
+
 /* Read byte */
 
 	from IND_RC_B+IND_RL_B to GPR
@@ -490,11 +519,17 @@ MOVES
 	from NOT_R to GPR
 		gen nor %2, %1.reg, %1.reg
 
+	from AND_RIS to GPR
+		gen andisX %2, %1.reg, {C, %1.valhi}
+
+	from AND_RC to GPR
+		gen andiX %2, %1.reg, {C, %1.val}
+
 	from AND_RR to GPR
 		gen and %2, %1.reg1, %1.reg2
 
-	from OR_RR to GPR
-		gen or %2, %1.reg1, %1.reg2
+	from ANDC_RR to GPR
+		gen andc %2, %1.reg1, %1.reg2
 
 	from OR_RIS to GPR
 		gen oris %2, %1.reg, {C, %1.valhi}
@@ -502,8 +537,11 @@ MOVES
 	from OR_RC to GPR
 		gen ori %2, %1.reg, {C, %1.val}
 
-	from XOR_RR to GPR
-		gen xor %2, %1.reg1, %1.reg2
+	from OR_RR to GPR
+		gen or %2, %1.reg1, %1.reg2
+
+	from ORC_RR to GPR
+		gen orc %2, %1.reg1, %1.reg2
 
 	from XOR_RIS to GPR
 		gen xoris %2, %1.reg, {C, %1.valhi}
@@ -511,6 +549,18 @@ MOVES
 	from XOR_RC to GPR
 		gen xori %2, %1.reg, {C, %1.val}
 
+	from XOR_RR to GPR
+		gen xor %2, %1.reg1, %1.reg2
+
+	from NAND_RR to GPR
+		gen nand %2, %1.reg1, %1.reg2
+
+	from NOR_RR to GPR
+		gen nor %2, %1.reg1, %1.reg2
+
+	from EQV_RR to GPR
+		gen eqv %2, %1.reg1, %1.reg2
+
 /* Conditions */
 
 	/* Compare values, then copy cr0 to GPR. */
@@ -739,22 +789,6 @@ PATTERNS
 		with REG REG
 			yields %1 %2
 
-	pat stl lol $1==$2                 /* Store then load local */
-		leaving
-			dup 4
-			stl $1
-
-	pat sdl ldl $1==$2                 /* Store then load double local */
-		leaving
-			dup 8
-			sdl $1
-
-	pat lal sti lal loi $1==$3 && $2==$4 /* Store then load local, of a different size */
-		leaving
-			dup INT32
-			lal $1
-			sti $2
-
 	pat ste loe $1==$2                 /* Store then load external */
 		leaving
 			dup 4
@@ -1166,11 +1200,9 @@ PATTERNS
 		with REG CONST2
 			yields {SUM_RC, %1, %2.val}
 		with CONST_HZ REG
-			uses reusing %2, REG={SUM_RIS, %2, his(%1.val)}
-			yields %a
+			yields {SUM_RIS, %2, his(%1.val)}
 		with REG CONST_HZ
-			uses reusing %1, REG={SUM_RIS, %1, his(%2.val)}
-			yields %a
+			yields {SUM_RIS, %1, his(%2.val)}
 		with CONST_STACK-CONST2-CONST_HZ REG
 			uses reusing %2, REG={SUM_RIS, %2, his(%1.val)}
 			yields {SUM_RC, %a, los(%1.val)}
@@ -1181,100 +1213,63 @@ PATTERNS
 	pat sbi $1==4                      /* Subtract word (second - top) */
 		with REG REG
 			uses reusing %2, REG
-			gen
-				subf %a, %1, %2
-			yields %a
+			yields {SUB_RR, %2, %1}
 		with CONST2_WHEN_NEG REG
 			yields {SUM_RC, %2, 0-%1.val}
 		with CONST_HZ REG
-			uses reusing %2, REG={SUM_RIS, %2, his(0-%1.val)}
-			yields %a
+			yields {SUM_RIS, %2, his(0-%1.val)}
 		with CONST_STACK-CONST2_WHEN_NEG-CONST_HZ REG
 			uses reusing %2, REG={SUM_RIS, %2, his(0-%1.val)}
 			yields {SUM_RC, %a, los(0-%1.val)}
 
 	pat ngi $1==4                      /* Negate word */
 		with REG
-			uses reusing %1, REG
-			gen
-				neg %a, %1
-			yields %a
+			yields {NEG_R, %1}
 
 	pat mli $1==4                      /* Multiply word (second * top) */
 		with REG REG
-			uses reusing %2, REG
-			gen
-				mullw %a, %2, %1
-			yields %a
+			yields {MUL_RR, %2, %1}
 
 	pat dvi $1==4                      /* Divide word (second / top) */
 		with REG REG
-			uses reusing %2, REG
-			gen
-				divw %a, %2, %1
-			yields %a
+			yields {DIV_RR, %2, %1}
 
-	pat dvu $1==4                      /* Divide unsigned word (second / top) */
+	pat dvu $1==4             /* Divide unsigned word (second / top) */
 		with REG REG
-			uses reusing %2, REG
-			gen
-				divwu %a, %2, %1
-			yields %a
+			yields {DIV_RR_U, %2, %1}
+
+	/* To calculate a remainder:  a % b = a - (a / b * b) */
 
 	pat rmi $1==4                      /* Remainder word (second % top) */
 		with REG REG
-			uses REG
-			gen
-				divw %a, %2, %1
-				mullw %a, %a, %1
-				subf %a, %a, %2
-			yields %a
+			uses REG={DIV_RR, %2, %1}, REG
+			gen move {MUL_RR, %a, %1}, %b
+			yields {SUB_RR, %2, %b}
 
-	pat rmu $1==4                      /* Remainder unsigned word (second % top) */
+	pat rmu $1==4             /* Remainder unsigned word (second % top) */
 		with REG REG
-			uses REG
-			gen
-				divwu %a, %2, %1
-				mullw %a, %a, %1
-				subf %a, %a, %2
-			yields %a
+			uses REG={DIV_RR_U, %2, %1}, REG
+			gen move {MUL_RR, %a, %1}, %b
+			yields {SUB_RR, %2, %b}
 
 
 /* Bitwise logic */
 
 	pat and $1==4                      /* AND word */
 		with REG NOT_R
-			uses reusing %1, REG
-			gen
-				andc %a, %1, %2.reg
-			yields %a
+			yields {ANDC_RR, %1, %2.reg}
 		with NOT_R REG
-			uses reusing %1, REG
-			gen
-				andc %a, %2, %1.reg
-			yields %a
+			yields {ANDC_RR, %2, %1.reg}
 		with REG REG
 			yields {AND_RR, %1, %2}
 		with REG UCONST2
-			uses reusing %1, REG
-			gen
-				andiX %a, %1, %2
-			yields %a
+			yields {AND_RC, %1, %2.val}
 		with UCONST2 REG
-			uses reusing %2, REG
-			gen
-				andiX %a, %2, %1
-			yields %a
+			yields {AND_RC, %2, %1.val}
 		with REG CONST_HZ
-			uses reusing %1, REG
-			gen
-				andisX %a, %1, {C, hi(%2.val)}
-			yields %a
+			yields {AND_RIS, %1, hi(%2.val)}
 		with CONST_HZ REG
-			uses reusing %2, REG
-			gen
-				andisX %a, %2, {C, hi(%1.val)}
-			yields %a
+			yields {AND_RIS, %2, hi(%1.val)}
 
 	pat and defined($1)                /* AND set */
 		leaving
@@ -1287,15 +1282,9 @@ PATTERNS
 
 	pat ior $1==4                      /* OR word */
 		with REG NOT_R
-			uses reusing %1, REG
-			gen
-				orc %a, %1, %2.reg
-			yields %a
+			yields {ORC_RR, %1, %2.reg}
 		with NOT_R REG
-			uses reusing %2, REG
-			gen
-				orc %a, %2, %1.reg
-			yields %a
+			yields {ORC_RR, %2, %1.reg}
 		with REG REG
 			yields {OR_RR, %1, %2}
 		with REG UCONST2
@@ -1303,11 +1292,9 @@ PATTERNS
 		with UCONST2 REG
 			yields {OR_RC, %2, %1.val}
 		with REG CONST_HZ
-			uses reusing %1, REG={OR_RIS, %1, hi(%2.val)}
-			yields %a
+			yields {OR_RIS, %1, hi(%2.val)}
 		with CONST_HZ REG
-			uses reusing %2, REG={OR_RIS, %2, hi(%1.val)}
-			yields %a
+			yields {OR_RIS, %2, hi(%1.val)}
 		with REG CONST_STACK-UCONST2-CONST_HZ
 			uses reusing %1, REG={OR_RIS, %1, hi(%2.val)}
 			yields {OR_RC, %1, lo(%2.val)}
@@ -1333,11 +1320,9 @@ PATTERNS
 		with UCONST2 REG
 			yields {XOR_RC, %2, %1.val}
 		with REG CONST_HZ
-			uses reusing %1, REG={XOR_RIS, %1, hi(%2.val)}
-			yields %a
+			yields {XOR_RIS, %1, hi(%2.val)}
 		with CONST_HZ REG
-			uses reusing %2, REG={XOR_RIS, %2, hi(%1.val)}
-			yields %a
+			yields {XOR_RIS, %2, hi(%1.val)}
 		with REG CONST_STACK-UCONST2-CONST_HZ
 			uses reusing %1, REG={XOR_RIS, %1, hi(%2.val)}
 			yields {XOR_RC, %1, lo(%2.val)}
@@ -1355,21 +1340,12 @@ PATTERNS
 			cal ".xor"
 
 	pat com $1==INT32                  /* NOT word */
-		with AND_RR
-			uses REG
-			gen
-				nand %a, %1.reg1, %1.reg2
-			yields %a
-		with OR_RR
-			uses REG
-			gen
-				nor %a, %1.reg1, %1.reg2
-			yields %a
-		with XOR_RR
-			uses REG
-			gen
-				eqv %a, %1.reg1, %1.reg2
-			yields %a
+		with exact AND_RR
+			yields {NAND_RR, %1.reg1, %1.reg2}
+		with exact OR_RR
+			yields {NOR_RR, %1.reg1, %1.reg2}
+		with exact XOR_RR
+			yields {EQV_RR, %1.reg1, %1.reg2}
 		with REG
 			yields {NOT_R, %1}
 
@@ -1403,8 +1379,13 @@ PATTERNS
 			uses reusing %2, REG
 			gen slw %a, %2, %1
 			yields %a
+	pat sli stl $1==4 && inreg($2)==reg_any
+		with CONST_STACK REG
+			gen slwi {LOCAL, $2}, %2, {C, %1.val & 0x1F}
+		with REG REG
+			gen slw {LOCAL, $2}, %2, %1
 
-	pat sri $1==4                      /* Shift right signed (second >> top) */
+	pat sri $1==4               /* Shift right signed (second >> top) */
 		with CONST_STACK REG
 			uses reusing %2, REG
 			gen srawi %a, %2, {C, %1.val & 0x1F}
@@ -1413,8 +1394,13 @@ PATTERNS
 			uses reusing %2, REG
 			gen sraw %a, %2, %1
 			yields %a
+	pat sri stl $1==4 && inreg($2)==reg_any
+		with CONST_STACK REG
+			gen srawi {LOCAL, $2}, %2, {C, %1.val & 0x1F}
+		with REG REG
+			gen sraw {LOCAL, $2}, %2, %1
 
-	pat sru $1==4                      /* Shift right unsigned (second >> top) */
+	pat sru $1==4               /* Shift right unsigned (second >> top) */
 		with CONST_STACK REG
 			uses reusing %2, REG
 			gen srwi %a, %2, {C, %1.val & 0x1F}
@@ -1423,6 +1409,11 @@ PATTERNS
 			uses reusing %2, REG
 			gen srw %a, %2, %1
 			yields %a
+	pat sru stl $1==4 && inreg($2)==reg_any
+		with CONST_STACK REG
+			gen srwi {LOCAL, $2}, %2, {C, %1.val & 0x1F}
+		with REG REG
+			gen srw {LOCAL, $2}, %2, %1
 
 	pat rol $1==4                      /* Rotate left word */
 		with CONST_STACK REG
@@ -1433,6 +1424,11 @@ PATTERNS
 			uses reusing %2, REG
 			gen rotlw %a, %2, %1
 			yields %a
+	pat rol stl $1==4 && inreg($2)==reg_any
+		with CONST_STACK REG
+			gen rotlwi {LOCAL, $2}, %2, {C, %1.val & 0x1F}
+		with REG REG
+			gen rotlw {LOCAL, $2}, %2, %1
 
 	/*
 	 * ror 4 -> ngi 4, rol 4
@@ -1450,6 +1446,14 @@ PATTERNS
 			leaving
 				ngi 4
 				rol 4
+	pat ror stl $1==4 && inreg($2)==reg_any
+		with CONST_STACK REG
+			gen rotrwi {LOCAL, $2}, %2, {C, %1.val & 0x1F}
+		with /* anything */
+			leaving
+				ngi 4
+				rol 4
+				stl $2
 
 
 /* Arrays */
@@ -1517,8 +1521,7 @@ PATTERNS
 			gen
 				test %1
 				mfcr %a
-				move {XEQ, %a}, %a
-			yields %a
+			yields {XEQ, %a}
 
 	pat tne                            /* top = (top != 0) */
 		with REG
@@ -1526,8 +1529,7 @@ PATTERNS
 			gen
 				test %1
 				mfcr %a
-				move {XNE, %a}, %a
-			yields %a
+			yields {XNE, %a}
 
 	pat tlt                            /* top = (top < 0) */
 		with REG
@@ -1535,8 +1537,7 @@ PATTERNS
 			gen
 				test %1
 				mfcr %a
-				move {XLT, %a}, %a
-			yields %a
+			yields {XLT, %a}
 
 	pat tle                            /* top = (top <= 0) */
 		with REG
@@ -1544,8 +1545,7 @@ PATTERNS
 			gen
 				test %1
 				mfcr %a
-				move {XLE, %a}, %a
-			yields %a
+			yields {XLE, %a}
 
 	pat tgt                            /* top = (top > 0) */
 		with REG
@@ -1553,8 +1553,7 @@ PATTERNS
 			gen
 				test %1
 				mfcr %a
-				move {XGT, %a}, %a
-			yields %a
+			yields {XGT, %a}
 
 	pat tge                            /* top = (top >= 0) */
 		with REG
@@ -1562,176 +1561,139 @@ PATTERNS
 			gen
 				test %1
 				mfcr %a
-				move {XGE, %a}, %a
-			yields %a
+			yields {XGE, %a}
 
 	pat cmi teq $1==4                  /* Signed second == top */
 		with REG CONST2
 			uses reusing %1, REG={COND_RC, %1, %2.val}
-			gen move {XEQ, %a}, %a
-			yields %a
+			yields {XEQ, %a}
 		with CONST2 REG
 			uses reusing %1, REG={COND_RC, %2, %1.val}
-			gen move {XEQ, %a}, %a
-			yields %a
+			yields {XEQ, %a}
 		with REG REG
 			uses reusing %1, REG={COND_RR, %2, %1}
-			gen move {XEQ, %a}, %a
-			yields %a
+			yields {XEQ, %a}
 
 	pat cmi tne $1==4                  /* Signed second != top */
 		with REG CONST2
 			uses reusing %1, REG={COND_RC, %1, %2.val}
-			gen move {XNE, %a}, %a
-			yields %a
+			yields {XNE, %a}
 		with CONST2 REG
 			uses reusing %1, REG={COND_RC, %2, %1.val}
-			gen move {XNE, %a}, %a
-			yields %a
+			yields {XNE, %a}
 		with REG REG
 			uses reusing %1, REG={COND_RR, %2, %1}
-			gen move {XNE, %a}, %a
-			yields %a
+			yields {XNE, %a}
 
 	pat cmi tgt $1==4                  /* Signed second > top */
 		with REG CONST2
 			uses reusing %1, REG={COND_RC, %1, %2.val}
-			gen move {XLT, %a}, %a
-			yields %a
+			yields {XLT, %a}
 		with CONST2 REG
 			uses reusing %1, REG={COND_RC, %2, %1.val}
-			gen move {XGT, %a}, %a
-			yields %a
+			yields {XGT, %a}
 		with REG REG
 			uses reusing %1, REG={COND_RR, %2, %1}
-			gen move {XGT, %a}, %a
-			yields %a
+			yields {XGT, %a}
 
 	pat cmi tge $1==4                  /* Signed second >= top */
 		with REG CONST2
 			uses reusing %1, REG={COND_RC, %1, %2.val}
-			gen move {XLE, %a}, %a
-			yields %a
+			yields {XLE, %a}
 		with CONST2 REG
 			uses reusing %1, REG={COND_RC, %2, %1.val}
-			gen move {XGE, %a}, %a
-			yields %a
+			yields {XGE, %a}
 		with REG REG
 			uses reusing %1, REG={COND_RR, %2, %1}
-			gen move {XGE, %a}, %a
-			yields %a
+			yields {XGE, %a}
 
 	pat cmi tlt $1==4                  /* Signed second < top */
 		with REG CONST2
 			uses reusing %1, REG={COND_RC, %1, %2.val}
-			gen move {XGT, %a}, %a
-			yields %a
+			yields {XGT, %a}
 		with CONST2 REG
 			uses reusing %1, REG={COND_RC, %2, %1.val}
-			gen move {XLT, %a}, %a
-			yields %a
+			yields {XLT, %a}
 		with REG REG
 			uses reusing %1, REG={COND_RR, %2, %1}
-			gen move {XLT, %a}, %a
-			yields %a
+			yields {XLT, %a}
 
 	pat cmi tle $1==4                  /* Signed second <= top */
 		with REG CONST2
 			uses reusing %1, REG={COND_RC, %1, %2.val}
-			gen move {XGE, %a}, %a
-			yields %a
+			yields {XGE, %a}
 		with CONST2 REG
 			uses reusing %1, REG={COND_RC, %2, %1.val}
-			gen move {XLE, %a}, %a
-			yields %a
+			yields {XLE, %a}
 		with REG REG
 			uses reusing %1, REG={COND_RR, %2, %1}
-			gen move {XLE, %a}, %a
-			yields %a
+			yields {XLE, %a}
 
 	pat cmu teq $1==4                  /* Unsigned second == top */
 		with REG UCONST2
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
-			gen move {XEQ, %a}, %a
-			yields %a
+			yields {XEQ, %a}
 		with UCONST2 REG
 			uses reusing %1, REG={CONDL_RC, %2, %1.val}
-			gen move {XEQ, %a}, %a
-			yields %a
+			yields {XEQ, %a}
 		with REG REG
 			uses reusing %1, REG={CONDL_RR, %2, %1}
-			gen move {XEQ, %a}, %a
-			yields %a
+			yields {XEQ, %a}
 
 	pat cmu tne $1==4                  /* Unsigned second != top */
 		with REG UCONST2
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
-			gen move {XNE, %a}, %a
-			yields %a
+			yields {XNE, %a}
 		with UCONST2 REG
 			uses reusing %1, REG={CONDL_RC, %2, %1.val}
-			gen move {XNE, %a}, %a
-			yields %a
+			yields {XNE, %a}
 		with REG REG
 			uses reusing %1, REG={CONDL_RR, %2, %1}
-			gen move {XNE, %a}, %a
-			yields %a
+			yields {XNE, %a}
 
 	pat cmu tgt $1==4                  /* Unsigned second > top */
 		with REG UCONST2
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
-			gen move {XLT, %a}, %a
-			yields %a
+			yields {XLT, %a}
 		with UCONST2 REG
 			uses reusing %1, REG={CONDL_RC, %2, %1.val}
-			gen move {XGT, %a}, %a
-			yields %a
+			yields {XGT, %a}
 		with REG REG
 			uses reusing %1, REG={CONDL_RR, %2, %1}
-			gen move {XGT, %a}, %a
-			yields %a
+			yields {XGT, %a}
 
 	pat cmu tge $1==4                  /* Unsigned second >= top */
 		with REG UCONST2
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
-			gen move {XLE, %a}, %a
-			yields %a
+			yields {XLE, %a}
 		with UCONST2 REG
 			uses reusing %1, REG={CONDL_RC, %2, %1.val}
-			gen move {XGE, %a}, %a
-			yields %a
+			yields {XGE, %a}
 		with REG REG
 			uses reusing %1, REG={CONDL_RR, %2, %1}
-			gen move {XGE, %a}, %a
-			yields %a
+			yields {XGE, %a}
 
 	pat cmu tlt $1==4                  /* Unsigned second < top */
 		with REG UCONST2
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
-			gen move {XGT, %a}, %a
-			yields %a
+			yields {XGT, %a}
 		with UCONST2 REG
 			uses reusing %1, REG={CONDL_RC, %2, %1.val}
-			gen move {XLT, %a}, %a
-			yields %a
+			yields {XLT, %a}
 		with REG REG
 			uses reusing %1, REG={CONDL_RR, %2, %1}
-			gen move {XLT, %a}, %a
-			yields %a
+			yields {XLT, %a}
 
 	pat cmu tle $1==4                  /* Unsigned second <= top */
 		with REG UCONST2
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
-			gen move {XGE, %a}, %a
-			yields %a
+			yields {XGE, %a}
 		with UCONST2 REG
 			uses reusing %1, REG={CONDL_RC, %2, %1.val}
-			gen move {XLE, %a}, %a
-			yields %a
+			yields {XLE, %a}
 		with REG REG
 			uses reusing %1, REG={CONDL_RR, %2, %1}
-			gen move {XLE, %a}, %a
-			yields %a
+			yields {XLE, %a}
 
 
 /* Simple branches */
@@ -2179,38 +2141,32 @@ PATTERNS
 	pat cmf teq $1==4                  /* Single second == top */
 		with FSREG FSREG
 			uses REG={COND_FS, %2, %1}
-			gen move {XEQ, %a}, %a
-			yields %a
+			yields {XEQ, %a}
 
 	pat cmf tne $1==4                  /* Single second == top */
 		with FSREG FSREG
 			uses REG={COND_FS, %2, %1}
-			gen move {XNE, %a}, %a
-			yields %a
+			yields {XNE, %a}
 
 	pat cmf tgt $1==4                  /* Single second > top */
 		with FSREG FSREG
 			uses REG={COND_FS, %2, %1}
-			gen move {XGT, %a}, %a
-			yields %a
+			yields {XGT, %a}
 
 	pat cmf tge $1==4                  /* Single second >= top */
 		with FSREG FSREG
 			uses REG={COND_FS, %2, %1}
-			gen move {XGE, %a}, %a
-			yields %a
+			yields {XGE, %a}
 
 	pat cmf tlt $1==4                  /* Single second < top */
 		with FSREG FSREG
 			uses REG={COND_FS, %2, %1}
-			gen move {XLT, %a}, %a
-			yields %a
+			yields {XLT, %a}
 
 	pat cmf tle $1==4                  /* Single second <= top */
 		with FSREG FSREG
 			uses REG={COND_FS, %2, %1}
-			gen move {XLE, %a}, %a
-			yields %a
+			yields {XLE, %a}
 
 	proc cmf4zxx example cmf zeq
 		with FSREG FSREG STACK
@@ -2337,38 +2293,32 @@ PATTERNS
 	pat cmf teq $1==8                  /* Double second == top */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
-			gen move {XEQ, %a}, %a
-			yields %a
+			yields {XEQ, %a}
 
 	pat cmf tne $1==8                  /* Single second == top */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
-			gen move {XNE, %a}, %a
-			yields %a
+			yields {XNE, %a}
 
 	pat cmf tgt $1==8                  /* Double second > top */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
-			gen move {XGT, %a}, %a
-			yields %a
+			yields {XGT, %a}
 
 	pat cmf tge $1==8                  /* Double second >= top */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
-			gen move {XGE, %a}, %a
-			yields %a
+			yields {XGE, %a}
 
 	pat cmf tlt $1==8                  /* Double second < top */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
-			gen move {XLT, %a}, %a
-			yields %a
+			yields {XLT, %a}
 
 	pat cmf tle $1==8                  /* Double second <= top */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
-			gen move {XLE, %a}, %a
-			yields %a
+			yields {XLE, %a}
 
 	proc cmf8zxx example cmf zeq
 		with FREG FREG STACK
@@ -2385,11 +2335,15 @@ PATTERNS
 	pat cmf zlt $1==8    call cmf8zxx("blt")
 	pat cmf zle $1==8    call cmf8zxx("ble")
 
-	pat loc loc cff $1==INT64 && $2==INT32 /* Convert double to single */
+	/* Convert double to single */
+	/*   reg_float pattern must be first, or it goes unused! */
+	pat loc loc cff stl $1==8 && $2==4 && inreg($4)==reg_float
+		with FREG
+			gen frsp {LOCAL, $4}, %1
+	pat loc loc cff $1==8 && $2==4
 		with FREG
 			uses reusing %1, FSREG
-			gen
-				frsp %a, %1
+			gen frsp %a, %1
 			yields %a
 
 	/* Convert double to signed int */

From 504d2aa34e606b6a51643b2fcdd49312d8ece1d0 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Sat, 9 Dec 2017 17:21:06 -0500
Subject: [PATCH 07/55] Revise stack shuffles and integer conversions in
 PowerPC ncg.

Allow asp 4, exg 4 to shuffle tokens without coercing them into
registers; but comment why dup 4, dup 8 coerce tokens into registers.

Allow dup, dus, exg with larger sizes; and add tests dup_e.e and
exg_e.e to check that dup 20, dus, exg 20 work as well in powerpc as
in i80 and i86.

Then powerpc failed to compile loc 2 loc 4 cuu in dup_e.e.  Revise the
integer conversions, so powerpc can compile and pass the test.
---
 mach/powerpc/libem/build.lua |   2 +-
 mach/powerpc/libem/exg.s     |  22 ++++++
 mach/powerpc/ncg/table       | 134 +++++++++++++++++++-------------
 tests/plat/build.lua         |   2 +
 tests/plat/dup_e.e           | 145 +++++++++++++++++++++++++++++++++++
 tests/plat/exg_e.e           |  86 +++++++++++++++++++++
 6 files changed, 338 insertions(+), 53 deletions(-)
 create mode 100644 mach/powerpc/libem/exg.s
 create mode 100644 tests/plat/dup_e.e
 create mode 100644 tests/plat/exg_e.e

diff --git a/mach/powerpc/libem/build.lua b/mach/powerpc/libem/build.lua
index 16a03147e..cb5efd281 100644
--- a/mach/powerpc/libem/build.lua
+++ b/mach/powerpc/libem/build.lua
@@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
 	acklibrary {
 		name = "lib_"..plat,
 		srcs = {
-			"./*.s", -- rm ret.s
+			"./*.s", -- exg.s
 		},
 		vars = { plat = plat },
 		deps = {
diff --git a/mach/powerpc/libem/exg.s b/mach/powerpc/libem/exg.s
new file mode 100644
index 000000000..eb631b697
--- /dev/null
+++ b/mach/powerpc/libem/exg.s
@@ -0,0 +1,22 @@
+.sect .text
+
+! Exchange top two values on stack.
+!   Stack: ( a b size -- b a )
+
+.define .exg
+.exg:
+	lwz	r3, 0(sp)		! r3 = size
+	srwi	r7, r3, 2
+	mtspr	ctr, r7			! ctr = size / 4
+	mr	r4, sp			! r4 = pointer before value b
+	add	r5, r4, r3		! r5 = pointer before value a
+
+	! Loop to swap each pair of words.
+1:	lwzu	r6, 4(r4)
+	lwzu	r7, 4(r5)
+	stw	r6, 0(r5)
+	stw	r7, 0(r4)
+	bdnz	1b			! loop ctr times
+
+	addi	sp, sp, 4		! drop size from stack
+	blr
diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 5768c4382..90fd9448d 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -217,6 +217,9 @@ SETS
 	          XOR_RIS + XOR_RC + XOR_RR + NAND_RR + NOR_RR + EQV_RR +
 	          XEQ + XNE + XGT + XGE + XLT + XLE.
 
+	/* any register or token of each size */
+	ANY4 = ANY_BHW + FSREG.
+	ANY8 = IND_ALL_D + FREG.
 
 INSTRUCTIONS
 
@@ -756,7 +759,7 @@ COERCIONS
 
 PATTERNS
 
-/* Intrinsics */
+/* Constants */
 
 	pat loc $1==(0-0x8000)             /* Load constant */
 		yields {CONST_N8000, $1}
@@ -773,22 +776,78 @@ PATTERNS
 	pat loc
 		yields {CONST_HL, $1}
 
-	pat dup $1==INT32                  /* Duplicate word on top of stack */
-		with REG
-			yields %1 %1
-		with FSREG
+
+/* Stack shuffles */
+
+	/* The peephole optimizer does:  loc $1 ass 4 -> asp $1
+	 * To optimize multiplication, it uses:  dup 8 asp 4
+	 */
+
+	pat asp $1==4                      /* Adjust stack by constant */
+		with exact ANY4
+			/* drop %1 */
+		with STACK
+			gen addi sp, sp, {C, 4}
+	pat asp smalls($1)
+		with STACK
+			gen addi sp, sp, {C, $1}
+	pat asp lo($1)==0
+		with STACK
+			gen addi sp, sp, {C, hi($1)}
+	pat asp
+		with STACK
+			gen
+				addis sp, sp, {C, his($1)}
+				addi sp, sp, {C, los($1)}
+
+	pat ass $1==4                      /* Adjust stack by variable */
+		with REG STACK
+			gen add sp, sp, %1
+
+	/* To duplicate a token, we coerce the token into a register,
+	 * then duplicate the register.  This decreases code size.
+	 */
+
+	pat dup $1==4                      /* Duplicate word on top of stack */
+		with REG+FSREG
 			yields %1 %1
 
-	pat dup $1==INT64                  /* Duplicate double-word on top of stack */
-		with REG REG
+	pat dup $1==8                      /* Duplicate double-word */
+		with REG+FSREG REG+FSREG
 			yields %2 %1 %2 %1
 		with FREG
 			yields %1 %1
 
-	pat exg $1==INT32                  /* Exchange top two words on stack */
-		with REG REG
+	pat dup                            /* Duplicate other size */
+		leaving
+			loc $1
+			dus 4
+
+	pat dus $1==4                      /* Duplicate variable size */
+		with REG STACK
+			/* ( a size%1 -- a a ) */
+			uses REG, REG
+			gen
+				srwi %a, %1, {C, 2}
+				mtspr ctr, %a
+				add %b, sp, %1
+			1:	lwzu %a, {IND_RC_W, %b, 0-4}
+				stwu %a, {IND_RC_W, sp, 0-4}
+				bdnz {LABEL, "1b"}
+
+	pat exg $1==4                      /* Exchange top two words */
+		with ANY4 ANY4
 			yields %1 %2
 
+	pat exg defined($1)                /* Exchange other size */
+		leaving
+			loc $1
+			cal ".exg"
+
+	pat exg !defined($1)
+		leaving
+			cal ".exg"
+
 	pat ste loe $1==$2                 /* Store then load external */
 		leaving
 			dup 4
@@ -797,32 +856,30 @@ PATTERNS
 
 /* Type conversions */
 
-	pat loc loc ciu                    /* signed X -> unsigned X */
+	pat loc loc ciu                    /* signed -> unsigned */
 		leaving
 			loc $1
 			loc $2
 			cuu
 
-	pat loc loc cuu $1==$2             /* unsigned X -> unsigned X */
+	pat loc loc cui                    /* unsigned -> signed */
+		leaving
+			loc $1
+			loc $2
+			cuu
+
+	pat loc loc cuu $1<=4 && $2<=4     /* unsigned -> unsigned */
 		/* nop */
 
-	pat loc loc cii $1==$2             /* signed X -> signed X */
-		/* nop */
+	pat loc loc cii $1<=4 && $2<=$1
+		/* signed -> signed of smaller or same size,
+		 * no sign extension */
 
-	pat loc loc cui $1==$2             /* unsigned X -> signed X */
-		/* nop */
-
-	pat loc loc cui $1==INT8 && $2==INT32 /* unsigned char -> signed int */
-		/* nop */
-
-	pat loc loc cui $1==INT16 && $2==INT32 /* unsigned short -> signed int */
-		/* nop */
-
-	pat loc loc cii $1==INT8 && $2==INT32 /* signed char -> signed int */
+	pat loc loc cii $1==1 && $2<=4     /* sign-extend char */
 		with REG
 			yields {SEX_B, %1}
 
-	pat loc loc cii $1==2 && $2==4     /* signed char -> signed short */
+	pat loc loc cii $1==2 && $2<=4     /* sign-extend short */
 		with REG
 			yields {SEX_H, %1}
 
@@ -1362,7 +1419,7 @@ PATTERNS
 		leaving
 			loc 0
 
-	pat zer defined($1)	   	           /* Create empty set */
+	pat zer defined($1)                /* Create empty set */
 		leaving
 			loc $1
 			cal ".zer"
@@ -2038,33 +2095,6 @@ PATTERNS
 			gen
 				move %1, sp
 
-	pat loc ass $1==4 && $2==4         /* Drop 4 bytes from stack */
-		with exact REG
-			/* nop */
-		with STACK
-			gen
-				addi sp, sp, {C, 4}
-
-	pat ass $1==4                      /* Adjust stack by variable amount */
-		with CONST2 STACK
-			gen
-				move {SUM_RC, sp, %1.val}, sp
-		with CONST_HZ STACK
-			gen
-				move {SUM_RC, sp, his(%1.val)}, sp
-		with CONST_STACK-CONST2-CONST_HZ STACK
-			gen
-				move {SUM_RC, sp, his(%1.val)}, sp
-				move {SUM_RC, sp, los(%1.val)}, sp
-		with REG STACK
-			gen
-				move {SUM_RR, sp, %1}, sp
-
-	pat asp                            /* Adjust stack by constant amount */
-		leaving
-			loc $1
-			ass 4
-
 	pat lae rck $2==4                  /* Range check */
 		with REG
 			kills ALL
diff --git a/tests/plat/build.lua b/tests/plat/build.lua
index cbd39468e..609771ed1 100644
--- a/tests/plat/build.lua
+++ b/tests/plat/build.lua
@@ -10,6 +10,8 @@ definerule("plat_testsuite",
 		-- target names will resolve there.
 		local testfiles = filenamesof(
 			"tests/plat/*.c",
+			"tests/plat/dup_e.e",
+			"tests/plat/exg_e.e",
 			"tests/plat/inn_e.e",
 			"tests/plat/rotate_e.e",
 			"tests/plat/*.p",
diff --git a/tests/plat/dup_e.e b/tests/plat/dup_e.e
new file mode 100644
index 000000000..600161be4
--- /dev/null
+++ b/tests/plat/dup_e.e
@@ -0,0 +1,145 @@
+#
+    mes 2, EM_WSIZE, EM_PSIZE
+
+/*
+ * Test _dup_ and _dus_ by loading 20 bytes from _src_, then making
+ * and checking some duplicates.
+ */
+
+    exa src
+    exa size
+src
+    con 3593880729I4, 782166578I4, 4150666996I4, 2453272937I4, 3470523049I4
+size
+    con 20I2
+
+    exp $check
+    exp $_m_a_i_n
+    pro $_m_a_i_n, 0
+
+    /* Push 3 copies of src on stack. */
+    lae src
+    loi 20        /* 1st copy */
+    dup 20        /* 2nd copy */
+    lae size
+    loi 2
+    loc 2
+    loc EM_WSIZE
+    cuu
+    dus EM_WSIZE  /* 3rd copy */
+
+    cal $check
+    cal $finished
+    end /* $_m_a_i_n */
+
+    pro $check, 4 * EM_PSIZE + 2 * EM_WSIZE
+#define p1    (-1 * EM_PSIZE)
+#define p2    (-2 * EM_PSIZE)
+#define p3    (-3 * EM_PSIZE)
+#define p4    (-4 * EM_PSIZE)
+#define b     (p4 - 1 * EM_WSIZE)
+#define i     (p4 - 2 * EM_WSIZE)
+
+    /* Set pointers to all 4 copies. */
+    lae src
+    lal p4
+    sti EM_PSIZE  /* p4 = src */
+    lal 0
+    lal p3
+    sti EM_PSIZE  /* p3 = 3rd copy */
+    lal 20
+    lal p2
+    sti EM_PSIZE  /* p2 = 2nd copy */
+    lal 40
+    lal p1
+    sti EM_PSIZE  /* p1 = 1st copy */
+
+    /* Loop 20 times to verify each byte. */
+    loc 0
+    stl i
+4
+    lal p4
+    loi EM_PSIZE
+    loi 1
+    loc 1
+    loc EM_WSIZE
+    cii
+    stl b         /* b = byte from src */
+    lol b
+    lal p3
+    loi EM_PSIZE
+    loi 1         /* byte from 3rd copy */
+    loc 1
+    loc EM_WSIZE
+    cii
+    beq *3
+    loc (3 * 256)
+    lol i
+    adi EM_WSIZE
+    loc EM_WSIZE
+    loc 4
+    cuu
+    cal $fail
+    asp 4
+3
+    lol b
+    lal p2
+    loi EM_PSIZE
+    loi 1         /* byte from 2nd copy */
+    loc 1
+    loc EM_WSIZE
+    cii
+    beq *2
+    loc (2 * 256)
+    lol i
+    adi EM_WSIZE
+    loc EM_WSIZE
+    loc 4
+    cuu
+    cal $fail
+    asp 4
+2
+    lol b
+    lal p1
+    loi EM_PSIZE
+    loi 1         /* byte from 1st copy */
+    loc 1
+    loc EM_WSIZE
+    cii
+    beq *1
+    loc (1 * 256)
+    lol i
+    adi EM_WSIZE
+    loc EM_WSIZE
+    loc 4
+    cuu
+    cal $fail
+    asp 4
+1
+    lal p4
+    loi EM_PSIZE
+    adp 1
+    lal p4
+    sti EM_PSIZE  /* increment p4 */
+    lal p3
+    loi EM_PSIZE
+    adp 1
+    lal p3
+    sti EM_PSIZE  /* increment p3 */
+    lal p2
+    loi EM_PSIZE
+    adp 1
+    lal p2
+    sti EM_PSIZE  /* increment p2 */
+    lal p1
+    loi EM_PSIZE
+    adp 1
+    lal p1
+    sti EM_PSIZE  /* increment p1 */
+    inl i
+    lol i
+    loc 20
+    blt *4        /* loop 20 times */
+
+    ret 0
+    end /* $check */
diff --git a/tests/plat/exg_e.e b/tests/plat/exg_e.e
new file mode 100644
index 000000000..3a1f06d3b
--- /dev/null
+++ b/tests/plat/exg_e.e
@@ -0,0 +1,86 @@
+#
+    mes 2, EM_WSIZE, EM_PSIZE
+
+/*
+ * Test _exg_ by loading 40 bytes from _src_, then exchanging 20 and
+ * 20 bytes, and checking the result.
+ */
+
+    exa src
+src
+    con 1539465570I4, 1344465418I4, 1317578918I4, 1163467696I4, 2645261331I4
+    con 3981585269I4, 1433968975I4, 4256886989I4, 4114909542I4, 1817334375I4
+
+    exp $check
+    exp $_m_a_i_n
+    pro $_m_a_i_n, 0
+
+    lae src
+    loi 40
+    exg 20
+    cal $check
+    cal $finished
+    end /* $_m_a_i_n */
+
+    pro $check, 2 * EM_PSIZE + EM_WSIZE
+#define p1    (-1 * EM_PSIZE)
+#define p2    (-2 * EM_PSIZE)
+#define i     (p2 - EM_WSIZE)
+
+    lae src
+    lal p2
+    sti EM_PSIZE  /* p2 = src */
+    lal 0
+    adp 20
+    lal p1
+    sti EM_PSIZE  /* p1 = exchanged copy + 20 */
+
+    /* Loop 40 times to verify each byte. */
+    loc 0
+    stl i
+1
+    lal p2
+    loi EM_PSIZE
+    loi 1         /* byte from src */
+    loc 1
+    loc EM_WSIZE
+    cii
+    lal p1
+    loi EM_PSIZE
+    loi 1         /* byte from exchanged copy */
+    loc 1
+    loc EM_WSIZE
+    cii
+    beq *2
+    lol i
+    loc EM_WSIZE
+    loc 4
+    cuu
+    cal $fail
+    asp 4
+2
+    lal p2
+    loi EM_PSIZE
+    adp 1
+    lal p2
+    sti EM_PSIZE  /* increment p2 */
+    lal p1
+    loi EM_PSIZE  /* p1 */
+    inl i
+    /* When i reaches 20, p1 would reach end of exchanged copy. */
+    lol i
+    loc 20
+    beq *3
+    adp 1         /* p1 + 1 */
+    bra *4
+3
+    adp -39       /* p1 - 39, beginning of exchanged copy */
+4
+    lal p1
+    sti EM_PSIZE  /* move p1 */
+    lol i
+    loc 40
+    blt *1
+
+    ret 0
+    end /* $check */
\ No newline at end of file

From 11a54e0a7c0e7f4947986fee96d3465ac68e7ab0 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Sun, 10 Dec 2017 14:01:14 -0500
Subject: [PATCH 08/55] These instructions write to the CR.

---
 mach/powerpc/ncg/table | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 90fd9448d..cc5d8a0c0 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -262,13 +262,13 @@ INSTRUCTIONS
   bclr            CONST:ro, CONST:ro, CONST:ro.
     blr.
   bl              LABEL:ro.
-  cmp             CR:ro, CONST:ro, GPR:ro, GPR:ro kills :cc.
+  cmp             CR:wo, CONST:ro, GPR:ro, GPR:ro kills :cc.
     cmpw          GPR:ro, GPR:ro kills :cc.
-  cmpi            CR:ro, CONST:ro, GPR:ro, CONST:ro kills :cc.
+  cmpi            CR:wo, CONST:ro, GPR:ro, CONST:ro kills :cc.
     cmpwi         GPR:ro, CONST:ro kills :cc.
-  cmpl            CR:ro, CONST:ro, GPR:ro, GPR:ro kills :cc.
+  cmpl            CR:wo, CONST:ro, GPR:ro, GPR:ro kills :cc.
     cmplw         GPR:ro, GPR:ro kills :cc.
-  cmpli           CR:ro, CONST:ro, GPR:ro, CONST:ro kills :cc.
+  cmpli           CR:wo, CONST:ro, GPR:ro, CONST:ro kills :cc.
     cmplwi        GPR:ro, CONST:ro kills :cc.
   divw            GPR:wo, GPR:ro, GPR:ro cost(4, 23).
   divwu           GPR:wo, GPR:ro, GPR:ro cost(4, 23).

From 5ba83100d654303f448bc0041044cbd9dfa1d74c Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Tue, 12 Dec 2017 13:36:43 -0500
Subject: [PATCH 09/55] Delete rules for sti 8 with REG IND_RC_D, with REG
 IND_RR_D.

Prefer the rule with REG FREG, by coercing IND_RC_D or IND_RR_D to
FREG.  This rule looks better to ncg.  When ncg chose between coercion
to REG IND_RC_D or coercion to REG FREG, it chose REG FREG.  It only
chose REG IND_RC_D if the stack had exact REG IND_RC_D.
---
 mach/powerpc/ncg/table | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index cc5d8a0c0..f5b3817cc 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -1174,26 +1174,6 @@ PATTERNS
 			gen
 				move %2, {IND_RC_W, %1, 0}
 				move %3, {IND_RC_W, %1, 4}
-		/*
-		 * Next 2 patterns exist because there is no coercion
-		 * from IND_ALL_D to REG REG.
-		 */
-		with REG IND_RC_D
-			kills MEMORY
-			uses REG={SUM_RC, %2.reg, %2.off}, REG, REG
-			gen
-				move {IND_RC_W, %a, 0}, %b
-				move {IND_RC_W, %a, 4}, %c
-				move %b, {IND_RC_W, %1, 0}
-				move %c, {IND_RC_W, %1, 4}
-		with REG IND_RR_D
-			kills MEMORY
-			uses REG={SUM_RR, %2.reg1, %2.reg2}, REG, REG
-			gen
-				move {IND_RC_W, %a, 0}, %b
-				move {IND_RC_W, %a, 4}, %c
-				move %b, {IND_RC_W, %1, 0}
-				move %c, {IND_RC_W, %1, 4}
 
 	pat sti                            /* Store arbitrary size */
 		leaving

From b0d75fed37c49f61992310c0e62525d6305cbe13 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 14 Dec 2017 16:26:19 -0500
Subject: [PATCH 10/55] Rename ANY_BHW to INT_W; add FLOAT_W, FLOAT_D.

INT_W, the integer set, continues to exclude FSREG, because we can't
easily move FSREG to GPR.

ANY4 becomes ISET+FLOAT_W and ANY8 becomes FLOAT_D.
---
 mach/powerpc/ncg/table | 49 +++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index f5b3817cc..ae1620cfd 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -65,7 +65,7 @@ REGISTERS
 	f0          : FPR.
 
 	f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13
-          : FPR, FREG.
+	  : FPR, FREG.
 
 	f14, f15, f16, f17, f18, f19, f20, f21, f22, f23, f24, f25,
 	f26, f27, f28, f29, f30, f31
@@ -202,24 +202,23 @@ SETS
 	                     IND_RC_H_S + IND_RL_H_S + IND_RR_H_S.
 	IND_ALL_W          = IND_RC_W + IND_RL_W + IND_RR_W.
 	IND_ALL_D          = IND_RC_D + IND_RL_D + IND_RR_D.
-	IND_ALL_BHW        = IND_ALL_B + IND_ALL_H + IND_ALL_W.
 
 	/* anything killed by sti (store indirect) */
-	MEMORY             = IND_ALL_BHW + IND_ALL_D.
+	MEMORY  = IND_ALL_B + IND_ALL_H + IND_ALL_W + IND_ALL_D.
 
-	/* any stack token that we can easily move to GPR */
-	ANY_BHW = REG + CONST_STACK + SEX_B + SEX_H +
+	/* any integer from stack that we can easily move to GPR */
+	INT_W   = REG + CONST_STACK + SEX_B + SEX_H +
 	          SUM_RIS + SUM_RC + SUM_RL + SUM_RR +
 	          SUB_RR + NEG_R + MUL_RR + DIV_RR + DIV_RR_U +
-	          IND_ALL_BHW +
+	          IND_ALL_B + IND_ALL_H + IND_ALL_W +
 	          NOT_R + AND_RIS + AND_RC + AND_RR + ANDC_RR +
 	          OR_RIS + OR_RC + OR_RR + ORC_RR +
 	          XOR_RIS + XOR_RC + XOR_RR + NAND_RR + NOR_RR + EQV_RR +
 	          XEQ + XNE + XGT + XGE + XLT + XLE.
 
-	/* any register or token of each size */
-	ANY4 = ANY_BHW + FSREG.
-	ANY8 = IND_ALL_D + FREG.
+	FLOAT_D = FREG + IND_ALL_D.
+	FLOAT_W = FSREG + IND_ALL_W.
+
 
 INSTRUCTIONS
 
@@ -634,13 +633,13 @@ MOVES
    our moves to GPR to set register variables.  We define no moves to
    LOCAL, so we avoid confusion between GPR and FSREG in LOCAL. */
 
-	from ANY_BHW to GPR_EXPR
+	from INT_W to GPR_EXPR
 		gen move %1, %2.reg
 
-	from FPR+IND_ALL_D to FPR_EXPR
+	from FLOAT_D to FPR_EXPR
 		gen move %1, %2.reg
 
-	from FSREG+IND_ALL_W to FSREG_EXPR
+	from FLOAT_W to FSREG_EXPR
 		gen move %1, %2.reg
 
 
@@ -664,15 +663,15 @@ STACKINGRULES
 			COMMENT("stack REG")
 			stwu %1, {IND_RC_W, sp, 0-4}
 
-	from ANY_BHW-REG to STACK
+	from INT_W-REG to STACK
 		gen
-			COMMENT("stack ANY_BHW-REG")
+			COMMENT("stack INT_W-REG")
 			move %1, RSCRATCH
 			stwu RSCRATCH, {IND_RC_W, sp, 0-4}
 
-	from IND_ALL_D to STACK
+	from FLOAT_D-FREG to STACK
 		gen
-			COMMENT("stack IND_ALL_D")
+			COMMENT("stack FLOAT_D-FREG")
 			move %1, FSCRATCH
 			stfdu FSCRATCH, {IND_RC_D, sp, 0-8}
 
@@ -724,10 +723,10 @@ COERCIONS
 			addi sp, sp, {C, 4}
 		yields %a
 
-	from ANY_BHW
+	from INT_W
 		uses REG
 		gen
-			COMMENT("coerce ANY_BHW->REG")
+			COMMENT("coerce INT_W->REG")
 			move %1, %a
 		yields %a
 
@@ -741,17 +740,17 @@ COERCIONS
 	 * but %1.off+4 might overflow a signed 16-bit integer.
 	 */
 
-	from FREG+IND_ALL_D
+	from FLOAT_D
 		uses FREG
 		gen
-			COMMENT("coerce FREG+IND_ALL_D->FREG")
+			COMMENT("coerce FLOAT_D->FREG")
 			move %1, %a
 		yields %a
 
-	from FSREG+IND_ALL_W
+	from FLOAT_W
 		uses FSREG
 		gen
-			COMMENT("coerce FSREG+IND_ALL_W->FREG")
+			COMMENT("coerce FLOAT_W->FREG")
 			move %1, %a
 		yields %a
 
@@ -784,7 +783,7 @@ PATTERNS
 	 */
 
 	pat asp $1==4                      /* Adjust stack by constant */
-		with exact ANY4
+		with exact INT_W+FLOAT_W
 			/* drop %1 */
 		with STACK
 			gen addi sp, sp, {C, 4}
@@ -836,7 +835,7 @@ PATTERNS
 				bdnz {LABEL, "1b"}
 
 	pat exg $1==4                      /* Exchange top two words */
-		with ANY4 ANY4
+		with INT_W+FLOAT_W INT_W+FLOAT_W
 			yields %1 %2
 
 	pat exg defined($1)                /* Exchange other size */
@@ -911,7 +910,7 @@ PATTERNS
 
 	/* Store word to local */
 	pat stl inreg($1)==reg_any
-		with exact ANY_BHW
+		with exact INT_W
 			/* ncg fails to infer that regvar($1) is dead! */
 			kills regvar($1)
 			gen move %1, {GPR_EXPR, regvar($1)}

From d8fa9d1b2aa5e2387b0e12dc6babf4efa4344c54 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Sun, 17 Dec 2017 12:45:27 -0500
Subject: [PATCH 11/55] In coercions, try to reuse a register with the same
 token.

This reduces code size.
---
 mach/powerpc/ncg/table | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index ae1620cfd..3611a1809 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -723,11 +723,13 @@ COERCIONS
 			addi sp, sp, {C, 4}
 		yields %a
 
+	/* "uses REG=%1" may find and reuse a register containing the
+	 * same token.  For contrast, "uses REG gen move %1, %a" would
+	 * pick a different register before doing the move.
+         */
+
 	from INT_W
-		uses REG
-		gen
-			COMMENT("coerce INT_W->REG")
-			move %1, %a
+		uses REG=%1
 		yields %a
 
 	/*
@@ -741,17 +743,11 @@ COERCIONS
 	 */
 
 	from FLOAT_D
-		uses FREG
-		gen
-			COMMENT("coerce FLOAT_D->FREG")
-			move %1, %a
+		uses FREG=%1
 		yields %a
 
 	from FLOAT_W
-		uses FSREG
-		gen
-			COMMENT("coerce FLOAT_W->FREG")
-			move %1, %a
+		uses FSREG=%1
 		yields %a
 
 

From 5e99baabdfd899a950bc184f745f3ed874744120 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Mon, 18 Dec 2017 12:36:10 -0500
Subject: [PATCH 12/55] Rename two tokens.  CONST_HZ was not hertz (Hz).

---
 mach/powerpc/ncg/table | 43 +++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 3611a1809..86da2a895 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -117,8 +117,8 @@ TOKENS
 	CONST_0000_7FFF    = { INT val; }             4    val.
 	CONST_8000         = { INT val; }             4    val.
 	CONST_8001_FFFF    = { INT val; }             4    val.
-	CONST_HZ           = { INT val; }             4    val.
-	CONST_HL           = { INT val; }             4    val.
+	CONST_HI_ZR        = { INT val; }             4    val.
+	CONST_HI_LO        = { INT val; }             4    val.
 
 /* Expression partial results */
 
@@ -193,7 +193,8 @@ SETS
 	UCONST2         = CONST_0000_7FFF + CONST_8000 + CONST_8001_FFFF.
 	/* any constant on stack */
 	CONST_STACK     = CONST_N8000 + CONST_N7FFF_N0001 + CONST_0000_7FFF +
-	                  CONST_8000 + CONST_8001_FFFF + CONST_HZ + CONST_HL.
+	                  CONST_8000 + CONST_8001_FFFF +
+			  CONST_HI_ZR + CONST_HI_LO.
 
 	CONST           = C + CONST_STACK.
 
@@ -767,9 +768,9 @@ PATTERNS
 	pat loc $1>=0x8001 && $1<=0xFFFF
 		yields {CONST_8001_FFFF, $1}
 	pat loc lo($1)==0
-		yields {CONST_HZ, $1}
+		yields {CONST_HI_ZR, $1}
 	pat loc
-		yields {CONST_HL, $1}
+		yields {CONST_HI_LO, $1}
 
 
 /* Stack shuffles */
@@ -1231,14 +1232,14 @@ PATTERNS
 			yields {SUM_RC, %2, %1.val}
 		with REG CONST2
 			yields {SUM_RC, %1, %2.val}
-		with CONST_HZ REG
+		with CONST_HI_ZR REG
 			yields {SUM_RIS, %2, his(%1.val)}
-		with REG CONST_HZ
+		with REG CONST_HI_ZR
 			yields {SUM_RIS, %1, his(%2.val)}
-		with CONST_STACK-CONST2-CONST_HZ REG
+		with CONST_STACK-CONST2-CONST_HI_ZR REG
 			uses reusing %2, REG={SUM_RIS, %2, his(%1.val)}
 			yields {SUM_RC, %a, los(%1.val)}
-		with REG CONST_STACK-CONST2-CONST_HZ
+		with REG CONST_STACK-CONST2-CONST_HI_ZR
 			uses reusing %1, REG={SUM_RIS, %1, his(%2.val)}
 			yields {SUM_RC, %a, los(%2.val)}
 
@@ -1248,9 +1249,9 @@ PATTERNS
 			yields {SUB_RR, %2, %1}
 		with CONST2_WHEN_NEG REG
 			yields {SUM_RC, %2, 0-%1.val}
-		with CONST_HZ REG
+		with CONST_HI_ZR REG
 			yields {SUM_RIS, %2, his(0-%1.val)}
-		with CONST_STACK-CONST2_WHEN_NEG-CONST_HZ REG
+		with CONST_STACK-CONST2_WHEN_NEG-CONST_HI_ZR REG
 			uses reusing %2, REG={SUM_RIS, %2, his(0-%1.val)}
 			yields {SUM_RC, %a, los(0-%1.val)}
 
@@ -1298,9 +1299,9 @@ PATTERNS
 			yields {AND_RC, %1, %2.val}
 		with UCONST2 REG
 			yields {AND_RC, %2, %1.val}
-		with REG CONST_HZ
+		with REG CONST_HI_ZR
 			yields {AND_RIS, %1, hi(%2.val)}
-		with CONST_HZ REG
+		with CONST_HI_ZR REG
 			yields {AND_RIS, %2, hi(%1.val)}
 
 	pat and defined($1)                /* AND set */
@@ -1323,14 +1324,14 @@ PATTERNS
 			yields {OR_RC, %1, %2.val}
 		with UCONST2 REG
 			yields {OR_RC, %2, %1.val}
-		with REG CONST_HZ
+		with REG CONST_HI_ZR
 			yields {OR_RIS, %1, hi(%2.val)}
-		with CONST_HZ REG
+		with CONST_HI_ZR REG
 			yields {OR_RIS, %2, hi(%1.val)}
-		with REG CONST_STACK-UCONST2-CONST_HZ
+		with REG CONST_STACK-UCONST2-CONST_HI_ZR
 			uses reusing %1, REG={OR_RIS, %1, hi(%2.val)}
 			yields {OR_RC, %1, lo(%2.val)}
-		with CONST_STACK-UCONST2-CONST_HZ REG
+		with CONST_STACK-UCONST2-CONST_HI_ZR REG
 			uses reusing %2, REG={OR_RIS, %2, hi(%1.val)}
 			yields {OR_RC, %2, lo(%1.val)}
 
@@ -1351,14 +1352,14 @@ PATTERNS
 			yields {XOR_RC, %1, %2.val}
 		with UCONST2 REG
 			yields {XOR_RC, %2, %1.val}
-		with REG CONST_HZ
+		with REG CONST_HI_ZR
 			yields {XOR_RIS, %1, hi(%2.val)}
-		with CONST_HZ REG
+		with CONST_HI_ZR REG
 			yields {XOR_RIS, %2, hi(%1.val)}
-		with REG CONST_STACK-UCONST2-CONST_HZ
+		with REG CONST_STACK-UCONST2-CONST_HI_ZR
 			uses reusing %1, REG={XOR_RIS, %1, hi(%2.val)}
 			yields {XOR_RC, %1, lo(%2.val)}
-		with CONST_STACK-UCONST2-CONST_HZ REG
+		with CONST_STACK-UCONST2-CONST_HI_ZR REG
 			uses reusing %2, REG={XOR_RIS, %2, hi(%1.val)}
 			yields {XOR_RC, %2, lo(%1.val)}
 

From 24abaf6a25fb8449a918ad8caa626da835e95521 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Mon, 18 Dec 2017 20:39:56 -0500
Subject: [PATCH 13/55] Enable conditional expressions in splitting coercions.

ncgg has parsed the optional conditional expression (optexpr) of each
splitting coercion since commit 72b83cc in 1985; but for almost 33
years, ncg has ignored the expression in c2_expr.

Few tables had conditional coercions (I only found them in arm and
m68020), and no tables had conditional splitting coercions, so this
only becomes a problem now as I try to add a conditional splitting
coercion to powerpc.
---
 mach/proto/ncg/subr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mach/proto/ncg/subr.c b/mach/proto/ncg/subr.c
index 0feb54f30..0dc045973 100644
--- a/mach/proto/ncg/subr.c
+++ b/mach/proto/ncg/subr.c
@@ -518,7 +518,7 @@ int split(token_p tp, int *ip, int ply, int toplevel) {
 	int tpl;
 
 	for (cp=c2coercs;cp->c2_texpno>=0; cp++) {
-		if (!match(tp,&machsets[cp->c2_texpno],0))
+		if (!match(tp,&machsets[cp->c2_texpno],cp->c2_expr))
 			continue;
 		ok=1;
 		for (i=0; ok && i<cp->c2_nsplit;i++) {

From ad47fa5fe302c0ff217ce846363708e7cbf024a4 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Mon, 18 Dec 2017 20:59:04 -0500
Subject: [PATCH 14/55] Add splitting coercions for IND_ALL_D.

Delete my wrong comment (from commits cfbc537, a8f62f4, 5432bd0) which
claimed that such coercions are not possible.
---
 mach/powerpc/ncg/table | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 86da2a895..28ef4b40e 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -52,11 +52,9 @@ REGISTERS
 	 *   r13, r14, ..., r31: GPR, REG regvar(reg_any).
 	 */
 
-	r0, sp, fp  : GPR.
-	r3          : GPR, REG, REG3.
-
-	r4, r5, r6, r7, r8, r9, r10, r11, r12
-	  : GPR, REG.
+	r0, sp, fp, r12                   : GPR.
+	r3                                : GPR, REG, REG3.
+	r4, r5, r6, r7, r8, r9, r10, r11  : GPR, REG.
 
 	r13, r14, r15, r16, r17, r18, r19, r20, r21, r22, r23, r24,
 	r25, r26, r27, r28, r29, r30, r31
@@ -88,6 +86,10 @@ REGISTERS
 	lr, ctr     : SPR.
 	cr0         : CR.
 
+	/* The stacking rules and the splitting coercions can't
+	 * allocate registers.  We use r12 in the splitting coercions,
+	 * and these scratch registers in the stacking rules.
+	 */
 #define RSCRATCH r0
 #define FSCRATCH f0
 
@@ -697,7 +699,6 @@ STACKINGRULES
 		gen bug {LABEL, "STACKING DLOCAL"}
 
 
-
 COERCIONS
 
 	from STACK
@@ -733,16 +734,6 @@ COERCIONS
 		uses REG=%1
 		yields %a
 
-	/*
-	 * There is no coercion from IND_ALL_D to REG REG, because
-	 * coercions can't allocate registers for intermediate values.
-	 *
-	 * A coercion to split IND_RC_D into two IND_RC_W, without
-	 * allocating an intermediate register, would yield
-	 *   {IND_RC_W, %1.val, %1.off+4}
-	 * but %1.off+4 might overflow a signed 16-bit integer.
-	 */
-
 	from FLOAT_D
 		uses FREG=%1
 		yields %a
@@ -751,6 +742,23 @@ COERCIONS
 		uses FSREG=%1
 		yields %a
 
+	/* Splitting coercions can't allocate registers.
+	 * PowerPC can't add r0 + constant.  Use r12.
+	 */
+
+	from IND_RC_D %off<=0x7FFA
+		yields
+			{IND_RC_W, %1.reg, %1.off+4}
+			{IND_RC_W, %1.reg, %1.off}
+
+	from IND_RC_D
+		/* Don't move to %1.reg; it might be a regvar. */
+		gen move {SUM_RC, %1.reg, %1.off}, r12
+		yields {IND_RC_W, r12, 4} {IND_RC_W, r12, 0}
+
+	from IND_RR_D
+		gen move {SUM_RR, %1.reg1, %1.reg2}, r12
+		yields {IND_RC_W, r12, 4} {IND_RC_W, r12, 0}
 
 
 PATTERNS

From a4e6595032838a7260425f35f0dd045a1126a9af Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Mon, 18 Dec 2017 21:17:42 -0500
Subject: [PATCH 15/55] Remove '\0' from output.  Fix a compiler warning.

Don't output '\0' in "@@FINISHED\0".

Cast code to unsigned int.  This helps platforms with 16-bit int, by
doing only the low 16 bits of the bitwise-and.  It also removes the
"(warning) conversion of long to pointer loses accuracy".
---
 tests/plat/lib/test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/plat/lib/test.c b/tests/plat/lib/test.c
index 426f9944a..6df3ee7d5 100644
--- a/tests/plat/lib/test.c
+++ b/tests/plat/lib/test.c
@@ -5,7 +5,7 @@
 void finished(void)
 {
     static const char s[] = "@@FINISHED\n";
-    write(1, s, sizeof(s));
+    write(1, s, sizeof(s)-1);
     _exit(0);
 }
 
@@ -16,7 +16,7 @@ void writehex(uint32_t code)
 
     do
     {
-        *--p = "0123456789abcdef"[code & 0xf];
+        *--p = "0123456789abcdef"[(unsigned int)code & 0xf];
         code >>= 4;
     }
     while (code > 0);

From a5e8dc8a069d373936732c09c58abb5f20840f53 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Mon, 18 Dec 2017 21:52:13 -0500
Subject: [PATCH 16/55] Simplify code by using cms EM_WSIZE to compare bytes.

This should work because the C compiler does it.
---
 tests/plat/dup_e.e | 44 ++++++++++++++++++--------------------------
 tests/plat/exg_e.e |  9 ++-------
 2 files changed, 20 insertions(+), 33 deletions(-)

diff --git a/tests/plat/dup_e.e b/tests/plat/dup_e.e
index 600161be4..c0e0001b3 100644
--- a/tests/plat/dup_e.e
+++ b/tests/plat/dup_e.e
@@ -32,13 +32,12 @@ size
     cal $finished
     end /* $_m_a_i_n */
 
-    pro $check, 4 * EM_PSIZE + 2 * EM_WSIZE
+    pro $check, 4 * EM_PSIZE + EM_WSIZE
 #define p1    (-1 * EM_PSIZE)
 #define p2    (-2 * EM_PSIZE)
 #define p3    (-3 * EM_PSIZE)
 #define p4    (-4 * EM_PSIZE)
-#define b     (p4 - 1 * EM_WSIZE)
-#define i     (p4 - 2 * EM_WSIZE)
+#define i     (p4 - EM_WSIZE)
 
     /* Set pointers to all 4 copies. */
     lae src
@@ -60,56 +59,49 @@ size
 4
     lal p4
     loi EM_PSIZE
-    loi 1
-    loc 1
-    loc EM_WSIZE
-    cii
-    stl b         /* b = byte from src */
-    lol b
+    loi 1         /* byte from src */
     lal p3
     loi EM_PSIZE
     loi 1         /* byte from 3rd copy */
-    loc 1
-    loc EM_WSIZE
-    cii
-    beq *3
+    cms EM_WSIZE
+    zeq *3
     loc (3 * 256)
     lol i
-    adi EM_WSIZE
+    adi EM_WSIZE  /* 0x300 + i */
     loc EM_WSIZE
     loc 4
     cuu
     cal $fail
     asp 4
 3
-    lol b
+    lal p4
+    loi EM_PSIZE
+    loi 1         /* byte from src */
     lal p2
     loi EM_PSIZE
     loi 1         /* byte from 2nd copy */
-    loc 1
-    loc EM_WSIZE
-    cii
-    beq *2
+    cms EM_WSIZE
+    zeq *2
     loc (2 * 256)
     lol i
-    adi EM_WSIZE
+    adi EM_WSIZE  /* 0x200 + i */
     loc EM_WSIZE
     loc 4
     cuu
     cal $fail
     asp 4
 2
-    lol b
+    lal p4
+    loi EM_PSIZE
+    loi 1         /* byte from src */
     lal p1
     loi EM_PSIZE
     loi 1         /* byte from 1st copy */
-    loc 1
-    loc EM_WSIZE
-    cii
-    beq *1
+    cms EM_WSIZE
+    zeq *1
     loc (1 * 256)
     lol i
-    adi EM_WSIZE
+    adi EM_WSIZE  /* 0x100 + i */
     loc EM_WSIZE
     loc 4
     cuu
diff --git a/tests/plat/exg_e.e b/tests/plat/exg_e.e
index 3a1f06d3b..617f07104 100644
--- a/tests/plat/exg_e.e
+++ b/tests/plat/exg_e.e
@@ -42,16 +42,11 @@ src
     lal p2
     loi EM_PSIZE
     loi 1         /* byte from src */
-    loc 1
-    loc EM_WSIZE
-    cii
     lal p1
     loi EM_PSIZE
     loi 1         /* byte from exchanged copy */
-    loc 1
-    loc EM_WSIZE
-    cii
-    beq *2
+    cms EM_WSIZE
+    zeq *2
     lol i
     loc EM_WSIZE
     loc 4

From aa9418c029c36c952c78e5243043e13001d01c14 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Mon, 18 Dec 2017 21:58:57 -0500
Subject: [PATCH 17/55] Pass 4 bytes to fail(uint32_t)

This would become necessary if something failed on a platform with
16-bit int (EM_WSIZE == 2).

Remove unreachable `ret 0`.  If reached, it wouldn't work to return
from _m_a_i_n.
---
 tests/plat/inn_e.e | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tests/plat/inn_e.e b/tests/plat/inn_e.e
index a5aee02f5..543623b3f 100644
--- a/tests/plat/inn_e.e
+++ b/tests/plat/inn_e.e
@@ -14,6 +14,9 @@
     zeq *1
 
     loc __LINE__
+    loc EM_WSIZE
+    loc 4
+    cuu
     cal $fail
     ass EM_WSIZE
 1
@@ -31,6 +34,9 @@
     zne *2
 
     loc __LINE__
+    loc EM_WSIZE
+    loc 4
+    cuu
     cal $fail
     ass EM_WSIZE
 2
@@ -49,6 +55,9 @@
     zeq *3
 
     loc __LINE__
+    loc EM_WSIZE
+    loc 4
+    cuu
     cal $fail
     ass EM_WSIZE
 3
@@ -67,11 +76,12 @@
     zne *4
 
     loc __LINE__
+    loc EM_WSIZE
+    loc 4
+    cuu
     cal $fail
     ass EM_WSIZE
 4
 
     cal $finished
-    ret 0
-    
     end

From 787fdeaaa9330e267dd49aa26818a4e1bd02eded Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 21 Dec 2017 17:44:03 -0500
Subject: [PATCH 18/55] Add some tests for Modula-2.

---
 tests/plat/build.lua             |   7 +-
 tests/plat/m2/ConvTest_mod.mod   |  36 +++++++++
 tests/plat/m2/NestProc_mod.mod   | 132 +++++++++++++++++++++++++++++++
 tests/plat/m2/OpenArray_mod.mod  |  59 ++++++++++++++
 tests/plat/m2/Set100_mod.mod     |  61 ++++++++++++++
 tests/plat/m2/StringTest_mod.mod |  55 +++++++++++++
 6 files changed, 349 insertions(+), 1 deletion(-)
 create mode 100644 tests/plat/m2/ConvTest_mod.mod
 create mode 100644 tests/plat/m2/NestProc_mod.mod
 create mode 100644 tests/plat/m2/OpenArray_mod.mod
 create mode 100644 tests/plat/m2/Set100_mod.mod
 create mode 100644 tests/plat/m2/StringTest_mod.mod

diff --git a/tests/plat/build.lua b/tests/plat/build.lua
index 609771ed1..666af7d95 100644
--- a/tests/plat/build.lua
+++ b/tests/plat/build.lua
@@ -17,7 +17,12 @@ definerule("plat_testsuite",
 			"tests/plat/*.p",
 			"tests/plat/b/*.b",
 			"tests/plat/bugs/bug-22-inn_mod.mod",
-			"tests/plat/bugs/bug-62-notvar_var_e.c"
+			"tests/plat/bugs/bug-62-notvar_var_e.c",
+			"tests/plat/m2/ConvTest_mod.mod",
+			"tests/plat/m2/NestProc_mod.mod",
+			"tests/plat/m2/OpenArray_mod.mod",
+			"tests/plat/m2/Set100_mod.mod",
+			"tests/plat/m2/StringTest_mod.mod"
 		)
 
 		acklibrary {
diff --git a/tests/plat/m2/ConvTest_mod.mod b/tests/plat/m2/ConvTest_mod.mod
new file mode 100644
index 000000000..9fa828af0
--- /dev/null
+++ b/tests/plat/m2/ConvTest_mod.mod
@@ -0,0 +1,36 @@
+MODULE ConvTest;
+FROM Conversions IMPORT
+  ConvertOctal, ConvertHex, ConvertCardinal, ConvertInteger;
+FROM Strings IMPORT CompareStr;
+FROM Test IMPORT fail, finished;
+
+(* Asserts a = b, or fails with code. *)
+PROCEDURE A(a, b: ARRAY OF CHAR; code: INTEGER);
+BEGIN
+  IF (CompareStr(a, b) # 0) OR (CompareStr(a, "wrong string") = 0) THEN
+    fail(code)
+  END
+END A;
+
+VAR
+  str: ARRAY [0..15] OF CHAR;
+BEGIN
+  ConvertOctal(  9, 6, str); A("    11", str, 1);
+  ConvertOctal( 59, 6, str); A("    73", str, 2);
+  ConvertOctal(278, 6, str); A("   426", str, 3);
+
+  ConvertHex(  9, 6, str); A("     9", str, 11H);
+  ConvertHex( 59, 6, str); A("    3B", str, 12H);
+  ConvertHex(278, 6, str); A("   116", str, 13H);
+
+  ConvertCardinal(  9, 6, str); A("     9", str, 21H);
+  ConvertCardinal( 59, 6, str); A("    59", str, 22H);
+  ConvertCardinal(278, 6, str); A("   278", str, 23H);
+
+  ConvertInteger(   9, 6, str); A("     9", str, 31H);
+  ConvertInteger(  59, 6, str); A("    59", str, 32H);
+  ConvertInteger( 278, 6, str); A("   278", str, 33H);
+  ConvertInteger(-424, 6, str); A("  -424", str, 34H);
+
+  finished;
+END ConvTest.
diff --git a/tests/plat/m2/NestProc_mod.mod b/tests/plat/m2/NestProc_mod.mod
new file mode 100644
index 000000000..d46731f55
--- /dev/null
+++ b/tests/plat/m2/NestProc_mod.mod
@@ -0,0 +1,132 @@
+(*
+ * Calls nested procedures.  The compiler emits the EM instructions
+ * _lxl_ and _lxa_ to access the variables in the statically enclosing
+ * procedures.
+ *
+ * You can cheat this test if a = b is TRUE for any a, b.
+ *)
+MODULE NestProc;
+FROM Test IMPORT fail, finished;
+
+(* Asserts cond, or fails with code. *)
+PROCEDURE A(cond: BOOLEAN; code: INTEGER);
+BEGIN
+  IF NOT cond THEN fail(code) END
+END A;
+
+TYPE
+  Set8 = SET OF [0..63];
+  (* Box has fields of size 8, 4, and 1. *)
+  Box = RECORD
+    huge: Set8;
+    big: LONGINT;
+    small: CHAR;
+    tiny: CHAR;
+  END;
+
+PROCEDURE First(a, b: INTEGER; in: Box): Box;
+  VAR c, d: INTEGER;
+      out: Box;
+
+  PROCEDURE Second(e: INTEGER);
+    VAR f: INTEGER;
+
+    PROCEDURE Third(g: INTEGER);
+      VAR h: INTEGER;
+
+      PROCEDURE CheckThird;
+      BEGIN
+        A(a = 1354, 31H);   (* lxa 3 *)
+        A(b = 3385, 32H);
+        A(c = 14349, 33H);  (* lxl 3 *)
+        A(d = 30989, 34H);
+        A(e = 28935, 35H);  (* lxa 2 *)
+        A(f = 13366, 36H);  (* lxl 2 *)
+        A(g = 7988, 37H);   (* lxa 1 *)
+        A(h = 11711, 38H);  (* lxl 1 *)
+      END CheckThird;
+
+      PROCEDURE Fourth(i: INTEGER);
+        VAR j: INTEGER;
+
+        PROCEDURE Fifth(k: INTEGER);
+          VAR l: INTEGER;
+
+          PROCEDURE Sixth(): INTEGER;
+          BEGIN
+            A(e = 2, 61H);      (* lxa 4 *)
+            A(f = 11703, 62H);  (* lxl 4 *)
+
+            b := 3385;   (* lxa 5 *)
+            d := 30989;  (* lxl 5 *)
+            e := 28935;  (* lxl 4 *)
+            f := 13366;  (* lxa 4 *)
+            CheckThird;
+
+            (* lxa 5 *)
+            A(in.huge = Set8{11, 12, 40, 40, 43, 56}, 63H);
+            A(in.big = 2130020019D, 64H);
+            A(in.small = 300C, 65H);
+            A(in.tiny = 175C, 66H);
+
+            (* lxl 5 *)
+            out.huge := Set8{8, 19, 36, 41, 47, 62};
+            out.big := 385360915D;
+            out.small := 366C;
+            out.tiny := 131C;
+
+            j := k;  (* lxl 2, lxa 1 *)
+            l := i;  (* lxl 1, lxa 2 *)
+            RETURN 5217;
+          END Sixth;
+
+          PROCEDURE TwiceSixth(): INTEGER;
+          BEGIN
+            (* lxa and lxl must follow the static chain from Sixth to
+             * Fifth, not dynamic chain from Sixth to TwiceSixth. *)
+            RETURN 2 * Sixth();
+          END TwiceSixth;
+
+        BEGIN (* Fifth *)
+          A(TwiceSixth() = 10434, 51H);
+          A(k = 11567, 51H);
+          A(l = 32557, 52H);
+        END Fifth;
+
+      BEGIN (* Fourth *)
+        Fifth(11567);  (* k *)
+        A(i = 32557, 41H);
+        A(j = 11567, 42H);
+      END Fourth;
+
+    BEGIN (* Third *)
+      h := 11711;
+      Fourth(32557);  (* i *)
+    END Third;
+
+  BEGIN (* Second *)
+    f := 11703;
+    Third(7988);  (* g *)
+  END Second;
+
+BEGIN (* First *)
+  c := 14349;
+  d := 17850;
+  Second(2);  (* e *)
+  RETURN out
+END First;
+
+VAR
+  x: Box;
+BEGIN
+  x.huge := Set8{11, 12, 40, 40, 43, 56};
+  x.big := 2130020019D;
+  x.small := 300C;
+  x.tiny := 175C;
+  x := First(1354, 19516, x);  (* a, b, in *)
+  A(x.huge = Set8{8, 19, 36, 41, 47, 62}, 71H);
+  A(x.big = 385360915D, 72H);
+  A(x.small = 366C, 73H);
+  A(x.tiny = 131C, 74H);
+  finished;
+END NestProc.
diff --git a/tests/plat/m2/OpenArray_mod.mod b/tests/plat/m2/OpenArray_mod.mod
new file mode 100644
index 000000000..1aa219a55
--- /dev/null
+++ b/tests/plat/m2/OpenArray_mod.mod
@@ -0,0 +1,59 @@
+(*
+ * Passes an open array to a procedure.  The back end must implement
+ * some EM instructions for accessing arrays.
+ *)
+MODULE OpenArray;
+FROM Test IMPORT fail, finished;
+
+(* Asserts condition or fails with code. *)
+PROCEDURE A(cond: BOOLEAN; code: INTEGER);
+BEGIN
+  IF NOT cond THEN fail(code) END
+END A;
+
+(* Called as Modify(ary1, 1) or Modify(ary2, 2). *)
+PROCEDURE Modify(VAR ary: ARRAY OF INTEGER; what: INTEGER);
+  VAR hi: INTEGER;
+BEGIN
+  hi := what * 100H;
+
+  (* Indices must be from 0 to HIGH(ary). *)
+  A((what = 1) = (HIGH(ary) = 3), hi + 1);
+  A((what = 2) = (HIGH(ary) = 9), hi + 2);
+
+  (* ary[2] must equal ary1[3] or ary2[3]. *)
+  A((what = 1) = (ary[2] = 13), hi + 3);
+  A((what = 2) = (ary[2] = 37), hi + 4);
+
+  (* Modify some values. *)
+  IF HIGH(ary) >= 3 THEN ary[3] := 20 END;
+  IF HIGH(ary) >= 6 THEN ary[6] := 40 END;
+  IF HIGH(ary) >= 9 THEN ary[9] := 12 END;
+END Modify;
+
+VAR
+  ary1: ARRAY [1..4] OF INTEGER;
+  ary2: ARRAY [1..10] OF INTEGER;
+BEGIN
+  (* Initialize the arrays. *)
+  ary1[1] :=  6; ary1[2] :=  9; ary1[3] := 13; ary1[4] := 49;
+
+  ary2[1] := 56; ary2[2] := 79; ary2[3] := 37; ary2[4] :=  0;
+  ary2[5] := 70; ary2[6] := 62; ary2[7] := 64; ary2[8] := 92;
+  ary2[9] := 29; ary2[10] := 90;
+
+  (* Pass them as open arrays. *)
+  Modify(ary1, 1);
+  Modify(ary2, 2);
+
+  (* Check that ary1[4], ary2[4, 7, 10] have been modified. *)
+  A(ary1[1] =  6, 301H); A(ary1[2] =  9, 301H); A(ary1[3] = 13, 303H);
+  A(ary1[4] = 20, 304H);
+
+  A(ary2[1] = 56, 401H); A(ary2[2] = 79, 402H); A(ary2[3] = 37, 403H);
+  A(ary2[4] = 20, 404H); A(ary2[5] = 70, 406H); A(ary2[6] = 62, 406H);
+  A(ary2[7] = 40, 407H); A(ary2[8] = 92, 408H); A(ary2[9] = 29, 409H);
+  A(ary2[10] = 12, 40AH);
+
+  finished;
+END OpenArray.
diff --git a/tests/plat/m2/Set100_mod.mod b/tests/plat/m2/Set100_mod.mod
new file mode 100644
index 000000000..3b200d318
--- /dev/null
+++ b/tests/plat/m2/Set100_mod.mod
@@ -0,0 +1,61 @@
+(*
+ * Operates on sets of 100 integers.  The compiler emits, and the back
+ * end must implement, the EM instructions for large sets.
+ *)
+MODULE Set100;
+FROM Test IMPORT fail, finished;
+
+(* Asserts condition or fails with code. *)
+PROCEDURE A(cond: BOOLEAN; code: INTEGER);
+BEGIN
+  IF NOT cond THEN fail(code) END
+END A;
+
+TYPE
+  Num = [1..100];
+  NumSet = SET OF Num;
+VAR
+  (* VAR, not CONST, so compiler can't do constant operations. *)
+  primes, teens, lowevens, eighties, nineties: NumSet;
+CONST
+  (* These are the expected results of some set operations. *)
+  primeteen = NumSet{13, 17, 19};
+  compeighties = NumSet{80..82, 84..88};
+  teenxoreven = NumSet{2, 4, 6, 8, 10, 12, 13, 15, 17, 19, 20};
+  eightiesnineties = NumSet{80..99};
+
+(* Checks that some set is equal to the expected result.  Also checks
+ * that the set is not equal to the other sets. *)
+PROCEDURE Check(set: NumSet; what: INTEGER);
+  VAR hi: INTEGER;
+BEGIN
+  hi := what * 100H;
+
+  (* The compiler uses cms in EM to check set equality. *)
+  A((what = 1) = (set = primeteen), hi + 1);
+  A((what = 2) = (set = compeighties), hi + 2);
+  A((what = 3) = (set = teenxoreven), hi + 3);
+  A((what = 4) = (set = eightiesnineties), hi + 4);
+END Check;
+
+PROCEDURE Range(min: Num; max: Num): NumSet;
+BEGIN
+  (* The compiler calls LtoUset in lang/m2/libm2/LtoUset.e *)
+  RETURN NumSet{min..max}
+END Range;
+
+BEGIN
+  primes := NumSet{2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43,
+                   47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97};
+  teens := NumSet{13, 14, 15, 16, 17, 18, 19};
+  lowevens := NumSet{2, 4, 6, 8, 10, 12, 14, 16, 18, 20};
+  eighties := Range(80, 89);
+  nineties := Range(90, 99);
+
+  Check(primes * teens, 1);
+  Check(eighties - primes, 2);
+  Check(teens / lowevens, 3);
+  Check(eighties + nineties, 4);
+
+  finished;
+END Set100.
diff --git a/tests/plat/m2/StringTest_mod.mod b/tests/plat/m2/StringTest_mod.mod
new file mode 100644
index 000000000..41552aa7a
--- /dev/null
+++ b/tests/plat/m2/StringTest_mod.mod
@@ -0,0 +1,55 @@
+MODULE StringTest;
+FROM Strings IMPORT
+  Assign, Insert, Delete, Pos, Copy, Concat, Length, CompareStr;
+FROM Test IMPORT fail, finished;
+
+(* Asserts condition or fails with code. *)
+PROCEDURE A(cond: BOOLEAN; code: INTEGER);
+BEGIN
+  IF NOT cond THEN fail(code) END
+END A;
+
+VAR
+  small: ARRAY [0..3] OF CHAR;
+  big: ARRAY [0..99] OF CHAR;
+BEGIN
+  (* CompareStr *)
+  A(CompareStr("ablaze", "ablaze") = 0, 1);
+  A(CompareStr("ablaze", "abloom") < 0, 2);
+  A(CompareStr("abloom", "ablaze") > 0, 3);
+  A(CompareStr("abloom", "abloom") = 0, 4);
+
+  (* Assign, Insert, Delete *)
+  Assign("obsequiosity", small);
+  A(CompareStr("obsequiosity", small) > 0, 11H);
+  Assign("obsequiosity", big);
+  A(CompareStr("obsequiosity", big) = 0, 12H);
+  A(big[11] = 'y', 13H);
+  A(big[11] # 0C, 14H);
+  A(big[12] # 'y', 15H);
+  A(big[12] = 0C, 16H);
+  Insert(" omnihuman", big, 9);
+  A(CompareStr("obsequios omnihumanity", big) = 0, 17H);
+  Delete(big, 6, 15);
+  A(CompareStr("obsequy", big) = 0, 18H);
+
+  (* Pos, Concat *)
+  Assign("Now is the time for all good men to come...", big);
+  A(Pos("w", big) = 2, 21H);
+  A(Pos("t", big) = 7, 22H);
+  A(Pos("ti", big) = 11, 23H);
+  A(Pos("men", big) = 29, 24H);
+  A(Pos("women", big) > 42, 25H);
+  Copy(big, 29, 2, small);
+  A(CompareStr("me", small) = 0, 26H);
+
+  (* Concat, Length *)
+  Concat("pictorial", "ist", big);
+  A(CompareStr("pictorialist", big) = 0, 31H);
+  A(Length(big) = 12, 32H);
+  Concat("zit", "her", small);
+  A(CompareStr("zither", small) > 0, 33H);
+  A(Length(small) < 5, 34H);
+
+  finished;
+END StringTest.

From 4bb31c296dbd9f176a84918e5508d22505519569 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 21 Dec 2017 18:19:26 -0500
Subject: [PATCH 19/55] Revise the comments in the EM tests.

You can cheat these tests if _cms_ and _cmu_ always push zero.
---
 tests/plat/dup_e.e    | 6 ++++--
 tests/plat/exg_e.e    | 6 ++++--
 tests/plat/rotate_e.e | 6 +++++-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tests/plat/dup_e.e b/tests/plat/dup_e.e
index c0e0001b3..649589d84 100644
--- a/tests/plat/dup_e.e
+++ b/tests/plat/dup_e.e
@@ -2,8 +2,10 @@
     mes 2, EM_WSIZE, EM_PSIZE
 
 /*
- * Test _dup_ and _dus_ by loading 20 bytes from _src_, then making
- * and checking some duplicates.
+ * Tests _dup_ and _dus_ by loading 20 bytes from _src_, then making
+ * and checking some duplicates.  The compilers might never _dup_ or
+ * _dus_ with large sizes, so the compilers might work even if this
+ * test fails.  You can cheat this test if _cms_ always pushes zero.
  */
 
     exa src
diff --git a/tests/plat/exg_e.e b/tests/plat/exg_e.e
index 617f07104..455256483 100644
--- a/tests/plat/exg_e.e
+++ b/tests/plat/exg_e.e
@@ -2,8 +2,10 @@
     mes 2, EM_WSIZE, EM_PSIZE
 
 /*
- * Test _exg_ by loading 40 bytes from _src_, then exchanging 20 and
- * 20 bytes, and checking the result.
+ * Tests _exg_ by loading 40 bytes from _src_, then exchanging 20 and
+ * 20 bytes, and checking the result.  The compilers might never _exg_
+ * large sizes, so the compilers might work even if this test fails.
+ * You can cheat this test if _cms_ always pushes zero.
  */
 
     exa src
diff --git a/tests/plat/rotate_e.e b/tests/plat/rotate_e.e
index a6f8f28dd..0698c58a0 100644
--- a/tests/plat/rotate_e.e
+++ b/tests/plat/rotate_e.e
@@ -2,12 +2,16 @@
     mes 2, EM_WSIZE, EM_PSIZE
 
 /*
- * Test _rol_ (rotate left) and _ror_ (rotate right).
+ * Tests _rol_ (rotate left) and _ror_ (rotate right).  Several back
+ * ends provide _rol_ and _ror_, but as of year 2017, the compilers
+ * and optimizers had never emit _rol_ nor _ror_.
  *
  * By tradition, _rol_ and _ror_ can't rotate values shorter than the
  * word size, or longer than 4 bytes.
  *  - If word size is 2, then try rotating 2-byte and 4-byte values.
  *  - If word size is 4, then try rotating 4-byte values.
+ *
+ * You can cheat this test if _cmu_ always pushes zero.
  */
 
 #if EM_WSIZE == 2

From 2eeee36f7870679eed0fa68b53bc57902b4dc63a Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Fri, 22 Dec 2017 17:04:16 -0500
Subject: [PATCH 20/55] Add FRAME_V tokens for local variables.

When storing to a local, stop killing the tokens of other locals,
unless they might overlap with the stored local.  This helps some
procedures that juggle locals when the locals aren't in registers.

Also use FRAME_V tokens for locals in statically enclosing procedures.
Rewrite _lxa_ as _lxl_, to skip the `addi ?,?,8` if we can add 8 to
the next constant.  The PowerPC code from _lxl_ is now sometimes
better, sometimes worse than before.

The i386 table provided the idea to use %size to find overlapping
locals.
---
 mach/powerpc/ncg/table | 329 ++++++++++++++++++++++++++++++-----------
 1 file changed, 243 insertions(+), 86 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 28ef4b40e..3f4794fc8 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -9,7 +9,6 @@ INT64 = 8
 
 FP_OFFSET = 0   /* Offset of saved FP relative to our FP */
 PC_OFFSET = 4   /* Offset of saved PC relative to our FP */
-SL_OFFSET = 8   /* Offset of static link */
 
 #define COMMENT(n) /* comment {LABEL, n} */
 
@@ -19,6 +18,12 @@ SL_OFFSET = 8   /* Offset of static link */
 #define smalls(n) sfit(n, 16)
 #define smallu(n) ufit(n, 16)
 
+/* Finds FRAME_V tokens that overlap myoff, mysize. */
+#define fover(myoff, mysize) (%off+%size>(myoff) && %off<((myoff)+(mysize)))
+
+/* Checks if we can use {LXFRAME, x}. */
+#define nicelx(x) ((x)>=1 && (x)<=0x8000)
+
 #define lo(n) ((n) & 0xFFFF)
 #define hi(n) (((n)>>16) & 0xFFFF)
 
@@ -138,6 +143,8 @@ TOKENS
 	DIV_RR      = { GPR reg1; GPR reg2; }  4.   /* reg1 / reg2 signed */
 	DIV_RR_U    = { GPR reg1; GPR reg2; }  4.   /* reg1 / reg2 unsigned */
 
+/* Indirect loads and stores */
+
 	IND_RC_B    = { GPR reg; INT off; }    4    off "(" reg ")".
 	IND_RL_B    = { GPR reg; ADDR adr; }   4    "lo16[" adr "](" reg ")".
 	IND_RR_B    = { GPR reg1; GPR reg2; }  4.
@@ -154,6 +161,23 @@ TOKENS
 	IND_RL_D    = { GPR reg; ADDR adr; }   8    "lo16[" adr "](" reg ")".
 	IND_RR_D    = { GPR reg1; GPR reg2; }  8.
 
+/* Local variables in frame */
+
+	FRAME_B     = { INT level; GPR reg; INT off; INT size; }
+	                                       4    off "(" reg ")".
+	FRAME_H     = { INT level; GPR reg; INT off; INT size; }
+	                                       4    off "(" reg ")".
+	FRAME_H_S   = { INT level; GPR reg; INT off; INT size; }
+	                                       4    off "(" reg ")".
+	FRAME_W     = { INT level; GPR reg; INT off; INT size; }
+	                                       4    off "(" reg ")".
+	FRAME_D     = { INT level; GPR reg; INT off; INT size; }
+	                                       8    off "(" reg ")".
+
+	LXFRAME     = { INT level; }           4.
+
+/* Bitwise logic */
+
 	NOT_R       = { GPR reg; }             4.   /* ~reg */
 	AND_RIS     = { GPR reg; INT valhi; }  4.
 	AND_RC      = { GPR reg; INT val; }    4.
@@ -170,6 +194,8 @@ TOKENS
 	NOR_RR      = { GPR reg1; GPR reg2; }  4.   /* ~(reg1 | reg2) */
 	EQV_RR      = { GPR reg1; GPR reg2; }  4.   /* ~(reg1 ^ reg2) */
 
+/* Comparisons */
+
 	COND_RC            = { GPR reg; INT val; }    4.
 	COND_RR            = { GPR reg1; GPR reg2; }  4.
 	CONDL_RC           = { GPR reg; INT val; }    4.
@@ -200,27 +226,37 @@ SETS
 
 	CONST           = C + CONST_STACK.
 
-	IND_ALL_B          = IND_RC_B + IND_RL_B + IND_RR_B.
-	IND_ALL_H          = IND_RC_H + IND_RL_H + IND_RR_H +
-	                     IND_RC_H_S + IND_RL_H_S + IND_RR_H_S.
-	IND_ALL_W          = IND_RC_W + IND_RL_W + IND_RR_W.
-	IND_ALL_D          = IND_RC_D + IND_RL_D + IND_RR_D.
+	SET_RC_B        = IND_RC_B + IND_RL_B + FRAME_B.
+	SET_RC_H        = IND_RC_H + IND_RL_H + FRAME_H.
+	SET_RC_H_S      = IND_RC_H_S + IND_RL_H_S + FRAME_H_S.
+	SET_RC_W        = IND_RC_W + IND_RL_W + FRAME_W.
+	SET_RC_D        = IND_RC_D + IND_RL_D + FRAME_D.
+
+	IND_ALL_B       = IND_RC_B + IND_RL_B + IND_RR_B.
+	IND_ALL_H       = IND_RC_H + IND_RL_H + IND_RR_H +
+	                  IND_RC_H_S + IND_RL_H_S + IND_RR_H_S.
+	IND_ALL_W       = IND_RC_W + IND_RL_W + IND_RR_W.
+	IND_ALL_D       = IND_RC_D + IND_RL_D + IND_RR_D.
+	IND_V           = IND_ALL_B + IND_ALL_H + IND_ALL_W + IND_ALL_D.
+
+	FRAME_V         = FRAME_B + FRAME_H + FRAME_H_S + FRAME_W + FRAME_D.
 
 	/* anything killed by sti (store indirect) */
-	MEMORY  = IND_ALL_B + IND_ALL_H + IND_ALL_W + IND_ALL_D.
+	MEMORY          = IND_V + FRAME_V.
 
 	/* any integer from stack that we can easily move to GPR */
 	INT_W   = REG + CONST_STACK + SEX_B + SEX_H +
 	          SUM_RIS + SUM_RC + SUM_RL + SUM_RR +
 	          SUB_RR + NEG_R + MUL_RR + DIV_RR + DIV_RR_U +
 	          IND_ALL_B + IND_ALL_H + IND_ALL_W +
+		  FRAME_B + FRAME_H + FRAME_H_S + FRAME_W +
 	          NOT_R + AND_RIS + AND_RC + AND_RR + ANDC_RR +
 	          OR_RIS + OR_RC + OR_RR + ORC_RR +
 	          XOR_RIS + XOR_RC + XOR_RR + NAND_RR + NOR_RR + EQV_RR +
 	          XEQ + XNE + XGT + XGE + XLT + XLE.
 
-	FLOAT_D = FREG + IND_ALL_D.
-	FLOAT_W = FSREG + IND_ALL_W.
+	FLOAT_D = FREG + IND_ALL_D + FRAME_D.
+	FLOAT_W = FSREG + IND_ALL_W + FRAME_W.
 
 
 INSTRUCTIONS
@@ -293,21 +329,21 @@ INSTRUCTIONS
   frsp            FSREG+LOCAL:wo, FREG:ro cost(4, 5).
   fsub            FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 5).
   fsubs           FSREG+LOCAL:wo, FSREG:ro, FSREG:ro cost(4, 5).
-  lbz             GPR:wo, IND_RC_B+IND_RL_B:ro cost(4, 3).
+  lbz             GPR:wo, SET_RC_B:ro cost(4, 3).
   lbzx            GPR:wo, GPR:ro, GPR:ro cost(4, 3).
-  lfd             FPR+DLOCAL:wo, IND_RC_D+IND_RL_D:ro cost(4, 5).
+  lfd             FPR+DLOCAL:wo, SET_RC_D:ro cost(4, 5).
   lfdu            FPR:wo, IND_RC_D:ro cost(4, 5).
   lfdx            FPR:wo, GPR:ro, GPR:ro cost(4, 5).
-  lfs             FSREG+LOCAL:wo, IND_RC_W+IND_RL_W:ro cost(4, 4).
+  lfs             FSREG+LOCAL:wo, SET_RC_W:ro cost(4, 4).
   lfsu            FSREG:wo, IND_RC_W:rw cost(4, 4).
   lfsx            FSREG:wo, GPR:ro, GPR:ro cost(4, 4).
-  lha             GPR:wo, IND_RC_H_S+IND_RL_H_S:ro cost(4, 3).
+  lha             GPR:wo, SET_RC_H_S:ro cost(4, 3).
   lhax            GPR:wo, GPR:ro, GPR:ro cost(4, 3).
-  lhz             GPR:wo, IND_RC_H+IND_RL_H:ro cost(4, 3).
+  lhz             GPR:wo, SET_RC_H:ro cost(4, 3).
   lhzx            GPR:wo, GPR:ro, GPR:ro cost(4, 3).
   lwzu            GPR:wo, IND_RC_W:rw cost(4, 3).
   lwzx            GPR:wo, GPR:ro, GPR:ro cost(4, 3).
-  lwz             GPR+LOCAL:wo, IND_RC_W+IND_RL_W:ro cost(4, 3).
+  lwz             GPR+LOCAL:wo, SET_RC_W:ro cost(4, 3).
   mfcr            GPR:wo cost(4,2).
   mfspr           GPR:wo, SPR:ro cost(4, 3).
   mtspr           SPR:wo, GPR:ro cost(4, 2).
@@ -336,17 +372,17 @@ INSTRUCTIONS
   sraw            GPR+LOCAL:wo, GPR:ro, GPR:ro cost(4, 2).
   srawi           GPR+LOCAL:wo, GPR:ro, CONST:ro cost(4, 2).
   srw             GPR+LOCAL:wo, GPR:ro, GPR:ro.
-  stb             GPR:ro, IND_RC_B+IND_RL_B:rw cost(4, 3).
+  stb             GPR:ro, SET_RC_B:rw cost(4, 3).
   stbx            GPR:ro, GPR:ro, GPR:ro cost(4, 3).
-  stfd            FPR:ro, IND_RC_D+IND_RL_D:rw cost(4, 4).
+  stfd            FPR:ro, SET_RC_D:rw cost(4, 4).
   stfdu           FPR:ro, IND_RC_D:rw cost(4, 4).
   stfdx           FPR:ro, GPR:ro, GPR:ro cost(4, 4).
-  stfs            FSREG:ro, IND_RC_W+IND_RL_W:rw cost(4, 3).
+  stfs            FSREG:ro, SET_RC_W:rw cost(4, 3).
   stfsu           FSREG:ro, IND_RC_W:rw cost(4, 3).
   stfsx           FSREG:ro, GPR:ro, GPR:ro cost(4, 3).
-  sth             GPR:ro, IND_RC_H+IND_RL_H:rw cost(4, 3).
+  sth             GPR:ro, SET_RC_H:rw cost(4, 3).
   sthx            GPR:ro, GPR:ro, GPR:ro cost(4, 3).
-  stw             GPR:ro, IND_RC_W+IND_RL_W:rw cost(4, 3).
+  stw             GPR:ro, SET_RC_W:rw cost(4, 3).
   stwx            GPR:ro, GPR:ro, GPR:ro cost(4, 3).
   stwu            GPR:ro, IND_RC_W:rw cost(4, 3).
   xor             GPR:wo, GPR:ro, GPR:ro.
@@ -439,7 +475,7 @@ MOVES
 
 /* Read byte */
 
-	from IND_RC_B+IND_RL_B to GPR
+	from SET_RC_B to GPR
 		gen lbz %2, %1
 
 	from IND_RR_B to GPR
@@ -447,7 +483,7 @@ MOVES
 
 /* Write byte */
 
-	from GPR to IND_RC_B+IND_RL_B
+	from GPR to SET_RC_B
 		gen stb %1, %2
 
 	from GPR to IND_RR_B
@@ -455,13 +491,13 @@ MOVES
 
 /* Read halfword (short) */
 
-	from IND_RC_H+IND_RL_H to GPR
+	from SET_RC_H to GPR
 		gen lhz %2, %1
 
 	from IND_RR_H to GPR
 		gen lhzx %2, %1.reg1, %1.reg2
 
-	from IND_RC_H_S+IND_RL_H_S to GPR
+	from SET_RC_H_S to GPR
 		gen lha %2, %1
 
 	from IND_RR_H_S to GPR
@@ -469,7 +505,7 @@ MOVES
 
 /* Write halfword */
 
-	from GPR to IND_RC_H+IND_RL_H
+	from GPR to SET_RC_H
 		gen sth %1, %2
 
 	from GPR to IND_RR_H
@@ -477,13 +513,13 @@ MOVES
 
 /* Read word */
 
-	from IND_RC_W+IND_RL_W to GPR
+	from SET_RC_W to GPR
 		gen lwz %2, %1
 
 	from IND_RR_W to GPR
 		gen lwzx %2, %1.reg1, %1.reg2
 
-	from IND_RC_W+IND_RL_W to FSREG
+	from SET_RC_W to FSREG
 		gen lfs %2, %1
 
 	from IND_RR_W to FSREG
@@ -491,13 +527,13 @@ MOVES
 
 /* Write word */
 
-	from GPR to IND_RC_W+IND_RL_W
+	from GPR to SET_RC_W
 		gen stw %1, %2
 
 	from GPR to IND_RR_W
 		gen stwx %1, %2.reg1, %2.reg2
 
-	from FSREG to IND_RC_W+IND_RL_W
+	from FSREG to SET_RC_W
 		gen stfs %1, %2
 
 	from FSREG to IND_RR_W
@@ -505,7 +541,7 @@ MOVES
 
 /* Read double */
 
-	from IND_RC_D+IND_RL_D to FPR
+	from SET_RC_D to FPR
 		gen lfd %2, %1
 
 	from IND_RR_D to FPR
@@ -513,12 +549,41 @@ MOVES
 
 /* Write double */
 
-	from FPR to IND_RC_D+IND_RL_D
+	from FPR to SET_RC_D
 		gen stfd %1, %2
 
 	from FPR to IND_RR_D
 		gen stfdx %1, %2.reg1, %2.reg2
 
+/* LXFRAME is a lexical frame from the static chain.  We define a move
+   so "uses REG={LXFRAME, $1}" may find a register with the same
+   frame, and not repeat the move.  This move can't search for a REG
+   with {LXFRAME, $1-1}, but must always start from fp.  The static
+   chain, if it exists, is the argument at fp + EM_BSIZE. */
+
+	from LXFRAME %level==1 to REG
+		gen	lwz %2, {IND_RC_W, fp, EM_BSIZE}
+	from LXFRAME %level==2 to REG
+		gen	lwz %2, {IND_RC_W, fp, EM_BSIZE}
+			/* PowerPC can't add r0 + EM_BSIZE,
+			 * so %2 must not be r0. */
+			lwz %2, {IND_RC_W, %2, EM_BSIZE}
+	from LXFRAME %level==3 to REG
+		gen	lwz %2, {IND_RC_W, fp, EM_BSIZE}
+			lwz %2, {IND_RC_W, %2, EM_BSIZE}
+			lwz %2, {IND_RC_W, %2, EM_BSIZE}
+	from LXFRAME %level==4 to REG
+		gen	lwz %2, {IND_RC_W, fp, EM_BSIZE}
+			lwz %2, {IND_RC_W, %2, EM_BSIZE}
+			lwz %2, {IND_RC_W, %2, EM_BSIZE}
+			lwz %2, {IND_RC_W, %2, EM_BSIZE}
+	from LXFRAME to REG  /* assuming %level in 2 to 0x8000 */
+		gen	li %2, {C, %1.level-1}
+			mtspr ctr, %2
+			lwz %2, {IND_RC_W, fp, EM_BSIZE}
+		1:	lwz %2, {IND_RC_W, %2, EM_BSIZE}
+			bdnz {LABEL, "1b"}
+
 /* Logicals */
 
 	from NOT_R to GPR
@@ -661,6 +726,11 @@ TESTS
 
 STACKINGRULES
 
+	/* We don't allow GPR-REG on the stack.  The intent is to ban
+	 * r0 from the stack, but this also bans fp from the stack.
+	 * This is odd because most other tables for ncg allow the
+	 * frame pointer on the stack.
+	 */
 	from REG to STACK
 		gen
 			COMMENT("stack REG")
@@ -760,6 +830,11 @@ COERCIONS
 		gen move {SUM_RR, %1.reg1, %1.reg2}, r12
 		yields {IND_RC_W, r12, 4} {IND_RC_W, r12, 0}
 
+	from FRAME_D %off<=0x7FFA
+		yields
+			{FRAME_W, %1.level, %1.reg, %1.off+4, 4}
+			{FRAME_W, %1.level, %1.reg, %1.off, 4}
+
 
 PATTERNS
 
@@ -897,24 +972,47 @@ PATTERNS
 		uses REG={SUM_RIS, fp, his($1)}
 		yields {SUM_RC, %a, los($1)}
 
+	pat lal loi smalls($1) && $2==1    /* Load byte from local */
+		yields {FRAME_B, 0, fp, $1, 1}
+
+	/* Load half-word from local and sign-extend */
+	pat lal loi loc loc cii smalls($1) && $2==2 && $3==2 && $4==4
+		yields {FRAME_H_S, 0, fp, $1, 1}
+
+	pat lal loi smalls($1) && $2==2    /* Load half-word from local */
+		yields {FRAME_H, 0, fp, $1, 1}
+
 	/* Load word from local */
 	pat lol inreg($1)==reg_any || inreg($1)==reg_float
 		yields {LOCAL, $1}
+	pat lol smalls($1)
+		yields {FRAME_W, 0, fp, $1, 4}
 	pat lol
 		leaving
 			lal $1
 			loi 4
 
-	/* Load double-word from local */
-	pat ldl inreg($1)==reg_float
+	pat ldl inreg($1)==reg_float       /* Load double-word from local */
 		yields {DLOCAL, $1}
+	pat ldl smalls($1) && smalls($1+4)
+		/* smalls($1+4) implies FRAME_D %off<=0xFFFA */
+		yields {FRAME_D, 0, fp, $1, 8}
 	pat ldl
 		leaving
 			lal $1
 			loi 8
 
-	/* Store word to local */
-	pat stl inreg($1)==reg_any
+	pat lal sti smalls($1) && $2==1    /* Store byte to local */
+		with REG
+			kills IND_V, FRAME_V %level==0 && fover($1, 1)
+			gen move %1, {FRAME_B, 0, fp, $1, 1}
+
+	pat lal sti smalls($1) && $2==2    /* Store half-word to local */
+		with REG
+			kills IND_V, FRAME_V %level==0 && fover($1, 2)
+			gen move %1, {FRAME_H, 0, fp, $1, 2}
+
+	pat stl inreg($1)==reg_any         /* Store word to local */
 		with exact INT_W
 			/* ncg fails to infer that regvar($1) is dead! */
 			kills regvar($1)
@@ -924,58 +1022,69 @@ PATTERNS
 				lwz {LOCAL, $1}, {IND_RC_W, sp, 0}
 				addi sp, sp, {C, 4}
 	pat stl inreg($1)==reg_float
-		with exact FSREG+IND_ALL_W
+		with exact FLOAT_W
 			kills regvar_w($1, reg_float)
 			gen move %1, {FSREG_EXPR, regvar_w($1, reg_float)}
 		with STACK
 			gen
 				lfs {LOCAL, $1}, {IND_RC_W, sp, 0}
 				addi sp, sp, {C, 4}
+	pat stl smalls($1)
+		with REG+FSREG
+			kills IND_V, FRAME_V %level==0 && fover($1, 4)
+			gen move %1, {FRAME_W, 0, fp, $1, 4}
 	pat stl
 		leaving
 			lal $1
 			sti 4
 
-	/* Store double-word to local */
-	pat sdl inreg($1)==reg_float
-		with exact FREG+IND_ALL_D
+	pat sdl inreg($1)==reg_float       /* Store double-word to local */
+		with exact FLOAT_D
 			kills regvar_d($1, reg_float)
 			gen move %1, {FPR_EXPR, regvar_d($1, reg_float)}
 		with STACK
 			gen
 				lfd {DLOCAL, $1}, {IND_RC_D, sp, 0}
 				addi sp, sp, {C, 8}
+	pat sdl smalls($1) && smalls($1+4)
+		with REG REG
+			kills IND_V, FRAME_V %level==0 && fover($1, 8)
+			gen
+				move %1, {FRAME_W, 0, fp, $1, 4}
+				move %2, {FRAME_W, 0, fp, $1+4, 4}
+		with FREG
+			kills IND_V, FRAME_V %level==0 && fover($1, 4)
+			gen move %1, {FRAME_D, 0, fp, $1, 8}
 	pat sdl
 		leaving
 			lal $1
 			sti 8
 
-	/* Load indirect from local */
-	pat lil inreg($1)==reg_any
+	pat lil inreg($1)==reg_any         /* Load indirect from local */
 		yields {IND_RC_W, regvar($1), 0}
 	pat lil
 		leaving
 			lol $1
 			loi 4
 
-	pat sil                            /* Save to indirected local */
+	pat sil                            /* Store indirect to local */
 		leaving
 			lol $1
 			sti 4
 
-	pat zrl                             /* Zero local */
+	pat zrl                            /* Zero local */
 		leaving
 			loc 0
 			stl $1
 
-	pat inl                             /* Increment local */
+	pat inl                            /* Increment local */
 		leaving
 			lol $1
 			loc 1
 			adi 4
 			stl $1
 
-	pat del                             /* Decrement local */
+	pat del                            /* Decrement local */
 		leaving
 			lol $1
 			loc 1
@@ -983,9 +1092,87 @@ PATTERNS
 			stl $1
 
 
+/* Local variables of procedures on static chain */
+
+	/* lxa (lexical argument base) -> lxl (lexical local base) */
+	pat lxa adp nicelx($1)
+		leaving lxl $1 adp $2+EM_BSIZE
+	pat lxa lof nicelx($1)
+		leaving lxl $1 lof $2+EM_BSIZE
+	pat lxa ldf nicelx($1)
+		leaving lxl $1 ldf $2+EM_BSIZE
+	pat lxa stf nicelx($1)
+		leaving lxl $1 stf $2+EM_BSIZE
+	pat lxa sdf nicelx($1)
+		leaving lxl $1 stf $2+EM_BSIZE
+	pat lxa $1==0 || nicelx($1)
+		leaving lxl $1 adp EM_BSIZE
+
+	/* Load locals in statically enclosing procedures */
+	pat lxl adp loi nicelx($1) && smalls($2) && $3==1
+		uses REG={LXFRAME, $1}
+		yields {FRAME_B, $1, %a, $2, 1}
+	pat lxl adp loi loc loc cii nicelx($1) && smalls($2) &&
+	                            $3==2 && $4==2 && $5==4
+		uses REG={LXFRAME, $1}
+		yields {FRAME_H_S, $1, %a, $2, 2}
+	pat lxl adp loi nicelx($1) && smalls($2) && $3==2
+		uses REG={LXFRAME, $1}
+		yields {FRAME_H, $1, %a, $2, 2}
+	pat lxl lof nicelx($1) && smalls($2)
+		uses REG={LXFRAME, $1}
+		yields {FRAME_W, $1, %a, $2, 4}
+	pat lxl ldf nicelx($1) && smalls($2) && smalls($2+4)
+		uses REG={LXFRAME, $1}
+		/* smalls($2+4) implies FRAME_D %off<=0xFFFA */
+		yields {FRAME_D, $1, %a, $2, 8}
+
+	/* Store locals in statically enclosing procedures */
+	pat lxl adp sti nicelx($1) && smalls($2) && $3==1
+		with REG
+			kills IND_V, FRAME_V %level==$1 && fover($2, 1)
+			uses REG={LXFRAME, $1}
+			gen move %1, {FRAME_B, $1, %a, $2, 1}
+	pat lxl adp sti nicelx($1) && smalls($2) && $3==2
+		with REG
+			kills IND_V, FRAME_V %level==$1 && fover($2, 2)
+			uses REG={LXFRAME, $1}
+			gen move %1, {FRAME_H, $1, %a, $2, 2}
+	pat lxl stf nicelx($1) && smalls($2)
+		with REG+FSREG
+			kills IND_V, FRAME_V %level==$1 && fover($2, 4)
+			uses REG={LXFRAME, $1}
+			gen move %1, {FRAME_W, $1, %a, $2, 4}
+	pat lxl sdf nicelx($1) && smalls($2) && smalls($2+4)
+		with REG REG
+			kills IND_V, FRAME_V %level==$1 && fover($2, 8)
+			uses REG={LXFRAME, $1}
+			gen
+				move %1, {FRAME_W, $1, %a, $2, 4}
+				move %2, {FRAME_W, $1, %a, $2+4, 4}
+		with FREG
+			kills IND_V, FRAME_V %level==$1 && fover($2, 8)
+			uses REG={LXFRAME, $1}
+			gen move %1, {FRAME_D, $1, %a, $2, 8}
+
+	/* Programs use "lxl cal" to pass the static chain and call a
+	 * nested procedure.  This must push a token LXFRAME or the
+	 * register fp to the real stack. */
+
+	/* Local base of procedure on static chain */
+	pat lxl nicelx($1)
+		uses REG={LXFRAME, $1}
+		yields %a  /* Can't yield LXFRAME. */
+
+	pat lxl $1==0                      /* Our local base */
+		with STACK
+			gen stwu fp, {IND_RC_W, sp, 0-4}
+			/* Can't yield fp. */
+
+
 /* Global variables */
 
-	pat lpi                            /* Load address of external function */
+	pat lpi                            /* Load address of function */
 		leaving
 			lae $1
 
@@ -2008,30 +2195,11 @@ PATTERNS
 		leaving
 			ret 0
 
-	/*
-	 * Lexical local base: lxl 0 yields our fp, lxl n yields the
-	 * fp of the nth statically enclosing procedure.
+	/* Our caller's local base, "lxl 0 dch", appears in
+	 * lang/cem/libcc.ansi/setjmp/setjmp.e, lang/m2/libm2/par_misc.e
 	 */
-	pat lxl $1==0
-		leaving
-			lor 0
-	pat lxl $1==1
-		yields {IND_RC_W, fp, SL_OFFSET}
-	pat lxl $1==2
-		uses REG={IND_RC_W, fp, SL_OFFSET}
-		yields {IND_RC_W, %a, SL_OFFSET}
-	pat lxl $1==3
-		uses REG={IND_RC_W, fp, SL_OFFSET}, reusing %a, REG
-		gen move {IND_RC_W, %a, SL_OFFSET}, %b
-		yields {IND_RC_W, %b, SL_OFFSET}
-	pat lxl $1>=4 && $1<=0x8000
-		uses REG={IND_RC_W, fp, SL_OFFSET},
-		     REG={CONST_0000_7FFF, $1-1}
-		gen
-			mtspr ctr, %b
-		1:	lwz %a, {IND_RC_W, %a, SL_OFFSET}
-			bdnz {LABEL, "1b"}
-		yields %a
+	pat lxl dch $1==0
+		yields {IND_RC_W, fp, FP_OFFSET}
 
 	pat dch               /* Dynamic chain: LB -> caller's LB */
 		with REG
@@ -2041,11 +2209,6 @@ PATTERNS
 		leaving
 			adp EM_BSIZE
 
-	pat lxa                            /* Lexical argument base */
-		leaving
-			lxl $1
-			lpb
-
 	pat gto                            /* longjmp */
 		with STACK
 			uses REG
@@ -2058,26 +2221,20 @@ PATTERNS
 				bctr.
 
 	pat lor $1==0                      /* Load local base */
-		uses REG
-		gen
-			move fp, %a
-		yields %a
+		leaving lxl 0
 
 	pat lor $1==1                      /* Load stack pointer */
-		uses REG
-		gen
-			move sp, %a
-		yields %a
+		with STACK
+			uses REG=sp
+			yields %a  /* Can't yield sp. */
 
 	pat str $1==0                      /* Store local base */
 		with REG
-			gen
-				move %1, fp
+			gen move %1, fp
 
 	pat str $1==1                      /* Store stack pointer */
 		with REG
-			gen
-				move %1, sp
+			gen move %1, sp
 
 	pat lae rck $2==4                  /* Range check */
 		with REG

From 5867ca2f2c88e74f5a92210dc784ddef8a072512 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Fri, 22 Dec 2017 19:57:42 -0500
Subject: [PATCH 21/55] Remove two obsolete patterns.

These patterns seem to have no effect on the generated code.
---
 mach/powerpc/ncg/table | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 3f4794fc8..129a09a41 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -927,11 +927,6 @@ PATTERNS
 		leaving
 			cal ".exg"
 
-	pat ste loe $1==$2                 /* Store then load external */
-		leaving
-			dup 4
-			ste $1
-
 
 /* Type conversions */
 
@@ -1060,9 +1055,7 @@ PATTERNS
 			lal $1
 			sti 8
 
-	pat lil inreg($1)==reg_any         /* Load indirect from local */
-		yields {IND_RC_W, regvar($1), 0}
-	pat lil
+	pat lil                            /* Load indirect from local */
 		leaving
 			lol $1
 			loi 4

From f96f918a2952c4b5928d6c5f1920985f87e0def7 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Fri, 22 Dec 2017 20:37:39 -0500
Subject: [PATCH 22/55] Generate shorter code for ret 4 and ret 8.

---
 mach/powerpc/ncg/table | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 129a09a41..04619aa71 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -2089,7 +2089,7 @@ PATTERNS
 	pat lfr $1==INT32                  /* Load function result, word */
 		yields r3
 
-	pat lfr $1==INT64                  /* Load function result, double-word */
+	pat lfr $1==INT64           /* Load function result, double-word */
 		yields r4 r3
 
 	pat ret $1==0                      /* Return from procedure */
@@ -2108,11 +2108,22 @@ PATTERNS
 	pat ret $1==4                      /* Return from procedure, word */
 		with REG3
 			leaving ret 0
+		with STACK
+			gen lwz r3, {IND_RC_W, sp, 0}
+			leaving ret 0
 
 	pat ret $1==8                      /* Return from proc, double-word */
-		with REG3 REG
+		with REG3 INT_W
 			gen move %2, r4
 			leaving ret 0
+		with REG3 STACK
+			gen lwz r4, {IND_RC_W, sp, 0}
+			leaving ret 0
+		with STACK
+			gen
+				lwz r3, {IND_RC_W, sp, 0}
+				lwz r4, {IND_RC_W, sp, 4}
+			leaving ret 0
 
 	/*
 	 * These rules for blm/bls are wrong if length is zero.

From c964eeddba8458ac5795f3f0cc013e1f94b6e99a Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Fri, 22 Dec 2017 21:18:58 -0500
Subject: [PATCH 23/55] Remove INT32 and such.  Adjust indentation.

I understand `loi 4` more easily than `loi INT32`, because `loi 4`
appears in .e files.  So remove INT8, INT16, INT32, INT64.

Add a comment to explain r3 during unconditional jumps.
---
 mach/powerpc/ncg/table | 223 ++++++++++++++++-------------------------
 1 file changed, 89 insertions(+), 134 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 04619aa71..e566c51b9 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -2,18 +2,12 @@ EM_WSIZE = 4
 EM_PSIZE = 4
 EM_BSIZE = 8    /* two words saved in call frame */
 
-INT8 = 1        /* Size of values */
-INT16 = 2
-INT32 = 4
-INT64 = 8
-
 FP_OFFSET = 0   /* Offset of saved FP relative to our FP */
 PC_OFFSET = 4   /* Offset of saved PC relative to our FP */
 
 #define COMMENT(n) /* comment {LABEL, n} */
 
-
-#define nicesize(x) ((x)==INT8 || (x)==INT16 || (x)==INT32 || (x)==INT64)
+#define nicesize(x) ((x)==1 || (x)==2 || (x)==4 || (x)==8)
 
 #define smalls(n) sfit(n, 16)
 #define smallu(n) ufit(n, 16)
@@ -1176,35 +1170,35 @@ PATTERNS
 	pat loe                            /* Load word external */
 		leaving
 			lae $1
-			loi INT32
+			loi 4
 
 	pat ste                            /* Store word external */
 		leaving
 			lae $1
-			sti INT32
+			sti 4
 
 	pat lde                            /* Load double-word external */
 		leaving
 			lae $1
-			loi INT64
+			loi 8
 
 	pat sde                            /* Store double-word external */
 		leaving
 			lae $1
-			sti INT64
+			sti 8
 
-	pat zre                             /* Zero external */
+	pat zre                            /* Zero external */
 		leaving
 			loc 0
 			ste $1
 
-	pat ine                             /* Increment external */
+	pat ine                            /* Increment external */
 		leaving
 			loe $1
 			inc
 			ste $1
 
-	pat dee                             /* Decrement external */
+	pat dee                            /* Decrement external */
 		leaving
 			loe $1
 			dec
@@ -1216,27 +1210,27 @@ PATTERNS
 	pat lof                            /* Load word offsetted */
 		leaving
 			adp $1
-			loi INT32
+			loi 4
 
 	pat ldf                            /* Load double-word offsetted */
 		leaving
 			adp $1
-			loi INT64
+			loi 8
 
 	pat stf                            /* Store word offsetted */
 		leaving
 			adp $1
-			sti INT32
+			sti 4
 
 	pat sdf                            /* Store double-word offsetted */
 		leaving
 			adp $1
-			sti INT64
+			sti 8
 
 
 /* Loads and stores */
 
-	pat loi $1==INT8                   /* Load byte indirect */
+	pat loi $1==1                      /* Load byte indirect */
 		with REG
 			yields {IND_RC_B, %1, 0}
 		with exact SUM_RC
@@ -1246,8 +1240,8 @@ PATTERNS
 		with exact SUM_RR
 			yields {IND_RR_B, %1.reg1, %1.reg2}
 
-	pat loi loc loc cii $1==INT16 && $2==INT16 && $3==INT32
-	/* Load half-word indirect and sign extend */
+	/* Load half-word indirect and sign-extend */
+	pat loi loc loc cii $1==2 && $2==2 && $3==4
 		with REG
 			yields {IND_RC_H_S, %1, 0}
 		with exact SUM_RC
@@ -1257,7 +1251,7 @@ PATTERNS
 		with exact SUM_RR
 			yields {IND_RR_H_S, %1.reg1, %1.reg2}
 
-	pat loi $1==INT16                  /* Load half-word indirect */
+	pat loi $1==2                      /* Load half-word indirect */
 		with REG
 			yields {IND_RC_H, %1, 0}
 		with exact SUM_RC
@@ -1267,7 +1261,7 @@ PATTERNS
 		with exact SUM_RR
 			yields {IND_RR_H, %1.reg1, %1.reg2}
 
-	pat loi $1==INT32                  /* Load word indirect */
+	pat loi $1==4                      /* Load word indirect */
 		with REG
 			yields {IND_RC_W, %1, 0}
 		with exact SUM_RC
@@ -1277,7 +1271,7 @@ PATTERNS
 		with exact SUM_RR
 			yields {IND_RR_W, %1.reg1, %1.reg2}
 
-	pat loi $1==INT64                  /* Load double-word indirect */
+	pat loi $1==8                      /* Load double-word indirect */
 		with REG
 			yields {IND_RC_D, %1, 0}
 		with exact SUM_RC
@@ -1295,10 +1289,9 @@ PATTERNS
 	pat los $1==4                      /* Load arbitrary size */
 		with REG3 STACK
 			kills ALL
-			gen
-				bl {LABEL, ".los4"}
+			gen bl {LABEL, ".los4"}
 
-	pat sti $1==INT8                   /* Store byte indirect */
+	pat sti $1==1                      /* Store byte indirect */
 		with REG REG
 			kills MEMORY
 			gen move %2, {IND_RC_B, %1, 0}
@@ -1312,7 +1305,7 @@ PATTERNS
 			kills MEMORY
 			gen move %2, {IND_RR_B, %1.reg1, %1.reg2}
 
-	pat sti $1==INT16                  /* Store half-word indirect */
+	pat sti $1==2                      /* Store half-word indirect */
 		with REG REG
 			kills MEMORY
 			gen move %2, {IND_RC_H, %1, 0}
@@ -1326,7 +1319,7 @@ PATTERNS
 			kills MEMORY
 			gen move %2, {IND_RR_H, %1.reg1, %1.reg2}
 
-	pat sti $1==INT32                  /* Store word indirect */
+	pat sti $1==4                      /* Store word indirect */
 		with REG REG+FSREG
 			kills MEMORY
 			gen move %2, {IND_RC_W, %1, 0}
@@ -1340,7 +1333,7 @@ PATTERNS
 			kills MEMORY
 			gen move %2, {IND_RR_W, %1.reg1, %1.reg2}
 
-	pat sti $1==INT64                  /* Store double-word indirect */
+	pat sti $1==8                      /* Store double-word indirect */
 		with REG FREG
 			kills MEMORY
 			gen move %2, {IND_RC_D, %1, 0}
@@ -1367,8 +1360,7 @@ PATTERNS
 	pat sts $1==4                      /* Store arbitrary size */
 		with REG3 STACK
 			kills ALL
-			gen
-				bl {LABEL, ".sts4"}
+			gen bl {LABEL, ".sts4"}
 
 
 /* Arithmetic wrappers */
@@ -1560,7 +1552,7 @@ PATTERNS
 		leaving
 			cal ".xor"
 
-	pat com $1==INT32                  /* NOT word */
+	pat com $1==4                      /* NOT word */
 		with exact AND_RR
 			yields {NAND_RR, %1.reg1, %1.reg2}
 		with exact OR_RR
@@ -1680,8 +1672,7 @@ PATTERNS
 /* Arrays */
 
 	pat aar $1==4                      /* Address of array element */
-		leaving
-			cal ".aar4"
+		leaving cal ".aar4"
 
 	pat lar $1==4                      /* Load from array */
 		with STACK
@@ -1993,7 +1984,7 @@ PATTERNS
 	 * puts gt in the sign bit, to reverse the comparison.
 	 */
 
-	pat cmi $1==INT32                  /* Signed tristate compare */
+	pat cmi $1==4                      /* Signed tristate compare */
 		with REG CONST2
 			uses reusing %1, REG={COND_RC, %1, %2.val}
 			gen rlwinm %a, %a, {C, 1}, {C, 31}, {C, 0}
@@ -2007,7 +1998,7 @@ PATTERNS
 			gen extlwi %a, %a, {C, 2}, {C, 0}
 			yields %a
 
-	pat cmu $1==INT32                  /* Unsigned tristate compare */
+	pat cmu $1==4                      /* Unsigned tristate compare */
 		with REG UCONST2
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
 			gen rlwinm %a, %a, {C, 1}, {C, 31}, {C, 0}
@@ -2023,11 +2014,11 @@ PATTERNS
 
 	pat cmp                            /* Compare pointers */
 		leaving
-			cmu INT32
+			cmu 4
 
-	pat cms $1==INT32                  /* Compare blocks (word sized) */
+	pat cms $1==4                      /* Compare blocks (word sized) */
 		leaving
-			cmi INT32
+			cmi 4
 
 	pat cms defined($1)
 		leaving
@@ -2041,34 +2032,32 @@ PATTERNS
 
 /* Other branching and labelling */
 
+	/* During an unconditional jump, if the top element on the
+	 * stack has 4 bytes, then we hold it in register r3.
+	 */
 	pat lab topeltsize($1)==4 && !fallthrough($1)
 		kills ALL
-		gen
-			labeldef $1
-			yields r3
+		gen labeldef $1
+		yields r3
 
 	pat lab topeltsize($1)==4 && fallthrough($1)
 		with REG3 STACK
-		kills ALL
-		gen
-			labeldef $1
-		yields r3
+			kills ALL
+			gen labeldef $1
+			yields r3
 
-	pat lab topeltsize($1)!=4
+	pat lab topeltsize($1)!=4          /* Label without r3 */
 		with STACK
-		kills ALL
-		gen
-			labeldef $1
+			kills ALL
+			gen labeldef $1
 
-	pat bra topeltsize($1)==4          /* Unconditional jump with TOS GPRister */
+	pat bra topeltsize($1)==4          /* Branch with r3 */
 		with REG3 STACK
-		gen
-			b {LABEL, $1}
+			gen b {LABEL, $1}
 
-	pat bra topeltsize($1)!=4          /* Unconditional jump without TOS GPRister */
+	pat bra topeltsize($1)!=4          /* Branch without r3 */
 		with STACK
-		gen
-			b {LABEL, $1}
+			gen b {LABEL, $1}
 
 
 /* Miscellaneous */
@@ -2076,8 +2065,7 @@ PATTERNS
 	pat cal                            /* Call procedure */
 		with STACK
 			kills ALL
-			gen
-				bl {LABEL, $1}
+			gen bl {LABEL, $1}
 
 	pat cai                            /* Call procedure indirect */
 		with REG STACK
@@ -2086,10 +2074,10 @@ PATTERNS
 				mtspr ctr, %1
 				bctrl.
 
-	pat lfr $1==INT32                  /* Load function result, word */
+	pat lfr $1==4                      /* Load function result, word */
 		yields r3
 
-	pat lfr $1==INT64           /* Load function result, double-word */
+	pat lfr $1==8               /* Load function result, double-word */
 		yields r4 r3
 
 	pat ret $1==0                      /* Return from procedure */
@@ -2151,14 +2139,12 @@ PATTERNS
 	pat csa                            /* Array-lookup switch */
 		with STACK
 			kills ALL
-			gen
-				b {LABEL, ".csa"}
+			gen b {LABEL, ".csa"}
 
 	pat csb                            /* Table-lookup switch */
 		with STACK
 			kills ALL
-			gen
-				b {LABEL, ".csb"}
+			gen b {LABEL, ".csb"}
 
 
 /* EM specials */
@@ -2174,30 +2160,24 @@ PATTERNS
 			ste "hol0"
 
 	pat lni                            /* Increment line number */
-		leaving
-			ine "hol0"
+		leaving ine "hol0"
 
 	pat lim                            /* Load EM trap ignore mask */
-		leaving
-			lde ".ignmask"
+		leaving lde ".ignmask"
 
 	pat sim                            /* Store EM trap ignore mask */
-		leaving
-			ste ".ignmask"
+		leaving ste ".ignmask"
 
 	pat trp                            /* Raise EM trap */
 		with REG3
 			kills ALL
-			gen
-				bl {LABEL, ".trap"}
+			gen bl {LABEL, ".trap"}
 
 	pat sig                            /* Set trap handler */
-		leaving
-			ste ".trppc"
+		leaving ste ".trppc"
 
 	pat rtt                            /* Return from trap */
-		leaving
-			ret 0
+		leaving ret 0
 
 	/* Our caller's local base, "lxl 0 dch", appears in
 	 * lang/cem/libcc.ansi/setjmp/setjmp.e, lang/m2/libm2/par_misc.e
@@ -2210,8 +2190,7 @@ PATTERNS
 			yields {IND_RC_W, %1, FP_OFFSET}
 
 	pat lpb                            /* LB -> argument base */
-		leaving
-			adp EM_BSIZE
+		leaving adp EM_BSIZE
 
 	pat gto                            /* longjmp */
 		with STACK
@@ -2253,15 +2232,14 @@ PATTERNS
 
 /* Single-precision floating-point */
 
-	pat zrf $1==INT32                  /* Push zero */
+	pat zrf $1==4                      /* Push zero */
 		leaving
 			loe ".fs_00000000"
 
 	pat adf $1==4                      /* Add single */
 		with FSREG FSREG
 			uses reusing %1, FSREG
-			gen
-				fadds %a, %2, %1
+			gen fadds %a, %2, %1
 			yields %a
 	pat adf stl $1==4 && inreg($2)==reg_float
 		with FSREG FSREG
@@ -2270,8 +2248,7 @@ PATTERNS
 	pat sbf $1==4                      /* Subtract single */
 		with FSREG FSREG
 			uses reusing %1, FSREG
-			gen
-				fsubs %a, %2, %1
+			gen fsubs %a, %2, %1
 			yields %a
 	pat sbf stl $1==4 && inreg($2)==reg_float
 		with FSREG FSREG
@@ -2280,34 +2257,31 @@ PATTERNS
 	pat mlf $1==4                      /* Multiply single */
 		with FSREG FSREG
 			uses reusing %1, FSREG
-			gen
-				fmuls %a, %2, %1
+			gen fmuls %a, %2, %1
 			yields %a
 	pat mlf stl $1==4 && inreg($2)==reg_float
 		with FSREG FSREG
 			gen fmuls {LOCAL, $2}, %2, %1
 
-	pat dvf $1==INT32                  /* Divide single */
+	pat dvf $1==4                      /* Divide single */
 		with FSREG FSREG
 			uses reusing %1, FSREG
-			gen
-				fdivs %a, %2, %1
+			gen fdivs %a, %2, %1
 			yields %a
 	pat dvf stl $1==4 && inreg($2)==reg_float
 		with FSREG FSREG
 			gen fdivs {LOCAL, $2}, %2, %1
 
-	pat ngf $1==INT32                  /* Negate single */
+	pat ngf $1==4                      /* Negate single */
 		with FSREG
 			uses reusing %1, FSREG
-			gen
-				fneg %a, %1
+			gen fneg %a, %1
 			yields %a
 	pat ngf stl $1==4 && inreg($2)==reg_float
 		with FSREG
 			gen fneg {LOCAL, $2}, %1
 
-	pat cmf $1==INT32                  /* Compare single */
+	pat cmf $1==4                      /* Compare single */
 		with FSREG FSREG
 			uses REG={COND_FS, %2, %1}
 			gen extlwi %a, %a, {C, 2}, {C, 0}
@@ -2358,12 +2332,11 @@ PATTERNS
 	pat cmf zlt $1==4    call cmf4zxx("blt")
 	pat cmf zle $1==4    call cmf4zxx("ble")
 
-	pat loc loc cff $1==INT32 && $2==INT64 /* Convert single to double */
+	pat loc loc cff $1==4 && $2==8     /* Convert single to double */
 		with FSREG
 			yields %1.1
 
-	/* Convert single to signed int */
-	pat loc loc cfi $1==4 && $2==4
+	pat loc loc cfi $1==4 && $2==4     /* Single to signed int */
 		leaving
 			loc 4
 			loc 8
@@ -2372,8 +2345,7 @@ PATTERNS
 			loc 4
 			cfi
 
-	/* Convert single to unsigned int */
-	pat loc loc cfu $1==4 && $2==4
+	pat loc loc cfu $1==4 && $2==4     /* Single to unsigned int */
 		leaving
 			loc 4
 			loc 8
@@ -2382,8 +2354,7 @@ PATTERNS
 			loc 4
 			cfu
 
-	/* Convert signed int to single */
-	pat loc loc cif $1==4 && $2==4
+	pat loc loc cif $1==4 && $2==4     /* Signed int to single */
 		leaving
 			loc 4
 			loc 8
@@ -2392,8 +2363,7 @@ PATTERNS
 			loc 4
 			cff
 
-	/* Convert unsigned int to single */
-	pat loc loc cuf $1==4 && $2==4
+	pat loc loc cuf $1==4 && $2==4     /* Unsigned int to single */
 		leaving
 			loc 4
 			loc 8
@@ -2405,15 +2375,13 @@ PATTERNS
 
 /* Double-precision floating-point */
 
-	pat zrf $1==INT64                  /* Push zero */
-		leaving
-			lde ".fd_00000000"
+	pat zrf $1==8                      /* Push zero */
+		leaving lde ".fd_00000000"
 
 	pat adf $1==8                      /* Add double */
 		with FREG FREG
 			uses reusing %1, FREG
-			gen
-				fadd %a, %2, %1
+			gen fadd %a, %2, %1
 			yields %a
 	pat adf sdl $1==8 && inreg($2)==reg_float
 		with FREG FREG
@@ -2422,8 +2390,7 @@ PATTERNS
 	pat sbf $1==8                      /* Subtract double */
 		with FREG FREG
 			uses reusing %1, FREG
-			gen
-				fsub %a, %2, %1
+			gen fsub %a, %2, %1
 			yields %a
 	pat sbf sdl $1==8 && inreg($2)==reg_float
 		with FREG FREG
@@ -2432,8 +2399,7 @@ PATTERNS
 	pat mlf $1==8                      /* Multiply double */
 		with FREG FREG
 			uses reusing %1, FREG
-			gen
-				fmul %a, %2, %1
+			gen fmul %a, %2, %1
 			yields %a
 	pat mlf sdl $1==8 && inreg($2)==reg_float
 		with FREG FREG
@@ -2442,8 +2408,7 @@ PATTERNS
 	pat dvf $1==8                      /* Divide double */
 		with FREG FREG
 			uses reusing %1, FREG
-			gen
-				fdiv %a, %2, %1
+			gen fdiv %a, %2, %1
 			yields %a
 	pat dvf sdl $1==8 && inreg($2)==reg_float
 		with FREG FREG
@@ -2452,14 +2417,13 @@ PATTERNS
 	pat ngf $1==8                      /* Negate double */
 		with FREG
 			uses reusing %1, FREG
-			gen
-				fneg %a, %1
+			gen fneg %a, %1
 			yields %a
 	pat ngf sdl $1==8 && inreg($2)==reg_float
 		with FREG
 			gen fneg {DLOCAL, $2}, %1
 
-	pat cmf $1==INT64                  /* Compare double */
+	pat cmf $1==8                      /* Compare double */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
 			gen extlwi %a, %a, {C, 2}, {C, 0}
@@ -2521,8 +2485,7 @@ PATTERNS
 			gen frsp %a, %1
 			yields %a
 
-	/* Convert double to signed int */
-	pat loc loc cfi $1==8 && $2==4
+	pat loc loc cfi $1==8 && $2==4     /* Double to signed int */
 		with FREG STACK
 			uses reusing %1, FREG
 			gen
@@ -2530,26 +2493,18 @@ PATTERNS
 				stfdu %a, {IND_RC_D, sp, 0-8}
 				addi sp, sp, {C, 4}
 
-	/* Convert double to unsigned int */
-	pat loc loc cfu $1==8 && $2==4
-		leaving
-			cal ".cfu8"
+	pat loc loc cfu $1==8 && $2==4     /* Double to unsigned int */
+		leaving cal ".cfu8"
 
-	/* Convert signed int to double */
-	pat loc loc cif $1==4 && $2==8
-		leaving
-			cal ".cif8"
+	pat loc loc cif $1==4 && $2==8     /* Signed int to double */
+		leaving cal ".cif8"
 
-	/* Convert unsigned int to double */
-	pat loc loc cuf $1==4 && $2==8
-		leaving
-			cal ".cuf8"
+	pat loc loc cuf $1==4 && $2==8     /* Unsigned int to double */
+		leaving cal ".cuf8"
 
 	pat fef $1==8                      /* Split fraction, exponent */
-		leaving
-			cal ".fef8"
+		leaving cal ".fef8"
 
 	/* Multiply two doubles, then split fraction, integer */
 	pat fif $1==8
-		leaving
-			cal ".fif8"
+		leaving cal ".fif8"

From 5f2a7b260fd8af26294062f69599db4d0d698293 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Fri, 22 Dec 2017 22:32:16 -0500
Subject: [PATCH 24/55] Optimize `mr. X, X` after some instructions.

For example, when ncg emits
    slw r9,r8,r5
    mr. r9,r9
then top simplifies the code to
    slw. r9,r8,r5
---
 mach/powerpc/ncg/table | 11 ++++-------
 mach/powerpc/top/table | 27 +++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index e566c51b9..e66ae855f 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -348,7 +348,7 @@ INSTRUCTIONS
   or              GPR:wo, GPR:ro, GPR:ro.
     mr            GPR:wo, GPR:ro.
   orX "or."       GPR:wo:cc, GPR:ro, GPR:ro.
-    orX_readonly "or." GPR:ro:cc, GPR:ro, GPR:ro.
+    mrX_readonly "mr." GPR:ro:cc, GPR:ro.
   orc             GPR:wo, GPR:ro, GPR:ro.
   ori             GPR:wo, GPR:ro, CONST+LABEL_LO:ro.
   oris            GPR:wo, GPR:ro, CONST:ro.
@@ -707,15 +707,12 @@ MOVES
 
 TESTS
 
-	/* Given orX %1, %1, %1, ncgg says, "Instruction destroys %1,
-	 * not allowed here".  We use orX_readonly to trick ncgg.
-	 *
-	 * Using "or." and not "mr." because mach/powerpc/top/table
-	 * was optimizing "or." and not "mr.".
+	/* Given "mrX %1, %1", ncgg would say, "Instruction destroys
+	 * %1, not allowed here".  We use mrX_readonly to trick ncgg.
 	 */
 	to test GPR
 		gen
-			orX_readonly %1, %1, %1
+			mrX_readonly %1, %1
 
 
 STACKINGRULES
diff --git a/mach/powerpc/top/table b/mach/powerpc/top/table
index fdec03b2e..b3f5b3a31 100644
--- a/mach/powerpc/top/table
+++ b/mach/powerpc/top/table
@@ -1,5 +1,5 @@
 
-/* PowerPC desciptor table for ACK target optimizer */
+/* PowerPC table for ACK target optimizer */
 
 MAXOP 3;
 LABEL_STARTER '.';
@@ -16,10 +16,33 @@ X, Y, Z             { TRUE };
 addi  RNZ, RNZ, 0            -> ;
 addis RNZ, RNZ, 0            -> ;
 
+or X, Y, Y                   -> mr X, Y ;
+or. X, Y, Y                  -> mr. X, Y ;
+
 mr X, X                      -> ;
 fmr X, X                     -> ;
 
-or X, Y, Z : or. X, X, X     -> or. X, Y, Z ;
+add X, Y, Z   : mr. X, X     -> add. X, Y, Z ;
+and X, Y, Z   : mr. X, X     -> and. X, Y, Z ;
+andc X, Y, Z  : mr. X, X     -> andc. X, Y, Z ;
+divw X, Y, Z  : mr. X, X     -> divw. X, Y, Z ;
+divwu X, Y, Z : mr. X, X     -> divwu. X, Y, Z ;
+extsb X, Y, Z : mr. X, X     -> extsb. X, Y, Z ;
+extsh X, Y, Z : mr. X, X     -> extsh. X, Y, Z ;
+eqv X, Y, Z   : mr. X, X     -> eqv. X, Y, Z ;
+mullw X, Y, Z : mr. X, X     -> mullw. X, Y, Z ;
+nand X, Y, Z  : mr. X, X     -> nand. X, Y, Z ;
+nor X, Y, Z   : mr. X, X     -> nor. X, Y, Z ;
+or X, Y, Z    : mr. X, X     -> or. X, Y, Z ;
+orc X, Y, Z   : mr. X, X     -> orc. X, Y, Z ;
+slw X, Y, Z   : mr. X, X     -> slw. X, Y, Z ;
+slwi X, Y, Z  : mr. X, X     -> slwi. X, Y, Z ;
+subf X, Y, Z  : mr. X, X     -> subf. X, Y, Z ;
+sraw X, Y, Z  : mr. X, X     -> sraw. X, Y, Z ;
+srawi X, Y, Z : mr. X, X     -> srawi. X, Y, Z ;
+srw X, Y, Z   : mr. X, X     -> srw. X, Y, Z ;
+srwi X, Y, Z  : mr. X, X     -> srwi. X, Y, Z ;
+xor X, Y, Z   : mr. X, X     -> xor. X, Y, Z ;
 
 b X : labdef X               -> labdef X ;
 

From 26de4c1ab18df825607d7882924f57c673af46cf Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Sun, 24 Dec 2017 22:37:52 -0500
Subject: [PATCH 25/55] Add test for EM _rck_.  Fix traps in PowerPC ncg.

The new test rck_e.e segfaults on PowerPC unless I make some changes.
The inline code for _rck_ was wrong because it didn't allow the trap
handler to return.  _sig_ forgot to push the old trap handler.

Move plat/linuxppc/libsys/trap.s to mach/powerpc/libem/trp.s and
rewrite it with simplified/extended mnemonics.  Remove .trap alias for
.trp procedure.  Add a missing `mtspr lr, r0` so we can return from
the trap handler.  Call write() and _exit() so trp.s works with both
linuxppc and osxppc.  Before, Mac OS X was wrongly using the trap.s
for Linux.

In powerpc/libem, simplify .aar4; teach .csa and .csb to raise the
trap if the default target is zero.

C programs don't need these changes.  You may relink your C programs
with the changed .csa and .csb, but C code doesn't raise the trap.
Modula-2 code can raise traps, so you may want to relink your Modula-2
programs with the changed libem, but you might keep your old .o files
from Modula-2.  You may need to recompile your Pascal programs (delete
old .o files from Pascal) because the Pascal compiler might use _rck_.
---
 mach/powerpc/libem/aar4.s      |  12 +--
 mach/powerpc/libem/build.lua   |   3 +-
 mach/powerpc/libem/csa.s       |  23 ++--
 mach/powerpc/libem/csb.s       |  25 ++---
 mach/powerpc/libem/rck.s       |   4 +
 mach/powerpc/libem/trp.s       |  56 ++++++++++
 mach/powerpc/ncg/table         |  23 ++--
 plat/linuxppc/libsys/build.lua |   1 -
 plat/linuxppc/libsys/trap.s    | 112 --------------------
 plat/osxppc/libsys/build.lua   |   1 -
 tests/plat/build.lua           |   1 +
 tests/plat/rck_e.e             | 186 +++++++++++++++++++++++++++++++++
 12 files changed, 285 insertions(+), 162 deletions(-)
 create mode 100644 mach/powerpc/libem/trp.s
 delete mode 100644 plat/linuxppc/libsys/trap.s
 create mode 100644 tests/plat/rck_e.e

diff --git a/mach/powerpc/libem/aar4.s b/mach/powerpc/libem/aar4.s
index fc8620d02..08390b081 100644
--- a/mach/powerpc/libem/aar4.s
+++ b/mach/powerpc/libem/aar4.s
@@ -8,21 +8,17 @@
 
 .define .aar4
 .aar4:
-	lis r0, hi16[.trap_earray]
-	ori r0, r0, lo16[.trap_earray]
-	mtspr ctr, r0            ! load CTR with trap address
-
 	lwz r4, 0(sp)            ! r4 = address of descriptor
 	lwz r5, 4(sp)            ! r5 = index
 	lwz r6, 8(sp)            ! r6 = address of array
 
 	lwz r0, 0(r4)
 	subf. r5, r0, r5         ! subtract lower bound from index
-	bltctr                   ! check lower bound
+	blt .trap_earray         ! check lower bound
 
 	lwz r0, 4(r4)
 	cmplw r5, r0
-	bgtctr                   ! check upper bound
+	bgt .trap_earray         ! check upper bound
 
 	lwz r3, 8(r4)            ! r3 = size of element
 	mullw r5, r5, r3         ! scale index by size
@@ -30,3 +26,7 @@
 	stw r6, 8(sp)            ! push address of element
 	addi sp, sp, 8
 	blr
+
+.trap_earray:
+	li r3, 0                 ! EARRAY = 0 in h/em_abs.h
+	b .trp
diff --git a/mach/powerpc/libem/build.lua b/mach/powerpc/libem/build.lua
index cb5efd281..ac84e3b0f 100644
--- a/mach/powerpc/libem/build.lua
+++ b/mach/powerpc/libem/build.lua
@@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
 	acklibrary {
 		name = "lib_"..plat,
 		srcs = {
-			"./*.s", -- exg.s
+			"./*.s", -- trp.s
 		},
 		vars = { plat = plat },
 		deps = {
@@ -15,4 +15,3 @@ for _, plat in ipairs(vars.plats) do
 		}
 	}
 end
-
diff --git a/mach/powerpc/libem/csa.s b/mach/powerpc/libem/csa.s
index 3898241c4..86d792554 100644
--- a/mach/powerpc/libem/csa.s
+++ b/mach/powerpc/libem/csa.s
@@ -13,22 +13,21 @@
 	lwz r4, 4(sp)
 	addi sp, sp, 8
 
-	lwz r5, 0(r3)            ! load default
-	mtspr ctr, r5
-	
-	lwz r5, 4(r3)            ! fetch lower bound
-	subf. r4, r5, r4         ! adjust value
-	bltctr                   ! jump to default if out of range
+	lwz r5, 0(r3)            ! r5 = default target
 
-	lwz r5, 8(r3)            ! fetch range
-	cmplw r4, r5
-	bgtctr                   ! jump to default if out of range
+	lwz r6, 4(r3)            ! fetch lower bound
+	subf. r4, r6, r4         ! adjust value
+	blt 1f                   ! jump to default if out of range
+
+	lwz r6, 8(r3)            ! fetch range
+	cmplw r4, r6
+	bgt 1f                   ! jump to default if out of range
 
 	addi r3, r3, 12          ! skip header
 	slwi r4, r4, 2           ! scale value (<<2)
-	lwzx r5, r3, r4          ! load target
-	mtspr ctr, r5
+	lwzx r5, r3, r4          ! r5 = new target
 
-	or. r5, r5, r5           ! test it
+1:	mtspr ctr, r5
+	mr. r5, r5               ! test it
 	bnectr                   ! jump to target if non-zero
 	b .trap_ecase            ! otherwise trap
diff --git a/mach/powerpc/libem/csb.s b/mach/powerpc/libem/csb.s
index 571bfc210..92c6d096d 100644
--- a/mach/powerpc/libem/csb.s
+++ b/mach/powerpc/libem/csb.s
@@ -13,23 +13,20 @@
 	lwz r4, 4(sp)
 	addi sp, sp, 8
 
-	lwz r5, 0(r3)            ! load default
-	mtspr ctr, r5
+	lwz r5, 0(r3)            ! r5 = default target
 
 	lwz r6, 4(r3)            ! fetch count
-
-1:
-	or. r6, r6, r6           ! test count
-	beqctr                   ! exit if zero
-	addi r6, r6, -1          ! otherwise decrement
-
-	lwzu r7, 8(r3)           ! fetch target index, increment pointer
+	mr. r6, r6               ! skip loop if count is zero
+	beq 3f                   !   (needed by Modula-2 "CASE i OF END")
+	mtspr ctr, r6
+1:	lwzu r7, 8(r3)           ! fetch target index, increment pointer
 	cmpw r4, r7              ! compare with value
-	bne 1b                   ! if not equal, go again
+	beq 2f
+	bdnz 1b                  ! if not equal, go again
+	b 3f
 
-	lwz r7, 4(r3)            ! fetch target address
-	mtspr ctr, r7
-
-	or. r7, r7, r7           ! test it
+2:	lwz r5, 4(r3)            ! r5 = new target
+3:	mtspr ctr, r5
+	mr. r5, r5               ! test target
 	bnectr                   ! jump to target if non-zero
 	b .trap_ecase            ! otherwise trap
diff --git a/mach/powerpc/libem/rck.s b/mach/powerpc/libem/rck.s
index 9008be610..f1cf7f848 100644
--- a/mach/powerpc/libem/rck.s
+++ b/mach/powerpc/libem/rck.s
@@ -18,3 +18,7 @@
     bgt .trap_erange
 
     blr
+
+.trap_erange:
+    li r3, 1       ! ERANGE = 1 in h/em_abs.h
+    b .trp
diff --git a/mach/powerpc/libem/trp.s b/mach/powerpc/libem/trp.s
new file mode 100644
index 000000000..b07afb929
--- /dev/null
+++ b/mach/powerpc/libem/trp.s
@@ -0,0 +1,56 @@
+.sect .text
+
+.define .trap_ecase
+.trap_ecase:
+	li	r3, 20			! ECASE = 20 in h/em_abs.h
+	! FALLTHROUGH to .trp
+
+! Raises an EM trap.
+! Expects r3 = trap number.
+
+.define .trp
+.trp:
+	cmplwi	r3, 15			! traps > 15 can't be ignored
+	bgt	1f
+
+	lis	r4, ha16[.ignmask]
+	lwz	r4, lo16[.ignmask](r4)	! load ignore mask
+	srw	r4, r4, r3
+	andi.	r4, r4, 1
+	bnelr				! return if ignoring trap
+
+1:	lis	r4, ha16[.trppc]
+	lwz	r5, lo16[.trppc](r4)	! r5 = user trap routine
+	mr.	r5, r5
+	beq	2f			! if no user trap routine, bail out
+
+	mtspr	ctr, r5
+	mfspr	r6, lr
+	li	r0, 0
+	stwu	r3, -8(sp)		! push trap number
+	stw	r0, lo16[.trppc](r4)	! reset trap routine
+	stw	r6, 4(sp)		! save old lr
+	bctrl				! call trap routine
+
+	lwz	r0, 4(sp)
+	mtspr	lr, r0
+	addi	sp, sp, 8		! retract over stack usage
+	blr
+
+2:	! No trap handler.  Write error message, exit.
+	li	r3, 2
+	stwu	r3, -12(sp)
+	lis	r4, ha16[message]
+	addi	r4, r4, lo16[message]
+	li	r5, 6
+	stw	r4, 4(sp)
+	stw	r5, 8(sp)
+	bl	_write			! write(2, message, 6)
+
+	li	r3, 1
+	stw	r3, 0(sp)
+	bl	__exit			! _exit(1)
+
+.sect .rom
+message:
+	.ascii "TRAP!\n"
diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index e66ae855f..10ffadb16 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -2168,10 +2168,13 @@ PATTERNS
 	pat trp                            /* Raise EM trap */
 		with REG3
 			kills ALL
-			gen bl {LABEL, ".trap"}
+			gen bl {LABEL, ".trp"}
 
-	pat sig                            /* Set trap handler */
-		leaving ste ".trppc"
+	pat sig                            /* Set trap handler, yield old */
+		leaving
+			loe ".trppc"
+			exg 4
+			ste ".trppc"
 
 	pat rtt                            /* Return from trap */
 		leaving ret 0
@@ -2216,22 +2219,14 @@ PATTERNS
 		with REG
 			gen move %1, sp
 
-	pat lae rck $2==4                  /* Range check */
-		with REG
-			kills ALL
-			gen
-				cmpwi %1, {C, rom($1, 1)}
-				blt {LABEL, ".trap_erange"}
-				cmpwi %1, {C, rom($1, 2)}
-				bgt {LABEL, ".trap_erange"}
-			yields %1
+	pat rck $1==4                      /* Range check */
+		leaving cal ".rck"
 
 
 /* Single-precision floating-point */
 
 	pat zrf $1==4                      /* Push zero */
-		leaving
-			loe ".fs_00000000"
+		leaving loe ".fs_00000000"
 
 	pat adf $1==4                      /* Add single */
 		with FSREG FSREG
diff --git a/plat/linuxppc/libsys/build.lua b/plat/linuxppc/libsys/build.lua
index f7b16b378..696c62d42 100644
--- a/plat/linuxppc/libsys/build.lua
+++ b/plat/linuxppc/libsys/build.lua
@@ -4,7 +4,6 @@ acklibrary {
 		"./_syscall.s",
 		"./sigaction.s",
 		"./signal.c",
-		"./trap.s",
 		"plat/linux/libsys/_exit.c",
 		"plat/linux/libsys/_hol0.s",
 		"plat/linux/libsys/close.c",
diff --git a/plat/linuxppc/libsys/trap.s b/plat/linuxppc/libsys/trap.s
deleted file mode 100644
index 93c5189a4..000000000
--- a/plat/linuxppc/libsys/trap.s
+++ /dev/null
@@ -1,112 +0,0 @@
-#
-! $Source: /cvsroot/tack/Ack/plat/linux386/libsys/_syscall.s,v $
-! $State: Exp $
-! $Revision: 1.1 $
-
-! Declare segments (the order is important).
-
-.sect .text
-.sect .rom
-.sect .data
-.sect .bss
-
-.sect .text
-
-#define IFFALSE 4
-#define IFTRUE 12
-#define ALWAYS 20
-
-#define LT 0
-#define GT 1
-#define EQ 2
-#define OV 3
-
-EARRAY	=  0
-ERANGE	=  1
-ESET	=  2
-EIOVFL	=  3
-EFOVFL	=  4
-EFUNFL	=  5
-EIDIVZ	=  6
-EFDIVZ	=  7
-EIUND	=  8
-EFUND	=  9
-ECONV	= 10
-ESTACK  = 16
-EHEAP	= 17
-EILLINS = 18
-EODDZ	= 19
-ECASE	= 20
-EMEMFLT	= 21
-EBADPTR = 22
-EBADPC  = 23
-EBADLAE = 24
-EBADMON = 25
-EBADLIN = 26
-EBADGTO = 27
-EUNIMPL = 63		! unimplemented em-instruction called
-
-! EM trap handling.
-
-.define .trap_ecase
-.trap_ecase:
-	addi r3, r0, ECASE
-	b .trap
-
-.define .trap_earray
-.trap_earray:
-	addi r3, r0, EARRAY
-	b .trap
-
-.define .trap_erange
-.trap_erange:
-	addi r3, r0, ERANGE
-	b .trap
-
-.define .trp
-.define .trap
-.trp:
-.trap:
-	cmpi cr0, 0, r3, 15      ! traps >15 can't be ignored
-	bc IFTRUE, LT, 1f
-
-	addi r4, r0, 1
-	rlwnm r4, r4, r3, 0, 31  ! calculate trap bit
-	li32 r5, .ignmask
-	lwz r5, 0(r5)            ! load ignore mask
-	and. r4, r4, r5          ! compare
-	bclr IFFALSE, EQ, 0      ! return if non-zero
-
-1:
-	li32 r4, .trppc
-	lwz r5, 0(r4)            ! load user trap routine
-	or. r5, r5, r5           ! test
-	bc IFTRUE, EQ, fatal     ! if no user trap routine, bail out
-
-	addi r0, r0, 0
-	stw r0, 0(r4)            ! reset trap routine
-
-	mfspr r0, lr
-	stwu r0, -4(sp)          ! save old lr
-
-	stwu r3, -4(sp)
-	mtspr ctr, r5
-	bcctrl ALWAYS, 0, 0      ! call trap routine
-
-	lwz r0, 4(sp)            ! load old lr again
-	addi sp, sp, 8           ! retract over stack usage
-	bclr ALWAYS, 0, 0        ! return
-
-fatal:
-	addi r3, r0, 1
-	li32 r4, message
-	addi r5, r0, 6
-	addi r0, r0, 4           ! write()
-	sc 0
-
-	addi r0, r0, 1           ! exit()
-	sc 0
-
-.sect .rom
-message:
-	.ascii "TRAP!\n"
diff --git a/plat/osxppc/libsys/build.lua b/plat/osxppc/libsys/build.lua
index 072730b7a..49fc0c934 100644
--- a/plat/osxppc/libsys/build.lua
+++ b/plat/osxppc/libsys/build.lua
@@ -19,7 +19,6 @@ acklibrary {
 		"./sigaction.s",
 		"./stat.s",
 		"./write.s",
-		"plat/linuxppc/libsys/trap.s",
 		"plat/osx/libsys/brk.c",
 		"plat/osx/libsys/creat.c",
 		"plat/osx/libsys/isatty.c",
diff --git a/tests/plat/build.lua b/tests/plat/build.lua
index 666af7d95..fdac9bae3 100644
--- a/tests/plat/build.lua
+++ b/tests/plat/build.lua
@@ -13,6 +13,7 @@ definerule("plat_testsuite",
 			"tests/plat/dup_e.e",
 			"tests/plat/exg_e.e",
 			"tests/plat/inn_e.e",
+			"tests/plat/rck_e.e",
 			"tests/plat/rotate_e.e",
 			"tests/plat/*.p",
 			"tests/plat/b/*.b",
diff --git a/tests/plat/rck_e.e b/tests/plat/rck_e.e
new file mode 100644
index 000000000..cd5c581df
--- /dev/null
+++ b/tests/plat/rck_e.e
@@ -0,0 +1,186 @@
+#
+    mes 2, EM_WSIZE, EM_PSIZE
+
+/*
+ * Uses _rck_ for range checks.  Catches the EM trap if a value is out
+ * of range, and continues with the next instruction after _rck_.
+ *
+ * Some back ends, like i80, ignore _rck_, so this test fails.
+ */
+
+testnr
+    con 1         ; test number
+caught
+    con 0         ; number of caught traps
+
+    inp $next
+    inp $catch
+    inp $never
+    exp $_m_a_i_n
+    pro $_m_a_i_n,0
+
+    lim           ; load ignore mask
+    loc 2
+    and EM_WSIZE  ; check bit 1 << ERANGE
+    zeq *1        ; fail if ignoring ERANGE
+.1
+    rom 1I4
+    lae .1
+    loi 4
+    cal $fail
+    asp 4
+1
+
+    cal $next     ; increment testnr, catch next trap
+    loc 10125
+.2
+    rom 4283, 13644
+    lae .2
+    rck EM_WSIZE  ; testnr 2 in range
+    asp EM_WSIZE
+
+    cal $next
+    loc 4282
+    lae .2
+    rck EM_WSIZE  ; testnr 3 out of range
+    asp EM_WSIZE
+
+    cal $next
+    loc 4283
+    lae .2
+    rck EM_WSIZE  ; testnr 4 in range
+    asp EM_WSIZE
+
+    cal $next
+    loc 13644
+    lae .2
+    rck EM_WSIZE  ; testnr 5 in range
+    asp EM_WSIZE
+
+    cal $next
+    loc 13655
+    lae .2
+    rck EM_WSIZE  ; testnr 6 out of range
+    asp EM_WSIZE
+
+    cal $next
+    loc -13015
+.7
+    rom -31344, -1898
+    lae .7
+    rck EM_WSIZE  ; testnr 7 in range
+    asp EM_WSIZE
+
+    cal $next
+    loc 8580
+.8
+    rom -26315, 4588
+    lae .8
+    rck EM_WSIZE  ; testnr 8 out of range
+    asp EM_WSIZE
+
+    ; The last test raised a trap, so now there is no trap handler.
+    lpi $never
+    sig           ; push old trap handler
+    loc 0
+    loc EM_WSIZE
+    loc EM_PSIZE
+    cuu           ; push NULL pointer
+    cmp
+    zeq *17       ; fail unless old handler is NULL
+.17
+    rom 17I4
+    lae .17
+    loi 4
+    cal $fail
+    asp 4
+17
+    ; Change the trap handler from $never to $catch.
+    lpi $catch
+    sig
+    lpi $never
+    cmp
+    zeq *18
+.18
+    rom 18I4
+    lae .18
+    loi 4
+    cal $fail
+    asp 4
+18
+    ; Begin ignoring range traps.
+    loc 2         ; 1 << ERANGE
+    sim
+    loc 18
+    ste testnr
+    loc 8580
+    lae .8
+    rck EM_WSIZE  ; testnr 18 out of range but ignored
+
+    ; Fail if we caught the wrong number of traps.
+    loe caught
+    loc 3
+    beq *20
+.20
+    rom 20I4
+    lae .20
+    loi 4
+    cal $fail
+    asp 4
+20
+    cal $finished
+    end
+
+    pro $next,0
+    ine testnr    ; next test
+    lpi $catch
+    sig           ; catch next EM trap (only one trap)
+    asp EM_PSIZE
+    ret 0
+    end
+
+    pro $catch,0
+    ine caught    ; count this trap
+
+    lol 0         ; load trap number
+    loc 1
+    beq *1        ; fail if trap != ERANGE
+.101
+    rom 257I4
+    lae .101
+    loi 4
+    cal $fail
+    ; Wrong type of trap.  _rtt_ might not work, so exit now.
+    cal $finished
+1
+    ; Fail if the wrong test raised this trap.
+    loe testnr
+    loc 3
+    beq *2
+    loe testnr
+    loc 6
+    beq *2
+    loe testnr
+    loc 8
+    beq *2
+    loc 256
+    loe testnr
+    adi EM_WSIZE  ; 0x100 + testnr
+    loc EM_WSIZE
+    loc 4
+    cuu
+    cal $fail
+    asp 4
+2
+    rtt           ; return from trap handler
+    end
+
+    pro $never,0
+.200
+    rom 200I4
+    lae .200
+    loi 4
+    cal $fail
+    asp 4
+    rtt
+    end

From d6938108a6fda7cb9ef47de7b20587769503add1 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Wed, 3 Jan 2018 14:51:14 -0500
Subject: [PATCH 26/55] Add tests for C <setjmp.h> and Modula-2 Semaphores.

Fix PowerPC ncg so setjmp() returns the correct value.  I got unlucky
when ncg picked r3 for "uses REG"; this destroyed the return value in
r3 and caused the new test to fail.
---
 mach/powerpc/ncg/table         |  23 +++--
 tests/plat/build.lua           |   1 +
 tests/plat/m2/SemaTest_mod.mod | 157 +++++++++++++++++++++++++++++++++
 tests/plat/setjmp_c.c          |  58 ++++++++++++
 4 files changed, 230 insertions(+), 9 deletions(-)
 create mode 100644 tests/plat/m2/SemaTest_mod.mod
 create mode 100644 tests/plat/setjmp_c.c

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 10ffadb16..a35ace230 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -85,9 +85,8 @@ REGISTERS
 	lr, ctr     : SPR.
 	cr0         : CR.
 
-	/* The stacking rules and the splitting coercions can't
-	 * allocate registers.  We use r12 in the splitting coercions,
-	 * and these scratch registers in the stacking rules.
+	/* The stacking rules can't allocate registers.  We use these
+	 * scratch registers to stack tokens.
 	 */
 #define RSCRATCH r0
 #define FSCRATCH f0
@@ -2192,15 +2191,21 @@ PATTERNS
 	pat lpb                            /* LB -> argument base */
 		leaving adp EM_BSIZE
 
+	/* "gto" must preserve the function result for "lfr", so
+	 * longjmp() can pass the return value to setjmp().
+	 *  - See lang/cem/libcc.ansi/setjmp/setjmp.e
+	 *
+	 * Must preserve r3 and r4, so no "uses REG".
+	 * PowerPC can't add r0 + constant.  Use r12.
+	 */
 	pat gto                            /* longjmp */
 		with STACK
-			uses REG
 			gen
-				move {LABEL, $1}, %a
-				move {IND_RC_W, %a, 8}, fp
-				move {IND_RC_W, %a, 4}, sp
-				move {IND_RC_W, %a, 0}, %a
-				mtspr ctr, %a
+				move {LABEL, $1}, r12
+				move {IND_RC_W, r12, 8}, fp
+				move {IND_RC_W, r12, 4}, sp
+				move {IND_RC_W, r12, 0}, r12
+				mtspr ctr, r12
 				bctr.
 
 	pat lor $1==0                      /* Load local base */
diff --git a/tests/plat/build.lua b/tests/plat/build.lua
index fdac9bae3..42ca441d0 100644
--- a/tests/plat/build.lua
+++ b/tests/plat/build.lua
@@ -22,6 +22,7 @@ definerule("plat_testsuite",
 			"tests/plat/m2/ConvTest_mod.mod",
 			"tests/plat/m2/NestProc_mod.mod",
 			"tests/plat/m2/OpenArray_mod.mod",
+			"tests/plat/m2/SemaTest_mod.mod",
 			"tests/plat/m2/Set100_mod.mod",
 			"tests/plat/m2/StringTest_mod.mod"
 		)
diff --git a/tests/plat/m2/SemaTest_mod.mod b/tests/plat/m2/SemaTest_mod.mod
new file mode 100644
index 000000000..9ae395662
--- /dev/null
+++ b/tests/plat/m2/SemaTest_mod.mod
@@ -0,0 +1,157 @@
+(*
+ * Generates some integer sequences.  Each generator is a process that
+ * yields integers to the main process.  ACK switches processes by
+ * saving and restoring the stack.  It uses _lor_ and _str_ to save
+ * and restore the local base and frame pointer.
+ *)
+MODULE SemaTest;
+FROM Semaphores IMPORT Sema, NewSema, Down, Up, StartProcess;
+FROM Storage IMPORT ALLOCATE;
+FROM Test IMPORT fail, finished;
+
+TYPE
+  Generator = POINTER TO GeneratorRecord;
+  GeneratorRecord = RECORD
+    resume: Sema;       (* up when resuming generator *)
+    yield: Sema;        (* up when yielding value *)
+    value: INTEGER;
+  END;
+VAR
+  curgen: Generator;    (* current generator *)
+  startLock: Sema;      (* down when booting generator *)
+  startProc: PROC;
+  startSelf: Generator;
+
+PROCEDURE BootGenerator;
+  VAR pr: PROC; self: Generator;
+BEGIN
+  pr := startProc;
+  self := startSelf;
+  Up(startLock);
+  Down(self^.resume);   (* wait for first Resume *)
+  pr();
+END BootGenerator;
+
+PROCEDURE StartGenerator(gen: Generator; pr: PROC);
+BEGIN
+  gen^.resume := NewSema(0);
+  gen^.yield := NewSema(0);
+  Down(startLock);
+  startProc := pr;
+  startSelf := gen;
+  StartProcess(BootGenerator, 8192);
+END StartGenerator;
+
+PROCEDURE Resume(gen: Generator): INTEGER;
+  VAR self: Generator;
+BEGIN
+  self := curgen;
+  curgen := gen;
+  Up(gen^.resume);
+  Down(gen^.yield);     (* wait for Yield *)
+  curgen := self;
+  RETURN gen^.value
+END Resume;
+
+PROCEDURE Yield(i: INTEGER);
+  VAR self: Generator;
+BEGIN
+  self := curgen;
+  self^.value := i;
+  Up(self^.yield);      (* curgen becomes invalid *)
+  Down(self^.resume);   (* wait for Resume *)
+END Yield;
+
+PROCEDURE YieldHalfOf(i: INTEGER);
+BEGIN
+  Yield(i DIV 2);
+END YieldHalfOf;
+
+PROCEDURE Triangular;
+  (* Yields the triangular numbers, http://oeis.org/A000217 *)
+  VAR n: INTEGER;
+BEGIN
+  n := 0;
+  LOOP
+    YieldHalfOf(n * (n + 1));
+    INC(n);
+  END;
+END Triangular;
+
+PROCEDURE Pentagonal;
+  (* Yields the pentagonal numbers, http://oeis.org/A000326 *)
+  VAR n: INTEGER;
+BEGIN
+  n := 0;
+  LOOP
+    YieldHalfOf(n * (3 * n - 1));
+    INC(n);
+  END;
+END Pentagonal;
+
+PROCEDURE Odious;
+  (* Yields the odius numbers, http://oeis.org/A000069 *)
+  VAR b, i, n: INTEGER;
+BEGIN
+  n := 1;
+  LOOP
+    (* b := count bits in n *)
+    b := 0;
+    i := n;
+    WHILE i # 0 DO
+      INC(b, i MOD 2);
+      i := i DIV 2;
+    END;
+
+    IF (b MOD 2) = 1 THEN
+      Yield(n);
+    END;
+    INC(n);
+  END;
+END Odious;
+
+TYPE
+  Triple = ARRAY[1..3] OF INTEGER;
+PROCEDURE T(i1, i2, i3: INTEGER): Triple;
+  VAR t: Triple;
+BEGIN
+  t[1] := i1; t[2] := i2; t[3] := i3; RETURN t
+END T;
+
+CONST
+  two28 = 268435456D;   (* 0x1000_0000 *)
+VAR
+  a: ARRAY [0..9] OF Triple;
+  tri, pen, odi: Generator;
+  i, g1, g2, g3: INTEGER;
+BEGIN
+  startLock := NewSema(1);
+
+  ALLOCATE(tri, SIZE(GeneratorRecord));
+  ALLOCATE(pen, SIZE(GeneratorRecord));
+  ALLOCATE(odi, SIZE(GeneratorRecord));
+  StartGenerator(tri, Triangular);
+  StartGenerator(pen, Pentagonal);
+  StartGenerator(odi, Odious);
+
+  a[0] := T( 0,   0,  1);
+  a[1] := T( 1,   1,  2);
+  a[2] := T( 3,   5,  4);
+  a[3] := T( 6,  12,  7);
+  a[4] := T(10,  22,  8);
+  a[5] := T(15,  35, 11);
+  a[6] := T(21,  51, 13);
+  a[7] := T(28,  70, 14);
+  a[8] := T(36,  92, 16);
+  a[9] := T(45, 117, 19);
+
+  FOR i := 0 TO INTEGER(9) DO
+    g1 := Resume(tri);
+    g2 := Resume(pen);
+    g3 := Resume(odi);
+    IF g1 # a[i][1] THEN fail(1D * two28 + LONG(a[i][1])) END;
+    IF g2 # a[i][2] THEN fail(2D * two28 + LONG(a[i][2])) END;
+    IF g3 # a[i][3] THEN fail(3D * two28 + LONG(a[i][3])) END;
+  END;
+  finished;
+END SemaTest.
diff --git a/tests/plat/setjmp_c.c b/tests/plat/setjmp_c.c
new file mode 100644
index 000000000..2a514a03f
--- /dev/null
+++ b/tests/plat/setjmp_c.c
@@ -0,0 +1,58 @@
+#include <setjmp.h>
+#include "test.h"
+
+/*
+ * Sets i = 2 * i for each i in nums, until i == 0, but stops if
+ * 2 * i >= 1000.
+ *
+ * Uses setjmp() and longjmp() in libc.  For ACK's libc, the back end
+ * must provides EM's _gto_, and _gto_ must preserve the function
+ * return area.
+ */
+int nums1[]         = { 79, 245, 164, 403, 0};
+const int expect1[] = {158, 490, 328, 806, 0};
+int nums2[]         = {20, 221, 411, 643, 48, 272, 448, 0};
+const int expect2[] = {40, 442, 822, 643, 48, 272, 448, 0};
+int nums3[]         = {371, 265, 500, 124, 117, 0};
+const int expect3[] = {742, 530, 500, 124, 117, 0};
+int docount = 0;
+
+int twice(int i, jmp_buf esc) {
+	if (i >= 500)
+		longjmp(esc, i);
+	return 2 * i;
+}
+
+void donums(int *nums, jmp_buf esc) {
+	int *p;
+
+	docount++;
+	for (p = nums; *p != 0; p++) {
+		*p = twice(*p, esc);
+	}
+}
+
+int cknums(int *nums, const int *expect) {
+	jmp_buf env;
+	int ret;
+
+	ret = setjmp(env);
+	if (ret == 0)
+		donums(nums, env);
+	for (;;) {
+		ASSERT(*nums == *expect);
+		if (*expect == 0)
+			break;
+		nums++;
+		expect++;
+	}
+	return ret;
+}
+
+int main(void) {
+	ASSERT(cknums(nums1, expect1) == 0);
+	ASSERT(cknums(nums2, expect2) == 643);
+	ASSERT(cknums(nums3, expect3) == 500);
+	ASSERT(docount == 3);
+	finished();
+}

From 720af48d8ad469934dbc19d92b77f83f61aeed82 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 4 Jan 2018 20:40:35 -0500
Subject: [PATCH 27/55] Fix lim.  Improve lxl, lxa, lor, str, procs with no
 locals.

_lim_ must use _loe_ (load word external), not _lde_ (load double-word
external).

The new patterns for _lxl_, _lxa_, _lor_, _str_ emit shorter code in
some cases.  The change from GPR_EXPR to REG_EXPR allows moving
LXFRAME to a register variable.

Add more "reusing" clauses.  We have enough registers that ncg almost
never reuses a register, but sometimes it can reuse r3.

In mach.c, emit one fewer instruction in procedures with no locals.
---
 mach/powerpc/ncg/mach.c |  15 +++-
 mach/powerpc/ncg/table  | 183 +++++++++++++++++++++++++---------------
 2 files changed, 128 insertions(+), 70 deletions(-)

diff --git a/mach/powerpc/ncg/mach.c b/mach/powerpc/ncg/mach.c
index b67903b0a..a31879de9 100644
--- a/mach/powerpc/ncg/mach.c
+++ b/mach/powerpc/ncg/mach.c
@@ -55,10 +55,17 @@ static void
 emit_prolog(void)
 {
 	fprintf(codefile, "mfspr r0, lr\n");
-	fprintf(codefile, "addi sp, sp, %ld\n", -framesize - 8);
-	fprintf(codefile, "stw fp, %ld(sp)\n", framesize);
-	fprintf(codefile, "stw r0, %ld(sp)\n", framesize + 4);
-	fprintf(codefile, "addi fp, sp, %ld\n", framesize);
+	if (framesize) {
+		fprintf(codefile, "addi sp, sp, %ld\n", -framesize - 8);
+		fprintf(codefile, "stw fp, %ld(sp)\n", framesize);
+		fprintf(codefile, "stw r0, %ld(sp)\n", framesize + 4);
+		fprintf(codefile, "addi fp, sp, %ld\n", framesize);
+	} else {
+		/* optimize for framesize == 0 */
+		fprintf(codefile, "stwu fp, -8(sp)\n");
+		fprintf(codefile, "stw r0, 4(sp)\n");
+		fprintf(codefile, "mr fp, sp\n");
+	}
 }
 
 void
diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index a35ace230..367942408 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -106,8 +106,8 @@ TOKENS
 
 /* Allows us to use regvar() to refer to registers */
 
-	GPR_EXPR           = { GPR reg; }             4    reg.
-	FPR_EXPR           = { FPR reg; }             8    reg.
+	REG_EXPR           = { REG reg; }             4    reg.
+	FREG_EXPR          = { FREG reg; }            8    reg.
 	FSREG_EXPR         = { FSREG reg; }           4    reg.
 
 /* Constants on the stack */
@@ -689,15 +689,16 @@ MOVES
 			extrwi %2, %1.reg, {C, 1}, {C, 1}
 			xori %2, %2, {C, 1}
 
-/* GPR_EXPR exists solely to allow us to use regvar() (which can only
+/* REG_EXPR exists solely to allow us to use regvar() (which can only
    be used in an expression) as a register constant.  We can then use
-   our moves to GPR to set register variables.  We define no moves to
-   LOCAL, so we avoid confusion between GPR and FSREG in LOCAL. */
+   our moves to GPR or REG to set register variables.  This is easier
+   than defining moves to LOCAL, and avoids confusion between GPR and
+   FSREG in LOCAL. */
 
-	from INT_W to GPR_EXPR
+	from INT_W + LXFRAME to REG_EXPR
 		gen move %1, %2.reg
 
-	from FLOAT_D to FPR_EXPR
+	from FLOAT_D to FREG_EXPR
 		gen move %1, %2.reg
 
 	from FLOAT_W to FSREG_EXPR
@@ -788,18 +789,21 @@ COERCIONS
 	/* "uses REG=%1" may find and reuse a register containing the
 	 * same token.  For contrast, "uses REG gen move %1, %a" would
 	 * pick a different register before doing the move.
-         */
+	 *
+	 * "reusing %1" helps when coercing an INT_W token like
+	 * {SUM_RC, r3, 0-4} to REG3, by not stacking the token.
+	 */
 
 	from INT_W
-		uses REG=%1
+		uses reusing %1, REG=%1
 		yields %a
 
 	from FLOAT_D
-		uses FREG=%1
+		uses reusing %1, FREG=%1
 		yields %a
 
 	from FLOAT_W
-		uses FSREG=%1
+		uses reusing %1, FSREG=%1
 		yields %a
 
 	/* Splitting coercions can't allocate registers.
@@ -1001,7 +1005,7 @@ PATTERNS
 		with exact INT_W
 			/* ncg fails to infer that regvar($1) is dead! */
 			kills regvar($1)
-			gen move %1, {GPR_EXPR, regvar($1)}
+			gen move %1, {REG_EXPR, regvar($1)}
 		with STACK
 			gen
 				lwz {LOCAL, $1}, {IND_RC_W, sp, 0}
@@ -1026,7 +1030,7 @@ PATTERNS
 	pat sdl inreg($1)==reg_float       /* Store double-word to local */
 		with exact FLOAT_D
 			kills regvar_d($1, reg_float)
-			gen move %1, {FPR_EXPR, regvar_d($1, reg_float)}
+			gen move %1, {FREG_EXPR, regvar_d($1, reg_float)}
 		with STACK
 			gen
 				lfd {DLOCAL, $1}, {IND_RC_D, sp, 0}
@@ -1088,7 +1092,7 @@ PATTERNS
 		leaving lxl $1 stf $2+EM_BSIZE
 	pat lxa sdf nicelx($1)
 		leaving lxl $1 stf $2+EM_BSIZE
-	pat lxa $1==0 || nicelx($1)
+	pat lxa nicelx($1)
 		leaving lxl $1 adp EM_BSIZE
 
 	/* Load locals in statically enclosing procedures */
@@ -1146,11 +1150,21 @@ PATTERNS
 	pat lxl nicelx($1)
 		uses REG={LXFRAME, $1}
 		yields %a  /* Can't yield LXFRAME. */
+	pat lxl stl nicelx($1) && inreg($2)==reg_any
+		kills regvar($2)
+		gen move {LXFRAME, $1}, {REG_EXPR, regvar($2)}
 
-	pat lxl $1==0                      /* Our local base */
+	pat lxl cal $1==0  /* Pass our local base to procedure */
 		with STACK
 			gen stwu fp, {IND_RC_W, sp, 0-4}
-			/* Can't yield fp. */
+			leaving cal $2
+
+	pat lxl $1==0                      /* Our local base */
+		uses REG=fp
+		yields %a  /* Can't yield fp. */
+
+	pat lxa $1==0                      /* Our argument base */
+		yields {SUM_RC, fp, EM_BSIZE}
 
 
 /* Global variables */
@@ -1421,7 +1435,7 @@ PATTERNS
 
 	pat sbi $1==4                      /* Subtract word (second - top) */
 		with REG REG
-			uses reusing %2, REG
+			uses reusing %1, reusing %2, REG
 			yields {SUB_RR, %2, %1}
 		with CONST2_WHEN_NEG REG
 			yields {SUM_RC, %2, 0-%1.val}
@@ -1585,7 +1599,7 @@ PATTERNS
 			gen slwi %a, %2, {C, %1.val & 0x1F}
 			yields %a
 		with REG REG
-			uses reusing %2, REG
+			uses reusing %1, reusing %2, REG
 			gen slw %a, %2, %1
 			yields %a
 	pat sli stl $1==4 && inreg($2)==reg_any
@@ -1600,7 +1614,7 @@ PATTERNS
 			gen srawi %a, %2, {C, %1.val & 0x1F}
 			yields %a
 		with REG REG
-			uses reusing %2, REG
+			uses reusing %1, reusing %2, REG
 			gen sraw %a, %2, %1
 			yields %a
 	pat sri stl $1==4 && inreg($2)==reg_any
@@ -1615,7 +1629,7 @@ PATTERNS
 			gen srwi %a, %2, {C, %1.val & 0x1F}
 			yields %a
 		with REG REG
-			uses reusing %2, REG
+			uses reusing %1, reusing %2, REG
 			gen srw %a, %2, %1
 			yields %a
 	pat sru stl $1==4 && inreg($2)==reg_any
@@ -1630,7 +1644,7 @@ PATTERNS
 			gen rotlwi %a, %2, {C, %1.val & 0x1F}
 			yields %a
 		with REG REG
-			uses reusing %2, REG
+			uses reusing %1, reusing %2, REG
 			gen rotlw %a, %2, %1
 			yields %a
 	pat rol stl $1==4 && inreg($2)==reg_any
@@ -1776,10 +1790,10 @@ PATTERNS
 			uses reusing %1, REG={COND_RC, %1, %2.val}
 			yields {XEQ, %a}
 		with CONST2 REG
-			uses reusing %1, REG={COND_RC, %2, %1.val}
+			uses reusing %2, REG={COND_RC, %2, %1.val}
 			yields {XEQ, %a}
 		with REG REG
-			uses reusing %1, REG={COND_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={COND_RR, %2, %1}
 			yields {XEQ, %a}
 
 	pat cmi tne $1==4                  /* Signed second != top */
@@ -1787,10 +1801,10 @@ PATTERNS
 			uses reusing %1, REG={COND_RC, %1, %2.val}
 			yields {XNE, %a}
 		with CONST2 REG
-			uses reusing %1, REG={COND_RC, %2, %1.val}
+			uses reusing %2, REG={COND_RC, %2, %1.val}
 			yields {XNE, %a}
 		with REG REG
-			uses reusing %1, REG={COND_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={COND_RR, %2, %1}
 			yields {XNE, %a}
 
 	pat cmi tgt $1==4                  /* Signed second > top */
@@ -1798,10 +1812,10 @@ PATTERNS
 			uses reusing %1, REG={COND_RC, %1, %2.val}
 			yields {XLT, %a}
 		with CONST2 REG
-			uses reusing %1, REG={COND_RC, %2, %1.val}
+			uses reusing %2, REG={COND_RC, %2, %1.val}
 			yields {XGT, %a}
 		with REG REG
-			uses reusing %1, REG={COND_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={COND_RR, %2, %1}
 			yields {XGT, %a}
 
 	pat cmi tge $1==4                  /* Signed second >= top */
@@ -1809,10 +1823,10 @@ PATTERNS
 			uses reusing %1, REG={COND_RC, %1, %2.val}
 			yields {XLE, %a}
 		with CONST2 REG
-			uses reusing %1, REG={COND_RC, %2, %1.val}
+			uses reusing %2, REG={COND_RC, %2, %1.val}
 			yields {XGE, %a}
 		with REG REG
-			uses reusing %1, REG={COND_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={COND_RR, %2, %1}
 			yields {XGE, %a}
 
 	pat cmi tlt $1==4                  /* Signed second < top */
@@ -1820,10 +1834,10 @@ PATTERNS
 			uses reusing %1, REG={COND_RC, %1, %2.val}
 			yields {XGT, %a}
 		with CONST2 REG
-			uses reusing %1, REG={COND_RC, %2, %1.val}
+			uses reusing %2, REG={COND_RC, %2, %1.val}
 			yields {XLT, %a}
 		with REG REG
-			uses reusing %1, REG={COND_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={COND_RR, %2, %1}
 			yields {XLT, %a}
 
 	pat cmi tle $1==4                  /* Signed second <= top */
@@ -1831,10 +1845,10 @@ PATTERNS
 			uses reusing %1, REG={COND_RC, %1, %2.val}
 			yields {XGE, %a}
 		with CONST2 REG
-			uses reusing %1, REG={COND_RC, %2, %1.val}
+			uses reusing %2, REG={COND_RC, %2, %1.val}
 			yields {XLE, %a}
 		with REG REG
-			uses reusing %1, REG={COND_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={COND_RR, %2, %1}
 			yields {XLE, %a}
 
 	pat cmu teq $1==4                  /* Unsigned second == top */
@@ -1842,10 +1856,10 @@ PATTERNS
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
 			yields {XEQ, %a}
 		with UCONST2 REG
-			uses reusing %1, REG={CONDL_RC, %2, %1.val}
+			uses reusing %2, REG={CONDL_RC, %2, %1.val}
 			yields {XEQ, %a}
 		with REG REG
-			uses reusing %1, REG={CONDL_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={CONDL_RR, %2, %1}
 			yields {XEQ, %a}
 
 	pat cmu tne $1==4                  /* Unsigned second != top */
@@ -1853,10 +1867,10 @@ PATTERNS
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
 			yields {XNE, %a}
 		with UCONST2 REG
-			uses reusing %1, REG={CONDL_RC, %2, %1.val}
+			uses reusing %2, REG={CONDL_RC, %2, %1.val}
 			yields {XNE, %a}
 		with REG REG
-			uses reusing %1, REG={CONDL_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={CONDL_RR, %2, %1}
 			yields {XNE, %a}
 
 	pat cmu tgt $1==4                  /* Unsigned second > top */
@@ -1864,10 +1878,10 @@ PATTERNS
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
 			yields {XLT, %a}
 		with UCONST2 REG
-			uses reusing %1, REG={CONDL_RC, %2, %1.val}
+			uses reusing %2, REG={CONDL_RC, %2, %1.val}
 			yields {XGT, %a}
 		with REG REG
-			uses reusing %1, REG={CONDL_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={CONDL_RR, %2, %1}
 			yields {XGT, %a}
 
 	pat cmu tge $1==4                  /* Unsigned second >= top */
@@ -1875,10 +1889,10 @@ PATTERNS
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
 			yields {XLE, %a}
 		with UCONST2 REG
-			uses reusing %1, REG={CONDL_RC, %2, %1.val}
+			uses reusing %2, REG={CONDL_RC, %2, %1.val}
 			yields {XGE, %a}
 		with REG REG
-			uses reusing %1, REG={CONDL_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={CONDL_RR, %2, %1}
 			yields {XGE, %a}
 
 	pat cmu tlt $1==4                  /* Unsigned second < top */
@@ -1886,10 +1900,10 @@ PATTERNS
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
 			yields {XGT, %a}
 		with UCONST2 REG
-			uses reusing %1, REG={CONDL_RC, %2, %1.val}
+			uses reusing %2, REG={CONDL_RC, %2, %1.val}
 			yields {XLT, %a}
 		with REG REG
-			uses reusing %1, REG={CONDL_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={CONDL_RR, %2, %1}
 			yields {XLT, %a}
 
 	pat cmu tle $1==4                  /* Unsigned second <= top */
@@ -1897,10 +1911,10 @@ PATTERNS
 			uses reusing %1, REG={CONDL_RC, %1, %2.val}
 			yields {XGE, %a}
 		with UCONST2 REG
-			uses reusing %1, REG={CONDL_RC, %2, %1.val}
+			uses reusing %2, REG={CONDL_RC, %2, %1.val}
 			yields {XLE, %a}
 		with REG REG
-			uses reusing %1, REG={CONDL_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={CONDL_RR, %2, %1}
 			yields {XLE, %a}
 
 
@@ -1990,7 +2004,7 @@ PATTERNS
 			gen extlwi %a, %a, {C, 2}, {C, 0}
 			yields %a
 		with REG REG
-			uses reusing %1, REG={COND_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={COND_RR, %2, %1}
 			gen extlwi %a, %a, {C, 2}, {C, 0}
 			yields %a
 
@@ -2004,7 +2018,7 @@ PATTERNS
 			gen extlwi %a, %a, {C, 2}, {C, 0}
 			yields %a
 		with REG REG
-			uses reusing %1, REG={CONDL_RR, %2, %1}
+			uses reusing %1, reusing %2, REG={CONDL_RR, %2, %1}
 			gen extlwi %a, %a, {C, 2}, {C, 0}
 			yields %a
 
@@ -2159,25 +2173,28 @@ PATTERNS
 		leaving ine "hol0"
 
 	pat lim                            /* Load EM trap ignore mask */
-		leaving lde ".ignmask"
+		leaving loe ".ignmask"
 
 	pat sim                            /* Store EM trap ignore mask */
 		leaving ste ".ignmask"
 
-	pat trp                            /* Raise EM trap */
-		with REG3
-			kills ALL
-			gen bl {LABEL, ".trp"}
-
 	pat sig                            /* Set trap handler, yield old */
 		leaving
 			loe ".trppc"
 			exg 4
 			ste ".trppc"
 
+	pat trp                            /* Raise EM trap */
+		with REG3
+			kills ALL
+			gen bl {LABEL, ".trp"}
+
 	pat rtt                            /* Return from trap */
 		leaving ret 0
 
+	pat rck $1==4                      /* Range check */
+		leaving cal ".rck"
+
 	/* Our caller's local base, "lxl 0 dch", appears in
 	 * lang/cem/libcc.ansi/setjmp/setjmp.e, lang/m2/libm2/par_misc.e
 	 */
@@ -2216,16 +2233,50 @@ PATTERNS
 			uses REG=sp
 			yields %a  /* Can't yield sp. */
 
+	/* Next few patterns for "lor 1" appear in
+	 * lang/m2/libm2/par_misc.e
+	 */
+	pat lor lor $1==1 && $2==1         /* Load sp twice */
+		with STACK
+			gen stwu sp, {IND_RC_W, sp, 0-4}
+			leaving lor 1
+
+	pat lor adp $1==1 && smalls($2)    /* sp + constant */
+		with STACK
+			uses REG
+			gen addi %a, sp, {C, $2}
+			yields %a
+
+	/* Subtract stack pointer by doing %1 - (sp - 4)
+	 * because sp - 4 would point to %1.
+	 */
+	pat lor sbs loc adu $1==1 && $2==4 && $4==4
+		with REG STACK
+			uses reusing %1, REG
+			gen subf %a, sp, %1
+			yields %a
+			leaving loc $3+4 adu 4
+	pat lor sbs $1==1 && $2==4
+		with REG STACK
+			uses reusing %1, REG
+			gen subf %a, sp, %1
+			yields {SUM_RC, %a, 4}
+
 	pat str $1==0                      /* Store local base */
-		with REG
+		with INT_W
 			gen move %1, fp
+		with STACK
+			gen
+				lwz fp, {IND_RC_W, sp, 0}
+				addi sp, sp, {C, 4}
 
 	pat str $1==1                      /* Store stack pointer */
-		with REG
+		with INT_W
+			kills ALL
 			gen move %1, sp
-
-	pat rck $1==4                      /* Range check */
-		leaving cal ".rck"
+		with STACK
+			kills ALL
+			gen lwz sp, {IND_RC_W, sp, 0}
 
 
 /* Single-precision floating-point */
@@ -2235,7 +2286,7 @@ PATTERNS
 
 	pat adf $1==4                      /* Add single */
 		with FSREG FSREG
-			uses reusing %1, FSREG
+			uses reusing %1, reusing %2, FSREG
 			gen fadds %a, %2, %1
 			yields %a
 	pat adf stl $1==4 && inreg($2)==reg_float
@@ -2244,7 +2295,7 @@ PATTERNS
 
 	pat sbf $1==4                      /* Subtract single */
 		with FSREG FSREG
-			uses reusing %1, FSREG
+			uses reusing %1, reusing %2, FSREG
 			gen fsubs %a, %2, %1
 			yields %a
 	pat sbf stl $1==4 && inreg($2)==reg_float
@@ -2253,7 +2304,7 @@ PATTERNS
 
 	pat mlf $1==4                      /* Multiply single */
 		with FSREG FSREG
-			uses reusing %1, FSREG
+			uses reusing %1, reusing %2, FSREG
 			gen fmuls %a, %2, %1
 			yields %a
 	pat mlf stl $1==4 && inreg($2)==reg_float
@@ -2262,7 +2313,7 @@ PATTERNS
 
 	pat dvf $1==4                      /* Divide single */
 		with FSREG FSREG
-			uses reusing %1, FSREG
+			uses reusing %1, reusing %2, FSREG
 			gen fdivs %a, %2, %1
 			yields %a
 	pat dvf stl $1==4 && inreg($2)==reg_float
@@ -2377,7 +2428,7 @@ PATTERNS
 
 	pat adf $1==8                      /* Add double */
 		with FREG FREG
-			uses reusing %1, FREG
+			uses reusing %1, reusing %2, FREG
 			gen fadd %a, %2, %1
 			yields %a
 	pat adf sdl $1==8 && inreg($2)==reg_float
@@ -2386,7 +2437,7 @@ PATTERNS
 
 	pat sbf $1==8                      /* Subtract double */
 		with FREG FREG
-			uses reusing %1, FREG
+			uses reusing %1, reusing %2, FREG
 			gen fsub %a, %2, %1
 			yields %a
 	pat sbf sdl $1==8 && inreg($2)==reg_float
@@ -2395,7 +2446,7 @@ PATTERNS
 
 	pat mlf $1==8                      /* Multiply double */
 		with FREG FREG
-			uses reusing %1, FREG
+			uses reusing %1, reusing %2, FREG
 			gen fmul %a, %2, %1
 			yields %a
 	pat mlf sdl $1==8 && inreg($2)==reg_float
@@ -2404,7 +2455,7 @@ PATTERNS
 
 	pat dvf $1==8                      /* Divide double */
 		with FREG FREG
-			uses reusing %1, FREG
+			uses reusing %1, reusing %2, FREG
 			gen fdiv %a, %2, %1
 			yields %a
 	pat dvf sdl $1==8 && inreg($2)==reg_float

From b90c97b00bf4bef2bd51403e9bf2b4795247fd9a Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Fri, 5 Jan 2018 17:55:50 -0500
Subject: [PATCH 28/55] Teach top to merge or delete "addi sp, sp, X".

This reduces code size, because ncg emits too many "addi sp, sp, X"
instructions when unstacking things.  Now top lowers "addi sp, sp, X"
by lifting other instructions.  This sometimes creates chances to
merge or delete _addi_ instructions.  If no such chance is found, the
_addi_ remains uselessly lowered.

Edit ncg/table to remove something that top now does.

Edit ncg/mach.c to remove some spaces after commas.  This removes a
whitespace difference between *.s and *.so files, because top removes
the space.
---
 mach/powerpc/ncg/mach.c |   6 +-
 mach/powerpc/ncg/table  |  20 +++----
 mach/powerpc/top/table  | 130 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 142 insertions(+), 14 deletions(-)

diff --git a/mach/powerpc/ncg/mach.c b/mach/powerpc/ncg/mach.c
index a31879de9..06e39709f 100644
--- a/mach/powerpc/ncg/mach.c
+++ b/mach/powerpc/ncg/mach.c
@@ -203,7 +203,7 @@ saveloadregs(const char* ops, const char* opm, const char *opf)
 	for (reg = 31; reg >= 0; reg--) {
 		if (savedf[reg] != LONG_MIN) {
 			offset -= 8;
-			fprintf(codefile, "%s f%d, %ld(fp)\n",
+			fprintf(codefile, "%s f%d,%ld(fp)\n",
 				opf, reg, offset);
 		}
 	}
@@ -220,7 +220,7 @@ saveloadregs(const char* ops, const char* opm, const char *opf)
 		while (reg > 0 && savedi[reg - 1] != LONG_MIN)
 			reg--;
 		offset -= (32 - reg) * 4;
-		fprintf(codefile, "%s r%d, %ld(fp)\n", opm, reg, offset);
+		fprintf(codefile, "%s r%d,%ld(fp)\n", opm, reg, offset);
 	} else
 		reg = 32;
 
@@ -228,7 +228,7 @@ saveloadregs(const char* ops, const char* opm, const char *opf)
 	for (reg--; reg >= 0; reg--) {
 		if (savedi[reg] != LONG_MIN) {
 			offset -= 4;
-			fprintf(codefile, "%s r%d, %ld(fp)\n",
+			fprintf(codefile, "%s r%d,%ld(fp)\n",
 				ops, reg, offset);
 		}
 	}
diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 367942408..df06a5d49 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -334,9 +334,9 @@ INSTRUCTIONS
   lhax            GPR:wo, GPR:ro, GPR:ro cost(4, 3).
   lhz             GPR:wo, SET_RC_H:ro cost(4, 3).
   lhzx            GPR:wo, GPR:ro, GPR:ro cost(4, 3).
+  lwz             GPR+LOCAL:wo, SET_RC_W:ro cost(4, 3).
   lwzu            GPR:wo, IND_RC_W:rw cost(4, 3).
   lwzx            GPR:wo, GPR:ro, GPR:ro cost(4, 3).
-  lwz             GPR+LOCAL:wo, SET_RC_W:ro cost(4, 3).
   mfcr            GPR:wo cost(4,2).
   mfspr           GPR:wo, SPR:ro cost(4, 3).
   mtspr           SPR:wo, GPR:ro cost(4, 2).
@@ -361,7 +361,6 @@ INSTRUCTIONS
   rlwnm           GPR:wo, GPR:ro, GPR:ro, CONST:ro, CONST:ro.
     rotlw         GPR+LOCAL:wo, GPR:ro, GPR:ro.
   slw             GPR+LOCAL:wo, GPR:ro, GPR:ro.
-  subf            GPR:wo, GPR:ro, GPR:ro.
   sraw            GPR+LOCAL:wo, GPR:ro, GPR:ro cost(4, 2).
   srawi           GPR+LOCAL:wo, GPR:ro, CONST:ro cost(4, 2).
   srw             GPR+LOCAL:wo, GPR:ro, GPR:ro.
@@ -378,6 +377,7 @@ INSTRUCTIONS
   stw             GPR:ro, SET_RC_W:rw cost(4, 3).
   stwx            GPR:ro, GPR:ro, GPR:ro cost(4, 3).
   stwu            GPR:ro, IND_RC_W:rw cost(4, 3).
+  subf            GPR:wo, GPR:ro, GPR:ro.
   xor             GPR:wo, GPR:ro, GPR:ro.
   xori            GPR:wo, GPR:ro, CONST:ro.
   xoris           GPR:wo, GPR:ro, CONST:ro.
@@ -762,6 +762,10 @@ STACKINGRULES
 
 COERCIONS
 
+	/* The unstacking coercions emit many "addi sp, sp, X"
+	 * instructions; the target optimizer (top) will merge them.
+	 */
+
 	from STACK
 		uses REG
 		gen
@@ -2103,12 +2107,13 @@ PATTERNS
 			mr fp, r0
 			blr.
 
+	/* If "ret" coerces STACK to REG3, then top will delete the
+	 * extra "addi sp, sp, 4".
+	 */
+
 	pat ret $1==4                      /* Return from procedure, word */
 		with REG3
 			leaving ret 0
-		with STACK
-			gen lwz r3, {IND_RC_W, sp, 0}
-			leaving ret 0
 
 	pat ret $1==8                      /* Return from proc, double-word */
 		with REG3 INT_W
@@ -2117,11 +2122,6 @@ PATTERNS
 		with REG3 STACK
 			gen lwz r4, {IND_RC_W, sp, 0}
 			leaving ret 0
-		with STACK
-			gen
-				lwz r3, {IND_RC_W, sp, 0}
-				lwz r4, {IND_RC_W, sp, 4}
-			leaving ret 0
 
 	/*
 	 * These rules for blm/bls are wrong if length is zero.
diff --git a/mach/powerpc/top/table b/mach/powerpc/top/table
index b3f5b3a31..cbc16c277 100644
--- a/mach/powerpc/top/table
+++ b/mach/powerpc/top/table
@@ -1,11 +1,12 @@
 
 /* PowerPC table for ACK target optimizer */
 
-MAXOP 3;
+MAXOP 5;
 LABEL_STARTER '.';
 
 %%;
 
+L1, L2, L3, L4, L5  { not_using_sp(VAL) };
 RNZ                 { strcmp(VAL, "r0") };  /* not r0 */
 X, Y, Z             { TRUE };
 
@@ -16,6 +17,47 @@ X, Y, Z             { TRUE };
 addi  RNZ, RNZ, 0            -> ;
 addis RNZ, RNZ, 0            -> ;
 
+addi RNZ, RNZ, X : addi RNZ, RNZ, Y { plus(X, Y, Z) }
+                             -> addi RNZ, RNZ, Z ;
+
+/* Lower "addi sp, sp, X" by lifting other instructions, looking for
+ * chances to merge or delete _addi_ instructions, and assuming that
+ * the code generator uses "sp" not "r1".
+ */
+addi sp, sp, X : ANY L1                 { lift(ANY) }
+                             -> ANY L1                 : addi sp, sp, X ;
+addi sp, sp, X : ANY L1, L2             { lift(ANY) }
+                             -> ANY L1, L2             : addi sp, sp, X ;
+addi sp, sp, X : ANY L1, L2, L3         { lift(ANY) }
+                             -> ANY L1, L2, L3         : addi sp, sp, X ;
+addi sp, sp, X : ANY L1, L2, L3, L4     { lift(ANY) }
+ -> ANY L1, L2, L3, L4     : addi sp, sp, X ;
+addi sp, sp, X : ANY L1, L2, L3, L4, L5 { lift(ANY) }
+                             -> ANY L1, L2, L3, L4, L5 : addi sp, sp, X ;
+addi sp, sp, X : lmw Y, L1 { Y[0]=='r' && atoi(Y+1)>1 }
+                             -> lmw Y, L1 : addi sp, sp, X ;
+
+/* Merge _addi_ when popping from the stack. */
+addi sp, sp, X : lwz L1, Y(sp) { plus(X, Y, Z) && Z[0]!='-' }
+                             -> lwz L1, Z(sp) : addi sp, sp, X ;
+addi sp, sp, X : lfs L1, Y(sp) { plus(X, Y, Z) && Z[0]!='-' }
+                             -> lfs L1, Z(sp) : addi sp, sp, X ;
+addi sp, sp, X : lfd L1, Y(sp) { plus(X, Y, Z) && Z[0]!='-' }
+                             -> lfd L1, Z(sp) : addi sp, sp, X ;
+
+/* Lower or delete _addi_ when pushing to the stack. */
+addi sp, sp, X : stwu  L1, Y(sp) { plus(X, Y, Z) && Z[0]!='-' }
+                             -> stw  L1, Z(sp) : addi sp, sp, Z ;
+addi sp, sp, X : stfsu L1, Y(sp) { plus(X, Y, Z) && Z[0]!='-' }
+                             -> stfs L1, Z(sp) : addi sp, sp, Z ;
+addi sp, sp, X : stfdu L1, Y(sp) { plus(X, Y, Z) && Z[0]!='-' }
+                             -> stfd L1, Z(sp) : addi sp, sp, Z ;
+addi sp, sp, 4 : stfdu L1, -8(sp) -> stfdu L1, -4(sp) ;
+
+/* Delete _addi_ when setting the stack pointer. */
+addi sp, sp, X : addi sp, L1, Y   -> addi sp, L1, Y ;
+addi sp, sp, X : lwz sp, L1       -> lwz sp, L1 ;
+
 or X, Y, Y                   -> mr X, Y ;
 or. X, Y, Y                  -> mr. X, Y ;
 
@@ -50,3 +92,89 @@ b X : labdef X               -> labdef X ;
 /* LT=0, GT=1, EQ=2, OV=3 */
 
 %%;
+
+/* Is it a word character? 0-9A-Za-z_ */
+static int isword(char c) {
+	return
+	    (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') ||
+	    (c >= 'a' && c <= 'z') || (c == '_');
+}
+
+/* Does operand _s_ not use the stack pointer? */
+int not_using_sp(const char *s) {
+	int boundary;
+
+	boundary = 1;
+	while (*s) {
+		if (boundary &&
+		    ((s[0]=='s' && s[1]=='p') || (s[0]=='r' && s[1]=='1')) &&
+		    !isword(s[2]))
+			return 0;
+		boundary = !isword(*s);
+		s++;
+	}
+	return 1;
+}
+
+
+/* Instructions to lift(), sorted in strcmp() order.  These are from
+ * ../ncg/table, minus branch instructions.
+ */
+const char *liftables[] = {
+	"add", "add.", "addi",
+	"and", "andc", "andi.", "andis.",
+	"cmp", "cmpi", "cmpl", "cmpli",
+	"cmplw", "cmplwi", "cmpw", "cmpwi",
+	"divw", "divwu", "eqv", "extlwi", "extrwi", "extsb", "extsh",
+	"fadd", "fadds", "fcmpo", "fctiwz", "fdiv", "fdivs",
+	"fmr", "fmul", "fmuls", "fneg", "frsp", "fsub", "fsubs",
+	"lbz", "lbzx",
+	"lfd", "lfdu", "lfdx", "lfs", "lfsu", "lfsx",
+	"lha", "lhax", "lhz", "lhzx",
+	"li", "lis", "lwz", "lwzu", "lwzx",
+	"mfcr", "mfspr", "mr", "mr.", "mtspr", "mullw",
+	"nand", "neg", "nor", "or", "or.", "ori", "oris",
+	"rlwinm", "rlwnm", "rotlwi", "rotrwi",
+	"slw", "slwi", "sraw", "srawi", "srw", "srwi",
+	"stb", "stbx",
+	"stfd", "stfdu", "stfdx", "stfs", "stfsu", "stfsx",
+	"sth", "sthx", "stw", "stwx", "stwu",
+	"subf", "xor", "xori", "xoris",
+};
+
+static int liftcmp(const void *a, const void *b) {
+	return strcmp(*(const char **)a, *(const char **)b);
+}
+
+/* May we lift instruction _s_ above "addi SP, SP, X"? */
+int lift(const char *s) {
+	return bsearch(&s, liftables,
+	    sizeof(liftables) / sizeof(liftables[0]),
+	    sizeof(liftables[0]), liftcmp);
+}
+
+
+/* Does it fit a signed 16-bit integer? */
+static int fits16(long l) {
+	return l >= -32768 && l <= 32767;
+}
+
+/* Tries sum = a + b with signed 16-bit integers. */
+int plus(const char *a, const char *b, const char *sum)
+{
+	long la, lb, lsum;
+	char *end;
+
+	la = strtol(a, &end, 10);
+	if (*a == '\0' || *end != '\0' || !fits16(la))
+		return 0;
+	lb = strtol(b, &end, 10);
+	if (*b == '\0' || *end != '\0' || !fits16(lb))
+		return 0;
+
+	lsum = la + lb;
+	if (!fits16(lsum))
+		return 0;
+	snprintf(sum, 7, "%ld", lsum);
+	return 1;
+}

From 64b50b3a45476976b9dc6ad9b91c56f50e2958b7 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Sun, 7 Jan 2018 16:03:55 -0500
Subject: [PATCH 29/55] Shrink .cfu8

With my PowerBook G4, a program that converts values from 1.0 to
4000000.0 runs in about 0.32s with the old .cfu8 and 0.29s with this
shrunken .cfu8

Leave a comment about other ways to implement .cfu8
---
 mach/powerpc/libem/build.lua     |  2 +-
 mach/powerpc/libem/cfu8.s        | 66 ++++++++++++++++++--------------
 mach/powerpc/libem/fd_80000000.s | 10 -----
 mach/powerpc/libem/fd_FFFFFFFF.s | 10 -----
 4 files changed, 39 insertions(+), 49 deletions(-)
 delete mode 100644 mach/powerpc/libem/fd_80000000.s
 delete mode 100644 mach/powerpc/libem/fd_FFFFFFFF.s

diff --git a/mach/powerpc/libem/build.lua b/mach/powerpc/libem/build.lua
index ac84e3b0f..7a0726b80 100644
--- a/mach/powerpc/libem/build.lua
+++ b/mach/powerpc/libem/build.lua
@@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
 	acklibrary {
 		name = "lib_"..plat,
 		srcs = {
-			"./*.s", -- trp.s
+			"./*.s", -- cfu8.s
 		},
 		vars = { plat = plat },
 		deps = {
diff --git a/mach/powerpc/libem/cfu8.s b/mach/powerpc/libem/cfu8.s
index 915f84dd2..fd69ff521 100644
--- a/mach/powerpc/libem/cfu8.s
+++ b/mach/powerpc/libem/cfu8.s
@@ -1,3 +1,5 @@
+.sect .text; .sect .rom; .sect .data; .sect .bss
+
 .sect .text
 
 ! Converts a 64-bit double into a 32-bit unsigned integer.
@@ -6,32 +8,40 @@
 
 .define .cfu8
 .cfu8:
-	lis r3, ha16[.fd_00000000]
-	lfd f0, lo16[.fd_00000000](r3) ! f0 = 0.0
-
-	lfd f1, 0(sp)            ! value to be converted
-
-	lis r3, ha16[.fd_FFFFFFFF]
-	lfd f3, lo16[.fd_FFFFFFFF](r3) ! f3 = 0xFFFFFFFF
-
-	lis r3, ha16[.fd_80000000]
-	lfd f4, lo16[.fd_80000000](r3) ! f4 = 0x80000000
-
-	fsel f2, f1, f1, f0
-	fsub f5, f3, f1
-	fsel f2, f5, f2, f3
-	fsub f5, f2, f4
-	fcmpu cr0, f2, f4
-	fsel f2, f5, f5, f2
-	fctiwz f2, f2
-	
-	stfd f2, 0(sp)
-	addi sp, sp, 4
-
-	bltlr
-
-	lwz r3, 0(sp)
-	xoris r3, r3, 0x8000
-	stw r3, 0(sp)
-
+	lfd f1, 0(sp)                   ! f1 = value to convert
+	lis r3, ha16[.fs_80000000]
+	lfs f2, lo16[.fs_80000000](r3)  ! f2 = 2**31
+	fsub   f1, f1, f2
+	fctiwz f1, f1         ! convert value - 2**31
+	stfd   f1, 0(sp)
+	lwz   r3, 4(sp)
+	xoris r3, r3, 0x8000  ! add 2**31
+	stw   r3, 4(sp)
+	addi  sp, sp, 4
 	blr
+
+.sect .rom
+.fs_80000000:
+	!float 2.147483648e+9 sz 4
+	.data1 0117,00,00,00
+
+! Freescale and IBM provide an example using fsel to select value or
+! value - 2**31 for fctiwz.  The following code adapts Freescale's
+! _Programming Environments Manual for 32-Bit Implementations of the
+! PowerPC Architecture_, section C.3.2, pdf page 557.
+!
+! Given f2 = value clamped from 0 to 2**32 - 1, f4 = 2**31, then
+!	fsub	f5, f2, f4
+!	fcmpu	cr2, f2, f4
+!	fsel	f2, f5, f5, f2
+!	fctiwz	f2, f2
+!	stfdu	f2, 0(sp)
+!	lwz	r3, 4(sp)
+!	blt	cr2, 1f
+!	xoris	r3, r3, 0x8000
+! 1: yields r3 = the converted value.
+!
+! Debian's clang 3.5.0-10 and gcc 4.9.2-10 don't clamp the value
+! before conversion.  They avoid fsel and put the conditional branch
+! before fctwiz.  PowerPC 601 lacks fsel (but kernel might trap and
+! emulate fsel).  PowerPC 603, 604, G3, G4, G5 have fsel.
diff --git a/mach/powerpc/libem/fd_80000000.s b/mach/powerpc/libem/fd_80000000.s
deleted file mode 100644
index 5c153bba8..000000000
--- a/mach/powerpc/libem/fd_80000000.s
+++ /dev/null
@@ -1,10 +0,0 @@
-.sect .text; .sect .rom; .sect .data; .sect .bss
-
-.sect .rom
-
-! Contains a handy double-precision 0x80000000.
-
-.define .fd_80000000
-.fd_80000000:
-	!float 2.147483648e+9 sz 8
-	.data1 0101,0340,00,00,00,00,00,00
diff --git a/mach/powerpc/libem/fd_FFFFFFFF.s b/mach/powerpc/libem/fd_FFFFFFFF.s
deleted file mode 100644
index 88cf04bd9..000000000
--- a/mach/powerpc/libem/fd_FFFFFFFF.s
+++ /dev/null
@@ -1,10 +0,0 @@
-.sect .text; .sect .rom; .sect .data; .sect .bss
-
-.sect .rom
-
-! Contains a handy double-precision 0xFFFFFFFF.
-
-.define .fd_FFFFFFFF
-.fd_FFFFFFFF:
-	!float 4.294967295e+9 sz 8
-	.data1 0101,0357,0377,0377,0377,0340,00,00

From de2c7c3f253787e98b389305f3f3e9cdf27ce03d Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Mon, 8 Jan 2018 22:26:24 -0500
Subject: [PATCH 30/55] Hide some i386 stuff from linux68k, linuxppc.

Rename plat/linux/libsys/errno.s to plat/linux386/libsys/trapno.s and
stop building it for linux68k and linuxppc.  It defines symbols for
mach/i386/libem.

In syscalls.h, the numbers after 165 are only for i386, so hide them
from 68k, ppc.  These numbers are unused, because the system calls now
in libsys use the lower numbers.

Also teach the build system that libsys depends on the internal
headers in plat/linux/libsys/*.h
---
 plat/linux/libsys/syscalls.h                            | 7 +++++++
 plat/linux386/libsys/build.lua                          | 1 +
 plat/{linux/libsys/errno.s => linux386/libsys/trapno.s} | 0
 plat/linux68k/libsys/build.lua                          | 1 +
 plat/linuxppc/libsys/build.lua                          | 2 +-
 plat/osx386/libsys/build.lua                            | 2 +-
 6 files changed, 11 insertions(+), 2 deletions(-)
 rename plat/{linux/libsys/errno.s => linux386/libsys/trapno.s} (100%)

diff --git a/plat/linux/libsys/syscalls.h b/plat/linux/libsys/syscalls.h
index 19d5543c6..8bddcc0ee 100644
--- a/plat/linux/libsys/syscalls.h
+++ b/plat/linux/libsys/syscalls.h
@@ -174,6 +174,12 @@
 #define __NR_mremap 163
 #define __NR_setresuid 164
 #define __NR_getresuid 165
+
+/*
+ * i386, m68020, powerpc use different numbers after 165.
+ * This file only has the numbers for i386.
+ */
+#if defined(__i386)
 #define __NR_vm86 166
 #define __NR_query_module 167
 #define __NR_poll 168
@@ -324,5 +330,6 @@
 
 #define concat(x, y) x##y
 #define MAPPED_SYSCALL(p, n) .define concat(p,n); concat(p,n): xor eax, eax; movb al, concat(__NR_,n); jmp __mapped_syscall
+#endif /* __i386 */
 
 #endif
diff --git a/plat/linux386/libsys/build.lua b/plat/linux386/libsys/build.lua
index a4d2d7447..7de7b4061 100644
--- a/plat/linux386/libsys/build.lua
+++ b/plat/linux386/libsys/build.lua
@@ -6,6 +6,7 @@ acklibrary {
         "plat/linux/libsys/*.s",
     },
 	deps = {
+		"plat/linux/libsys/*.h",
 		"lang/cem/libcc.ansi/headers+headers",
 		"plat/linux386/include+headers",
 	},
diff --git a/plat/linux/libsys/errno.s b/plat/linux386/libsys/trapno.s
similarity index 100%
rename from plat/linux/libsys/errno.s
rename to plat/linux386/libsys/trapno.s
diff --git a/plat/linux68k/libsys/build.lua b/plat/linux68k/libsys/build.lua
index ded71cdd1..c17436517 100644
--- a/plat/linux68k/libsys/build.lua
+++ b/plat/linux68k/libsys/build.lua
@@ -6,6 +6,7 @@ acklibrary {
         "plat/linux/libsys/*.s",
     },
 	deps = {
+		"plat/linux/libsys/*.h",
 		"lang/cem/libcc.ansi/headers+headers",
 		"plat/linux68k/include+headers",
 	},
diff --git a/plat/linuxppc/libsys/build.lua b/plat/linuxppc/libsys/build.lua
index 696c62d42..f58df16ea 100644
--- a/plat/linuxppc/libsys/build.lua
+++ b/plat/linuxppc/libsys/build.lua
@@ -8,7 +8,6 @@ acklibrary {
 		"plat/linux/libsys/_hol0.s",
 		"plat/linux/libsys/close.c",
 		"plat/linux/libsys/creat.c",
-		"plat/linux/libsys/errno.s",
 		"plat/linux/libsys/execve.c",
 		"plat/linux/libsys/getpid.c",
 		"plat/linux/libsys/gettimeofday.c",
@@ -25,6 +24,7 @@ acklibrary {
 		"plat/linux/libsys/write.c",
 	},
 	deps = {
+		"plat/linux/libsys/*.h",
 		"lang/cem/libcc.ansi/headers+headers",
 		"plat/linuxppc/include+headers",
 	},
diff --git a/plat/osx386/libsys/build.lua b/plat/osx386/libsys/build.lua
index 23e491f7a..6a5b0e58c 100644
--- a/plat/osx386/libsys/build.lua
+++ b/plat/osx386/libsys/build.lua
@@ -19,7 +19,7 @@ acklibrary {
 		"./sigaction.s",
 		"./stat.s",
 		"./write.s",
-		"plat/linux/libsys/errno.s",
+		"plat/linux386/libsys/trapno.s",
 		"plat/osx/libsys/brk.c",
 		"plat/osx/libsys/creat.c",
 		"plat/osx/libsys/isatty.c",

From 2b09d3756c0fec8643e476f5ea7c564a9a930255 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Tue, 9 Jan 2018 00:39:03 -0500
Subject: [PATCH 31/55] These are EM trap numbers.

Remove .sect; absolute symbols don't go in a section.
---
 plat/linux386/libsys/trapno.s | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/plat/linux386/libsys/trapno.s b/plat/linux386/libsys/trapno.s
index 550fd6d7c..4996de338 100644
--- a/plat/linux386/libsys/trapno.s
+++ b/plat/linux386/libsys/trapno.s
@@ -1,21 +1,7 @@
-#
-! $Source: /cvsroot/tack/Ack/plat/linux386/libsys/errno.s,v $
-! $State: Exp $
-! $Revision: 1.1 $
-
-! Declare segments (the order is important).
-
-.sect .text
-.sect .rom
-.sect .data
-.sect .bss
-
 #define D(e) .define e; e
 
-.sect .data
-
-! Define various ACK error numbers. Note that these are *not* ANSI C
-! errnos, and are used for different purposes.
+! Define various EM trap numbers needed by mach/i386/libem.
+! Note that these are *not* ANSI C errnos.
 
 D(ERANGE)         = 1
 D(ESET)           = 2
@@ -25,4 +11,3 @@ D(EILLINS)        = 18
 D(EODDZ)          = 19
 D(ECASE)          = 20
 D(EBADMON)        = 25
-

From 103d44c27c06f07bda935f0caa4e03114cd102b1 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 11 Jan 2018 17:59:02 -0500
Subject: [PATCH 32/55] Rewrite sigaction() to prevent another race.

A signal handler might call sigaction().  We must block all signals,
not only our signal, to prevent a race between us and the next signal
handler.

Use /* comments */ because cpp might expand macros in ! comments
though such expansion is probably harmless.

The bridge is now shorter by 2 instructions.
---
 plat/linuxppc/libsys/sigaction.s | 284 ++++++++++++++++++-------------
 1 file changed, 161 insertions(+), 123 deletions(-)

diff --git a/plat/linuxppc/libsys/sigaction.s b/plat/linuxppc/libsys/sigaction.s
index 0509c8e72..1b1cea24a 100644
--- a/plat/linuxppc/libsys/sigaction.s
+++ b/plat/linuxppc/libsys/sigaction.s
@@ -1,156 +1,194 @@
 #define __NR_sigaction		67
-#define SIG_BLOCK		0
+#define __NR_sigprocmask	126
 #define SIG_SETMASK		2
-#define MAXSIG			32
 
-/* offsets into our stack frame */
-#define mynew	16	/* new sigaction */
-#define mynset	32	/* new signal set */
-#define myoset	36	/* old signal set */
-#define mysave	40
-#define mysize	56
+/* offsets into struct sigaction */
+#define sa_handler	0	/* in union with sa_sigaction */
+#define sa_mask		4
+#define sa_flags	8
+#define sa_restorer	12
+
+/* offsets from stack pointer */
+#define mynewact	16	/* struct sigaction */
+#define myoldact	32
+#define newmask		64	/* signal set */
+#define oldmask		68
+#define oldhandler	72
+#define myret		76
+#define savelr		80
+#define signum		84	/* first argument */
+#define newact		88
+#define oldact		92
 
 .sect .text; .sect .rodata; .sect .data; .sect .bss
 
 /*
  * Linux calls signal handlers with arguments in registers, but the
  * ACK expects arguments on the stack.  This sigaction() uses a
- * "bridge" to move the arguments.
+ * "bridge" to move the arguments, but
+ *
+ *  - If the caller passes a bad pointer, this sigaction() causes
+ *    SIGBUS or SIGSEGV instead of setting errno = EFAULT.
+ *
+ *  - This sigaction() only works with signals 1 to 31, not with
+ *    real-time signals 32 to 64.
+ *
+ *  - This sigaction() is not safe for multiple threads.
+ *
+ * int sigaction(int signum, const struct sigaction *newact,
+ *		 struct sigaction *oldact);
  */
 .sect .text
 .define _sigaction
 _sigaction:
 	mflr	r0
-	subi	r1, r1, mysize
-	stw	r31, mysave+8(r1)
-	stw	r30, mysave+4(r1)
-	stw	r29, mysave(r1)
-	stw	r0, mysave+12(r1)
-	li	r3, 0
-	stw	r3, mynset(r1)	   	! mynset = 0
-	lwz	r29, mysize(r1)		! r29 = signal number
-	lwz	r30, mysize+4(r1)	! r30 = new action
-	lwz	r31, mysize+8(r1)	! r31 = old action
+	li	r3, __NR_sigprocmask
+	stwu	r3, -signum(sp)		/* keep 0(sp) = __NR_sigprocmask */
+	stw	r0, savelr(sp)
+
+	/* Copy newact to stack (before blocking SIGBUS, SIGSEGV). */
+	lwz	r3, newact(sp)
+	mr.	r3, r3
+	beq	1f			/* skip if newact == NULL */
+	lwz	r4, sa_handler(r3)
+	lwz	r5, sa_mask(r3)
+	lwz	r6, sa_flags(r3)
+	lwz	r7, sa_restorer(r3)
+	stw	r4, mynewact+sa_handler(sp)
+	stw	r5, mynewact+sa_mask(sp)
+	stw	r6, mynewact+sa_flags(sp)
+	stw	r7, mynewact+sa_restorer(sp)
+
 	/*
-	 * If the new action is non-NULL, the signal number is in
-	 * range 1 to MAXSIG, and the new handler is not SIG_DFL 0
-	 * or SIG_IGN 1, then we interpose our bridge.
+	 * Block all signals to prevent a race.  After we set sharray,
+	 * we must call the kernel's sigaction before the next signal
+	 * handler runs.  This prevents two problems:
+	 *
+	 *  - The bridge might call the new handler while the kernel
+	 *    uses the mask and flags of the old handler.
+	 *
+	 *  - The signal handler might call sigaction() and destroy
+	 *    sharray.  We must block all signals because any signal
+	 *    handler might call sigaction() for our signal.
 	 */
-	cmpwi	cr0, r30, 0
-	subi	r7, r29, 1		! r7 = index in handlers
-	cmplwi	cr7, r7, MAXSIG		! unsigned comparison
-	beq	cr0, kernel
-	bge	cr7, kernel
-	lwz	r3, 0(r30)		! r3 = new handler
-	clrrwi.	r3, r3, 1
-	beq	cr0, kernel
-	/*
-	 * Block the signal while we build the bridge.  Prevents a
-	 * race if a signal arrives after we change the bridge but
-	 * before we change the action in the kernel.
-	 */
-	li	r4, 1
-	slw	r4, r4, r7
-	stw	r4, mynset(r1)		! mynmask = 1 << (signal - 1)
-	li	r3, SIG_BLOCK
-	la	r4, mynset(r1)
-	la	r5, myoset(r1)
-	stw	r3, 0(r1)
-	stw	r4, 4(r1)
-	stw	r5, 8(r1)
-	bl	_sigprocmask
-	/*
-	 * Point our bridge to the new signal handler.  Then copy the
-	 * new sigaction but point it to our bridge.
-	 */
-	lis	r6, hi16[handlers]
-	ori	r6, r6, lo16[handlers]
-	subi	r7, r29, 1
-	slwi	r7, r7, 2
-	lwz	r3, 0(r30)		! r3 = new handler
-	stwx	r3, r6, r7		! put it in array of handlers
-	lis	r3, hi16[bridge]
-	ori	r3, r3, lo16[bridge]
-	lwz	r4, 4(r30)
-	lwz	r5, 8(r30)
-	lwz	r6, 12(r30)
-	stw	r3, mynew(r1)		! sa_handler or sa_sigaction
-	stw	r4, mynew+4(r1)		! sa_mask
-	stw	r5, mynew+8(r1)		! sa_flags
-	stw	r6, mynew+12(r1)	! sa_restorer
-	la	r30, mynew(r1)
-kernel:
-	li	r3, __NR_sigaction
-	stw	r3, 0(r1)
-	stw	r29, 4(r1)
-	stw	r30, 8(r1)
-	stw	r31, 12(r1)
+1:	li	r4, SIG_SETMASK
+	li	r5, -1			/* mask signals 1 to 32 */
+	stw	r5, newmask(sp)
+	la	r5, newmask(sp)
+	la	r6, oldmask(sp)
+	stw	r4, 4(sp)		/* kept 0(sp) = __NR_sigprocmask */
+	stw	r5, 8(sp)
+	stw	r6, 12(sp)
 	bl	__syscall
+
 	/*
-	 * If we blocked the signal, then restore the old signal mask.
+	 * If the signal number is in range 1 to 31, and the new
+	 * handler is not SIG_DFL 0 or SIG_IGN 1, then we interpose
+	 * our bridge.
 	 */
-	lwz	r3, mynset(r1)
-	cmpwi	cr0, r3, 0
-	beq	cr0, fixold
-	li	r3, SIG_SETMASK
-	la	r4, myoset(r1)
-	li	r5, 0
-	stw	r3, 0(r1)
-	stw	r4, 4(r1)
-	stw	r5, 8(r1)
-	bl	_sigprocmask
-	/*
-	 * If the old sigaction is non-NULL and points to our bridge,
-	 * then point it to the signal handler.
-	 */
-fixold:
-	cmpwi	cr0, r31, 0
-	beq	cr0, leave
-	lis	r3, hi16[bridge]
-	ori	r3, r3, lo16[bridge]
-	lwz	r4, 0(r31)
-	cmpw	cr0, r3, r4
-	bne	cr0, leave
-	lis	r6, hi16[handlers]
-	ori	r6, r6, lo16[handlers]
-	subi	r7, r29, 1
-	slwi	r7, r7, 2
-	lwzx	r3, r6, r7	! get it from array of handlers
-	stw	r3, 0(r31)	! put it in old sigaction
-leave:
-	lwz	r0, mysave+12(r1)
-	lwz	r29, mysave(r1)
-	lwz	r30, mysave+4(r1)
-	lwz	r31, mysave+8(r1)
-	addi	r1, r1, mysize
+	lwz	r4, signum(sp)		/* keep r4 = signum */
+	addi	r5, r4, -1
+	cmplwi	r5, 30
+	bgt	2f			/* skip if out of range */
+
+	slwi	r5, r5, 2		/* r5 = sharray index */
+	lis	r6, ha16[sharray]
+	la	r6, lo16[sharray](r6)	/* r6 = sharray */
+	lwzx	r0, r6, r5
+	stw	r0, oldhandler(sp)	/* remember old handler */
+	lwz	r0, newact(sp)
+	mr.	r0, r0
+	beq	2f			/* skip if newact == NULL */
+
+	lwz	r3, mynewact+sa_handler(sp)
+	cmplwi	r3, 2			/* r3 = new handler */
+	blt	2f			/* skip if SIG_DFL or SIG_IGN */
+
+	stwx	r3, r6, r5		/* put new handler in sharray */
+	lis	r3, ha16[sigbridge]
+	la	r3, lo16[sigbridge](r3)
+	stw	r3, mynewact+sa_handler(sp)
+
+	/* Call the kernel's sigaction. */
+	/* sigaction(signum, &mynewact or NULL, &myoldact or NULL) */
+2:	li	r3, __NR_sigaction
+	lwz	r0, newact(sp)
+	mr.	r0, r0
+	beq	3f
+	la	r5, mynewact(sp)
+	b	4f
+3:	li	r5, 0
+4:	lwz	r0, oldact(sp)
+	mr.	r0, r0
+	beq	5f
+	la	r6, myoldact(sp)
+	b	6f
+5:	li	r6, 0
+6:	stw	r3, 0(sp)
+	stw	r4, 4(sp)		/* kept r4 = signum */
+	stw	r5, 8(sp)
+	stw	r6, 12(sp)
+	bl	__syscall
+	stw	r3, myret(sp)
+
+	/* Unblock signals by restoring old signal mask. */
+	li	r3, __NR_sigprocmask
+	li	r4, SIG_SETMASK
+	la	r5, oldmask(sp)
+	li	r6, 0
+	stw	r3, 0(sp)
+	stw	r4, 4(sp)
+	stw	r5, 8(sp)
+	stw	r6, 12(sp)
+	bl	__syscall
+
+	/* Copy oldact from stack (after unblocking BUS, SEGV). */
+	lwz	r3, oldact(sp)
+	mr.	r3, r3
+	beq	8f			/* skip if oldact == NULL */
+	lwz	r4, myoldact+sa_handler(sp)
+	lis	r5, ha16[sigbridge]
+	la	r5, lo16[sigbridge](r5)
+	cmpw	r4, r5
+	bne	7f
+	lwz	r4, oldhandler(sp)
+7:	lwz	r5, myoldact+sa_mask(sp)
+	lwz	r6, myoldact+sa_flags(sp)
+	lwz	r7, myoldact+sa_restorer(sp)
+	stw	r4, sa_handler(r3)
+	stw	r5, sa_mask(r3)
+	stw	r6, sa_flags(r3)
+	stw	r7, sa_restorer(r3)
+
+8:	lwz	r0, savelr(sp)
+	lwz	r3, myret(sp)
+	addi	sp, sp, signum
 	mtlr	r0
-	blr			! return from sigaction
+	blr
 
 /*
- * Linux calls bridge(signum) or bridge(signum, info, context) with
- * arguments in registers r3, r4, r5.
+ * Linux calls sigbridge(signum) or sigbridge(signum, info, context)
+ * with arguments in registers r3, r4, r5.
  */
-bridge:
+sigbridge:
 	mflr	r0
-	subi	r1, r1, 16
+	stwu	r3, -16(sp)	/* signal number */
+	stw	r4, 4(sp)	/* info */
+	stw	r5, 8(sp)	/* context */
 	stw	r0, 12(r1)
-	stw	r3, 0(r1)	! signal number
-	stw	r4, 4(r1)	! info
-	stw	r5, 8(r1)	! context
 
-	lis	r6, hi16[handlers]
-	ori	r6, r6, lo16[handlers]
-	subi	r7, r3, 1
-	slwi	r7, r7, 2
+	lis	r6, hi16[sharray - 4]
+	la	r6, lo16[sharray - 4](r6)
+	slwi	r7, r3, 2
 	lwzx	r6, r6, r7
 	mtctr	r6
-	bctrl			! call our signal handler
+	bctrl			/* call our signal handler */
 
-	lwz	r0, 12(r1)
+	lwz	r0, 12(sp)
 	addi	r1, r1, 16
 	mtlr	r0
-	blr			! return from bridge
+	blr			/* sigreturn(2) */
 
 .sect .bss
-handlers:
-	.space 4 * MAXSIG	! array of signal handlers
+sharray:
+	.space 4 * 31		/* handlers for signals 1 to 31 */

From f1304e1a3c2a6d32bf156dfbb8541d317bfd1e9a Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 11 Jan 2018 20:04:27 -0500
Subject: [PATCH 33/55] Use extended mnemonics and ha16/lo16.

Remove wrong comment: that's a right shift, not a left shift.
---
 plat/linuxppc/boot.s            |  2 +-
 plat/linuxppc/libsys/_syscall.s | 37 +++++++++++++--------------------
 plat/osxppc/boot.s              |  2 +-
 plat/osxppc/libsys/set_errno.s  |  8 +++----
 4 files changed, 20 insertions(+), 29 deletions(-)

diff --git a/plat/linuxppc/boot.s b/plat/linuxppc/boot.s
index 2da5dd556..33b2abd61 100644
--- a/plat/linuxppc/boot.s
+++ b/plat/linuxppc/boot.s
@@ -32,7 +32,7 @@ begtext:
 
 	lwz r3, 0(sp)            ! r3 = argc
 	addi r4, sp, 4           ! r4 = argv
-	rlwinm r5, r3, 32-2, 2, 31 ! shift left 2 bits
+	srwi r5, r3, 2
 	add r5, r5, r4 
 	addi r5, r5, 8           ! r5 = env
 	
diff --git a/plat/linuxppc/libsys/_syscall.s b/plat/linuxppc/libsys/_syscall.s
index c7e818830..f60423bea 100644
--- a/plat/linuxppc/libsys/_syscall.s
+++ b/plat/linuxppc/libsys/_syscall.s
@@ -12,17 +12,8 @@
 
 .sect .text
 
-EINVAL = 22
+#define EINVAL 22
 
-#define IFFALSE 4
-#define IFTRUE 12
-#define ALWAYS 20
-
-#define LT 0
-#define GT 1
-#define EQ 2
-#define OV 3
-	
 ! Perform a Linux system call.
 
 .define __syscall
@@ -32,21 +23,21 @@ __syscall:
 	lwz r4, 8(sp)
 	lwz r5, 12(sp)
 	sc 0
-	bclr IFFALSE, OV, 0
-	
+	bnslr
+
 	! On error, r3 contains the errno.	
 	! It just so happens that errnos 1-34 are the same in Linux as in ACK.
-	cmpi cr0, 0, r3, 1
-	bc IFTRUE, LT, 2f
-	cmpi cr0, 0, r3, 34
-	bc IFTRUE, GT, 2f
-	
+	cmpwi r3, 1
+	blt 2f
+	cmpwi r3, 34
+	bgt 2f
+
 3:
-	li32 r4, _errno
-	stw r3, 0(r4)
-	addi r3, r0, -1
-	bclr ALWAYS, 0, 0
-	
+	lis r4, ha16[_errno]
+	stw r3, lo16[_errno](r4)
+	li r3, -1
+	blr
+
 2:
-	addi r3, r0, EINVAL
+	li r3, EINVAL
 	b 3b
diff --git a/plat/osxppc/boot.s b/plat/osxppc/boot.s
index e96198eb4..1517ee377 100644
--- a/plat/osxppc/boot.s
+++ b/plat/osxppc/boot.s
@@ -29,7 +29,7 @@ begtext:
 
 	lwz r3, 0(sp)            ! r3 = argc
 	addi r4, sp, 4           ! r4 = argv
-	rlwinm r5, r3, 32-2, 2, 31 ! shift left 2 bits
+	srwi r5, r3, 2
 	add r5, r5, r4
 	addi r5, r5, 8           ! r5 = env
 
diff --git a/plat/osxppc/libsys/set_errno.s b/plat/osxppc/libsys/set_errno.s
index e406865a6..beb124a7c 100644
--- a/plat/osxppc/libsys/set_errno.s
+++ b/plat/osxppc/libsys/set_errno.s
@@ -1,7 +1,7 @@
 .sect .text
 .define .set_errno
 .set_errno:
-	li32 r10, _errno
-	stw r3, 0(r10)		! set errno
-	addi r3, r0, -1		! return -1
-	bclr 20, 0, 0
+	lis r4, ha16[_errno]
+	stw r3, lo16[_errno](r4)	! set errno
+	li r3, -1			! return -1
+	blr

From 66f93f08c5a0a3af85c1ce5f5278c06882403c37 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Mon, 22 Jan 2018 14:04:15 -0500
Subject: [PATCH 34/55] Add fef 4, fif 4.  Improve fef 8, fif 8.  Other float
 changes.

When I wrote fef 8, I forgot to test denormalized numbers.  Oops.  Now
fix two of my mistakes:

 - When checking for zero, `extrwi r6, r3, 22, 12` needs to be
   `extrwi r6, r3, 20, 12`.  There are only 20 bits to extract.

 - After the multiplication by 2**64, I forgot to put the fraction in
   [0.5, 1) or (-1, 0.5] by setting IEEE exponent = 1022.

Teach fif 8 about signed zero and NaN.

In ncg/table, change cmf so NaN is not equal to any value, and comment
why ordered comparisons don't work with NaN.  Also add cost for
fctwiz, remove extra `uses REG`.

Edit comment in cfu8.s because the conditional branch might be before
or after fctwiz.
---
 mach/powerpc/libem/build.lua |  2 +-
 mach/powerpc/libem/cfu8.s    |  8 +++--
 mach/powerpc/libem/fef4.s    | 48 +++++++++++++++++++++++++++
 mach/powerpc/libem/fef8.s    | 46 +++++++++++++-------------
 mach/powerpc/libem/fif4.s    | 64 ++++++++++++++++++++++++++++++++++++
 mach/powerpc/libem/fif8.s    | 45 ++++++++++++++++---------
 mach/powerpc/ncg/table       | 30 +++++++++++++----
 7 files changed, 194 insertions(+), 49 deletions(-)
 create mode 100644 mach/powerpc/libem/fef4.s
 create mode 100644 mach/powerpc/libem/fif4.s

diff --git a/mach/powerpc/libem/build.lua b/mach/powerpc/libem/build.lua
index 7a0726b80..2709a4770 100644
--- a/mach/powerpc/libem/build.lua
+++ b/mach/powerpc/libem/build.lua
@@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
 	acklibrary {
 		name = "lib_"..plat,
 		srcs = {
-			"./*.s", -- cfu8.s
+			"./*.s", -- fif4.s
 		},
 		vars = { plat = plat },
 		deps = {
diff --git a/mach/powerpc/libem/cfu8.s b/mach/powerpc/libem/cfu8.s
index fd69ff521..710d2a65c 100644
--- a/mach/powerpc/libem/cfu8.s
+++ b/mach/powerpc/libem/cfu8.s
@@ -42,6 +42,8 @@
 ! 1: yields r3 = the converted value.
 !
 ! Debian's clang 3.5.0-10 and gcc 4.9.2-10 don't clamp the value
-! before conversion.  They avoid fsel and put the conditional branch
-! before fctwiz.  PowerPC 601 lacks fsel (but kernel might trap and
-! emulate fsel).  PowerPC 603, 604, G3, G4, G5 have fsel.
+! before conversion.  They avoid fsel and use the conditional branch
+! to pick between 2 fctwiz instructions.
+!
+! PowerPC 601 lacks fsel (but kernel might trap and emulate fsel).
+! PowerPC 603, 604, G3, G4, G5 have fsel.
diff --git a/mach/powerpc/libem/fef4.s b/mach/powerpc/libem/fef4.s
new file mode 100644
index 000000000..a338ed0a9
--- /dev/null
+++ b/mach/powerpc/libem/fef4.s
@@ -0,0 +1,48 @@
+.sect .text
+
+! Split a single-precision float into fraction and exponent, like
+! frexpf(3) in C, http://en.cppreference.com/w/c/numeric/math/frexp
+!
+! Stack: ( single -- fraction exponent )
+
+.define .fef4
+.fef4:
+	lwz r3, 0(sp)			! r3 = word of float bits
+
+	! IEEE single = sign * 1.fraction * 2**(exponent - 127)
+	!   sign  exponent  fraction
+	!   0     1..8      9..31
+	!
+	! IEEE exponent = 126 in [0.5, 1) or (-1, -0.5].
+
+	extrwi. r6, r3, 8, 1		! r6 = IEEE exponent
+	beq 3f				! jump if zero or denormalized
+	cmpwi r6, 255
+	addi r5, r6, -126		! r5 = our exponent
+	beq 2f				! jump if infinity or NaN
+	! fall through if normalized
+
+	! Put fraction in [0.5, 1) or (-1, -0.5].
+1:	li r6, 126
+	insrwi r3, r6, 8, 1		! IEEE exponent = 126
+	! fall through
+
+2:	stw r3, 0(sp)			! push fraction
+	stwu r5, -4(sp)			! push exponent
+	blr
+
+	! Got denormalized number or zero, probably zero.
+	! If zero, then exponent must also be zero.
+3:	extrwi. r6, r3, 23, 9		! r6 = fraction
+	bne 4f				! jump if not zero
+	li r5, 0			! exponent = 0
+	b 2b
+
+	! Got denormalized number = 0.fraction * 2**-126
+4:	cntlzw r5, r6
+	addi r5, r5, -8
+	slw r6, r6, r5			! shift left to make 1.fraction
+	insrwi r3, r6, 23, 9		! set new fraction
+	li r6, -126 + 1
+	subf r5, r5, r6			! r5 = our exponent
+	b 1b
diff --git a/mach/powerpc/libem/fef8.s b/mach/powerpc/libem/fef8.s
index 26a962d8b..aff5ea3b6 100644
--- a/mach/powerpc/libem/fef8.s
+++ b/mach/powerpc/libem/fef8.s
@@ -3,7 +3,7 @@
 .sect .text
 
 ! Split a double-precision float into fraction and exponent, like
-! frexp(3) in C.
+! frexp(3) in C, http://en.cppreference.com/w/c/numeric/math/frexp
 !
 ! Stack: ( double -- fraction exponent )
 
@@ -12,42 +12,41 @@
 	lwz r3, 0(sp)			! r3 = high word (bits 0..31)
 	lwz r4, 4(sp)			! r4 = low word (bits 32..63)
 
-	! IEEE double-precision format:
+	! IEEE double = sign * 1.fraction * 2**(exponent - 1023)
 	!   sign  exponent  fraction
 	!   0     1..11     12..63
 	!
-	! To get fraction in [0.5, 1) or (-1, -0.5], we subtract 1022
-	! from the IEEE exponent.
+	! IEEE exponent = 1022 in [0.5, 1) or (-1, -0.5].
 
 	extrwi. r6, r3, 11, 1		! r6 = IEEE exponent
-	addi r5, r6, -1022		! r5 = our exponent
-	beq 2f				! jump if zero or denormalized
+	beq 3f				! jump if zero or denormalized
 	cmpwi r6, 2047
-	beq 1f				! jump if infinity or NaN
+	addi r5, r6, -1022		! r5 = our exponent
+	beq 2f				! jump if infinity or NaN
 	! fall through if normalized
 
-	! Put fraction in [0.5, 1) or (-1, -0.5] by setting its
-	! IEEE exponent to 1022.
-	rlwinm r3, r3, 0, 12, 0		! clear old exponent
-	oris r3, r3, 1022 << 4		! set new exponent
+	! Put fraction in [0.5, 1) or (-1, -0.5].
+1:	li r6, 1022
+	insrwi r3, r6, 11, 1		! IEEE exponent = 1022
 	! fall through
 
-1:	stw r3, 0(sp)
+2:	stw r3, 0(sp)
 	stw r4, 4(sp)			! push fraction
 	stwu r5, -4(sp)			! push exponent
 	blr
 
-2:	! Got denormalized number or zero, probably zero.
-	extrwi r6, r3, 22, 12
+	! Got denormalized number or zero, probably zero.
+	! If zero, then exponent must also be zero.
+3:	extrwi r6, r3, 20, 12
 	or. r6, r6, r4			! r6 = high|low fraction
-	bne 3f				! jump if not zero
+	bne 4f				! jump if not zero
 	li r5, 0			! exponent = 0
-	b 1b
+	b 2b
 
-3:	! Got denormalized number, not zero.
-	lfd f0, 0(sp)
-	lis r6, ha16[_2_64]
-	lfd f1, lo16[_2_64](r6)
+	! Got denormalized number = 0.fraction * 2**-1022
+4:	lfd f0, 0(sp)
+	lis r6, ha16[.fs_2_64]
+	lfs f1, lo16[.fs_2_64](r6)
 	fmul f0, f0, f1			! multiply it by 2**64
 	stfd f0, 0(sp)
 	lwz r3, 0(sp)
@@ -57,7 +56,6 @@
 	b 1b
 
 .sect .rom
-_2_64:
-	! (double) 2**64
-	.data4 0x43f00000
-	.data4 0x00000000
+.fs_2_64:
+	!float 1.84467440737095516e+19 sz 4
+	.data1 0137,0200,00,00
diff --git a/mach/powerpc/libem/fif4.s b/mach/powerpc/libem/fif4.s
new file mode 100644
index 000000000..fc29b178c
--- /dev/null
+++ b/mach/powerpc/libem/fif4.s
@@ -0,0 +1,64 @@
+.sect .text
+
+! Multiplies two single-precision floats, then splits the product into
+! fraction and integer, both as floats, like modff(3) in C,
+! http://en.cppreference.com/w/c/numeric/math/modf
+!
+! Stack: ( a b -- fraction integer )
+
+.define .fif4
+.fif4:
+	lfs f1, 4(sp)
+	lfs f2, 0(sp)
+	fmuls f1, f1, f2		! f1 = a * b
+	stfs f1, 0(sp)
+	lwz r3, 0(sp)			! r3 = word of float bits
+
+	! IEEE single = sign * 1.fraction * 2**(exponent - 127)
+	!   sign  exponent  fraction
+	!   0     1..8      9..31
+	!
+	! Subtract 127 from the IEEE exponent.  If the result is from
+	! 0 to 23, then the IEEE fraction has that many integer bits.
+
+	extrwi r5, r3, 8, 1		! r5 = IEEE exponent
+	addic. r5, r5, -127		! r5 = nr of integer bits
+	blt 3f				! branch if no integer
+	cmpwi r5, 24
+	bge 4f				! branch if no fraction
+	! fall through if integer with fraction
+
+	! f1 has r5 = 0 to 23 integer bits in the IEEE fraction.
+	! There are 23 - r5 fraction bits.
+	li r6, 23
+	subf r6, r5, r6
+	srw r3, r3, r6
+	slw r3, r3, r6			! clear fraction in word
+	! fall through
+
+1:	stw r3, 0(sp)
+	lfs f2, 0(sp)			! integer = high word, low word
+	fsubs f1, f1, f2		! fraction = value - integer
+2:	stfs f1, 4(sp)			! push fraction
+	stfs f2, 0(sp)			! push integer
+	blr
+
+	! f1 is a fraction without integer (or zero).
+	! Then integer is zero with same sign.
+3:	extlwi r3, r3, 1, 0		! extract sign bit
+	stfs f1, 4(sp)			! push fraction
+	stw r3, 0(sp)			! push integer = zero with sign
+	blr
+
+	! f1 is an integer without fraction (or infinity or NaN).
+	! Unless NaN, then fraction is zero with same sign.
+4:	fcmpu cr0, f1, f1
+	bun cr0, 5f
+	extlwi r3, r3, 1, 0		! extract sign bit
+	stw r3, 4(sp)			! push fraction = zero with sign
+	stfs f1, 0(sp)			! push integer
+	blr
+
+	! f1 is NaN, so both fraction and integer are NaN.
+5:	fmr f2, f1
+	b 2b
diff --git a/mach/powerpc/libem/fif8.s b/mach/powerpc/libem/fif8.s
index bce4f8d24..f93a39ac2 100644
--- a/mach/powerpc/libem/fif8.s
+++ b/mach/powerpc/libem/fif8.s
@@ -1,7 +1,8 @@
 .sect .text
 
 ! Multiplies two double-precision floats, then splits the product into
-! fraction and integer, like modf(3) in C.  On entry:
+! fraction and integer, both as floats, like modf(3) in C,
+! http://en.cppreference.com/w/c/numeric/math/modf
 !
 ! Stack: ( a b -- fraction integer )
 
@@ -14,20 +15,18 @@
 	lwz r3, 0(sp)			! r3 = high word
 	lwz r4, 4(sp)			! r4 = low word
 
-	! IEEE double-precision format:
+	! IEEE double = sign * 1.fraction * 2**(exponent - 1023)
 	!   sign  exponent  fraction
 	!   0     1..11     12..63
 	!
 	! Subtract 1023 from the IEEE exponent.  If the result is from
 	! 0 to 51, then the IEEE fraction has that many integer bits.
-	! (IEEE has an implicit 1 before its fraction.  If the IEEE
-	! fraction has 0 integer bits, we still have an integer.)
 
 	extrwi r5, r3, 11, 1		! r5 = IEEE exponent
 	addic. r5, r5, -1023		! r5 = nr of integer bits
-	blt 4f				! branch if no integer
+	blt 3f				! branch if no integer
 	cmpwi r5, 52
-	bge 5f				! branch if no fraction
+	bge 4f				! branch if no fraction
 	cmpwi r5, 21
 	bge 6f				! branch if large integer
 	! fall through if small integer
@@ -44,22 +43,38 @@
 1:	stw r3, 0(sp)
 	stw r4, 4(sp)
 	lfd f2, 0(sp)			! integer = high word, low word
-2:	fsub f1, f1, f2			! fraction = value - integer
-3:	stfd f1, 8(sp)			! push fraction
+	fsub f1, f1, f2			! fraction = value - integer
+2:	stfd f1, 8(sp)			! push fraction
 	stfd f2, 0(sp)			! push integer
 	blr
 
-4:	! f1 is a fraction without integer.
-	fsub f2, f1, f1			! integer = zero
-	b 3b
+	! f1 is a fraction without integer (or zero).
+	! Then integer is zero with same sign.
+3:	extlwi r3, r3, 1, 0		! extract sign bit
+	li r4, 0
+	stfd f1, 8(sp)			! push fraction
+	stw r4, 4(sp)
+	stw r3, 0(sp)			! push integer = zero with sign
+	blr
 
-5:	! f1 is an integer without fraction (or infinity or NaN).
-	fmr f2, f1			! integer = f1
+	! f1 is an integer without fraction (or infinity or NaN).
+	! Unless NaN, then fraction is zero with same sign.
+4:	fcmpu cr0, f1, f1		! integer = f1
+	bun cr0, 5f
+	extlwi r3, r3, 1, 0		! extract sign bit
+	li r4, 0
+	stw r4, 12(sp)
+	stw r3, 8(sp)			! push fraction = zero with sign
+	stfd f1, 0(sp)			! push integer
+	blr
+
+	! f1 is NaN, so both fraction and integer are NaN.
+5:	fmr f2, f1
 	b 2b
 
-6:	! f1 has r5 = 21 to 51 to integer bits.
+	! f1 has r5 = 21 to 51 to integer bits.
 	! Low word has 52 - r5 fraction bits.
-	li r6, 52
+6:	li r6, 52
 	subf r6, r5, r6
 	srw r4, r4, r6
 	slw r4, r4, r6			! clear fraction in low word
diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index df06a5d49..1ea0b60ec 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -310,7 +310,7 @@ INSTRUCTIONS
   fadds           FSREG+LOCAL:wo, FSREG:ro, FSREG:ro cost(4, 5).
   fcmpo           CR:wo, FREG:ro, FREG:ro cost(4, 5).
   fcmpo           CR:wo, FSREG:ro, FSREG:ro cost(4, 5).
-  fctiwz          FREG:wo, FREG:ro.
+  fctiwz          FREG:wo, FREG:ro cost(4, 5).
   fdiv            FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 35).
   fdivs           FSREG+LOCAL:wo, FSREG:ro, FSREG:ro cost(4, 21).
   fmr             FPR:wo, FPR:ro cost(4, 5).
@@ -2329,10 +2329,20 @@ PATTERNS
 		with FSREG
 			gen fneg {LOCAL, $2}, %1
 
+	/* When a or b is NaN, then a < b, a <= b, a > b, a >= b
+	 * should all be false.  We can't make them false, because
+	 *  - EM's _cmf_ is only for ordered comparisons.
+	 *  - The peephole optimizer assumes (a < b) == !(a >= b).
+	 *
+	 * We do make a == b false and a != b true, by checking the
+	 * eq (equal) bit or un (unordered) bit in cr0.
+	 */
+
 	pat cmf $1==4                      /* Compare single */
 		with FSREG FSREG
 			uses REG={COND_FS, %2, %1}
-			gen extlwi %a, %a, {C, 2}, {C, 0}
+			/* Extract lt, gt, un; put lt in sign bit. */
+			gen andisX %a, %a, {C, 0xd000}
 			yields %a
 
 	pat cmf teq $1==4                  /* Single second == top */
@@ -2367,7 +2377,6 @@ PATTERNS
 
 	proc cmf4zxx example cmf zeq
 		with FSREG FSREG STACK
-			uses REG
 			gen
 				fcmpo cr0, %2, %1
 				bxx* {LABEL, $2}
@@ -2420,6 +2429,13 @@ PATTERNS
 			loc 4
 			cff
 
+	pat fef $1==4                      /* Split fraction, exponent */
+		leaving cal ".fef4"
+
+	/* Multiply two singles, then split fraction, integer */
+	pat fif $1==4
+		leaving cal ".fif4"
+
 
 /* Double-precision floating-point */
 
@@ -2471,10 +2487,13 @@ PATTERNS
 		with FREG
 			gen fneg {DLOCAL, $2}, %1
 
+	/* To compare NaN, see comment above pat cmf $1==4 */
+
 	pat cmf $1==8                      /* Compare double */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
-			gen extlwi %a, %a, {C, 2}, {C, 0}
+			/* Extract lt, gt, un; put lt in sign bit. */
+			gen andisX %a, %a, {C, 0xd000}
 			yields %a
 
 	pat cmf teq $1==8                  /* Double second == top */
@@ -2482,7 +2501,7 @@ PATTERNS
 			uses REG={COND_FD, %2, %1}
 			yields {XEQ, %a}
 
-	pat cmf tne $1==8                  /* Single second == top */
+	pat cmf tne $1==8                  /* Double second == top */
 		with FREG FREG
 			uses REG={COND_FD, %2, %1}
 			yields {XNE, %a}
@@ -2509,7 +2528,6 @@ PATTERNS
 
 	proc cmf8zxx example cmf zeq
 		with FREG FREG STACK
-			uses REG
 			gen
 				fcmpo cr0, %2, %1
 				bxx* {LABEL, $2}

From c6ceaac1afd70ea418c64be83de2784d3a1b488b Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Tue, 23 Jan 2018 13:55:39 -0500
Subject: [PATCH 35/55] Make osx386 and osxppc use _hol0.s like the other
 platforms.

Because I'm lazy, I didn't make another copy of _hol0.s; I am building
plat/linux/libsys/_hol0.s for OS X.
---
 plat/osx386/boot.s           | 2 --
 plat/osx386/libsys/build.lua | 1 +
 plat/osxppc/boot.s           | 2 --
 plat/osxppc/libsys/build.lua | 1 +
 4 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/plat/osx386/boot.s b/plat/osx386/boot.s
index 932a716e9..c10045dd6 100644
--- a/plat/osx386/boot.s
+++ b/plat/osx386/boot.s
@@ -58,8 +58,6 @@ begdata:
 
 .sect .bss
 begbss:
-.define hol0
-.comm hol0, 8                ! line number and filename (for debugging)
 
 .define _errno
 .comm _errno, 4              ! Posix errno storage
diff --git a/plat/osx386/libsys/build.lua b/plat/osx386/libsys/build.lua
index 6a5b0e58c..3c2e96c3a 100644
--- a/plat/osx386/libsys/build.lua
+++ b/plat/osx386/libsys/build.lua
@@ -19,6 +19,7 @@ acklibrary {
 		"./sigaction.s",
 		"./stat.s",
 		"./write.s",
+		"plat/linux/libsys/_hol0.s",
 		"plat/linux386/libsys/trapno.s",
 		"plat/osx/libsys/brk.c",
 		"plat/osx/libsys/creat.c",
diff --git a/plat/osxppc/boot.s b/plat/osxppc/boot.s
index 1517ee377..8b1b7ab75 100644
--- a/plat/osxppc/boot.s
+++ b/plat/osxppc/boot.s
@@ -49,8 +49,6 @@ begdata:
 
 .sect .bss
 begbss:
-.define hol0
-.comm hol0, 8                ! line number and filename (for debugging)
 
 .define _errno
 .comm _errno, 4              ! Posix errno storage
diff --git a/plat/osxppc/libsys/build.lua b/plat/osxppc/libsys/build.lua
index 49fc0c934..cff10f29b 100644
--- a/plat/osxppc/libsys/build.lua
+++ b/plat/osxppc/libsys/build.lua
@@ -19,6 +19,7 @@ acklibrary {
 		"./sigaction.s",
 		"./stat.s",
 		"./write.s",
+		"plat/linux/libsys/_hol0.s",
 		"plat/osx/libsys/brk.c",
 		"plat/osx/libsys/creat.c",
 		"plat/osx/libsys/isatty.c",

From e3672bd66e4242ac5592295576717d568d819c4e Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Tue, 23 Jan 2018 18:18:40 -0500
Subject: [PATCH 36/55] Allow sp and fp on the fake stack.

This simplifies parts of the PowerPC table and causes ncg to better
decide whether to push sp or fp to the real stack, or coerce it to
REG3, or coerce it to REG-REG3, or move it to a regvar.  These better
decisions remove extra _mr_ instructions.

The idea comes from mach/powerpc/arm/table, where SP has a property
STACKPOINTER and LB has LOCALBASE.  I don't need two properties, so I
make one property SPFP for both registers.
---
 mach/powerpc/ncg/table | 49 ++++++++++++------------------------------
 1 file changed, 14 insertions(+), 35 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index 1ea0b60ec..a5ec75fc0 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -31,6 +31,7 @@ PC_OFFSET = 4   /* Offset of saved PC relative to our FP */
 PROPERTIES
 
 	GPR             /* general-purpose register */
+	SPFP            /* sp or fp */
 	REG             /* allocatable GPR */
 	REG3            /* coercion to r3 */
 
@@ -51,7 +52,8 @@ REGISTERS
 	 *   r13, r14, ..., r31: GPR, REG regvar(reg_any).
 	 */
 
-	r0, sp, fp, r12                   : GPR.
+	r0, r12                           : GPR.
+	sp, fp                            : GPR, SPFP.
 	r3                                : GPR, REG, REG3.
 	r4, r5, r6, r7, r8, r9, r10, r11  : GPR, REG.
 
@@ -238,7 +240,7 @@ SETS
 	MEMORY          = IND_V + FRAME_V.
 
 	/* any integer from stack that we can easily move to GPR */
-	INT_W   = REG + CONST_STACK + SEX_B + SEX_H +
+	INT_W   = SPFP + REG + CONST_STACK + SEX_B + SEX_H +
 	          SUM_RIS + SUM_RC + SUM_RL + SUM_RR +
 	          SUB_RR + NEG_R + MUL_RR + DIV_RR + DIV_RR_U +
 	          IND_ALL_B + IND_ALL_H + IND_ALL_W +
@@ -717,19 +719,14 @@ TESTS
 
 STACKINGRULES
 
-	/* We don't allow GPR-REG on the stack.  The intent is to ban
-	 * r0 from the stack, but this also bans fp from the stack.
-	 * This is odd because most other tables for ncg allow the
-	 * frame pointer on the stack.
-	 */
-	from REG to STACK
+	from SPFP+REG to STACK
 		gen
-			COMMENT("stack REG")
+			COMMENT("stack SPFP+REG")
 			stwu %1, {IND_RC_W, sp, 0-4}
 
-	from INT_W-REG to STACK
+	from INT_W-SPFP-REG to STACK
 		gen
-			COMMENT("stack INT_W-REG")
+			COMMENT("stack INT_W-SPFP-REG")
 			move %1, RSCRATCH
 			stwu RSCRATCH, {IND_RC_W, sp, 0-4}
 
@@ -1146,26 +1143,15 @@ PATTERNS
 			uses REG={LXFRAME, $1}
 			gen move %1, {FRAME_D, $1, %a, $2, 8}
 
-	/* Programs use "lxl cal" to pass the static chain and call a
-	 * nested procedure.  This must push a token LXFRAME or the
-	 * register fp to the real stack. */
-
-	/* Local base of procedure on static chain */
-	pat lxl nicelx($1)
+	pat lxl nicelx($1)                 /* Local base on static chain */
 		uses REG={LXFRAME, $1}
 		yields %a  /* Can't yield LXFRAME. */
 	pat lxl stl nicelx($1) && inreg($2)==reg_any
 		kills regvar($2)
 		gen move {LXFRAME, $1}, {REG_EXPR, regvar($2)}
 
-	pat lxl cal $1==0  /* Pass our local base to procedure */
-		with STACK
-			gen stwu fp, {IND_RC_W, sp, 0-4}
-			leaving cal $2
-
 	pat lxl $1==0                      /* Our local base */
-		uses REG=fp
-		yields %a  /* Can't yield fp. */
+		yields fp
 
 	pat lxa $1==0                      /* Our argument base */
 		yields {SUM_RC, fp, EM_BSIZE}
@@ -2134,7 +2120,8 @@ PATTERNS
 			bls
 
 	pat bls                            /* Block move variable length */
-		with REG REG REG
+		with REG SPFP+REG SPFP+REG
+			/* allows sp as %2, %3 */
 			/* ( src%3 dst%2 len%1 -- ) */
 			uses reusing %1, REG, REG, REG
 			gen
@@ -2230,22 +2217,14 @@ PATTERNS
 
 	pat lor $1==1                      /* Load stack pointer */
 		with STACK
-			uses REG=sp
-			yields %a  /* Can't yield sp. */
+			yields sp
 
 	/* Next few patterns for "lor 1" appear in
 	 * lang/m2/libm2/par_misc.e
 	 */
-	pat lor lor $1==1 && $2==1         /* Load sp twice */
-		with STACK
-			gen stwu sp, {IND_RC_W, sp, 0-4}
-			leaving lor 1
-
 	pat lor adp $1==1 && smalls($2)    /* sp + constant */
 		with STACK
-			uses REG
-			gen addi %a, sp, {C, $2}
-			yields %a
+			yields {SUM_RC, sp, $2}
 
 	/* Subtract stack pointer by doing %1 - (sp - 4)
 	 * because sp - 4 would point to %1.

From e83aaca3ec61f214bb48b6e3122c59bac934b0bf Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Wed, 24 Jan 2018 15:17:32 -0500
Subject: [PATCH 37/55] Add some comments before I forget how this stuff works.

---
 mach/powerpc/libem/inn.s |  3 +++
 mach/powerpc/libem/rck.s |  3 +++
 mach/powerpc/libem/set.s |  3 +++
 mach/powerpc/ncg/table   | 48 +++++++++++++++++++++++++++++++++++++++-
 4 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/mach/powerpc/libem/inn.s b/mach/powerpc/libem/inn.s
index 8925e776e..32275c117 100644
--- a/mach/powerpc/libem/inn.s
+++ b/mach/powerpc/libem/inn.s
@@ -5,6 +5,9 @@
 /* Tests a bit in a bitset on the stack.
  *
  * Stack: ( bitset bitnum setsize -- bool )
+ *
+ * Some back ends push false if bitnum is too large.  We don't because
+ * the compilers tend to pass a small enough bitnum.
  */
 
 .define .inn
diff --git a/mach/powerpc/libem/rck.s b/mach/powerpc/libem/rck.s
index f1cf7f848..1d07d5711 100644
--- a/mach/powerpc/libem/rck.s
+++ b/mach/powerpc/libem/rck.s
@@ -2,6 +2,9 @@
 
 ! Bounds check. Traps if the value is out of range.
 !  Stack: ( value descriptor -- value )
+!
+! This ".rck" only works with 4-byte integers.  The name is ".rck" and
+! not ".rck4" because many back ends only do rck with the word size.
 
 .define .rck
 .rck:
diff --git a/mach/powerpc/libem/set.s b/mach/powerpc/libem/set.s
index 3c4a9e579..8faf84a09 100644
--- a/mach/powerpc/libem/set.s
+++ b/mach/powerpc/libem/set.s
@@ -2,6 +2,9 @@
 
 ! Create singleton set.
 !  Stack: ( bitnumber size -- set )
+!
+! Some back ends trap ESET if bitnumber is out of range.  We don't
+! because the compilers tend to pass a valid bitnumber.
 
 .define .set
 .set:
diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index a5ec75fc0..efdb681db 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -1,3 +1,32 @@
+/*
+ * PowerPC table for ncg
+ *
+ * David Given created this table.
+ * George Koehler made many changes in years 2016 to 2018.
+ *
+ * This back end provides 4-byte integers, 4-byte floats, and 8-byte
+ * floats.  It should provide enough of EM for the ACK's compilers.
+ *  - It doesn't provide "mon" (monitor call) nor "lor 2", "str 2"
+ *    (heap pointer).  Programs should call procedures in libsys to
+ *    make system calls or allocate heap memory.
+ *  - It generates only a few EM traps:
+ *     - EARRAY from aar, lar, sar
+ *     - ERANGE from rck
+ *     - ECASE from csa, csb
+ *  - It uses floating-point registers to move 8-byte values that
+ *    aren't floats.  This might cause extra FPU context switches in
+ *    programs that don't use floating point.
+ *
+ * The EM stack is less than optimal for PowerPC, and incompatible
+ * with the calling conventions of other compilers (like gcc).
+ *  - EM and ncg use the stack to pass parameters to procedures.  For
+ *    PowerPC, this is probably slower than passing them in registers.
+ *  - This back end misaligns some 8-byte floats, because EM's stack
+ *    has only 4-byte alignment.  (This kind of misalignment also
+ *    happened in IBM's AIX and Apple's Mac OS, where data structures
+ *    had 8-byte floats with only 4-byte alignment.)
+ */
+
 EM_WSIZE = 4
 EM_PSIZE = 4
 EM_BSIZE = 8    /* two words saved in call frame */
@@ -46,6 +75,15 @@ PROPERTIES
 REGISTERS
 
 	/*
+	 * We use r1 as stack pointer and r2 as frame pointer.
+	 * Our assembler has aliases sp -> r1 and fp -> r2.
+	 *
+	 * We preserve r13 to r31 and f14 to f31 across function
+	 * calls to mimic other compilers (like gcc).  See
+	 *  - http://refspecs.linuxbase.org/elf/elfspec_ppc.pdf
+	 *  - https://github.com/ryanarn/powerabi -> chap3-elf32abi.sgml
+	 *  - Apple's "32-bit PowerPC Function Calling Conventions"
+	 *
 	 * When ncg allocates regvars, it seems to start with the last
 	 * register in the first class.  To encourage ncg to allocate
 	 * them from r31 down, we list them in one class as
@@ -85,7 +123,7 @@ REGISTERS
 	  : FSREG regvar(reg_float).
 
 	lr, ctr     : SPR.
-	cr0         : CR.
+	cr0         : CR.   /* We use cr0, ignore cr1 to cr7. */
 
 	/* The stacking rules can't allocate registers.  We use these
 	 * scratch registers to stack tokens.
@@ -1405,6 +1443,10 @@ PATTERNS
 
 /* Word arithmetic */
 
+	/* Like most back ends, this one doesn't trap EIOVFL, so it
+	 * ignores overflow in signed integers.
+	 */
+
 	pat adi $1==4                      /* Add word (second + top) */
 		with REG REG
 			yields {SUM_RR, %1, %2}
@@ -1468,6 +1510,10 @@ PATTERNS
 
 /* Bitwise logic */
 
+	/* This back end doesn't know how to combine shifts and
+	 * bitwise ops to emit rlwinm, rlwnm, or rlwimi instructions.
+	 */
+
 	pat and $1==4                      /* AND word */
 		with REG NOT_R
 			yields {ANDC_RR, %1, %2.reg}

From 7c9c4f82fdda6dc02ce0ff24da958060d6a8f75a Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Fri, 26 Jan 2018 20:08:03 -0500
Subject: [PATCH 38/55] Get `ack -mosxppc -g` to partly work with gdb.

Copy and adapt code from mach/{i386,m68020}/ncg/mach.c to pass the
debugging stabs from EM to assembly.  The next tools (as, led, cv)
already know how to put the stabs in the Mach-o executable.

Modify the function prolog/prologue so gdb uses fp, not sp, for N_LSYM
and N_PSYM stabs.  Simplify prolog() by reducing differences between
stabs and no stabs, and zero and nonzero framesize.  For files without
stabs, the new prolog has the same number of instructions and memory
accesses as the old prolog, and to run at about the same speed on my
PowerPC Mac.

This is enough to see some info for global and local variables in gdb
for Mac OS X.  I still can't get a backtrace; gdb gets confused
because EM and ncg don't link 0(sp) to the previous stack frame.

I don't expect `ack -mlinuxppc -g` to work with gdb for Linux, because
we prepend underscores to the symbol table, which is correct for
Mach-o but wrong for ELF.
---
 mach/powerpc/ncg/mach.c | 88 ++++++++++++++++++++++++++++++-----------
 1 file changed, 66 insertions(+), 22 deletions(-)

diff --git a/mach/powerpc/ncg/mach.c b/mach/powerpc/ncg/mach.c
index 06e39709f..1a1d98d6c 100644
--- a/mach/powerpc/ncg/mach.c
+++ b/mach/powerpc/ncg/mach.c
@@ -10,8 +10,13 @@
 
 #include <limits.h>
 #include <stdint.h>
+#include <stb.h>
 
+static int writing_stabs = 0;
+
+#ifdef REGVARS
 static long framesize;
+#endif
 
 void
 con_part(int sz, word w)
@@ -51,39 +56,42 @@ con_mult(word sz)
 #define FL_MSB_AT_LOW_ADDRESS	1
 #include <con_float>
 
-static void
-emit_prolog(void)
-{
-	fprintf(codefile, "mfspr r0, lr\n");
-	if (framesize) {
-		fprintf(codefile, "addi sp, sp, %ld\n", -framesize - 8);
-		fprintf(codefile, "stw fp, %ld(sp)\n", framesize);
-		fprintf(codefile, "stw r0, %ld(sp)\n", framesize + 4);
-		fprintf(codefile, "addi fp, sp, %ld\n", framesize);
-	} else {
-		/* optimize for framesize == 0 */
-		fprintf(codefile, "stwu fp, -8(sp)\n");
-		fprintf(codefile, "stw r0, 4(sp)\n");
-		fprintf(codefile, "mr fp, sp\n");
-	}
-}
-
 void
 prolog(full nlocals)
 {
-	framesize = nlocals;
+	/*
+	 * For N_LSYM and N_PSYM stabs, we want gdb to use fp, not sp.
+	 * The trick is to use "stwu sp, _(sp)" then "addi fp, sp, 0"
+	 * before we save lr with "stw r0, _(sp)".
+	 *
+	 * Tried with Apple's gdb-696.  Refer to
+	 *  - gdb-696/src/gdb/rs6000-tdep.c, skip_prologue(), line 1101
+	 *  - gdb-696/src/gdb/macosx/ppc-macosx-frameinfo.c,
+	 *    ppc_parse_instructions(), line 717
+	 * https://opensource.apple.com/release/developer-tools-25.html
+	 */
+	fprintf(codefile, "mfspr r0, lr\n");
+	if (writing_stabs) {
+		fprintf(codefile, "stwu sp, -8(sp)\n");  /* for gdb */
+		fprintf(codefile, "stw fp, 0(sp)\n");
+	} else
+		fprintf(codefile, "stwu fp, -8(sp)\n");
+	fprintf(codefile, "addi fp, sp, 0\n");           /* for gdb */
+	fprintf(codefile, "stw r0, 4(sp)\n");
 
 #ifdef REGVARS
-	/* f_regsave() will call emit_prolog() */
+	framesize = nlocals;
+	/* regsave() increases framesize; f_regsave() adjusts sp. */
 #else
-	emit_prolog();
+	if (nlocals)
+		fprintf(codefile, "addi sp, sp, %ld\n", -nlocals);
 #endif
 }
 
 void
 mes(word type)
 {
-	int argt ;
+	int argt, a1, a2 ;
 
 	switch ( (int)type ) {
 	case ms_ext :
@@ -98,6 +106,41 @@ mes(word type)
 				break ;
 			}
 		}
+	case ms_stb:
+		argt = getarg(str_ptyp | cst_ptyp);
+		if (argt == sp_cstx)
+			fputs(".symb \"\", ", codefile);
+		else {
+			fprintf(codefile, ".symb \"%s\", ", str);
+			argt = getarg(cst_ptyp);
+		}
+		a1 = argval;
+		argt = getarg(cst_ptyp);
+		a2 = argval;
+		argt = getarg(cst_ptyp|nof_ptyp|sof_ptyp|ilb_ptyp|pro_ptyp);
+		if (a1 == N_PSYM) {
+			/* Change offset from AB into offset from
+			   the frame pointer.
+			*/
+			argval += 8;
+		}
+		fprintf(codefile, "%s, 0x%x, %d\n", strarg(argt), a1, a2);
+		argt = getarg(end_ptyp);
+		break;
+	case ms_std:
+		writing_stabs = 1;  /* set by first "mes 13,...,100,0" */
+		argt = getarg(str_ptyp | cst_ptyp);
+		if (argt == sp_cstx)
+			str[0] = '\0';
+		else {
+			argt = getarg(cst_ptyp);
+		}
+		swtxt();
+		fprintf(codefile, ".symd \"%s\", 0x%x,", str, (int) argval);
+		argt = getarg(cst_ptyp);
+		fprintf(codefile, "%d\n", (int) argval);
+		argt = getarg(end_ptyp);
+		break;
 	default :
 		while ( getarg(any_ptyp) != sp_cend ) ;
 		break ;
@@ -239,7 +282,8 @@ f_regsave(void)
 {
 	int reg;
 
-	emit_prolog();
+	if (framesize)
+		fprintf(codefile, "addi sp, sp, %ld\n", -framesize);
 	saveloadregs("stw", "stmw", "stfd");
 
 	/*

From 3dae9e49ccd6b753f412995d873a6760c38fea1a Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Sat, 27 Jan 2018 15:33:43 -0500
Subject: [PATCH 39/55] Use subfic (val - reg) and mulli (reg * val).

In the instruction list, put /* kills xer */ for sraw, srawi, subfic;
and correct the (now unused) "addi." and "lfdu".

Change MACHOPT_F from -m3 to -m2.  This changes the code for 15 * i
from

    slwi r3,r4,4
    subfic r5,r4,0
    add r3,r3,r5

to

    mulli r3,r4,15

If the sequence "slwi subfic addi" takes 3 cycles and 12 bytes, and
mulli takes 3 cycles and 4 bytes, then mulli is better.
---
 mach/powerpc/ncg/table | 30 ++++++++++++++++++++++++------
 plat/linuxppc/descr    |  2 +-
 plat/osxppc/descr      |  2 +-
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table
index efdb681db..82cada71a 100644
--- a/mach/powerpc/ncg/table
+++ b/mach/powerpc/ncg/table
@@ -170,8 +170,10 @@ TOKENS
 	SUM_RL      = { GPR reg; ADDR adr; }   4.   /* reg + lo16[adr] */
 	SUM_RR      = { GPR reg1; GPR reg2; }  4.   /* reg1 + reg2 */
 
+	SUB_CR      = { INT val; GPR reg; }    4.   /* val - reg */
 	SUB_RR      = { GPR reg1; GPR reg2; }  4.   /* reg1 - reg2 */
 	NEG_R       = { GPR reg; }             4.   /* -reg */
+	MUL_RC      = { GPR reg; INT val; }    4.   /* reg * val */
 	MUL_RR      = { GPR reg1; GPR reg2; }  4.   /* reg1 * reg2 */
 	DIV_RR      = { GPR reg1; GPR reg2; }  4.   /* reg1 / reg2 signed */
 	DIV_RR_U    = { GPR reg1; GPR reg2; }  4.   /* reg1 / reg2 unsigned */
@@ -280,9 +282,10 @@ SETS
 	/* any integer from stack that we can easily move to GPR */
 	INT_W   = SPFP + REG + CONST_STACK + SEX_B + SEX_H +
 	          SUM_RIS + SUM_RC + SUM_RL + SUM_RR +
-	          SUB_RR + NEG_R + MUL_RR + DIV_RR + DIV_RR_U +
+	          SUB_CR + SUB_RR + NEG_R +
+	          MUL_RC + MUL_RR + DIV_RR + DIV_RR_U +
 	          IND_ALL_B + IND_ALL_H + IND_ALL_W +
-		  FRAME_B + FRAME_H + FRAME_H_S + FRAME_W +
+	          FRAME_B + FRAME_H + FRAME_H_S + FRAME_W +
 	          NOT_R + AND_RIS + AND_RC + AND_RR + ANDC_RR +
 	          OR_RIS + OR_RC + OR_RR + ORC_RR +
 	          XOR_RIS + XOR_RC + XOR_RR + NAND_RR + NOR_RR + EQV_RR +
@@ -307,7 +310,7 @@ INSTRUCTIONS
   cost(4, 1) /* space, time */
 
   add             GPR:wo, GPR:ro, GPR:ro.
-  addX "add."     GPR:wo, GPR:ro, GPR:ro.
+  addX "add."     GPR:wo:cc, GPR:ro, GPR:ro.
   addi            GPR:wo, GPR:ro, CONST+LABEL_LO:ro.
     li            GPR:wo, CONST:ro.
   addis           GPR:wo, GPR:ro, CONST+LABEL_HI+LABEL_HA:ro.
@@ -365,7 +368,7 @@ INSTRUCTIONS
   lbz             GPR:wo, SET_RC_B:ro cost(4, 3).
   lbzx            GPR:wo, GPR:ro, GPR:ro cost(4, 3).
   lfd             FPR+DLOCAL:wo, SET_RC_D:ro cost(4, 5).
-  lfdu            FPR:wo, IND_RC_D:ro cost(4, 5).
+  lfdu            FPR:wo, IND_RC_D:rw cost(4, 5).
   lfdx            FPR:wo, GPR:ro, GPR:ro cost(4, 5).
   lfs             FSREG+LOCAL:wo, SET_RC_W:ro cost(4, 4).
   lfsu            FSREG:wo, IND_RC_W:rw cost(4, 4).
@@ -380,6 +383,7 @@ INSTRUCTIONS
   mfcr            GPR:wo cost(4,2).
   mfspr           GPR:wo, SPR:ro cost(4, 3).
   mtspr           SPR:wo, GPR:ro cost(4, 2).
+  mulli           GPR:wo, GPR:ro, CONST:ro cost(4, 3).
   mullw           GPR:wo, GPR:ro, GPR:ro cost(4, 4).
   nand            GPR:wo, GPR:ro, GPR:ro.
   neg             GPR:wo, GPR:ro.
@@ -401,8 +405,8 @@ INSTRUCTIONS
   rlwnm           GPR:wo, GPR:ro, GPR:ro, CONST:ro, CONST:ro.
     rotlw         GPR+LOCAL:wo, GPR:ro, GPR:ro.
   slw             GPR+LOCAL:wo, GPR:ro, GPR:ro.
-  sraw            GPR+LOCAL:wo, GPR:ro, GPR:ro cost(4, 2).
-  srawi           GPR+LOCAL:wo, GPR:ro, CONST:ro cost(4, 2).
+  sraw            GPR+LOCAL:wo, GPR:ro, GPR:ro /* kills xer */ cost(4, 2).
+  srawi           GPR+LOCAL:wo, GPR:ro, CONST:ro /* kills xer */ cost(4, 2).
   srw             GPR+LOCAL:wo, GPR:ro, GPR:ro.
   stb             GPR:ro, SET_RC_B:rw cost(4, 3).
   stbx            GPR:ro, GPR:ro, GPR:ro cost(4, 3).
@@ -418,6 +422,7 @@ INSTRUCTIONS
   stwx            GPR:ro, GPR:ro, GPR:ro cost(4, 3).
   stwu            GPR:ro, IND_RC_W:rw cost(4, 3).
   subf            GPR:wo, GPR:ro, GPR:ro.
+  subfic          GPR:wo, GPR:ro, CONST:ro /* kills xer */.
   xor             GPR:wo, GPR:ro, GPR:ro.
   xori            GPR:wo, GPR:ro, CONST:ro.
   xoris           GPR:wo, GPR:ro, CONST:ro.
@@ -490,6 +495,10 @@ MOVES
 
 /* Other arithmetic */
 
+	from SUB_CR to GPR
+		/* val - reg -> subtract reg from val */
+		gen subfic %2, %1.reg, {C, %1.val}
+
 	from SUB_RR to GPR
 		/* reg1 - reg2 -> subtract reg2 from reg1 */
 		gen subf %2, %1.reg2, %1.reg1
@@ -497,6 +506,9 @@ MOVES
 	from NEG_R to GPR
 		gen neg %2, %1.reg
 
+	from MUL_RC to GPR
+		gen mulli %2, %1.reg, {C, %1.val}
+
 	from MUL_RR to GPR
 		gen mullw %2, %1.reg1, %1.reg2
 
@@ -1471,6 +1483,8 @@ PATTERNS
 			yields {SUB_RR, %2, %1}
 		with CONST2_WHEN_NEG REG
 			yields {SUM_RC, %2, 0-%1.val}
+		with REG CONST2
+			yields {SUB_CR, %2.val, %1}
 		with CONST_HI_ZR REG
 			yields {SUM_RIS, %2, his(0-%1.val)}
 		with CONST_STACK-CONST2_WHEN_NEG-CONST_HI_ZR REG
@@ -1482,6 +1496,10 @@ PATTERNS
 			yields {NEG_R, %1}
 
 	pat mli $1==4                      /* Multiply word (second * top) */
+		with CONST2 REG
+			yields {MUL_RC, %2, %1.val}
+		with REG CONST2
+			yields {MUL_RC, %1, %2.val}
 		with REG REG
 			yields {MUL_RR, %2, %1}
 
diff --git a/plat/linuxppc/descr b/plat/linuxppc/descr
index 1bbb9fbd9..7f6f8fc02 100644
--- a/plat/linuxppc/descr
+++ b/plat/linuxppc/descr
@@ -19,7 +19,7 @@ var PLATFORM=linuxppc
 var PLATFORMDIR={EM}/share/ack/{PLATFORM}
 var CPP_F=-D__unix
 var ALIGN=-a0:4 -a1:4 -a2:4 -a3:4 -b0:0x10000054
-var MACHOPT_F=-m3
+var MACHOPT_F=-m2
 var EGO_PLAT_FLAGS=-M{EM}/share/ack/ego/{ARCH}.descr
 
 # Override the setting in fe so that files compiled for linuxppc can see
diff --git a/plat/osxppc/descr b/plat/osxppc/descr
index 5f416c44c..072a79dbc 100644
--- a/plat/osxppc/descr
+++ b/plat/osxppc/descr
@@ -19,7 +19,7 @@ var PLATFORM=osxppc
 var PLATFORMDIR={EM}/share/ack/{PLATFORM}
 var CPP_F=-D__unix
 var ALIGN=-a0:4 -a1:4 -a2:4096 -a3:4 -b0:0x129c
-var MACHOPT_F=-m3
+var MACHOPT_F=-m2
 var EGO_PLAT_FLAGS=-M{EM}/share/ack/ego/{ARCH}.descr
 
 # Override the setting in fe so that files compiled for osxppc can see

From cdde55535ef4417b0e08eeb6248e0b635fa0968a Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Sat, 27 Jan 2018 16:35:48 -0500
Subject: [PATCH 40/55] For osxppc, change size 8 to alignment 4.

You may need to delete and recompile some .o files!  This changes the
alignment of 8-byte values in C structs to match what Apple's gcc
does.  See Apple's "32-bit PowerPC Function Calling Conventions" at

    https://developer.apple.com
      /library/content/documentation/DeveloperTools/Conceptual/LowLevelABI
      /100-32-bit_PowerPC_Function_Calling_Conventions/32bitPowerPC.html
---
 plat/osxppc/descr | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/plat/osxppc/descr b/plat/osxppc/descr
index 072a79dbc..77fc45260 100644
--- a/plat/osxppc/descr
+++ b/plat/osxppc/descr
@@ -10,10 +10,11 @@ var l={w}
 var la={w}
 var f={w}
 var fa={w}
+# Size 8 has alignment 4 in Mac OS, 8 in Linux.
 var d=8
-var da={d}
+var da=4
 var x=8
-var xa={x}
+var xa=4
 var ARCH=powerpc
 var PLATFORM=osxppc
 var PLATFORMDIR={EM}/share/ack/{PLATFORM}

From b38fcdded3d6a258560f0ad4bbaf093b8a78a749 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Sat, 27 Jan 2018 20:09:16 -0500
Subject: [PATCH 41/55] Add tests for clearing BSS, copying C structs.

The new tests are bss_e.c, structcopy_e.c.  We do clear the BSS before
calling _m_a_i_n, so fix the comments in the other tests.
---
 tests/plat/_dummy_e.c                 |   2 +-
 tests/plat/bss_e.c                    |  27 ++++++
 tests/plat/bugs/bug-62-notvar_var_e.c |   2 +-
 tests/plat/build.lua                  |  18 ++--
 tests/plat/csa_e.c                    |   4 +-
 tests/plat/csb_e.c                    |   4 +-
 tests/plat/doublecmp_e.c              |   4 +-
 tests/plat/from_d_to_si_e.c           |   4 +-
 tests/plat/from_d_to_ui_e.c           |   4 +-
 tests/plat/from_si_to_d_e.c           |   4 +-
 tests/plat/from_ui_to_d_e.c           |   4 +-
 tests/plat/intadd_e.c                 |   4 +-
 tests/plat/intcmp_e.c                 |   4 +-
 tests/plat/intdiv_e.c                 |   4 +-
 tests/plat/intrem_e.c                 |   4 +-
 tests/plat/intshift_e.c               |   4 +-
 tests/plat/intsub_e.c                 |   4 +-
 tests/plat/structcopy_e.c             | 113 ++++++++++++++++++++++++++
 18 files changed, 173 insertions(+), 41 deletions(-)
 create mode 100644 tests/plat/bss_e.c
 create mode 100644 tests/plat/structcopy_e.c

diff --git a/tests/plat/_dummy_e.c b/tests/plat/_dummy_e.c
index 48104b5aa..39262eaaa 100644
--- a/tests/plat/_dummy_e.c
+++ b/tests/plat/_dummy_e.c
@@ -1,6 +1,6 @@
 #include "test.h"
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT(0 == 0);
diff --git a/tests/plat/bss_e.c b/tests/plat/bss_e.c
new file mode 100644
index 000000000..547e7f7f1
--- /dev/null
+++ b/tests/plat/bss_e.c
@@ -0,0 +1,27 @@
+#include "test.h"
+
+/*
+ * EM puts these variables in BSS.  Their initial values must be zero.
+ * Some platforms, like Linux, clear the BSS before they run the
+ * program.  For other platforms, like pc86, we clear the BSS in
+ * boot.s before we call _m_a_i_n.
+ */
+char c;
+int array[9000];
+short s;
+
+/* Bypasses the CRT, so there's no stdio. */
+void _m_a_i_n(void)
+{
+	int bad, i;
+
+	ASSERT(c == 0);
+	bad = 0;
+	for (i = 0; i < sizeof(array) / sizeof(array[0]); i++) {
+		if(array[i])
+			bad++;
+	}
+	ASSERT(bad == 0);
+	ASSERT(s == 0);
+	finished();
+}
diff --git a/tests/plat/bugs/bug-62-notvar_var_e.c b/tests/plat/bugs/bug-62-notvar_var_e.c
index d3813bb91..cde84eed1 100644
--- a/tests/plat/bugs/bug-62-notvar_var_e.c
+++ b/tests/plat/bugs/bug-62-notvar_var_e.c
@@ -40,7 +40,7 @@ void c(int i, int tru, int fal) {
   ASSERT((i != i) == fal);
 }
 
-/* Bypasses the CRT. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void) {
   a();
   b();
diff --git a/tests/plat/build.lua b/tests/plat/build.lua
index 42ca441d0..26676b0b1 100644
--- a/tests/plat/build.lua
+++ b/tests/plat/build.lua
@@ -9,22 +9,14 @@ definerule("plat_testsuite",
 		-- Remember this is executed from the caller's directory; local
 		-- target names will resolve there.
 		local testfiles = filenamesof(
+			-- added structcopy_e.c
 			"tests/plat/*.c",
-			"tests/plat/dup_e.e",
-			"tests/plat/exg_e.e",
-			"tests/plat/inn_e.e",
-			"tests/plat/rck_e.e",
-			"tests/plat/rotate_e.e",
+			"tests/plat/*.e",
 			"tests/plat/*.p",
 			"tests/plat/b/*.b",
-			"tests/plat/bugs/bug-22-inn_mod.mod",
-			"tests/plat/bugs/bug-62-notvar_var_e.c",
-			"tests/plat/m2/ConvTest_mod.mod",
-			"tests/plat/m2/NestProc_mod.mod",
-			"tests/plat/m2/OpenArray_mod.mod",
-			"tests/plat/m2/SemaTest_mod.mod",
-			"tests/plat/m2/Set100_mod.mod",
-			"tests/plat/m2/StringTest_mod.mod"
+			"tests/plat/bugs/*.c",
+			"tests/plat/bugs/*.mod",
+			"tests/plat/m2/*.mod"
 		)
 
 		acklibrary {
diff --git a/tests/plat/csa_e.c b/tests/plat/csa_e.c
index 355b75ee7..470fbebc5 100644
--- a/tests/plat/csa_e.c
+++ b/tests/plat/csa_e.c
@@ -11,7 +11,7 @@ int csa(int i)
     }
 }
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT(csa(0) == 0);
@@ -23,4 +23,4 @@ void _m_a_i_n(void)
     ASSERT(csa(6) == 0);
 
     finished();
-}
\ No newline at end of file
+}
diff --git a/tests/plat/csb_e.c b/tests/plat/csb_e.c
index c86d31fa6..38ce05402 100644
--- a/tests/plat/csb_e.c
+++ b/tests/plat/csb_e.c
@@ -11,7 +11,7 @@ int csa(int i)
     }
 }
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT(csa(0) == 0);
@@ -23,4 +23,4 @@ void _m_a_i_n(void)
     ASSERT(csa(600) == 0);
 
     finished();
-}
\ No newline at end of file
+}
diff --git a/tests/plat/doublecmp_e.c b/tests/plat/doublecmp_e.c
index f6c1582dc..b6fe5bbad 100644
--- a/tests/plat/doublecmp_e.c
+++ b/tests/plat/doublecmp_e.c
@@ -4,7 +4,7 @@
 double one = 1.0;
 double zero = 0.0;
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT(zero == zero);
@@ -17,4 +17,4 @@ void _m_a_i_n(void)
     ASSERT(one  >= one);
 
     finished();
-}
\ No newline at end of file
+}
diff --git a/tests/plat/from_d_to_si_e.c b/tests/plat/from_d_to_si_e.c
index 7f51e6c5b..bc06c755c 100644
--- a/tests/plat/from_d_to_si_e.c
+++ b/tests/plat/from_d_to_si_e.c
@@ -8,7 +8,7 @@ double minusone = -1.0;
 double big = (double)INT_MAX;
 double minusbig = (double)INT_MIN;
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT((int)zero == 0);
@@ -18,4 +18,4 @@ void _m_a_i_n(void)
     ASSERT((int)minusbig == INT_MIN);
 
     finished();
-}
\ No newline at end of file
+}
diff --git a/tests/plat/from_d_to_ui_e.c b/tests/plat/from_d_to_ui_e.c
index 811780b87..7d18ca9e5 100644
--- a/tests/plat/from_d_to_ui_e.c
+++ b/tests/plat/from_d_to_ui_e.c
@@ -6,7 +6,7 @@ double one = 1.0;
 double zero = 0.0;
 double big = (double)UINT_MAX;
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT((unsigned int)zero == 0);
@@ -14,4 +14,4 @@ void _m_a_i_n(void)
     ASSERT((unsigned int)big == UINT_MAX);
 
     finished();
-}
\ No newline at end of file
+}
diff --git a/tests/plat/from_si_to_d_e.c b/tests/plat/from_si_to_d_e.c
index b6c7a25ba..172361dfa 100644
--- a/tests/plat/from_si_to_d_e.c
+++ b/tests/plat/from_si_to_d_e.c
@@ -8,7 +8,7 @@ int minusone = -1;
 int big = INT_MAX;
 int minusbig = INT_MIN;
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT((double)zero == 0.0);
@@ -18,4 +18,4 @@ void _m_a_i_n(void)
     /* ASSERT((double)minusbig == (double)INT_MIN); FIXME: fails for now */
 
     finished();
-}
\ No newline at end of file
+}
diff --git a/tests/plat/from_ui_to_d_e.c b/tests/plat/from_ui_to_d_e.c
index b8e017c99..383d9afad 100644
--- a/tests/plat/from_ui_to_d_e.c
+++ b/tests/plat/from_ui_to_d_e.c
@@ -6,7 +6,7 @@ unsigned int one_u = 1;
 unsigned int zero_u = 0;
 unsigned int big_u = UINT_MAX;
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT((double)zero_u == 0.0);
@@ -14,4 +14,4 @@ void _m_a_i_n(void)
     ASSERT((double)big_u == (double)UINT_MAX);
 
     finished();
-}
\ No newline at end of file
+}
diff --git a/tests/plat/intadd_e.c b/tests/plat/intadd_e.c
index 8e4868a62..94549814c 100644
--- a/tests/plat/intadd_e.c
+++ b/tests/plat/intadd_e.c
@@ -6,7 +6,7 @@ int one = 1;
 int zero = 0;
 int minusone = -1;
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT((two + one)      == 3);
@@ -28,4 +28,4 @@ void _m_a_i_n(void)
     ASSERT(((unsigned int)-1  + (unsigned int)two) == 1);
 
     finished();
-}
\ No newline at end of file
+}
diff --git a/tests/plat/intcmp_e.c b/tests/plat/intcmp_e.c
index dd7f1da75..72cfc06b1 100644
--- a/tests/plat/intcmp_e.c
+++ b/tests/plat/intcmp_e.c
@@ -4,7 +4,7 @@
 int one = 1;
 int zero = 0;
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT(zero == zero);
@@ -62,4 +62,4 @@ void _m_a_i_n(void)
     ASSERT((unsigned int)1 >= (unsigned int)one);
 
     finished();
-}
\ No newline at end of file
+}
diff --git a/tests/plat/intdiv_e.c b/tests/plat/intdiv_e.c
index c90964ced..cab76cdad 100644
--- a/tests/plat/intdiv_e.c
+++ b/tests/plat/intdiv_e.c
@@ -6,7 +6,7 @@ int two = 2;
 int one = 1;
 int zero = 0;
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT((three / two) == 1);
@@ -25,4 +25,4 @@ void _m_a_i_n(void)
     ASSERT((3 / -two) == -1);
 
     finished();
-}
\ No newline at end of file
+}
diff --git a/tests/plat/intrem_e.c b/tests/plat/intrem_e.c
index 40f68d654..424152106 100644
--- a/tests/plat/intrem_e.c
+++ b/tests/plat/intrem_e.c
@@ -6,7 +6,7 @@ int two = 2;
 int one = 1;
 int zero = 0;
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT((three % two) == 1);
@@ -25,4 +25,4 @@ void _m_a_i_n(void)
     ASSERT((3 % -two) == 1);
 
     finished();
-}
\ No newline at end of file
+}
diff --git a/tests/plat/intshift_e.c b/tests/plat/intshift_e.c
index 3cc6d52f9..08ef05ca1 100644
--- a/tests/plat/intshift_e.c
+++ b/tests/plat/intshift_e.c
@@ -6,7 +6,7 @@ int one = 1;
 int zero = 0;
 int minusone = -1;
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT((one     <<zero) == 1);
@@ -50,4 +50,4 @@ void _m_a_i_n(void)
     ASSERT(((unsigned int)minusone>>(unsigned int)1)  == (UINT_MAX>>1));
 
     finished();
-}
\ No newline at end of file
+}
diff --git a/tests/plat/intsub_e.c b/tests/plat/intsub_e.c
index d8f67d3a3..b0cf08ae6 100644
--- a/tests/plat/intsub_e.c
+++ b/tests/plat/intsub_e.c
@@ -7,7 +7,7 @@ int one = 1;
 int zero = 0;
 int minusone = -1;
 
-/* Bypasses the CRT, so there's no stdio or BSS initialisation. */
+/* Bypasses the CRT, so there's no stdio. */
 void _m_a_i_n(void)
 {
     ASSERT((two - one) == 1);
@@ -29,4 +29,4 @@ void _m_a_i_n(void)
     ASSERT(((unsigned int)1   - (unsigned int)two) == UINT_MAX);
 
     finished();
-}
\ No newline at end of file
+}
diff --git a/tests/plat/structcopy_e.c b/tests/plat/structcopy_e.c
new file mode 100644
index 000000000..74a9e2d30
--- /dev/null
+++ b/tests/plat/structcopy_e.c
@@ -0,0 +1,113 @@
+#include "test.h"
+
+/* ACK's C compiler uses EM's loi, sti, blm, or an inline loop to copy
+ * these structs.  The compiler doesn't call memcpy() or other
+ * functions in libc, so this test passes without linking the CRT.
+ */
+
+struct c5 {       /* not a whole number of words */
+	char one[5];
+};
+
+struct ii {       /* two words */
+	int one;
+	int two;
+};
+
+struct iii {      /* three words */
+	int one;
+	int two;
+	int three;
+};
+
+int equal5(char *a, char *b) {  /* a, b must have 5 characters */
+	int i;
+
+	for (i = 0; i < 5; i++)
+		if (a[i] != b[i]) return 0;
+	return 1;
+}
+
+struct c5 make_c5(char *str) {  /* str must have 5 characters */
+	struct c5 out;
+	int i;
+
+	for (i = 0; i < 5; i++)
+		out.one[i] = str[i];
+	return out;
+}
+
+struct ii make_ii(int i, int j) {
+	struct ii out;
+
+	out.one = i;
+	out.two = j;
+	return out;
+}
+
+struct iii make_iii(struct ii in, int k) {
+	struct iii out;
+
+	out.one = in.one;
+	out.two = in.two;
+	out.three = k;
+	return out;
+}
+
+struct c5 rotate_left_c5(struct c5 in) {
+	int i;
+	char c = in.one[0];
+
+	/* Modifies our copy of _in_, not caller's copy. */
+	for (i = 0; i < 4; i++)
+		in.one[i] = in.one[i + 1];
+	in.one[4] = c;
+	return in;
+}
+
+struct iii rotate_left_iii(struct iii in) {
+	int i = in.one;
+
+	/* Modifies our copy of _in_, not caller's copy. */
+	in.one = in.two;
+	in.two = in.three;
+	in.three = i;
+	return in;
+}
+
+/* Bypasses the CRT, so there's no stdio. */
+void _m_a_i_n(void) {
+	struct c5 earth, heart, dup_heart, rol_heart;
+	struct ii pair, dup_pair;
+	struct iii triple, dup_triple, rol_triple;
+
+	earth = make_c5("earth");
+	heart = make_c5("heart");
+	dup_heart = heart;
+	rol_heart = rotate_left_c5(heart);
+	ASSERT(equal5(earth.one, "earth"));
+	ASSERT(equal5(heart.one, "heart"));
+	ASSERT(equal5(dup_heart.one, "heart"));
+	ASSERT(equal5(rol_heart.one, "earth"));
+
+	pair = make_ii(29, 31);
+	dup_pair = pair;
+	triple = make_iii(pair, -9);
+	dup_triple = triple;
+	rol_triple = rotate_left_iii(triple);
+	ASSERT(pair.one == 29);
+	ASSERT(pair.two == 31);
+	ASSERT(dup_pair.one == 29);
+	ASSERT(dup_pair.two == 31);
+	ASSERT(triple.one == 29);
+	ASSERT(triple.two == 31);
+	ASSERT(triple.three == -9);
+	ASSERT(dup_triple.one == 29);
+	ASSERT(dup_triple.two == 31);
+	ASSERT(dup_triple.three == -9);
+	ASSERT(rol_triple.one == 31);
+	ASSERT(rol_triple.two == -9);
+	ASSERT(rol_triple.three == 29);
+
+	finished();
+}

From b3c0a767a5b548a5a392d90d24fbabdc8e9ba148 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Sat, 27 Jan 2018 21:41:13 -0500
Subject: [PATCH 42/55] Sync qemuppc with linuxppc.

 - Don't reverse bitfields; do use ego (41f3bf7).
 - Use MACHOPT_F=-m2 (3dae9e4).
 - Remove old trap.s (26de4c1).

At this commit, one can build qemuppc with mcg by editing the root
build.lua to uncomment "qemuppc" in "vars.plats".  If one also
uncomments "qemuppc" from "vars.plats_with_tests", then mcg fails to
build the tests.  If one uses ncg (by editing plat/qemuppc/descr to
change "mcg" to "ncg"), then the tests pass.
---
 plat/qemuppc/descr         |  7 ++--
 plat/qemuppc/libsys/trap.s | 65 --------------------------------------
 2 files changed, 2 insertions(+), 70 deletions(-)
 delete mode 100644 plat/qemuppc/libsys/trap.s

diff --git a/plat/qemuppc/descr b/plat/qemuppc/descr
index f5191b249..9d1a80427 100644
--- a/plat/qemuppc/descr
+++ b/plat/qemuppc/descr
@@ -19,11 +19,8 @@ var PLATFORM=qemuppc
 var PLATFORMDIR={EM}/share/ack/{PLATFORM}
 var CPP_F=-D__unix
 var ALIGN=-a0:4 -a1:4 -a2:4 -a3:4 -b0:0x01000000
-var C_LIB={PLATFORMDIR}/libc-ansi.a
-# bitfields reversed for compatibility with (g)cc.
-var CC_ALIGN=-Vr
-var OLD_C_LIB={C_LIB}
-var MACHOPT_F=
+var MACHOPT_F=-m2
+var EGO_PLAT_FLAGS=-M{EM}/share/ack/ego/{ARCH}.descr
 
 # Override the setting in fe so that files compiled for qemuppc can see
 # the platform-specific headers.
diff --git a/plat/qemuppc/libsys/trap.s b/plat/qemuppc/libsys/trap.s
deleted file mode 100644
index e00c4d561..000000000
--- a/plat/qemuppc/libsys/trap.s
+++ /dev/null
@@ -1,65 +0,0 @@
-#
-! $Source: /cvsroot/tack/Ack/plat/linux386/libsys/_syscall.s,v $
-! $State: Exp $
-! $Revision: 1.1 $
-
-! Declare segments (the order is important).
-
-.sect .text
-.sect .rom
-.sect .data
-.sect .bss
-
-.sect .text
-
-#define IFFALSE 4
-#define IFTRUE 12
-#define ALWAYS 20
-
-#define LT 0
-#define GT 1
-#define EQ 2
-#define OV 3
-
-EARRAY	=  0
-ERANGE	=  1
-ESET	=  2
-EIOVFL	=  3
-EFOVFL	=  4
-EFUNFL	=  5
-EIDIVZ	=  6
-EFDIVZ	=  7
-EIUND	=  8
-EFUND	=  9
-ECONV	= 10
-ESTACK  = 16
-EHEAP	= 17
-EILLINS = 18
-EODDZ	= 19
-ECASE	= 20
-EMEMFLT	= 21
-EBADPTR = 22
-EBADPC  = 23
-EBADLAE = 24
-EBADMON = 25
-EBADLIN = 26
-EBADGTO = 27
-EUNIMPL = 63		! unimplemented em-instruction called
-
-.define .trap_ecase
-.trap_ecase:
-	b .trp
-
-.define .trap_earray
-.trap_earray:
-	b .trp
-
-.define .trap_erange
-.trap_erange:
-	b .trap
-
-.define .trp
-.define .trap
-.trp:
-.trap:
-	b .trp					! spin forever

From 9077b3a5ab37034932642fd11cf3eee3f887849b Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Tue, 30 Jan 2018 15:53:26 -0500
Subject: [PATCH 43/55] Teach mcg to pass our tests.

Tests pass if one edits the top build.lua to uncomment "qemuppc" from
both vars.plats and vars.plats_with_tests, and one leaves mcg in
plat/qemuppc/descr.

Add or correct some EM instructions in treebuilder.c:
 - "lof", "stf": handle negative offsets in load() and store().
 - "cuu": add using IR_FROMUI.
 - "lim", "sim": keep an entire word in ".ignmask", to be compatible
   with mach/powerpc/libem/trp.s and ncg.  We also keep a word in
   ".ignmask" in ncg for both i386 and m68020.
 - "trp": pass trap number in register.  See comment in
   helper_function_with_arg().
 - "sig": push the old value of .trppc on the stack.
 - "and ?", "ior ?", "xor ?", "com ?", "cms ?", "set ?", "inn ?":
   connect to helper functions in libem.
 - "blm", "bls": drop call to memmove() and use new helper ".bls4",
   because tests/plat/structcopy_e.c can't call memmove().
 - "xor s", "cms s": if s is large, fall back on helper function.
 - "rol", "ror": add by decomposing each rotate into 4 IR ops.
 - "rck s", "bls s": make fatal unless s is word size.
 - "loi": push multiple loads in the correct order.
 - "dup s", "exg s": if s is large, fall back on helper.
 - "dus": add using new helper ".dus4".
 - "lxl", "lxa": follow the static chain, not the dynamic chain.
 - "lor 1": materialise the stack before pushing the stack pointer.
 - "lor 2", "str 2": make fatal.
 - "los", "sts": drop calls to memcpy() and use helpers ".los4" and
   and ".sts4", so lang/m2/libm2/LtoUset.e starts working.
 - "gto": correctly read descriptor.

Change mach/powerpc/mcg/table:
 - ANY.L: add for "asp -8".
 - LOAD.L: work around register corruption.
 - COMPAREUL.I: add for "cms 8".
---
 mach/powerpc/libem/bls4.s    |  19 ++
 mach/powerpc/libem/build.lua |   2 +-
 mach/powerpc/libem/dus4.s    |  16 ++
 mach/powerpc/mcg/table       |  23 ++-
 mach/proto/mcg/treebuilder.c | 346 +++++++++++++++++------------------
 5 files changed, 223 insertions(+), 183 deletions(-)
 create mode 100644 mach/powerpc/libem/bls4.s
 create mode 100644 mach/powerpc/libem/dus4.s

diff --git a/mach/powerpc/libem/bls4.s b/mach/powerpc/libem/bls4.s
new file mode 100644
index 000000000..a36faca68
--- /dev/null
+++ b/mach/powerpc/libem/bls4.s
@@ -0,0 +1,19 @@
+.sect .text
+
+! Does a block move of words between non-overlapping buffers.
+!  Stack: ( src dst len -- )
+
+.define .bls4
+.bls4:
+	lwz	r3, 0(sp)	! len
+	lwz	r4, 4(sp)	! dst
+	lwz	r5, 8(sp)	! src
+	addi	sp, sp, 12
+	srwi	r3, r3, 2
+	mtspr	ctr, r3
+	addi	r5, r5, -4
+	addi	r4, r4, -4
+1:	lwzu	r3, 4(r5)
+	stwu	r3, 4(r4)
+	bdnz	1b
+	blr
diff --git a/mach/powerpc/libem/build.lua b/mach/powerpc/libem/build.lua
index 2709a4770..5ed9b52e8 100644
--- a/mach/powerpc/libem/build.lua
+++ b/mach/powerpc/libem/build.lua
@@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
 	acklibrary {
 		name = "lib_"..plat,
 		srcs = {
-			"./*.s", -- fif4.s
+			"./*.s", -- dus4.s
 		},
 		vars = { plat = plat },
 		deps = {
diff --git a/mach/powerpc/libem/dus4.s b/mach/powerpc/libem/dus4.s
new file mode 100644
index 000000000..9c751947a
--- /dev/null
+++ b/mach/powerpc/libem/dus4.s
@@ -0,0 +1,16 @@
+.sect .text
+
+! Duplicates some words on top of stack.
+!  Stack: ( a size -- a a )
+
+.define .dus4
+.dus4:
+	lwz	r3, 0(sp)
+	addi	sp, sp, 4
+	srwi	r4, r3, 2
+	mtspr	ctr, r4
+	add	r5, sp, r3
+1:	lwzu	r4, -4(r5)
+	stwu	r4, -4(sp)
+	bdnz	1b
+	blr
diff --git a/mach/powerpc/mcg/table b/mach/powerpc/mcg/table
index b72990c36..ca44ce869 100644
--- a/mach/powerpc/mcg/table
+++ b/mach/powerpc/mcg/table
@@ -237,10 +237,13 @@ PATTERNS
     SETSP.I(in:(int)reg)
         emit "mr sp, %in"
         cost 4;
-    
+
     out:(int)reg = ANY.I
         cost 1;
 
+    out:(long)reg = ANY.L
+        cost 1;
+
     out:(int)reg = COPYF.I(in:(float)reg)
         emit "stfsu %in, -4(sp)"
         emit "lwz %out, 0(sp)"
@@ -306,10 +309,21 @@ PATTERNS
 		emit "lwz %out, %addr"
 		cost 4;
 
+#if 0
+    /* FIXME: Doesn't work because %out.0 and %addr might share a
+     * register, so it corrupts %addr before it loads %out.1. */
     out:(long)reg = LOAD.L(addr:address)
         emit "lwz %out.0, 4+%addr"
         emit "lwz %out.1, 0+%addr"
         cost 8;
+#else
+    /* Works, but costs an extra instruction. */
+    out:(long)reg = LOAD.L(addr:address)
+        emit "la %out.1, %addr"
+        emit "lwz %out.0, 4(%out.1)"
+        emit "lwz %out.1, 0(%out.1)"
+        cost 12;
+#endif
 
 	out:(int)ushort0 = LOADH.I(addr:address)
 		emit "lhz %out, %addr"
@@ -566,6 +580,13 @@ PATTERNS
         emit "! COMPARESI.I(cr, 0)"
         cost 4;
 
+    cr:(cr)cr = COMPAREUL.I(left:(long)reg, right:(long)reg)
+        emit "cmpl %cr, 0, %left.1, %right.1"
+        emit "bne 1f"
+        emit "cmpl %cr, 0, %left.0, %right.0"
+        emit "1:"
+        cost 12;
+
 
 
 /* Booleans */
diff --git a/mach/proto/mcg/treebuilder.c b/mach/proto/mcg/treebuilder.c
index eed770170..ac811fc14 100644
--- a/mach/proto/mcg/treebuilder.c
+++ b/mach/proto/mcg/treebuilder.c
@@ -274,7 +274,7 @@ static struct ir* store(int size, struct ir* address, int offset, struct ir* val
     else
         opcode = IR_STORE;
 
-    if (offset > 0)
+    if (offset != 0)
         address = new_ir2(
             IR_ADD, EM_pointersize,
             address, new_wordir(offset)
@@ -304,7 +304,7 @@ static struct ir* load(int size, struct ir* address, int offset)
     else
         opcode = IR_LOAD;
 
-    if (offset > 0)
+    if (offset != 0)
         address = new_ir2(
             IR_ADD, EM_pointersize,
             address, new_wordir(offset)
@@ -416,6 +416,31 @@ static void helper_function(const char* name)
     );
 }
 
+static void helper_function_with_arg(const char* name, struct ir* arg)
+{
+    /* Abuses IR_SETRET to set a register to pass one argument to a
+     * helper function.
+     *
+     * FIXME:  As of January 2018, mach/powerpc/libem takes an
+     * argument in register r3 only for ".los4", ".sts4", ".trp".
+     * This is an accident.  Should the argument be on the stack, or
+     * should other helpers use a register? */
+
+    materialise_stack();
+    appendir(
+        new_ir1(
+            IR_SETRET, arg->size,
+            arg
+        )
+    );
+    appendir(
+        new_ir1(
+            IR_CALL, 0,
+            new_labelir(name)
+        )
+    );
+}
+
 static void insn_simple(int opcode)
 {
     switch (opcode)
@@ -437,6 +462,7 @@ static void insn_simple(int opcode)
         case op_cii: simple_convert(IR_FROMSI); break;
         case op_ciu: simple_convert(IR_FROMSI); break;
         case op_cui: simple_convert(IR_FROMUI); break;
+        case op_cuu: simple_convert(IR_FROMUI); break;
         case op_cfu: simple_convert(IR_FROMUF); break;
         case op_cfi: simple_convert(IR_FROMSF); break;
         case op_cif: simple_convert(IR_FROMSI); break;
@@ -496,10 +522,12 @@ static void insn_simple(int opcode)
 
         case op_lim:
         {
+            /* Traps use only 16 bits of .ignmask, but we keep an
+             * entire word, even if a word has more than 2 bytes. */
             push(
-                new_ir1(
-                    (EM_wordsize == 2) ? IR_LOAD : IR_LOADH, EM_wordsize,
-                    new_labelir(".ignmask")
+                load(
+                    EM_wordsize,
+                    new_labelir(".ignmask"), 0
                 )
             );
             break;
@@ -507,26 +535,34 @@ static void insn_simple(int opcode)
 
         case op_sim:
         {
-            sequence_point();
             appendir(
-                new_ir2(
-                    (EM_wordsize == 2) ? IR_STORE : IR_STOREH, EM_wordsize,
-                    new_labelir(".ignmask"),
+                store(
+                    EM_wordsize,
+                    new_labelir(".ignmask"), 0,
                     pop(EM_wordsize)
                 )
             );
             break;
         }
 
-        case op_trp: helper_function(".trp"); break;
+        case op_trp:
+            helper_function_with_arg(".trp", pop(EM_wordsize));
+            break;
 
         case op_sig:
         {
+            struct ir* label = new_labelir(".trppc");
             struct ir* value = pop(EM_pointersize);
+            push(
+                load(
+                    EM_pointersize,
+                    label, 0
+                )
+            );
             appendir(
                 store(
                     EM_pointersize,
-                    new_labelir(".trppc"), 0,
+                    label, 0,
                     value
                 )
             );
@@ -539,12 +575,13 @@ static void insn_simple(int opcode)
             break;
         }
 
-        /* FIXME: These instructions are really complex and barely used
-         * (Modula-2 and Pascal set support, I believe). Leave them until
-         * later. */
-        case op_set: helper_function(".unimplemented_set"); break;
-        case op_ior: helper_function(".unimplemented_ior"); break;
-
+        case op_and: helper_function(".and"); break;
+        case op_ior: helper_function(".ior"); break;
+        case op_xor: helper_function(".xor"); break;
+        case op_com: helper_function(".com"); break;
+        case op_cms: helper_function(".cms"); break;
+        case op_set: helper_function(".set"); break;
+        case op_inn: helper_function(".inn"); break;
 
         case op_dch:
             push(
@@ -670,6 +707,31 @@ static void simple_alu2(int opcode, int size, int irop, const char* fallback)
     }
 }
 
+static void rotate(int opcode, int size, int irop, int irop_reverse)
+{
+    if (size > (2*EM_wordsize))
+        fatal("treebuilder: can't do opcode %s with size %d", em_mnem[opcode - sp_fmnem], size);
+    else
+    {
+        struct ir* right = pop(size);
+        struct ir* left = pop(size);
+        struct ir* bits = new_wordir(8 * size);
+
+        /* a rol b -> (a << b) | (a >> (32 - b)) */
+        push(
+            new_ir2(
+                IR_OR, size,
+                new_ir2(irop, size, left, right),
+                new_ir2(
+                    irop_reverse, size,
+                    left,
+                    new_ir2(IR_SUB, size, bits, right)
+                )
+            )
+        );
+    }
+}
+
 static struct ir* extract_block_refs(struct basicblock* bb)
 {
     struct ir* outir = NULL;
@@ -720,26 +782,28 @@ static struct ir* ptradd(struct ir* address, int offset)
         );
 }
 
-static void blockmove(struct ir* dest, struct ir* src, struct ir* size)
+static struct ir* walk_static_chain(int level)
 {
-    /* memmove stack: ( size src dest -- ) */
-    push(size);
-    push(src);
-    push(dest);
+    struct ir* ir;
 
-    materialise_stack();
-    appendir(
-        new_ir1(
-            IR_CALL, 0,
-            new_labelir("memmove")
-        )
-    );
-    appendir(
-        new_ir1(
-            IR_STACKADJUST, EM_pointersize,
-            new_wordir(EM_pointersize*2 + EM_wordsize)
-        )
+    /* The static chain, when it exists, is the first argument of each
+     * procedure.  The chain begins with the current frame at level 0,
+     * and continues until we reach the outermost procedure. */
+    ir = new_ir0(
+        IR_GETFP, EM_pointersize
     );
+    while (level--)
+    {
+        /* Walk to the next frame pointer. */
+        ir = load(
+            EM_pointersize,
+            new_ir1(
+                IR_FPTOAB, EM_pointersize,
+                ir
+            ), 0
+        );
+    }
+    return ir;
 }
 
 static void insn_ivalue(int opcode, arith value)
@@ -765,8 +829,10 @@ static void insn_ivalue(int opcode, arith value)
 
         case op_and: simple_alu2(opcode, value, IR_AND, ".and"); break;
         case op_ior: simple_alu2(opcode, value, IR_OR, ".ior"); break;
-        case op_xor: simple_alu2(opcode, value, IR_EOR, NULL); break;
+        case op_xor: simple_alu2(opcode, value, IR_EOR, ".xor"); break;
         case op_com: simple_alu1(opcode, value, IR_NOT, ".com"); break;
+        case op_rol: rotate(opcode, value, IR_LSL, IR_LSR); break;
+        case op_ror: rotate(opcode, value, IR_LSR, IR_LSL); break;
 
         case op_adf: simple_alu2(opcode, value, IR_ADDF, NULL); break;
         case op_sbf: simple_alu2(opcode, value, IR_SUBF, NULL); break;
@@ -774,12 +840,23 @@ static void insn_ivalue(int opcode, arith value)
         case op_dvf: simple_alu2(opcode, value, IR_DIVF, NULL); break;
         case op_ngf: simple_alu1(opcode, value, IR_NEGF, NULL); break;
 
-        case op_cmu: /* fall through */
-        case op_cms: push(tristate_compare(value, IR_COMPAREUI)); break;
+        case op_cms:
+            if (value > (2*EM_wordsize))
+            {
+                push(new_wordir(value));
+                helper_function(".cms");
+                break;
+            }
+            /* fall through */
+        case op_cmu: push(tristate_compare(value, IR_COMPAREUI)); break;
         case op_cmi: push(tristate_compare(value, IR_COMPARESI)); break;
         case op_cmf: push(tristate_compare(value, IR_COMPAREF)); break;
 
-        case op_rck: helper_function(".rck"); break;
+        case op_rck:
+            if (value != EM_wordsize)
+                fatal("'rck %d' not supported", value);
+            helper_function(".rck");
+            break;
         case op_set: push(new_wordir(value)); helper_function(".set"); break;
         case op_inn: push(new_wordir(value)); helper_function(".inn"); break;
 
@@ -930,26 +1007,24 @@ static void insn_ivalue(int opcode, arith value)
 
             if (value > (EM_wordsize*2))
             {
-                /* We're going to need to do multiple stores; fix the address
+                /* We're going to need to do multiple loads; fix the address
                  * so it'll go into a register and we can do maths on it. */
                 appendir(ptr);
             }
 
+            /* Stack grows down.  Load backwards. */
             while (value > 0)
             {
                 int s = EM_wordsize*2;
                 if (value < s)
                     s = value;
-
+                value -= s;
                 push(
                     load(
                         s,
-                        ptr, offset
+                        ptr, value
                     )
                 );
-
-                value -= s;
-                offset += s;
             }
 
             assert(value == 0);
@@ -1099,7 +1174,12 @@ static void insn_ivalue(int opcode, arith value)
         case op_dup:
         {
             sequence_point();
-            if ((value == (EM_wordsize*2)) && (peek(0) == EM_wordsize) && (peek(1) == EM_wordsize))
+            if (value > (2*EM_wordsize))
+            {
+                push(new_wordir(value));
+                helper_function(".dus4");
+            }
+            else if ((value == (EM_wordsize*2)) && (peek(0) == EM_wordsize) && (peek(1) == EM_wordsize))
             {
                 struct ir* v1 = pop(EM_wordsize);
                 struct ir* v2 = pop(EM_wordsize);
@@ -1117,12 +1197,30 @@ static void insn_ivalue(int opcode, arith value)
             break;
         }
 
+        case op_dus:
+        {
+            if (value != EM_wordsize)
+                fatal("'dus %d' not supported", value);
+            helper_function(".dus4");
+            break;
+        }
+
         case op_exg:
         {
-            struct ir* v1 = pop(value);
-            struct ir* v2 = pop(value);
-            push(v1);
-            push(v2);
+            if (value > (2*EM_wordsize))
+            {
+                push(
+                    new_wordir(value)
+                );
+                helper_function(".exg");
+            }
+            else
+            {
+                struct ir* v1 = pop(value);
+                struct ir* v2 = pop(value);
+                push(v1);
+                push(v2);
+            }
             break;
         }
 
@@ -1285,53 +1383,19 @@ static void insn_ivalue(int opcode, arith value)
         }
 
         case op_lxl:
-        {
-            struct ir* ir;
-
-            /* Walk the static chain. */
-
-            ir = new_ir0(
-                IR_GETFP, EM_pointersize
+            push(
+                walk_static_chain(value)
             );
-
-            while (value--)
-            {
-                ir = new_ir1(
-                    IR_CHAINFP, EM_pointersize,
-                    ir
-                );
-            }
-
-            push(ir);
             break;
-        }
 
         case op_lxa:
-        {
-            struct ir* ir;
-
-            /* Walk the static chain. */
-
-            ir = new_ir0(
-                IR_GETFP, EM_pointersize
-            );
-
-            while (value--)
-            {
-                ir = new_ir1(
-                    IR_CHAINFP, EM_pointersize,
-                    ir
-                );
-            }
-
             push(
                 new_ir1(
                     IR_FPTOAB, EM_pointersize,
-                    ir
+                    walk_static_chain(value)
                 )
             );
             break;
-        }
 
         case op_fef:
         {
@@ -1394,6 +1458,7 @@ static void insn_ivalue(int opcode, arith value)
                     break;
 
                 case 1:
+                    materialise_stack();
                     push(
                         appendir(
                             new_ir0(
@@ -1403,10 +1468,6 @@ static void insn_ivalue(int opcode, arith value)
                     );
                     break;
 
-                case 2:
-                    helper_function(".unimplemented_lor_2");
-                    break;
-
                 default:
                     fatal("'lor %d' not supported", value);
             }
@@ -1436,10 +1497,6 @@ static void insn_ivalue(int opcode, arith value)
                     );
                     break;
 
-                case 2:
-                    helper_function(".unimplemented_str_2");
-                    break;
-
                 default:
                     fatal("'str %d' not supported", value);
             }
@@ -1448,100 +1505,27 @@ static void insn_ivalue(int opcode, arith value)
         }
 
         case op_blm:
-        {
-            /* Input stack: ( src dest -- ) */
-            struct ir* dest = pop(EM_pointersize);
-            struct ir* src = pop(EM_pointersize);
-            blockmove(dest, src, new_wordir(value));
+            push(new_wordir(value));
+            helper_function(".bls4");
             break;
-        }
 
         case op_bls:
-        {
-            /* Input stack: ( src dest size -- ) */
-            struct ir* dest = pop(EM_pointersize);
-            struct ir* src = pop(EM_pointersize);
-            struct ir* size = pop(EM_wordsize);
-            blockmove(dest, src, size);
+            if (value != EM_wordsize)
+                fatal("'bls %d' not supported", value);
+            helper_function(".bls4");
             break;
-        }
 
         case op_los:
-        {
-            /* Copy an arbitrary amount to the stack. */
-            struct ir* bytes = pop(EM_wordsize);
-            struct ir* address = pop(EM_pointersize);
-
-            materialise_stack();
-            appendir(
-                new_ir1(
-                    IR_STACKADJUST, EM_pointersize,
-                    new_ir1(
-                        IR_NEG, EM_wordsize,
-                        bytes
-                    )
-                )
-            );
-
-            push(
-                new_ir0(
-                    IR_GETSP, EM_pointersize
-                )
-            );
-            push(address);
-            push(bytes);
-            materialise_stack();
-            appendir(
-                new_ir1(
-                    IR_CALL, 0,
-                    new_labelir("memcpy")
-                )
-            );
-            appendir(
-                new_ir1(
-                    IR_STACKADJUST, EM_pointersize,
-                    new_wordir(EM_pointersize*2 + EM_wordsize)
-                )
-            );
+            if (value != EM_wordsize)
+                fatal("'los %d' not supported", value);
+            helper_function_with_arg(".los4", pop(EM_wordsize));
             break;
-        }
 
         case op_sts:
-        {
-            /* Copy an arbitrary amount from the stack. */
-            struct ir* bytes = pop(EM_wordsize);
-            struct ir* dest = pop(EM_pointersize);
-            struct ir* src;
-
-            materialise_stack();
-            src = appendir(
-                    new_ir0(
-                        IR_GETSP, EM_pointersize
-                    )
-                );
-
-            push(dest);
-            push(src);
-            push(bytes);
-            materialise_stack();
-            appendir(
-                new_ir1(
-                    IR_CALL, 0,
-                    new_labelir("memcpy")
-                )
-            );
-            appendir(
-                new_ir1(
-                    IR_STACKADJUST, EM_pointersize,
-                    new_ir2(
-                        IR_ADD, EM_wordsize,
-                        new_wordir(EM_pointersize*2 + EM_wordsize),
-                        bytes
-                    )
-                )
-            );
+            if (value != EM_wordsize)
+                fatal("'sts %d' not supported", value);
+            helper_function_with_arg(".sts4", pop(EM_wordsize));
             break;
-        }
 
         case op_lin:
         {
@@ -1677,17 +1661,17 @@ static void insn_lvalue(int opcode, const char* label, arith offset)
 
         case op_gto:
         {
-            struct ir* descriptor = pop(EM_pointersize);
+            struct ir* descriptor = address_of_external(label, offset);
 
             appendir(
                 new_ir1(
-                    IR_SETSP, EM_pointersize,
+                    IR_SETFP, EM_pointersize,
                     load(EM_pointersize, descriptor, EM_pointersize*2)
                 )
             );
             appendir(
                 new_ir1(
-                    IR_SETFP, EM_pointersize,
+                    IR_SETSP, EM_pointersize,
                     load(EM_pointersize, descriptor, EM_pointersize*1)
                 )
             );

From 04ac91889c32128b54d4d40961eab7e8a124a51a Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 1 Feb 2018 12:20:31 -0500
Subject: [PATCH 44/55] Only lower "addi sp, sp, X" if X > 0.

If X < 0, then lowering the addi might cause the code to use the stack
space before allocating it.  This is a bug because an asynchronous
signal handler can overwrite the unallocated stack space.
---
 mach/powerpc/top/table | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/mach/powerpc/top/table b/mach/powerpc/top/table
index cbc16c277..196cae128 100644
--- a/mach/powerpc/top/table
+++ b/mach/powerpc/top/table
@@ -8,6 +8,7 @@ LABEL_STARTER '.';
 
 L1, L2, L3, L4, L5  { not_using_sp(VAL) };
 RNZ                 { strcmp(VAL, "r0") };  /* not r0 */
+UP                  { positive(VAL) };
 X, Y, Z             { TRUE };
 
 %%;
@@ -20,22 +21,22 @@ addis RNZ, RNZ, 0            -> ;
 addi RNZ, RNZ, X : addi RNZ, RNZ, Y { plus(X, Y, Z) }
                              -> addi RNZ, RNZ, Z ;
 
-/* Lower "addi sp, sp, X" by lifting other instructions, looking for
+/* Lower "addi sp, sp, UP" by lifting other instructions, looking for
  * chances to merge or delete _addi_ instructions, and assuming that
  * the code generator uses "sp" not "r1".
  */
-addi sp, sp, X : ANY L1                 { lift(ANY) }
-                             -> ANY L1                 : addi sp, sp, X ;
-addi sp, sp, X : ANY L1, L2             { lift(ANY) }
-                             -> ANY L1, L2             : addi sp, sp, X ;
-addi sp, sp, X : ANY L1, L2, L3         { lift(ANY) }
-                             -> ANY L1, L2, L3         : addi sp, sp, X ;
-addi sp, sp, X : ANY L1, L2, L3, L4     { lift(ANY) }
- -> ANY L1, L2, L3, L4     : addi sp, sp, X ;
-addi sp, sp, X : ANY L1, L2, L3, L4, L5 { lift(ANY) }
-                             -> ANY L1, L2, L3, L4, L5 : addi sp, sp, X ;
-addi sp, sp, X : lmw Y, L1 { Y[0]=='r' && atoi(Y+1)>1 }
-                             -> lmw Y, L1 : addi sp, sp, X ;
+addi sp, sp, UP : ANY L1                 { lift(ANY) }
+                             -> ANY L1                 : addi sp, sp, UP ;
+addi sp, sp, UP : ANY L1, L2             { lift(ANY) }
+                             -> ANY L1, L2             : addi sp, sp, UP ;
+addi sp, sp, UP : ANY L1, L2, L3         { lift(ANY) }
+                             -> ANY L1, L2, L3         : addi sp, sp, UP ;
+addi sp, sp, UP : ANY L1, L2, L3, L4     { lift(ANY) }
+                             -> ANY L1, L2, L3, L4     : addi sp, sp, UP ;
+addi sp, sp, UP : ANY L1, L2, L3, L4, L5 { lift(ANY) }
+                             -> ANY L1, L2, L3, L4, L5 : addi sp, sp, UP ;
+addi sp, sp, UP : lmw Y, L1 { Y[0]=='r' && atoi(Y+1)>1 }
+                             -> lmw Y, L1 : addi sp, sp, UP ;
 
 /* Merge _addi_ when popping from the stack. */
 addi sp, sp, X : lwz L1, Y(sp) { plus(X, Y, Z) && Z[0]!='-' }
@@ -117,6 +118,15 @@ int not_using_sp(const char *s) {
 }
 
 
+int positive(const char *s) {
+	long n;
+	char *end;
+
+	n = strtol(s, &end, 10);
+	return *s != '\0' && *end == '\0' && n > 0;
+}
+
+
 /* Instructions to lift(), sorted in strcmp() order.  These are from
  * ../ncg/table, minus branch instructions.
  */

From a60738a50dbec68b219c9ad68a867127d42fffdd Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Mon, 5 Feb 2018 14:55:10 -0500
Subject: [PATCH 45/55] Don't use '-' in option string to getopt().

Using '-' might fail on platforms like FreeBSD.  Commit 50a7031
stopped using '-' in the B compiler and ego.  I now stop using '-' in
mcg, because I can now check that mcg still works.
---
 mach/proto/mcg/main.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/mach/proto/mcg/main.c b/mach/proto/mcg/main.c
index cf8a4435f..aa0fa4816 100644
--- a/mach/proto/mcg/main.c
+++ b/mach/proto/mcg/main.c
@@ -42,13 +42,14 @@ int main(int argc, char* const argv[])
     const char* inputfilename = NULL;
     const char* outputfilename = NULL;
     FILE* output;
+    int i;
 
     program_name = argv[0];
 
     opterr = 1;
     for (;;)
     {
-        int c = getopt(argc, argv, "-d:D:C:o:");
+        int c = getopt(argc, argv, "d:D:C:o:");
         if (c == -1)
             break;
 
@@ -79,20 +80,22 @@ int main(int argc, char* const argv[])
                     fatal("already specified an output file");
                 outputfilename = optarg;
                 break;
-
-            case 1:
-                if (inputfilename)
-                    fatal("unexpected argument '%s'", optarg);
-                inputfilename = optarg;
         }
     }
 
+    for (i = optind; i < argc; i++)
+    {
+        if (inputfilename)
+            fatal("unexpected argument '%s'", argv[i]);
+        inputfilename = argv[i];
+    }
+
     symbol_init();
 
-	if (!EM_open((char*) inputfilename))
-		fatal("couldn't open input '%s': %s",
+    if (!EM_open((char*) inputfilename))
+        fatal("couldn't open input '%s': %s",
             inputfilename ? inputfilename : "<stdin>", EM_error);
-	
+
     if (outputfilename)
     {
         outputfile = fopen(outputfilename, "w");

From 0a6d3de7fe51b1f29d83dccc147532a6a59a23a7 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Mon, 5 Feb 2018 16:09:30 -0500
Subject: [PATCH 46/55] Use prototypes in ego/cs, ego/sp.

---
 util/ego/cs/cs_aux.c         | 13 ++++------
 util/ego/cs/cs_aux.h         | 12 ++++-----
 util/ego/cs/cs_avail.c       | 25 ++++++-------------
 util/ego/cs/cs_avail.h       |  6 +++--
 util/ego/cs/cs_debug.c       | 13 ++++------
 util/ego/cs/cs_debug.h       | 12 ++++++---
 util/ego/cs/cs_elim.c        | 48 ++++++++++--------------------------
 util/ego/cs/cs_elim.h        |  3 ++-
 util/ego/cs/cs_entity.c      | 14 ++++-------
 util/ego/cs/cs_entity.h      |  9 ++++---
 util/ego/cs/cs_getent.c      |  6 ++---
 util/ego/cs/cs_getent.h      |  3 ++-
 util/ego/cs/cs_kill.c        | 48 +++++++++++++-----------------------
 util/ego/cs/cs_kill.h        | 15 +++++++----
 util/ego/cs/cs_partit.c      | 24 ++++++------------
 util/ego/cs/cs_partit.h      | 30 ++++++++++++++--------
 util/ego/cs/cs_profit.c      | 18 ++++----------
 util/ego/cs/cs_stack.c       | 13 +++-------
 util/ego/cs/cs_stack.h       | 12 ++++++---
 util/ego/cs/cs_vnm.c         | 47 ++++++++++-------------------------
 util/ego/cs/cs_vnm.h         |  3 ++-
 util/ego/descr/powerpc.descr |  4 +--
 util/ego/share/aux.h         |  2 +-
 util/ego/share/global.h      |  6 ++---
 util/ego/sp/sp.c             | 39 ++++++-----------------------
 25 files changed, 166 insertions(+), 259 deletions(-)

diff --git a/util/ego/cs/cs_aux.c b/util/ego/cs/cs_aux.c
index 337deeda7..aeb582c9b 100644
--- a/util/ego/cs/cs_aux.c
+++ b/util/ego/cs/cs_aux.c
@@ -11,8 +11,7 @@
 #include "cs.h"
 #include "cs_entity.h"
 
-offset array_elemsize(vn)
-	valnum vn;
+offset array_elemsize(valnum vn)
 {
 	/* Vn is the valuenumber of an entity that points to
 	 * an array-descriptor. The third element of this descriptor holds
@@ -36,14 +35,12 @@ offset array_elemsize(vn)
 	return aoff(enp->en_ext->o_dblock->d_values, 2);
 }
 
-occur_p occ_elem(i)
-	Lindex i;
+occur_p occ_elem(Lindex i)
 {
 	return (occur_p) Lelem(i);
 }
 
-entity_p en_elem(i)
-	Lindex i;
+entity_p en_elem(Lindex i)
 {
 	return (entity_p) Lelem(i);
 }
@@ -54,14 +51,14 @@ entity_p en_elem(i)
 
 STATIC valnum val_no;
 
-valnum newvalnum()
+valnum newvalnum(void)
 {
 	/* Return a completely new value number. */
 
 	return ++val_no;
 }
 
-start_valnum()
+void start_valnum(void)
 {
 	/* Restart value numbering. */
 
diff --git a/util/ego/cs/cs_aux.h b/util/ego/cs/cs_aux.h
index 11950540e..1ce9373a0 100644
--- a/util/ego/cs/cs_aux.h
+++ b/util/ego/cs/cs_aux.h
@@ -3,28 +3,28 @@
  * (c) copyright 1987 by the Vrije Universiteit, Amsterdam, The Netherlands.
  * See the copyright notice in the ACK home directory, in the file "Copyright".
  */
-extern offset	array_elemsize();	/* (valnum vm)
+extern offset	array_elemsize(valnum vm);
+					/*
 					 * Returns the size of array-elements,
 					 * if vn is the valuenumber of the
 					 * address of an array-descriptor.
 					 */
 
-extern occur_p	occ_elem();		/* (Lindex i)
+extern occur_p	occ_elem(Lindex i);	/*
 					 * Returns a pointer to the occurrence
 					 * of which i is an index in a set.
 					 */
 
-extern entity_p	en_elem();		/* (Lindex i)
+extern entity_p	en_elem(Lindex i);	/*
 					 * Returns a pointer to the entity
 					 * of which i is an index in a set.
 					 */
 
-extern valnum	newvalnum();		/* ()
+extern valnum	newvalnum(void);	/*
 					 * Returns a completely new
 					 * value number.
 					 */
 
-extern		start_valnum();		/* ()
+extern void	start_valnum(void);	/*
 					 * Restart value numbering.
 					 */
-
diff --git a/util/ego/cs/cs_avail.c b/util/ego/cs/cs_avail.c
index 1f766a85c..025132a2e 100644
--- a/util/ego/cs/cs_avail.c
+++ b/util/ego/cs/cs_avail.c
@@ -22,8 +22,7 @@
 
 avail_p avails; /* The list of available expressions. */
 
-STATIC bool commutative(instr)
-	int instr;
+STATIC bool commutative(int instr)
 {
 	/* Is instr a commutative operator? */
 
@@ -37,9 +36,7 @@ STATIC bool commutative(instr)
 	}
 }
 
-STATIC bool same_avail(kind, avp1, avp2)
-	byte kind;
-	avail_p avp1, avp2;
+STATIC bool same_avail(byte kind, avail_p avp1, avail_p avp2)
 {
 	/* Two expressions are the same if they have the same operator,
 	 * the same size, and their operand(s) have the same value. 
@@ -75,8 +72,7 @@ STATIC bool same_avail(kind, avp1, avp2)
 	/* NOTREACHED */
 }
 
-STATIC void check_local(avp)
-	avail_p avp;
+STATIC void check_local(avail_p avp)
 {
 	/* Check if the local in which the result of avp was stored,
 	 * still holds this result. Update if not.
@@ -89,9 +85,7 @@ STATIC void check_local(avp)
 	}
 }
 
-STATIC entity_p result_local(size, l)
-	offset size;
-	line_p l;
+STATIC entity_p result_local(offset size, line_p l)
 {
 	/* If the result of an expression of size bytes is stored into a
 	 * local for which a registermessage was generated, return a pointer
@@ -114,9 +108,7 @@ STATIC entity_p result_local(size, l)
 	return (entity_p) 0;
 }
 
-STATIC copy_avail(kind, src, dst)
-	int kind;
-	avail_p src, dst;
+STATIC void copy_avail(int kind, avail_p src, avail_p dst)
 {
 	/* Copy some attributes from src to dst. */
 
@@ -143,10 +135,7 @@ STATIC copy_avail(kind, src, dst)
 	}
 }
 
-avail_p av_enter(avp, ocp, kind)
-	avail_p avp;
-	occur_p ocp;
-	int kind;
+avail_p av_enter(avail_p avp, occur_p ocp, int kind)
 {
 	/* Put the available expression avp in the list,
 	 * if it is not already there.
@@ -186,7 +175,7 @@ avail_p av_enter(avp, ocp, kind)
 	return ravp;
 }
 
-clr_avails()
+void clr_avails(void)
 {
 	/* Throw away the information about the available expressions. */
 
diff --git a/util/ego/cs/cs_avail.h b/util/ego/cs/cs_avail.h
index a0515353a..3364be2a2 100644
--- a/util/ego/cs/cs_avail.h
+++ b/util/ego/cs/cs_avail.h
@@ -5,7 +5,8 @@
  */
 extern avail_p	avails;		/* The set of available expressions. */
 
-extern avail_p	av_enter();	/* (avail_p avp, occur_p ocp, byte kind)
+extern avail_p	av_enter(avail_p avp, occur_p ocp, byte kind);
+				/*
 				 * Puts the available expression in avp
 				 * in the list of available expressions,
 				 * if it is not already there. Add ocp to set of
@@ -18,6 +19,7 @@ extern avail_p	av_enter();	/* (avail_p avp, occur_p ocp, byte kind)
 				 * Returns a pointer into the list.
 				 */
 
-extern		clr_avails();	/* Release all space occupied by the old list
+extern void	clr_avails(void);
+				/* Release all space occupied by the old list
 				 * of available expressions.
 				 */
diff --git a/util/ego/cs/cs_debug.c b/util/ego/cs/cs_debug.c
index bf43d8c12..07890395b 100644
--- a/util/ego/cs/cs_debug.c
+++ b/util/ego/cs/cs_debug.c
@@ -17,8 +17,7 @@
 
 extern char em_mnem[]; /* The mnemonics of the EM instructions. */
 
-STATIC void showinstr(lnp)
-	line_p lnp;
+STATIC void showinstr(line_p lnp)
 {
 	/* Makes the instruction in `lnp' human readable. Only lines that
 	 * can occur in expressions that are going to be eliminated are
@@ -49,8 +48,7 @@ STATIC void showinstr(lnp)
 	fprintf(stderr,"\n");
 }
 
-SHOWOCCUR(ocp)
-	occur_p ocp;
+SHOWOCCUR(occur_p ocp)
 {
 	/* Shows all instructions in an occurrence. */
 
@@ -69,8 +67,7 @@ SHOWOCCUR(ocp)
 
 #ifdef TRACE
 
-SHOWAVAIL(avp)
-	avail_p avp;
+void SHOWAVAIL(avail_p avp)
 {
 	/* Shows an available expression. */
 	showinstr(avp->av_found);
@@ -79,7 +76,7 @@ SHOWAVAIL(avp)
 
 }
 
-OUTAVAILS()
+void OUTAVAILS(void)
 {
 	register avail_p ravp;
 
@@ -110,7 +107,7 @@ STATIC char *enkinds[] = {
 	"ignore mask"
 };
 
-OUTENTITIES()
+void OUTENTITIES(void)
 {
 	register Lindex i;
 
diff --git a/util/ego/cs/cs_debug.h b/util/ego/cs/cs_debug.h
index e45287f9b..2d85ebfe8 100644
--- a/util/ego/cs/cs_debug.h
+++ b/util/ego/cs/cs_debug.h
@@ -5,7 +5,8 @@
  */
 #ifdef VERBOSE
 
-extern SHOWOCCUR();	/* (occur_p ocp)
+extern void SHOWOCCUR(occur_p ocp);
+			/*
 			 * Shows all lines in an occurrence.
 			 */
 
@@ -17,15 +18,18 @@ extern SHOWOCCUR();	/* (occur_p ocp)
 
 #ifdef TRACE
 
-extern OUTAVAILS();	/* ()
+extern void OUTAVAILS(void);
+			/*
 			 * Prints all available expressions.
 			 */
 
-extern OUTENTITIES();	/* ()
+extern void OUTENTITIES(void);
+			/*
 			 * Prints all entities.
 			 */
 
-extern SHOWAVAIL();	/* (avail_p avp)
+extern void SHOWAVAIL(avail_p avp);
+			/*
 			 * Shows an available expression.
 			 */
 
diff --git a/util/ego/cs/cs_elim.c b/util/ego/cs/cs_elim.c
index 0a253830f..767517bd8 100644
--- a/util/ego/cs/cs_elim.c
+++ b/util/ego/cs/cs_elim.c
@@ -20,8 +20,7 @@
 #include "cs_partit.h"
 #include "cs_debug.h"
 
-STATIC dlink(l1, l2)
-	line_p l1, l2;
+STATIC void dlink(line_p l1, line_p l2)
 {
 	/* Doubly link the lines in l1 and l2. */
 
@@ -31,11 +30,10 @@ STATIC dlink(l1, l2)
 		l2->l_prev = l1;
 }
 
-STATIC remove_lines(first, last)
-	line_p first, last;
+STATIC void remove_lines(line_p first, line_p last)
 {
 	/* Throw away the lines between and including first and last.
-	 * Don't worry about any pointers; the (must) have been taken care of.
+	 * Don't worry about any pointers; they (must) have been taken care of.
 	 */
 	register line_p lnp, next;
 
@@ -46,8 +44,7 @@ STATIC remove_lines(first, last)
 	}
 }
 
-STATIC bool contained(ocp1, ocp2)
-	occur_p ocp1, ocp2;
+STATIC bool contained(occur_p ocp1, occur_p ocp2)
 {
 	/* Determine whether ocp1 is contained within ocp2. */
 
@@ -61,9 +58,7 @@ STATIC bool contained(ocp1, ocp2)
 	return FALSE;
 }
 
-STATIC delete(ocp, start)
-	occur_p ocp;
-	avail_p start;
+STATIC void delete(occur_p ocp, avail_p start)
 {
 	/* Delete all occurrences that are contained within ocp.
 	 * They must have been entered in the list before start:
@@ -90,10 +85,7 @@ STATIC delete(ocp, start)
 	}
 }
 
-STATIC complete_aar(lnp, instr, descr_vn)
-	line_p lnp;
-	int instr;
-	valnum descr_vn;
+STATIC void complete_aar(line_p lnp, int instr, valnum descr_vn)
 {
 	/* Lnp is an instruction that loads the address of an array-element.
 	 * Instr tells us what effect we should achieve; load (instr is op_lar)
@@ -109,10 +101,7 @@ STATIC complete_aar(lnp, instr, descr_vn)
 	dlink(lnp, lindir);
 }
 
-STATIC replace(ocp, tmp, avp)
-	occur_p ocp;
-	offset tmp;
-	avail_p avp;
+STATIC void replace(occur_p ocp, offset tmp, avail_p avp)
 {
 	/* Replace the lines in the occurrence in ocp by a load of the
 	 * temporary with offset tmp.
@@ -143,9 +132,7 @@ STATIC replace(ocp, tmp, avp)
 	remove_lines(first, last);
 }
 
-STATIC append(avp, tmp)
-	avail_p avp;
-	offset tmp;
+STATIC void append(avail_p avp, offset tmp)
 {
 	/* Avp->av_found points to a line with an operator in it. This 
 	 * routine emits a sequence of instructions that saves the result
@@ -177,9 +164,7 @@ STATIC append(avp, tmp)
 	}
 }
 
-STATIC set_replace(avp, tmp)
-	avail_p avp;
-	offset tmp;
+STATIC void set_replace(avail_p avp, offset tmp)
 {
 	/* Avp->av_occurs is now a set of occurrences, each of which will be
 	 * replaced by a reference to a local.
@@ -199,8 +184,7 @@ STATIC set_replace(avp, tmp)
 	}
 }
 
-STATIC int reg_score(enp)
-	entity_p enp;
+STATIC int reg_score(entity_p enp)
 {
 	/* Enp is a local that will go into a register.
 	 * We return its score upto now.
@@ -209,10 +193,7 @@ STATIC int reg_score(enp)
 	return regv_arg(enp->en_loc, 4);
 }
 
-STATIC line_p gen_mesreg(off, avp, pp)
-	offset off;
-	avail_p avp;
-	proc_p pp;
+STATIC line_p gen_mesreg(offset off, avail_p avp, proc_p pp)
 {
 	/* Generate a register message for the local that will hold the
 	 * result of the expression in avp, at the appropriate place in
@@ -226,9 +207,7 @@ STATIC line_p gen_mesreg(off, avp, pp)
 	return reg;
 }
 
-STATIC change_score(mes, score)
-	line_p mes;
-	int score;
+STATIC void change_score(line_p mes, int score)
 {
 	/* Change the score in the register message in mes to score. */
 
@@ -242,8 +221,7 @@ STATIC change_score(mes, score)
 	ap->a_a.a_offset = score;
 }
 
-eliminate(pp)
-	proc_p pp;
+void eliminate(proc_p pp)
 {
 	/* Eliminate costly common subexpressions within procedure pp.
 	 * We scan the available expressions in - with respect to time found -
diff --git a/util/ego/cs/cs_elim.h b/util/ego/cs/cs_elim.h
index 4c6a61669..9c7d86477 100644
--- a/util/ego/cs/cs_elim.h
+++ b/util/ego/cs/cs_elim.h
@@ -3,7 +3,8 @@
  * (c) copyright 1987 by the Vrije Universiteit, Amsterdam, The Netherlands.
  * See the copyright notice in the ACK home directory, in the file "Copyright".
  */
-extern eliminate();	/* (proc_p pp)
+extern void eliminate(proc_p pp);
+			/*
 			 * Eliminate some of the recurrences of expressions
 			 * that were found by the valuenumbering
 			 * algorithm.
diff --git a/util/ego/cs/cs_entity.c b/util/ego/cs/cs_entity.c
index e4e49ff9a..a2cd5228d 100644
--- a/util/ego/cs/cs_entity.c
+++ b/util/ego/cs/cs_entity.c
@@ -18,8 +18,7 @@
 
 lset entities; /* Our pseudo symbol-table. */
 
-entity_p find_entity(vn)
-	valnum vn;
+entity_p find_entity(valnum vn)
 {
 	/* Try to find the entity with valuenumber vn. */
 
@@ -33,8 +32,7 @@ entity_p find_entity(vn)
 	return (entity_p) 0;
 }
 
-STATIC bool same_entity(enp1, enp2)
-	entity_p enp1, enp2;
+STATIC bool same_entity(entity_p enp1, entity_p enp2)
 {
 	if (enp1->en_kind != enp2->en_kind) return FALSE;
 	if (enp1->en_size != enp2->en_size) return FALSE;
@@ -69,8 +67,7 @@ STATIC bool same_entity(enp1, enp2)
 	}
 }
 
-STATIC copy_entity(src, dst)
-	entity_p src, dst;
+STATIC void copy_entity(entity_p src, entity_p dst)
 {
 	dst->en_static = src->en_static;
 	dst->en_kind = src->en_kind;
@@ -111,8 +108,7 @@ STATIC copy_entity(src, dst)
 	}
 }
 
-entity_p en_enter(enp)
-	register entity_p enp;
+entity_p en_enter(entity_p enp)
 {
 	/* Put the entity in enp in the entity set, if it is not already there.
 	 * Return pointer to stored entity.
@@ -133,7 +129,7 @@ entity_p en_enter(enp)
 	return new;
 }
 
-clr_entities()
+void clr_entities(void)
 {
 	/* Throw away all pseudo-symboltable information. */
 
diff --git a/util/ego/cs/cs_entity.h b/util/ego/cs/cs_entity.h
index c669efb58..0a222f96e 100644
--- a/util/ego/cs/cs_entity.h
+++ b/util/ego/cs/cs_entity.h
@@ -5,16 +5,19 @@
  */
 extern lset	entities;	/* The pseudo-symboltable. */
 
-extern entity_p	find_entity();	/* (valnum vn)
+extern entity_p	find_entity(valnum vn);
+				/*
 				 * Tries to find an entity with value number vn.
 				 */
 
-extern entity_p	en_enter();	/* (entity_p enp)
+extern entity_p	en_enter(entity_p enp);
+				/*
 				 * Enter the entity in enp in the set of
 				 * entities if it was not already there.
 				 */
 
-extern		clr_entities();	/* ()
+extern void	clr_entities(void);
+				/*
 				 * Release all space occupied by our
 				 * pseudo-symboltable.
 				 */
diff --git a/util/ego/cs/cs_getent.c b/util/ego/cs/cs_getent.c
index ef8694536..144750802 100644
--- a/util/ego/cs/cs_getent.c
+++ b/util/ego/cs/cs_getent.c
@@ -67,8 +67,7 @@ STATIC struct inf_entity {
 #define ENKIND(ip)	ip->inf_used
 #define SIZEINF(ip)	ip->inf_size
 
-STATIC struct inf_entity *getinf(n)
-	int n;
+STATIC struct inf_entity *getinf(int n)
 {
 	struct inf_entity *ip;
 
@@ -78,8 +77,7 @@ STATIC struct inf_entity *getinf(n)
 	return (struct inf_entity *) 0;
 }
 
-entity_p getentity(lnp, l_out)
-	line_p lnp, *l_out;
+entity_p getentity(line_p lnp, line_p *l_out)
 {
 	/* Build the entities where lnp refers to, and enter them.
 	 * If a token needs to be popped, the first line that pushed
diff --git a/util/ego/cs/cs_getent.h b/util/ego/cs/cs_getent.h
index e37e37404..f1c4e955d 100644
--- a/util/ego/cs/cs_getent.h
+++ b/util/ego/cs/cs_getent.h
@@ -3,7 +3,8 @@
  * (c) copyright 1987 by the Vrije Universiteit, Amsterdam, The Netherlands.
  * See the copyright notice in the ACK home directory, in the file "Copyright".
  */
-extern entity_p getentity();	/* (line_p lnp, *l_out)
+extern entity_p getentity(line_p lnp, line_p *l_out);
+				/*
 				 * Extract the entity lnp refers and enter it
 				 * in the table of entities. The main entity
 				 * lnp refers to is returned; sometimes there
diff --git a/util/ego/cs/cs_kill.c b/util/ego/cs/cs_kill.c
index 520366f23..fc3144397 100644
--- a/util/ego/cs/cs_kill.c
+++ b/util/ego/cs/cs_kill.c
@@ -16,9 +16,9 @@
 #include "cs_debug.h"
 #include "cs_avail.h"
 #include "cs_entity.h"
+#include "cs_kill.h"
 
-STATIC base_valno(enp)
-	entity_p enp;
+STATIC valnum base_valno(entity_p enp)
 {
 	/* Return the value number of the (base) address of an indirectly
 	 * accessed entity.
@@ -37,8 +37,7 @@ STATIC base_valno(enp)
 	/* NOTREACHED */
 }
 
-STATIC entity_p find_base(vn)
-	valnum vn;
+STATIC entity_p find_base(valnum vn)
 {
 	/* Vn is the valuenumber of the (base) address of an indirectly
 	 * accessed entity. Return the entity that holds this address
@@ -79,8 +78,7 @@ STATIC entity_p find_base(vn)
 	return (entity_p) 0;
 }
 
-STATIC bool obj_overlap(op1, op2)
-	obj_p op1, op2;
+STATIC bool obj_overlap(obj_p op1, obj_p op2)
 {
 	/* Op1 and op2 point to two objects in the same datablock.
 	 * Obj_overlap returns whether these objects might overlap.
@@ -97,8 +95,7 @@ STATIC bool obj_overlap(op1, op2)
 
 #define same_datablock(o1, o2)	((o1)->o_dblock == (o2)->o_dblock)
 
-STATIC bool addr_local(enp)
-	entity_p enp;
+STATIC bool addr_local(entity_p enp)
 {
 	/* Is enp the address of a stack item. */
 
@@ -108,17 +105,14 @@ STATIC bool addr_local(enp)
 		enp->en_kind == ENAARGBASE;
 }
 
-STATIC bool addr_external(enp)
-	entity_p enp;
+STATIC bool addr_external(entity_p enp)
 {
 	/* Is enp the address of an external. */
 
 	return enp != (entity_p) 0 && enp->en_kind == ENAEXTERNAL;
 }
 
-STATIC kill_external(obp, indir)
-	obj_p obp;
-	int indir;
+STATIC void kill_external(obj_p obp, int indir)
 {
 	/* A store is done via the object in obp. If this store is direct
 	 * we kill directly accessed entities in the same data block only
@@ -164,8 +158,7 @@ STATIC kill_external(obp, indir)
 	}
 }
 
-STATIC bool loc_overlap(enp1, enp2)
-	entity_p enp1, enp2;
+STATIC bool loc_overlap(entity_p enp1, entity_p enp2)
 {
 	/* Enp1 and enp2 point to two locals. Loc_overlap returns whether
 	 * they overlap.
@@ -184,9 +177,7 @@ STATIC bool loc_overlap(enp1, enp2)
 			enp1->en_loc + enp1->en_size > enp2->en_loc;
 }
 
-STATIC kill_local(enp, indir)
-	entity_p enp;
-	bool indir;
+STATIC void kill_local(entity_p enp, bool indir)
 {
 	/* This time a store is done into an ENLOCAL. */
 
@@ -234,7 +225,7 @@ STATIC kill_local(enp, indir)
 	}
 }
 
-STATIC void kill_sim()
+STATIC void kill_sim(void)
 {
 	/* A store is done into the ENIGNMASK. */
 
@@ -252,8 +243,7 @@ STATIC void kill_sim()
 	}
 }
 
-kill_direct(enp)
-	entity_p enp;
+void kill_direct(entity_p enp)
 {
 	/* A store will be done into enp. We must forget the values of all the
 	 * entities this one may overlap with.
@@ -274,8 +264,7 @@ kill_direct(enp)
 	}
 }
 
-kill_indir(enp)
-	entity_p enp;
+void kill_indir(entity_p enp)
 {
 	/* An indirect store is done, in an ENINDIR,
 	 * an ENOFFSETTED or an ENARRELEM.
@@ -306,7 +295,7 @@ kill_indir(enp)
 	}
 }
 
-kill_much()
+extern void kill_much(void)
 {
 	/* Kills all killable entities,
 	 * except the locals for which a registermessage was generated.
@@ -324,8 +313,7 @@ kill_much()
 	}
 }
 
-STATIC bool bad_procflags(pp)
-	proc_p pp;
+STATIC bool bad_procflags(proc_p pp)
 {
 	/* Return whether the flags about the procedure in pp indicate
 	 * that we have little information about it. It might be that
@@ -335,8 +323,7 @@ STATIC bool bad_procflags(pp)
 	return !(pp->p_flags1 & PF_BODYSEEN) || (pp->p_flags1 & PF_CALUNKNOWN);
 }
 
-STATIC kill_globset(s)
-	cset s;
+STATIC void kill_globset(cset s)
 {
 	/* S is a set of global variables that might be changed.
 	 * We act as if a direct store is done into each of them.
@@ -349,8 +336,7 @@ STATIC kill_globset(s)
 	}
 }
 
-kill_call(pp)
-	proc_p pp;
+void kill_call(proc_p pp)
 {
 	/* Kill everything that might be destroyed by calling
 	 * the procedure in pp.
@@ -367,7 +353,7 @@ kill_call(pp)
 	}
 }
 
-kill_all()
+void kill_all(void)
 {
 	/* Kills all entities. */
 
diff --git a/util/ego/cs/cs_kill.h b/util/ego/cs/cs_kill.h
index 6fa6859b8..347e3eb16 100644
--- a/util/ego/cs/cs_kill.h
+++ b/util/ego/cs/cs_kill.h
@@ -3,27 +3,32 @@
  * (c) copyright 1987 by the Vrije Universiteit, Amsterdam, The Netherlands.
  * See the copyright notice in the ACK home directory, in the file "Copyright".
  */
-extern	kill_call();	/* (proc_p pp)
+extern void kill_call(proc_p pp);
+			/*
 			 * Kill all entities that might have an other value
 			 * after execution of the procedure in pp.
 			 */
 
-extern	kill_much();	/* ()
+extern void kill_much(void);
+			/*
 			 * Kill all killable entities except those for which
 			 * a register message was generated.
 			 * Constants, addresses, etc are not killable.
 			 */
 
-extern	kill_indir();	/* (entity_p enp)
+extern void kill_indir(entity_p enp);
+			/*
 			 * Kill all entities that might have an other value
 			 * after indirect assignment to the entity in enp.
 			 */
 
-extern	kill_direct();	/* (entity_p enp)
+extern void kill_direct(entity_p enp);
+			/*
 			 * Kill all entities that might have an other value
 			 * after direct assignment to the entity in enp.
 			 */
 
-extern	kill_all();	/* ()
+extern void kill_all(void);
+			/*
 			 * Kill all entities.
 			 */
diff --git a/util/ego/cs/cs_partit.c b/util/ego/cs/cs_partit.c
index 9a1bde042..e6d3b1fdf 100644
--- a/util/ego/cs/cs_partit.c
+++ b/util/ego/cs/cs_partit.c
@@ -178,8 +178,7 @@ STATIC struct {
 #define AVSIZE(l)	(info[INSTR(l)].i_av)
 #define REGTYPE(n)	(info[n].i_regtype)
 
-int instrgroup(lnp)
-	line_p lnp;
+int instrgroup(line_p lnp)
 {
 	if (INSTR(lnp) == op_lor && SHORT(lnp) == 1) {
 		/* We can't do anything with the stackpointer. */
@@ -192,8 +191,7 @@ int instrgroup(lnp)
 	return GROUP(INSTR(lnp));
 }
 
-bool stack_group(instr)
-	int instr;
+bool stack_group(int instr)
 {
 	/* Is this an instruction that only does something to the top of
 	 * the stack?
@@ -211,8 +209,7 @@ bool stack_group(instr)
 	}
 }
 
-STATIC offset argw(lnp)
-	line_p lnp;
+STATIC offset argw(line_p lnp)
 {
 	/* Some EM-instructions have their argument either on the same line,
 	 * or on top of the stack. We give up when the argument is on top of
@@ -228,8 +225,7 @@ STATIC offset argw(lnp)
 	}
 }
 
-offset op11size(lnp)
-	line_p lnp;
+offset op11size(line_p lnp)
 {
 	/* Returns the size of the first argument of
 	 * the unary operator in lnp.
@@ -248,8 +244,7 @@ offset op11size(lnp)
 	/* NOTREACHED */
 }
 
-offset op12size(lnp)
-	line_p lnp;
+offset op12size(line_p lnp)
 {
 	/* Same for first of binary. */
 
@@ -264,8 +259,7 @@ offset op12size(lnp)
 	/* NOTREACHED */
 }
 
-offset op22size(lnp)
-	line_p lnp;
+offset op22size(line_p lnp)
 {
 	switch (OP2SIZE(lnp)) {
 		case ARGW:
@@ -319,8 +313,7 @@ offset op33size(lnp)
 		return ws;
 }
 
-offset avsize(lnp)
-	line_p lnp;
+offset avsize(line_p lnp)
 {
 	/* Returns the size of the result of the instruction in lnp.
 	 * If the instruction is a conversion this size is given on the stack.
@@ -359,8 +352,7 @@ offset avsize(lnp)
 	/* NOTREACHED */
 }
 
-int regtype(instr)
-	byte instr;
+int regtype(byte instr)
 {
 	switch (REGTYPE(instr & BMASK)) {
 		case ANY:
diff --git a/util/ego/cs/cs_partit.h b/util/ego/cs/cs_partit.h
index 27e7a00bc..ffcc321cb 100644
--- a/util/ego/cs/cs_partit.h
+++ b/util/ego/cs/cs_partit.h
@@ -7,53 +7,63 @@
  * "manageable chunks.
  */
 
-extern int	instrgroup();	/* (line_p lnp)
+extern int	instrgroup(line_p lnp);
+				/*
 				 * Return the group into which the instruction
 				 * in lnp belongs to.
 				 */
 
-extern bool	stack_group();	/* (int instr)
+extern bool	stack_group(int instr);
+				/*
 				 * Return whether instr is an instruction that
 				 * only changes the state of the stack, i.e.
 				 * is a "true" operator.
 				 */
 
-extern offset	op11size();	/* (line_p lnp)
+extern offset	op11size(line_p lnp);
+				/*
 				 * Return the size of the operand of the unary
 				 * operator in lnp.
 				 */
 
-extern offset	op12size();	/* (line_p lnp)
+extern offset	op12size(line_p lnp);
+				/*
 				 * Return the size of the first operand of the
 				 * binary operator in lnp.
 				 */
 
-extern offset	op22size();	/* (line_p lnp)
+extern offset	op22size(line_p lnp);
+				/*
 				 * Return the size of the second operand of the
 				 * binary operator in lnp.
 				 */
 
-extern offset	op13size();	/* (line_p lnp)
+extern offset	op13size(line_p lnp);
+				/*
 				 * Return the size of the first operand of the
 				 * ternary operator in lnp.
 				 */
 
-extern offset	op23size();	/* (line_p lnp)
+extern offset	op23size(line_p lnp);
+				/*
 				 * Return the size of the second operand of the
 				 * ternary operator in lnp.
 				 */
 
-extern offset	op33size();	/* (line_p lnp)
+extern offset	op33size(line_p lnp);
+				/*
 				 * Return the size of the third operand of the
 				 * ternary operator in lnp.
 				 */
 
-extern offset	avsize();	/* (line_p lnp)
+extern offset	avsize(line_p lnp);
+				/*
 				 * Return the size of the result of the
 				 * operator in lnp.
 				 */
 
-extern int	regtype();	/* (byte instr)
+extern int	regtype(byte instr);
+				/*
 				 * Return in what kind of machine-register
 				 * the result of instr should be stored:
 				 * pointer, float, or any.
diff --git a/util/ego/cs/cs_profit.c b/util/ego/cs/cs_profit.c
index 259a6114d..50cb708fd 100644
--- a/util/ego/cs/cs_profit.c
+++ b/util/ego/cs/cs_profit.c
@@ -26,9 +26,7 @@ STATIC cset	sli_counts;
 STATIC short	LX_threshold;
 STATIC short	AR_limit;
 
-STATIC get_instrs(f, s_p)
-	FILE *f;
-	cset *s_p;
+STATIC void get_instrs(FILE *f, cset *s_p)
 {
 	/* Read a set of integers from inputfile f into *s_p.
 	 * Such a set must be delimited by a negative number.
@@ -42,9 +40,7 @@ STATIC get_instrs(f, s_p)
 	}
 }
 
-STATIC choose_cset(f, s_p, max)
-	FILE *f;
-	cset *s_p;
+STATIC void choose_cset(FILE *f, cset *s_p, int max)
 {
 	/* Read two compact sets of integers from inputfile f.
 	 * Choose the first if we optimize with respect to time,
@@ -115,8 +111,7 @@ void cs_machinit(void *vp)
 	choose_cset(f, &forbidden, sp_lmnem);
 }
 
-STATIC bool sli_no_eliminate(lnp)
-	line_p lnp;
+STATIC bool sli_no_eliminate(line_p lnp)
 {
 	/* Return whether the SLI-instruction in lnp is part of
 	 * an array-index computation, and should not be eliminated.
@@ -130,8 +125,7 @@ STATIC bool sli_no_eliminate(lnp)
 		;
 }
 
-STATIC bool gains(avp)
-	avail_p avp;
+STATIC bool gains(avail_p avp)
 {
 	/* Return whether we can gain something, when we eliminate
 	 * an expression such as in avp. We just glue together some
@@ -161,9 +155,7 @@ STATIC bool gains(avp)
 	return TRUE;
 }
 
-STATIC bool okay_lines(avp, ocp)
-	avail_p avp;
-	occur_p ocp;
+STATIC bool okay_lines(avail_p avp, occur_p ocp)
 {
 	register line_p lnp, next;
 	offset sz;
diff --git a/util/ego/cs/cs_stack.c b/util/ego/cs/cs_stack.c
index 7927438a5..670955d1e 100644
--- a/util/ego/cs/cs_stack.c
+++ b/util/ego/cs/cs_stack.c
@@ -23,8 +23,7 @@ STATIC token_p		free_token;
 #define Stack_empty()	(free_token == &Stack[0])
 #define Top		(free_token - 1)
 
-Push(tkp)
-	token_p tkp;
+void Push(token_p tkp)
 {
 	if (tkp->tk_size == UNKNOWN_SIZE) {
 		Empty_stack(); /* The contents of the Stack is useless. */
@@ -39,10 +38,7 @@ Push(tkp)
 
 #define WORD_MULTIPLE(n)	((n / ws) * ws + ( n % ws ? ws : 0 ))
 
-void
-Pop(tkp, size)
-	token_p tkp;
-	offset size;
+void Pop(token_p tkp, offset size)
 {
 	/* Pop a token with given size from the valuenumber stack into tkp. */
 
@@ -85,8 +81,7 @@ Pop(tkp, size)
 	}
 }
 
-Dup(lnp)
-	line_p lnp;
+void Dup(line_p lnp)
 {
 	/* Duplicate top bytes on the Stack. */
 
@@ -132,7 +127,7 @@ Dup(lnp)
 	}
 }
 
-clr_stack()
+void clr_stack(void)
 {
 	free_token = &Stack[0];
 }
diff --git a/util/ego/cs/cs_stack.h b/util/ego/cs/cs_stack.h
index 64d59cf90..e5a79b858 100644
--- a/util/ego/cs/cs_stack.h
+++ b/util/ego/cs/cs_stack.h
@@ -3,21 +3,25 @@
  * (c) copyright 1987 by the Vrije Universiteit, Amsterdam, The Netherlands.
  * See the copyright notice in the ACK home directory, in the file "Copyright".
  */
-extern	Push();		/* (token_p tkp)
+extern void Push(token_p tkp);
+			/*
 			 * Push the token in tkp on the fake-stack.
 			 */
 
-extern	Pop();		/* (token_p tkp; offset size)
+extern void Pop(token_p tkp, offset size);
+			/*
 			 * Pop a token of size bytes from the fake-stack
 			 * into tkp. If such a token is not there
 			 * we put a dummy in tkp and adjust the fake-stack.
 			 */
 
-extern	Dup();		/* (line_p lnp)
+extern void Dup(line_p lnp);
+			/*
 			 * Reflect the changes made by the dup-instruction
 			 * in lnp to the EM-stack into the fake-stack.
 			 */
 
-extern	clr_stack();	/* ()
+extern void clr_stack(void);
+			/*
 			 * Clear the fake-stack.
 			 */
diff --git a/util/ego/cs/cs_vnm.c b/util/ego/cs/cs_vnm.c
index a4813411c..4dbeb3df2 100644
--- a/util/ego/cs/cs_vnm.c
+++ b/util/ego/cs/cs_vnm.c
@@ -21,9 +21,7 @@
 #include "cs_partit.h"
 #include "cs_getent.h"
 
-STATIC push_entity(enp, lfirst)
-	entity_p enp;
-	line_p lfirst;
+STATIC void push_entity(entity_p enp, line_p lfirst)
 {
 	/* Build token and Push it. */
 
@@ -35,10 +33,8 @@ STATIC push_entity(enp, lfirst)
 	Push(&tk);
 }
 
-STATIC put_expensive_load(bp, lnp, lfirst, enp)
-	bblock_p bp;
-	line_p lnp, lfirst;
-	entity_p enp;
+STATIC void put_expensive_load(bblock_p bp, line_p lnp, line_p lfirst,
+			       entity_p enp)
 {
 	struct avail av;
 	occur_p	ocp;
@@ -52,10 +48,7 @@ STATIC put_expensive_load(bp, lnp, lfirst, enp)
 	av_enter(&av, ocp, EXPENSIVE_LOAD);
 }
 
-STATIC put_aar(bp, lnp, lfirst, enp)
-	bblock_p bp;
-	line_p lnp, lfirst;
-	entity_p enp;
+STATIC void put_aar(bblock_p bp, line_p lnp, line_p lfirst, entity_p enp)
 {
 	/* Enp points to an ENARRELEM. We do as if its address was computed. */
 
@@ -74,9 +67,7 @@ STATIC put_aar(bp, lnp, lfirst, enp)
 	av_enter(&av, ocp, TERNAIR_OP);
 }
 
-STATIC push_avail(avp, lfirst)
-	avail_p avp;
-	line_p lfirst;
+STATIC void push_avail(avail_p avp, line_p lfirst)
 {
 	struct token tk;
 
@@ -86,10 +77,7 @@ STATIC push_avail(avp, lfirst)
 	Push(&tk);
 }
 
-STATIC push_unair_op(bp, lnp, tkp1)
-	bblock_p bp;
-	line_p lnp;
-	token_p tkp1;
+STATIC void push_unair_op(bblock_p bp, line_p lnp, token_p tkp1)
 {
 	struct avail av;
 	occur_p	ocp;
@@ -103,10 +91,7 @@ STATIC push_unair_op(bp, lnp, tkp1)
 	push_avail(av_enter(&av, ocp, UNAIR_OP), tkp1->tk_lfirst);
 }
 
-STATIC push_binair_op(bp, lnp, tkp1, tkp2)
-	bblock_p bp;
-	line_p lnp;
-	token_p tkp1, tkp2;
+STATIC void push_binair_op(bblock_p bp, line_p lnp, token_p tkp1, token_p tkp2)
 {
 	struct avail av;
 	occur_p	ocp;
@@ -121,10 +106,8 @@ STATIC push_binair_op(bp, lnp, tkp1, tkp2)
 	push_avail(av_enter(&av, ocp, BINAIR_OP), tkp1->tk_lfirst);
 }
 
-STATIC push_ternair_op(bp, lnp, tkp1, tkp2, tkp3)
-	bblock_p bp;
-	line_p lnp;
-	token_p tkp1, tkp2, tkp3;
+STATIC void push_ternair_op(bblock_p bp, line_p lnp, token_p tkp1,
+			    token_p tkp2, token_p tkp3)
 {
 	struct avail av;
 	occur_p	ocp;
@@ -140,8 +123,7 @@ STATIC push_ternair_op(bp, lnp, tkp1, tkp2, tkp3)
 	push_avail(av_enter(&av, ocp, TERNAIR_OP), tkp1->tk_lfirst);
 }
 
-STATIC fiddle_stack(lnp)
-	line_p lnp;
+STATIC void fiddle_stack(line_p lnp)
 {
 	/* The instruction in lnp does something to the valuenumber-stack. */
 
@@ -232,8 +214,7 @@ STATIC proc_p find_proc(vn)
 	return (proc_p) 0;
 }
 
-STATIC side_effects(lnp)
-	line_p lnp;
+STATIC void side_effects(line_p lnp)
 {
 	/* Lnp contains a cai or cal instruction. We try to find the callee
 	 * and see what side-effects it has.
@@ -255,8 +236,7 @@ STATIC side_effects(lnp)
 	}
 }
 
-hopeless(instr)
-	int instr;
+STATIC void hopeless(int instr)
 {
 	/* The effect of `instr' is too difficult to
 	 * compute. We assume worst case behaviour.
@@ -281,8 +261,7 @@ hopeless(instr)
 	}
 }
 
-vnm(bp)
-	bblock_p bp;
+void vnm(bblock_p bp)
 {
 	register line_p lnp;
 	register entity_p rep;
diff --git a/util/ego/cs/cs_vnm.h b/util/ego/cs/cs_vnm.h
index 0fbce5d72..0c86a77e8 100644
--- a/util/ego/cs/cs_vnm.h
+++ b/util/ego/cs/cs_vnm.h
@@ -3,7 +3,8 @@
  * (c) copyright 1987 by the Vrije Universiteit, Amsterdam, The Netherlands.
  * See the copyright notice in the ACK home directory, in the file "Copyright".
  */
-extern vnm();	/* (bblock_p bp)
+extern void vnm(bblock_p bp);
+		/*
 		 * Performs the valuenumbering algorithm on the basic
 		 * block in bp.
 		 */
diff --git a/util/ego/descr/powerpc.descr b/util/ego/descr/powerpc.descr
index e59990ea1..5fb9bb628 100644
--- a/util/ego/descr/powerpc.descr
+++ b/util/ego/descr/powerpc.descr
@@ -102,7 +102,7 @@ register save costs:
 	17 -> (102,136)
 	18 -> (108,144)
 	19 -> (114,152)
-        20 -> (120,160)
+	20 -> (120,160)
 	21 -> (126,168)
 	22 -> (132,176)
 	23 -> (138,184)
@@ -137,7 +137,7 @@ reduce sli if shift count larger than:  0
 first time then space:
 addressing modes: op_ads op_adp op_lof op_ldf op_loi op_dch op_lpb -1
 		  op_ads op_adp op_lof op_ldf op_loi op_dch op_lpb -1
-cheap operations: op_cii op_ciu op_cui op_cuu op_cmi op_cmu op_cmp -1 
+cheap operations: op_cii op_ciu op_cui op_cuu op_cmi op_cmu op_cmp -1
 		  op_cii op_ciu op_cui op_cuu op_cmi op_cmu op_cmp -1
 lexical tresholds: 1 1
 indirection limit: 8
diff --git a/util/ego/share/aux.h b/util/ego/share/aux.h
index 6a6770469..db2d3f8da 100644
--- a/util/ego/share/aux.h
+++ b/util/ego/share/aux.h
@@ -36,7 +36,7 @@ line_p reg_mes(offset tmp, short size, int typ, int score);
 bool dom(bblock_p b1, bblock_p b2);
 				/*
 				 * See if b1 dominates b2. Note that a
-				 * block always * dominates itself.
+				 * block always dominates itself.
 				 */
 bblock_p common_dom(bblock_p a, bblock_p b);
 				/*
diff --git a/util/ego/share/global.h b/util/ego/share/global.h
index f97df2fa2..4121a5b85 100644
--- a/util/ego/share/global.h
+++ b/util/ego/share/global.h
@@ -40,13 +40,13 @@ extern int ws;		/* word size	*/
 
 #define UNKNOWN_SIZE (-1)
 
-extern proc_p curproc;  /* current procedure */
+extern proc_p curproc;	/* current procedure */
 
-extern char *filename; /* name of current input file */
+extern char *filename;	/* name of current input file */
 
 extern lset mesregs;	/* set of MES ms_reg pseudos */
 
-extern short time_space_ratio; /* 0   if optimizing for space only,
+extern short time_space_ratio;	/* 0   if optimizing for space only,
 				 * 100 if optimizing for time only,
 				 * else something 'in between'.
 				 */
diff --git a/util/ego/sp/sp.c b/util/ego/sp/sp.c
index 8538d3dfb..051281d7e 100644
--- a/util/ego/sp/sp.c
+++ b/util/ego/sp/sp.c
@@ -65,9 +65,8 @@ STATIC void sp_machinit(void *vp)
 	}
 	fscanf(f,"%d",&globl_sp_allowed);
 }
-comb_asps(l1,l2,b)
-	line_p l1,l2;
-	bblock_p b;
+
+STATIC void comb_asps(line_p l1, line_p l2, bblock_p b)
 {
 	assert(INSTR(l1) == op_asp);
 	assert(INSTR(l2) == op_asp);
@@ -78,11 +77,7 @@ comb_asps(l1,l2,b)
 	rm_line(l1,b);
 }
 	
-
-
-
-stack_pollution(b)
-	bblock_p b;
+STATIC void stack_pollution(bblock_p b)
 {
 	/* For every pair of successive ASP instructions in basic
 	 * block b, try to combine the two into one ASP.
@@ -134,8 +129,7 @@ stack_pollution(b)
 	} while (asp != (line_p) 0);
 }
 
-STATIC bool block_save(b)
-	bblock_p b;
+STATIC bool block_save(bblock_p b)
 {
 
 	register line_p l;
@@ -159,10 +153,7 @@ STATIC bool block_save(b)
 	return stack_diff >= 0;
 }
 
-
-
-STATIC mark_pred(b)
-	bblock_p b;
+STATIC void mark_pred(bblock_p b)
 {
 	Lindex i;
 	bblock_p x;
@@ -176,12 +167,7 @@ STATIC mark_pred(b)
 	}
 }
 
-
-
-
-
-STATIC mark_unsave_blocks(p)
-	proc_p p;
+STATIC void mark_unsave_blocks(proc_p p)
 {
 	register bblock_p b;
 
@@ -193,8 +179,7 @@ STATIC mark_unsave_blocks(p)
 	}
 }
 
-
-void sp_optimize(void *vp)
+STATIC void sp_optimize(void *vp)
 {
 	proc_p p = vp;
 	register bblock_p b;
@@ -206,21 +191,13 @@ void sp_optimize(void *vp)
 	}
 }
 
-
-
-
-main(argc,argv)
-	int argc;
-	char *argv[];
+int main(int argc, char *argv[])
 {
 	go(argc,argv,no_action,sp_optimize,sp_machinit,no_action);
 	report("stack adjustments deleted",Ssp);
 	exit(0);
 }
 
-
-
-
 /***** DEBUGGING:
 
 debug_stack_pollution(p)

From a7bb4ec4b1b4fd77c312d2b5ce2bc0f8d4569a0d Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 1 Mar 2018 13:19:38 -0500
Subject: [PATCH 47/55] Fixes for compiling ego with -DTRACE

 - In share/debug.c, undo my mistake in commit 9037d13 by changing
   vfprintf back to fprintf in OUTTRACE.

 - In ud/ud.c, move the trace output from stdout to stderr, because
   stdout has ego's output file, which becomes opt2's input file.  If
   trace output goes to stdout, it gets prepended to the output file,
   and opt2 errors with "wrong input file".

I also edit both build.lua files so ego depends on its header files;
this part isn't needed for -DTRACE.

One can now use -DTRACE by adding it to the cflags in both build.lua
files.
---
 util/ego/build.lua       |  1 +
 util/ego/share/build.lua |  3 +--
 util/ego/share/debug.c   |  2 +-
 util/ego/ud/ud.c         | 41 ++++++++++++++++++++--------------------
 4 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/util/ego/build.lua b/util/ego/build.lua
index 864447550..78895f508 100644
--- a/util/ego/build.lua
+++ b/util/ego/build.lua
@@ -3,6 +3,7 @@ local function build_ego(name)
 		name = name,
 		srcs = { "./"..name.."/*.c" },
 		deps = {
+			"./"..name.."/*.h",
 			"util/ego/share+lib",
 			"modules/src/em_data+lib",
 			"h+emheaders",
diff --git a/util/ego/share/build.lua b/util/ego/share/build.lua
index ab1068d2c..5ca714897 100644
--- a/util/ego/share/build.lua
+++ b/util/ego/share/build.lua
@@ -48,6 +48,7 @@ clibrary {
 		"./init_glob.c",
 	},
 	deps = {
+		"./*.h",
 		"+classdefs_h",
 		"+pop_push_h",
 		"h+emheaders",
@@ -57,5 +58,3 @@ clibrary {
 		["+cflags"] = {"-DVERBOSE", "-DNOTCOMPACT"}
 	}
 }
-
-
diff --git a/util/ego/share/debug.c b/util/ego/share/debug.c
index 81080f7cf..56514d149 100644
--- a/util/ego/share/debug.c
+++ b/util/ego/share/debug.c
@@ -45,7 +45,7 @@ void error(const char *s, ...)
 void OUTTRACE(const char *s, int n)
 {
 	fprintf(stderr,"> ");
-	vfprintf(stderr,s,n);
+	fprintf(stderr,s,n);
 	fprintf(stderr,"\n");
 }
 #endif
diff --git a/util/ego/ud/ud.c b/util/ego/ud/ud.c
index c0fe613fd..087337144 100644
--- a/util/ego/ud/ud.c
+++ b/util/ego/ud/ud.c
@@ -269,13 +269,13 @@ pr_localtab() {
 	short i;
 	local_p lc;
 
-	printf("LOCAL-TABLE (%d)\n\n",nrlocals);
+	fprintf(stderr,"LOCAL-TABLE (%d)\n\n",nrlocals);
 	for (i = 1; i <= nrlocals; i++) {
 		lc = locals[i];
-		printf("LOCAL %d\n",i);
-		printf("	offset= %ld\n",lc->lc_off);
-		printf("	size=   %d\n",lc->lc_size);
-		printf("	flags=  %d\n",lc->lc_flags);
+		fprintf(stderr,"LOCAL %d\n",i);
+		fprintf(stderr,"\toffset= %ld\n",lc->lc_off);
+		fprintf(stderr,"\tsize=   %d\n",lc->lc_size);
+		fprintf(stderr,"\tflags=  %d\n",lc->lc_flags);
 	}
 }
 
@@ -284,12 +284,13 @@ pr_globals()
 	dblock_p d;
 	obj_p obj;
 
-	printf("GLOBALS (%d)\n\n",nrglobals);
-	printf("ID	GLOBNR\n");
+	fprintf(stderr,"GLOBALS (%d)\n\n",nrglobals);
+	fprintf(stderr,"ID\tGLOBNR\n");
 	for (d = fdblock; d != (dblock_p) 0; d = d->d_next) {
 		for (obj = d->d_objlist; obj != (obj_p) 0; obj = obj->o_next) {
 			if (obj->o_globnr != 0) {
-			   printf("%d	%d\n", obj->o_id,obj->o_globnr);
+				fprintf(stderr,"%d\t%d\n",
+				    obj->o_id,obj->o_globnr);
 			}
 		}
 	}
@@ -302,20 +303,20 @@ pr_defs()
 	short i;
 	line_p l;
 
-	printf("DEF TABLE\n\n");
+	fprintf(stderr,"DEF TABLE\n\n");
 	for (i = 1; i <= nrexpldefs; i++) {
 		l = defs[i];
-		printf("%d	%s ",EXPL_TO_DEFNR(i),
+		fprintf(stderr,"%d\t%s ",EXPL_TO_DEFNR(i),
 			&em_mnem[(INSTR(l)-sp_fmnem)*4]);
 		switch(TYPE(l)) {
 			case OPSHORT:
-				printf("%d\n",SHORT(l));
+				fprintf(stderr,"%d\n",SHORT(l));
 				break;
 			case OPOFFSET:
-				printf("%ld\n",OFFSET(l));
+				fprintf(stderr,"%ld\n",OFFSET(l));
 				break;
 			case OPOBJECT:
-				printf("%d\n",OBJ(l)->o_id);
+				fprintf(stderr,"%d\n",OBJ(l)->o_id);
 				break;
 			default:
 				assert(FALSE);
@@ -331,13 +332,13 @@ pr_set(name,k,s,n)
 {
 	short i;
 
-	printf("%s(%d) =	{",name,k);
+	fprintf(stderr,"%s(%d) =\t{",name,k);
 	for (i = 1; i <= n; i++) {
 		if (Cis_elem(i,s)) {
-			printf("%d ",i);
+			fprintf(stderr,"%d ",i);
 		}
 	}
-	printf ("}\n");
+	fprintf(stderr,"}\n");
 }
 
 pr_blocks(p)
@@ -347,7 +348,7 @@ pr_blocks(p)
 	short n;
 
 	for (b = p->p_start; b != 0; b = b->b_next) {
-		printf ("\n");
+		fprintf(stderr,"\n");
 		n = b->b_id;
 		pr_set("GEN",n,GEN(b),nrdefs);
 		pr_set("KILL",n,KILL(b),nrdefs);
@@ -361,10 +362,10 @@ pr_copies()
 {
 	short i;
 
-	printf("\nCOPY TABLE\n\n");
+	fprintf(stderr,"\nCOPY TABLE\n\n");
 	for (i = 1; i <= nrdefs; i++) {
 		if (def_to_copynr[i] != 0) {
-			printf("%d	%d\n",i,def_to_copynr[i]);
+			fprintf(stderr,"%d\t%d\n",i,def_to_copynr[i]);
 		}
 	}
 }
@@ -376,7 +377,7 @@ pr_cblocks(p)
 	short n;
 
 	for (b = p->p_start; b != 0; b = b->b_next) {
-		printf ("\n");
+		fprintf(stderr,"\n");
 		n = b->b_id;
 		pr_set("CGEN",n,C_GEN(b),nrcopies);
 		pr_set("CKILL",n,C_KILL(b),nrcopies);

From f26259caac62b30bc710f6aa28c6e08253b6c5b2 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Fri, 2 Mar 2018 16:06:21 -0500
Subject: [PATCH 48/55] Check AAR earlier to prevent LOI/STI unknown size.

In ego, the CS phase may convert a LAR/SAR to AAR LOI/STI so it can
optimize multiple occurrences of AAR of the same array element.  This
conversion should not happen if it would LOI/STI a large or unknown
size.

cs_profit.c okay_lines() checked the size of each occurrence of AAR
except the first.  If the first AAR was the implicit AAR in a LAR/SAR,
then the conversion happened without checking the size.  For unknown
size, this made a bad LOI -1 or STI -1.  Fix by checking the size
earlier: if a LAR/SAR has a bad size, then don't enter it as an AAR.

This Modula-2 code showed the bug.  Given M.def:

    DEFINITION MODULE M;
    TYPE S = SET OF [0..95];
    PROCEDURE F(a: ARRAY OF S; i, j: INTEGER);
    END M.

and M.mod:

    (*$R-*) IMPLEMENTATION MODULE M;
    FROM SYSTEM IMPORT ADDRESS, ADR;
    PROCEDURE G(s: S; p, q: ADDRESS; t: S); BEGIN
      s := s; p := p; q := q; t := t;
    END G;
    PROCEDURE F(a: ARRAY OF S; i, j: INTEGER); BEGIN
      G(a[i + j], ADR(a[i + j]), ADR(a[i + j]), a[i + j])
    END F;
    END M.

then the bug caused an error:

    $ ack -mlinuxppc -O3 -c.e M.mod
    /tmp/Ack_b357d.g, line 57: Argument range error

The bug had put LOI -1 in the code, then em_decode got an error
because -1 is out of range for LOI.

Procedure F has 4 occurrences of `a[i + j]`.  The size of `a[i + j]`
is 96 bits, or 12 bytes, but the EM code hides the size in an array
descriptor, so the size is unknown to CS.  The pragma `(*$R-*)`
disables a range check on `i + j` so CS can work.  EM uses AAR for the
2 `ADR(a[i + j])` and LAR for the other 2 `a[i + j]`.  EM pushes the
arguments to G in reverse order, so the last `a[i + j]` in Modula-2 is
the first LAR in EM.

CS found 4 occurrences of AAR.  The first AAR was an implicit AAR in
LAR.  Because of the bug, CS converted this LAR 4 to AAR 4 LOI -1.
---
 util/ego/cs/cs_profit.c | 31 ++++++++++++++++++-------------
 util/ego/cs/cs_profit.h |  6 ++++++
 util/ego/cs/cs_vnm.c    | 17 ++++++++++++-----
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/util/ego/cs/cs_profit.c b/util/ego/cs/cs_profit.c
index 50cb708fd..8845aaa29 100644
--- a/util/ego/cs/cs_profit.c
+++ b/util/ego/cs/cs_profit.c
@@ -111,6 +111,21 @@ void cs_machinit(void *vp)
 	choose_cset(f, &forbidden, sp_lmnem);
 }
 
+bool may_become_aar(avail_p avp)
+{
+	/* Check whether it is desirable to treat a LAR or SAR as an
+	 * AAR LOI/STI. This depends on the size of the array-elements.
+	 */
+	offset sz;
+
+	sz = array_elemsize(avp->av_othird);
+	if (sz == UNKNOWN_SIZE)
+		return FALSE;
+	if (time_space_ratio < 50)
+		return sz <= AR_limit;
+	return TRUE;
+}
+
 STATIC bool sli_no_eliminate(line_p lnp)
 {
 	/* Return whether the SLI-instruction in lnp is part of
@@ -157,8 +172,10 @@ STATIC bool gains(avail_p avp)
 
 STATIC bool okay_lines(avail_p avp, occur_p ocp)
 {
+	/* Check whether all lines in this occurrence can in
+	 * principle be eliminated; no stores, messages, calls etc.
+	 */
 	register line_p lnp, next;
-	offset sz;
 
 	for (lnp = ocp->oc_lfirst; lnp != (line_p) 0; lnp = next) {
 		next = lnp != ocp->oc_llast ? lnp->l_next : (line_p) 0;
@@ -171,18 +188,6 @@ STATIC bool okay_lines(avail_p avp, occur_p ocp)
 				return FALSE;
 		}
 	}
-	/* All lines in this occurrence can in principle be eliminated;
-	 * no stores, messages, calls etc.
-	 * We now check whether it is desirable to treat a LAR or a SAR
-	 * as an AAR LOI/STI. This depends on the size of the array-elements.
-	 */
-	if (INSTR(ocp->oc_llast) == op_lar || INSTR(ocp->oc_llast) == op_sar) {
-		sz = array_elemsize(avp->av_othird);
-		if (sz == UNKNOWN_SIZE) return FALSE;
-		if (avp->av_instr == (byte) op_aar && time_space_ratio < 50) {
-			return sz <= AR_limit;
-		}
-	}
 	return TRUE;
 }
 
diff --git a/util/ego/cs/cs_profit.h b/util/ego/cs/cs_profit.h
index 7ec5e3c17..43f2bade9 100644
--- a/util/ego/cs/cs_profit.h
+++ b/util/ego/cs/cs_profit.h
@@ -7,6 +7,12 @@ void cs_machinit(void *vp);	/* (FILE *f)
 				 * Read phase-specific information from f.
 				 */
 
+bool may_become_aar(avail_p avp);
+				/*
+				 * Return whether a LAR/SAR may become
+				 * an AAR LOI/STI.
+				 */
+
 bool desirable(avail_p avp);	/*
 				 * Return whether it is desirable to eliminate
 				 * the recurrences of the expression in avp.
diff --git a/util/ego/cs/cs_vnm.c b/util/ego/cs/cs_vnm.c
index 4dbeb3df2..67507f805 100644
--- a/util/ego/cs/cs_vnm.c
+++ b/util/ego/cs/cs_vnm.c
@@ -50,11 +50,13 @@ STATIC void put_expensive_load(bblock_p bp, line_p lnp, line_p lfirst,
 
 STATIC void put_aar(bblock_p bp, line_p lnp, line_p lfirst, entity_p enp)
 {
-	/* Enp points to an ENARRELEM. We do as if its address was computed. */
-
+	/* Enter the implicit AAR in a LAR or SAR, where enp points to
+	 * the ENARRELEM, and AAR computes its address.
+	 */
 	struct avail av;
 	occur_p	ocp;
 
+	assert(INSTR(lnp) == op_lar || INSTR(lnp) == op_sar);
 	assert(enp->en_kind == ENARRELEM);
 	av.av_instr = op_aar;
 	av.av_size = ps;
@@ -62,9 +64,14 @@ STATIC void put_aar(bblock_p bp, line_p lnp, line_p lfirst, entity_p enp)
 	av.av_osecond = enp->en_index;
 	av.av_othird = enp->en_adesc;
 
-	ocp = newoccur(lfirst, lnp, bp);
-
-	av_enter(&av, ocp, TERNAIR_OP);
+	/* Before we enter an available AAR, we must check whether we
+	 * may convert this LAR/SAR to AAR LOI/STI.  This is so we
+	 * don't LOI/STI a large or unknown size.
+	 */
+	if (may_become_aar(&av)) {
+		ocp = newoccur(lfirst, lnp, bp);
+		av_enter(&av, ocp, TERNAIR_OP);
+	}
 }
 
 STATIC void push_avail(avail_p avp, line_p lfirst)

From b1b737ed6cac47b82f267a2c3e7d4a36d40a02cf Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Mon, 5 Mar 2018 13:32:06 -0500
Subject: [PATCH 49/55] Optimize procedures that do both a / b and a % b.

Enable this in CS for PowerPC; disable it for all other machines.
PowerPC has no remainder instruction; the back end uses division to
compute remainder.  If CS finds both a / b and a % b, then CS now
rewrites a % b as a - b * (a / b) and computes a / b only once.  This
removes an extra division in the PowerPC code, so it saves both time
and space.

I have not considered whether to enable this optimization for other
machines.  It might be less useful in machines with a remainder
instruction.  Also, if a % b occurs before a / b, the EM code gets a
DUP.  PowerPC ncg handles this DUP well; other back ends might not.
---
 util/ego/cs/cs.h             | 13 +++---
 util/ego/cs/cs_avail.c       |  5 +-
 util/ego/cs/cs_elim.c        | 90 ++++++++++++++++++++++++++++++------
 util/ego/cs/cs_partit.c      |  5 +-
 util/ego/cs/cs_profit.c      | 12 +++++
 util/ego/cs/cs_profit.h      |  5 ++
 util/ego/cs/cs_vnm.c         | 37 +++++++++++++++
 util/ego/descr/em22.descr    |  1 +
 util/ego/descr/em24.descr    |  1 +
 util/ego/descr/em44.descr    |  1 +
 util/ego/descr/i386.descr    |  1 +
 util/ego/descr/i86.descr     |  1 +
 util/ego/descr/m68020.descr  |  1 +
 util/ego/descr/m68k2.descr   |  1 +
 util/ego/descr/m68k4.descr   |  1 +
 util/ego/descr/pdp.descr     |  1 +
 util/ego/descr/powerpc.descr |  1 +
 util/ego/descr/sparc.descr   |  1 +
 util/ego/descr/vax4.descr    |  1 +
 19 files changed, 156 insertions(+), 23 deletions(-)

diff --git a/util/ego/cs/cs.h b/util/ego/cs/cs.h
index c749427a5..7a2ebde7b 100644
--- a/util/ego/cs/cs.h
+++ b/util/ego/cs/cs.h
@@ -88,12 +88,13 @@ struct occur {
 #define UNAIR_OP	6
 #define BINAIR_OP	7
 #define TERNAIR_OP	8
-#define KILL_ENTITY	9
-#define SIDE_EFFECTS	10
-#define FIDDLE_STACK	11
-#define IGNORE		12
-#define HOPELESS	13
-#define BBLOCK_END	14
+#define REMAINDER	9
+#define KILL_ENTITY	10
+#define SIDE_EFFECTS	11
+#define FIDDLE_STACK	12
+#define IGNORE		13
+#define HOPELESS	14
+#define BBLOCK_END	15
 
 struct avail {
 	avail_p	av_before;	/* Ptr to earlier discovered expressions. */
diff --git a/util/ego/cs/cs_avail.c b/util/ego/cs/cs_avail.c
index 025132a2e..b28cc496a 100644
--- a/util/ego/cs/cs_avail.c
+++ b/util/ego/cs/cs_avail.c
@@ -54,6 +54,7 @@ STATIC bool same_avail(byte kind, avail_p avp1, avail_p avp2)
 		case UNAIR_OP:
 			return	avp1->av_operand == avp2->av_operand;
 		case BINAIR_OP:
+		case REMAINDER:
 			if (commutative(avp1->av_instr & BMASK))
 				return	avp1->av_oleft == avp2->av_oleft &&
 					avp1->av_oright == avp2->av_oright
@@ -124,6 +125,7 @@ STATIC void copy_avail(int kind, avail_p src, avail_p dst)
 			dst->av_operand = src->av_operand;
 			break;
 		case BINAIR_OP:
+		case REMAINDER:
 			dst->av_oleft = src->av_oleft;
 			dst->av_oright = src->av_oright;
 			break;
@@ -160,7 +162,8 @@ avail_p av_enter(avail_p avp, occur_p ocp, int kind)
 	/* Remember local, if any, that holds result. */
 	if (avp->av_instr != (byte) INSTR(last)) {
 		/* Only possible when instr is the implicit AAR in 
-		 * a LAR or SAR.
+		 * a LAR or SAR, or the implicit DVI in an RMI, or
+		 * DVU in RMU.
 		 */
 		ravp->av_saveloc = (entity_p) 0;
 	} else {
diff --git a/util/ego/cs/cs_elim.c b/util/ego/cs/cs_elim.c
index 767517bd8..7dce0df09 100644
--- a/util/ego/cs/cs_elim.c
+++ b/util/ego/cs/cs_elim.c
@@ -101,12 +101,49 @@ STATIC void complete_aar(line_p lnp, int instr, valnum descr_vn)
 	dlink(lnp, lindir);
 }
 
+STATIC void complete_dv_as_rm(line_p lnp, avail_p avp, bool first)
+{
+	/* Complete a / b as a % b = a - b * (a / b). For the first
+	 * occurrence, lnp must stack q, where q = a / b. We prepend a
+	 * DUP to change postfix a b / into a b a b /, then append a
+	 * MLI/MLU and SBI/SBU to make a b a b / * -.
+	 *
+	 * For later occurences, lnp must stack a b q.  We append the
+	 * MLI/MLU and SBI/SBU.
+	 */
+	line_p dv, dup, ml, sb;
+	offset size;
+	bool s;
+
+	size = avp->av_size;
+	s = (avp->av_instr == (byte) op_dvi);
+	assert(s || avp->av_instr == (byte) op_dvu);
+	if (first) {
+		/* Prepend our DUP to avp->av_found, to get before the
+		 * DVI if lnp points to the LOL in DVI STL LOL.
+		 */
+		dup = int_line(2 * size);
+		dup->l_instr = op_dup;
+		dv = avp->av_found;
+		dlink(dv->l_prev, dup);
+		dlink(dup, dv);
+	}
+	ml = int_line(size);
+	sb = int_line(size);
+	ml->l_instr = (s ? op_mli : op_mlu);
+	sb->l_instr = (s ? op_sbi : op_sbu);
+	dlink(sb, lnp->l_next);
+	dlink(ml, sb);
+	dlink(lnp, ml);
+}
+
 STATIC void replace(occur_p ocp, offset tmp, avail_p avp)
 {
 	/* Replace the lines in the occurrence in ocp by a load of the
 	 * temporary with offset tmp.
 	 */
 	register line_p lol, first, last;
+	register int instr;
 
 	assert(avp->av_size == ws || avp->av_size == 2*ws);
 
@@ -119,13 +156,24 @@ STATIC void replace(occur_p ocp, offset tmp, avail_p avp)
 	if (first->l_prev == (line_p) 0) ocp->oc_belongs->b_start = lol;
 	dlink(first->l_prev, lol);
 
-	if (avp->av_instr == (byte) op_aar) {
-		/* There may actually be a LAR or a SAR instruction; in that
-		 * case we have to complete the array-instruction.
-		 */
-		register int instr = INSTR(last);
-
-		if (instr != op_aar) complete_aar(lol, instr, avp->av_othird);
+	instr = INSTR(last);
+	switch (avp->av_instr & 0377) {
+		case op_aar:
+			/* There may actually be a LAR or a SAR
+			 * instruction; in that case we have to
+			 * complete the array-instruction.
+			 */
+			if (instr != op_aar)
+				complete_aar(lol, instr, avp->av_othird);
+			break;
+		case op_dvi:
+			if (instr == op_rmi)
+				complete_dv_as_rm(lol, avp, FALSE);
+			break;
+		case op_dvu:
+			if (instr == op_rmu)
+				complete_dv_as_rm(lol, avp, FALSE);
+			break;
 	}
 
 	/* Throw away the by now useless lines. */
@@ -142,6 +190,7 @@ STATIC void append(avail_p avp, offset tmp)
 	 * within a lar or sar, we must first generate the aar.
 	 */
 	register line_p stl, lol;
+	register int instr;
 
 	assert(avp->av_size == ws || avp->av_size == 2*ws);
 
@@ -154,13 +203,26 @@ STATIC void append(avail_p avp, offset tmp)
 	dlink(stl, lol);
 	dlink(avp->av_found, stl);
 
-	if (avp->av_instr == (byte) op_aar) {
-		register int instr = INSTR(avp->av_found);
-
-		if (instr != op_aar) {
-			complete_aar(lol, instr, avp->av_othird);
-			avp->av_found->l_instr = op_aar;
-		}
+	instr = INSTR(avp->av_found);
+	switch (avp->av_instr & 0377) {
+		case op_aar:
+			if (instr != op_aar) {
+				complete_aar(lol, instr, avp->av_othird);
+				avp->av_found->l_instr = op_aar;
+			}
+			break;
+		case op_dvi:
+			if (instr == op_rmi) {
+				complete_dv_as_rm(lol, avp, TRUE);
+				avp->av_found->l_instr = op_dvi;
+			}
+			break;
+		case op_dvu:
+			if (instr == op_rmu) {
+				complete_dv_as_rm(lol, avp, TRUE);
+				avp->av_found->l_instr = op_dvu;
+			}
+			break;
 	}
 }
 
diff --git a/util/ego/cs/cs_partit.c b/util/ego/cs/cs_partit.c
index e6d3b1fdf..b020ebcfa 100644
--- a/util/ego/cs/cs_partit.c
+++ b/util/ego/cs/cs_partit.c
@@ -125,8 +125,8 @@ STATIC struct {
 /* nop */	HOPELESS,	XXX,	XXX,	XXX,	XXX,
 /* rck */	BBLOCK_END,	XXX,	XXX,	XXX,	XXX,
 /* ret */	BBLOCK_END,	XXX,	XXX,	XXX,	XXX,
-/* rmi */	BINAIR_OP,	ARGW,	ARGW,	ARGW,	ANY,
-/* rmu */	BINAIR_OP,	ARGW,	ARGW,	ARGW,	ANY,
+/* rmi */	REMAINDER,	ARGW,	ARGW,	ARGW,	ANY,
+/* rmu */	REMAINDER,	ARGW,	ARGW,	ARGW,	ANY,
 /* rol */	BINAIR_OP,	ARGW,	WS,	ARGW,	ANY,
 /* ror */	BINAIR_OP,	ARGW,	WS,	ARGW,	ANY,
 /* rtt */	BBLOCK_END,	XXX,	XXX,	XXX,	XXX,
@@ -203,6 +203,7 @@ bool stack_group(int instr)
 		case UNAIR_OP:
 		case BINAIR_OP:
 		case TERNAIR_OP:
+		case REMAINDER:
 			return TRUE;
 		default:
 			return FALSE;
diff --git a/util/ego/cs/cs_profit.c b/util/ego/cs/cs_profit.c
index 8845aaa29..a92028c36 100644
--- a/util/ego/cs/cs_profit.c
+++ b/util/ego/cs/cs_profit.c
@@ -25,6 +25,7 @@ STATIC cset	forbidden;
 STATIC cset	sli_counts;
 STATIC short	LX_threshold;
 STATIC short	AR_limit;
+STATIC bool	RM_to_DV;
 
 STATIC void get_instrs(FILE *f, cset *s_p)
 {
@@ -97,6 +98,12 @@ void cs_machinit(void *vp)
 	fscanf(f, "%d", &space);
 	AR_limit = space;
 
+	/* Read whether to convert a remainder RMI/RMU to a division
+	 * DVI/DVU using the formula a % b = a - b * (a / b).
+	 */
+	fscanf(f, "%d %d", &time, &space);
+	RM_to_DV = time_space_ratio >= 50 ? time : space;
+
 	/* Read for what counts we must not eliminate an SLI instruction
 	 * when it is part of an array-index computation.
 	 */
@@ -126,6 +133,11 @@ bool may_become_aar(avail_p avp)
 	return TRUE;
 }
 
+bool may_become_dv(void)
+{
+	return RM_to_DV;
+}
+
 STATIC bool sli_no_eliminate(line_p lnp)
 {
 	/* Return whether the SLI-instruction in lnp is part of
diff --git a/util/ego/cs/cs_profit.h b/util/ego/cs/cs_profit.h
index 43f2bade9..3d1972d24 100644
--- a/util/ego/cs/cs_profit.h
+++ b/util/ego/cs/cs_profit.h
@@ -13,6 +13,11 @@ bool may_become_aar(avail_p avp);
 				 * an AAR LOI/STI.
 				 */
 
+bool may_become_dv(void);	/*
+				 * Return whether an RMI/RMU may become
+				 * a DVI/DVU: a % b = a - (a / b * b).
+				 */
+
 bool desirable(avail_p avp);	/*
 				 * Return whether it is desirable to eliminate
 				 * the recurrences of the expression in avp.
diff --git a/util/ego/cs/cs_vnm.c b/util/ego/cs/cs_vnm.c
index 67507f805..435dd4658 100644
--- a/util/ego/cs/cs_vnm.c
+++ b/util/ego/cs/cs_vnm.c
@@ -20,6 +20,7 @@
 #include "cs_kill.h"
 #include "cs_partit.h"
 #include "cs_getent.h"
+#include "cs_profit.h"
 
 STATIC void push_entity(entity_p enp, line_p lfirst)
 {
@@ -130,6 +131,37 @@ STATIC void push_ternair_op(bblock_p bp, line_p lnp, token_p tkp1,
 	push_avail(av_enter(&av, ocp, TERNAIR_OP), tkp1->tk_lfirst);
 }
 
+STATIC void push_remainder(bblock_p bp, line_p lnp, token_p tkp1, token_p tkp2)
+{
+	/* Enter the implicit division tkp1 / tkp2,
+	 * then push the remainder tkp1 % tkp2.
+	 */
+	struct avail av;
+	occur_p	ocp;
+
+	assert(INSTR(lnp) == op_rmi || INSTR(lnp) == op_rmu);
+	av.av_size = avsize(lnp);
+	av.av_oleft = tkp1->tk_vn;
+	av.av_oright = tkp2->tk_vn;
+
+	/* Check whether we may convert RMI/RMU to DVI/DVU. */
+	if (may_become_dv()) {
+		/* The division is DVI in RMI, or DVU in RMU. */
+		av.av_instr = (INSTR(lnp) == op_rmi ? op_dvi : op_dvu);
+
+		/* In postfix, a b % becomes a b a b / * -.  We must
+		 * keep a and b on the stack, so the first instruction
+		 * to eliminate is lnp, not tkp1->l_first.
+		 */
+		ocp = newoccur(lnp, lnp, bp);
+		av_enter(&av, ocp, BINAIR_OP);
+	}
+
+	av.av_instr = INSTR(lnp);
+	ocp = newoccur(tkp1->tk_lfirst, lnp, bp);
+	push_avail(av_enter(&av, ocp, REMAINDER), tkp1->tk_lfirst);
+}
+
 STATIC void fiddle_stack(line_p lnp)
 {
 	/* The instruction in lnp does something to the valuenumber-stack. */
@@ -317,6 +349,11 @@ void vnm(bblock_p bp)
 				Pop(&tk1, op13size(lnp));
 				push_ternair_op(bp, lnp, &tk1, &tk2, &tk3);
 				break;
+			case REMAINDER:
+				Pop(&tk2, op22size(lnp));
+				Pop(&tk1, op12size(lnp));
+				push_remainder(bp, lnp, &tk1, &tk2);
+				break;
 			case KILL_ENTITY:
 				kill_direct(rep);
 				break;
diff --git a/util/ego/descr/em22.descr b/util/ego/descr/em22.descr
index f995d631c..d9c39226b 100644
--- a/util/ego/descr/em22.descr
+++ b/util/ego/descr/em22.descr
@@ -78,6 +78,7 @@ cheap operations: -1
                   -1
 lexical tresholds: 1 1
 indirection limit: 8
+convert remainder to division?: no no
 do not eliminate sli if index on shiftcounts:   -1
                                                 -1
 forbidden operators: -1 -1
diff --git a/util/ego/descr/em24.descr b/util/ego/descr/em24.descr
index a95751170..cbe0ab5c3 100644
--- a/util/ego/descr/em24.descr
+++ b/util/ego/descr/em24.descr
@@ -78,6 +78,7 @@ cheap operations: -1
                   -1
 lexical tresholds: 1 1
 indirection limit: 8
+convert remainder to division?: no no
 do not eliminate sli if index on shiftcounts:   -1
                                                 -1
 forbidden operators: -1 -1
diff --git a/util/ego/descr/em44.descr b/util/ego/descr/em44.descr
index 117f26591..b6dbebba3 100644
--- a/util/ego/descr/em44.descr
+++ b/util/ego/descr/em44.descr
@@ -78,6 +78,7 @@ cheap operations: -1
                   -1
 lexical tresholds: 1 1
 indirection limit: 8
+convert remainder to division?: no no
 do not eliminate sli if index on shiftcounts:   -1
                                                 -1
 forbidden operators: -1 -1
diff --git a/util/ego/descr/i386.descr b/util/ego/descr/i386.descr
index 264151a60..d5a2014bf 100644
--- a/util/ego/descr/i386.descr
+++ b/util/ego/descr/i386.descr
@@ -93,6 +93,7 @@ cheap operations: op_cii op_cui op_ciu op_cuu -1
                   op_cii op_cui op_ciu op_cuu -1
 lexical tresholds: 1 1
 indirection limit: 8
+convert remainder to division?: no no
 do not eliminate sli if index on shiftcounts:   -1
                                                 -1
 forbidden operators: -1 -1
diff --git a/util/ego/descr/i86.descr b/util/ego/descr/i86.descr
index 8be3ec23e..9b27cf840 100644
--- a/util/ego/descr/i86.descr
+++ b/util/ego/descr/i86.descr
@@ -93,6 +93,7 @@ cheap operations: op_cii op_cui op_ciu op_cuu -1
                   op_cii op_cui op_ciu op_cuu -1
 lexical tresholds: 1 1
 indirection limit: 8
+convert remainder to division?: no no
 do not eliminate sli if index on shiftcounts:   -1
                                                 -1
 forbidden operators: -1 -1
diff --git a/util/ego/descr/m68020.descr b/util/ego/descr/m68020.descr
index 9d2f46b2b..f568e00e2 100644
--- a/util/ego/descr/m68020.descr
+++ b/util/ego/descr/m68020.descr
@@ -102,6 +102,7 @@ cheap operations: op_cii op_ciu op_cui op_cuu op_cmi op_cmu op_cmp -1
 		  op_cii op_ciu op_cui op_cuu op_cmi op_cmu op_cmp -1
 lexical tresholds: 1 1
 indirection limit: 8
+convert remainder to division?: no no
 do not eliminate sli if index on shiftcounts:	1 2 3 -1
 						1 2 3 -1
 forbidden operators: -1 -1
diff --git a/util/ego/descr/m68k2.descr b/util/ego/descr/m68k2.descr
index 58e433db8..6b144cba0 100644
--- a/util/ego/descr/m68k2.descr
+++ b/util/ego/descr/m68k2.descr
@@ -99,6 +99,7 @@ addressing modes: op_adp op_lof op_ldf op_loi op_dch op_lpb -1
 cheap operations: -1 -1
 lexical tresholds: 1 1
 indirection limit: 8
+convert remainder to division?: no no
 do not eliminate sli if index on shiftcounts:	-1
 						-1
 forbidden operators: -1 -1
diff --git a/util/ego/descr/m68k4.descr b/util/ego/descr/m68k4.descr
index 8e1da4c5e..6b9d23dfa 100644
--- a/util/ego/descr/m68k4.descr
+++ b/util/ego/descr/m68k4.descr
@@ -102,6 +102,7 @@ cheap operations: op_ciu op_cui op_cuu op_cmi op_cmu op_cmp -1
 		  op_ciu op_cui op_cuu op_cmi op_cmu op_cmp -1
 lexical tresholds: 1 1
 indirection limit: 8
+convert remainder to division?: no no
 do not eliminate sli if index on shiftcounts:	-1
 						-1
 forbidden operators: -1 -1
diff --git a/util/ego/descr/pdp.descr b/util/ego/descr/pdp.descr
index e73b3aaf1..ec8f3abca 100644
--- a/util/ego/descr/pdp.descr
+++ b/util/ego/descr/pdp.descr
@@ -92,6 +92,7 @@ cheap operations: op_cii op_cui op_cfi op_ciu op_cff op_cuu op_cif -1
 		  op_cii op_cui op_cfi op_ciu op_cff op_cuu op_cif -1
 lexical tresholds: 1 1
 indirection limit: 8
+convert remainder to division?: no no
 do not eliminate sli if index on shiftcounts:	-1
 						-1
 forbidden operators: -1 -1
diff --git a/util/ego/descr/powerpc.descr b/util/ego/descr/powerpc.descr
index 5fb9bb628..cf613e96c 100644
--- a/util/ego/descr/powerpc.descr
+++ b/util/ego/descr/powerpc.descr
@@ -141,6 +141,7 @@ cheap operations: op_cii op_ciu op_cui op_cuu op_cmi op_cmu op_cmp -1
 		  op_cii op_ciu op_cui op_cuu op_cmi op_cmu op_cmp -1
 lexical tresholds: 1 1
 indirection limit: 8
+convert remainder to division?: yes yes
 do not eliminate sli if index on shiftcounts:	-1
 						-1
 forbidden operators: -1 -1
diff --git a/util/ego/descr/sparc.descr b/util/ego/descr/sparc.descr
index 978c39ba3..79c33decb 100644
--- a/util/ego/descr/sparc.descr
+++ b/util/ego/descr/sparc.descr
@@ -100,6 +100,7 @@ cheap operations: op_cuu op_ciu op_cui op_cii -1
                   op_cuu op_ciu op_cui op_cii -1
 lexical tresholds: 1 1
 indirection limit: 8
+convert remainder to division?: no no
 do not eliminate sli if index on shiftcounts:   -1
                                                 -1
 forbidden operators: -1 -1
diff --git a/util/ego/descr/vax4.descr b/util/ego/descr/vax4.descr
index 5a39ea759..beaf0c427 100644
--- a/util/ego/descr/vax4.descr
+++ b/util/ego/descr/vax4.descr
@@ -113,6 +113,7 @@ cheap operations: op_cii op_cui op_cfi op_ciu op_cff op_cuu op_cif
 		  op_cmi op_cmu op_cmf op_cms op_cmp -1
 lexical thresholds: 1 1
 indirection limit: 8
+convert remainder to division?: no no
 do not eliminate sli if index on shiftcounts:	1 2 3 -1
 						1 2 3 -1
 forbidden operators: -1 -1

From b1badf18511ef050ed7bb38c7b52ea52c28dcb3c Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Wed, 7 Mar 2018 13:37:31 -0500
Subject: [PATCH 50/55] Add instructions like "lwarx".  Extend manual.

Add more page numbers from PowerPC version 2.01.  Remove "xnop" not in
2.01, add "mtcr" from 2.01.  Add "lwarx" and the other instructions
from Book II.  I did not try all the newly added instructions, but
these seem to work: dcbt, dcbtst, icibi, isync, lwarx, stwcx., mftb,
mftbu

In man/powerpc_as.6 (not installed), add a summary of the registers
and addressing modes (like in i386_as.6), describe short forms, update
description of hi16/ha16, add CAVEATS about instructions that some
processors can't run.
---
 mach/powerpc/as/mach2.c |  11 ++-
 mach/powerpc/as/mach3.c |  80 ++++++++++++++++------
 mach/powerpc/as/mach4.c |  50 +++++++++++---
 man/powerpc_as.6        | 145 ++++++++++++++++++++++++++++++++++------
 4 files changed, 230 insertions(+), 56 deletions(-)

diff --git a/mach/powerpc/as/mach2.c b/mach/powerpc/as/mach2.c
index 4065334e6..e8e61ea0c 100644
--- a/mach/powerpc/as/mach2.c
+++ b/mach/powerpc/as/mach2.c
@@ -47,11 +47,15 @@
 %token <y_word> OP_FRT_FRB_C
 %token <y_word> OP_FRT_RA_D
 %token <y_word> OP_FRT_RA_RB
+%token <y_word> OP_L
 %token <y_word> OP_LEV
 %token <y_word> OP_LIA
 %token <y_word> OP_LIL
 %token <y_word> OP_LI32
+%token <y_word> OP_RA_RB
+%token <y_word> OP_RA_RB_TH
 %token <y_word> OP_RA_RS_C
+%token <y_word> OP_RA_RS_RA_C
 %token <y_word> OP_RA_RS_RB_C
 %token <y_word> OP_RA_RS_RB_MB5_ME5_C
 %token <y_word> OP_RA_RS_RB_MB6_C
@@ -61,14 +65,14 @@
 %token <y_word> OP_RA_RS_SH6_MB6_C
 %token <y_word> OP_RA_RS_UI
 %token <y_word> OP_RA_RS_UI_CC
+%token <y_word> OP_RS
 %token <y_word> OP_RS_FXM
 %token <y_word> OP_RS_RA
 %token <y_word> OP_RS_RA_D
 %token <y_word> OP_RS_RA_DS
 %token <y_word> OP_RS_RA_NB
 %token <y_word> OP_RS_RA_RB
-%token <y_word> OP_RS_RA_RB_C
-%token <y_word> OP_RS_RA_RA_C
+%token <y_word> OP_RS_RA_RB_CC
 %token <y_word> OP_RS_RB
 %token <y_word> OP_RS_SPR
 %token <y_word> OP_RS_SR
@@ -104,4 +108,5 @@
 
 %type <y_word> c
 %type <y_word> e16 negate16 u8 u7 u6 u5 u4 u2 u1
-%type <y_word> opt_bh cr_opt nb ds bda bdl lia lil spr_num
+%type <y_word> opt_bh cr_opt nb ds bda bdl lia lil
+%type <y_word> spr_num tbr_num opt_tbr
diff --git a/mach/powerpc/as/mach3.c b/mach/powerpc/as/mach3.c
index 91b088a6a..99507087d 100644
--- a/mach/powerpc/as/mach3.c
+++ b/mach/powerpc/as/mach3.c
@@ -103,6 +103,10 @@
 0,     OP_HA,                0,                                        "ha16",
 0,     OP_LO,                 0,                                       "lo16",
 
+/* The next page numbers are from PowerPC User Instruction Set
+ * Architecture, Book I, Version 2.01.
+ */
+
 /* Branch processor instructions (page 20) */
 
 0,     OP_LIL,                18<<26 | 0<<1 | 0<<0,                    "b",
@@ -128,7 +132,7 @@
 0,     OP_BT_BA_BB,           19<<26 | 417<<1,                         "crorc",
 0,     OP_BF_BFA,             19<<26 | 0<<1,                           "mcrf",
 
-/* extended mnemonics for bc, bcctr, bclr */
+/* extended mnemonics for bc, bcctr, bclr (page 144) */
 0,     OP_BH,       19<<26 | 20<<21 | 528<<1 | 0<<0,            "bctr",
 0,     OP_BH,       19<<26 | 20<<21 | 528<<1 | 1<<0,            "bctrl",
 0,     OP_BDL,      16<<26 | 16<<21 | 0<<1 | 0<<0,              "bdnz",
@@ -186,7 +190,7 @@
 0,     OP_BI_BH,    19<<26 | 12<<21 | 16<<1 | 0<<0,             "btlr",
 0,     OP_BI_BH,    19<<26 | 12<<21 | 16<<1 | 1<<0,             "btlrl",
 
-/* extended m with condition in BI */
+/* extended m with condition in BI (page 146) */
 0,     OP_BICR_BDL,  16<<26 | 12<<21 | 2<<16 | 0<<1 | 0<<0,     "beq",
 0,     OP_BICR_BDA,  16<<26 | 12<<21 | 2<<16 | 1<<1 | 0<<0,     "beqa",
 0,     OP_BICR_BH,   19<<26 | 12<<21 | 2<<16 | 528<<1 | 0<<0,   "beqctr",
@@ -284,7 +288,7 @@
 0,     OP_BICR_BH,   19<<26 | 12<<21 | 3<<16 | 16<<1 | 0<<0,    "bunlr",
 0,     OP_BICR_BH,   19<<26 | 12<<21 | 3<<16 | 16<<1 | 1<<0,    "bunlrl",
 
-/* extended m for cr logic */
+/* extended m for cr logic (page 147) */
 0,     OP_BT_BT_BT,  19<<26 | 289<<1,                           "crset",
 0,     OP_BT_BT_BT,  19<<26 | 193<<1,                           "crclr",
 0,     OP_BT_BA_BA,  19<<26 | 449<<1,                           "crmove",
@@ -377,12 +381,12 @@
 0,     OP_RT_RA_C,            31<<26 | 0<<10 | 104<<1,                 "neg",
 0,     OP_RT_RA_C,            31<<26 | 1<<10 | 104<<1,                 "nego",
 
-/* extended m for addition */
+/* extended m for addition (pages 153, 154) */
 0,     OP_RT_RA_D,            14<<26,                       "la",
 0,     OP_RT_SI,              14<<26 | 0<<16,               "li",
 0,     OP_RT_SI,              15<<26 | 0<<16,               "lis",
 
-/* extended m for subtraction */
+/* extended m for subtraction (pages 147, 148) */
 0,     OP_RT_RB_RA_C,         31<<26 | 0<<10 | 40<<1,       "sub",
 0,     OP_RT_RB_RA_C,         31<<26 | 1<<10 | 40<<1,       "subo",
 0,     OP_RT_RB_RA_C,         31<<26 | 0<<10 | 8<<1,        "subc",
@@ -418,7 +422,7 @@
 0,     OP_BF_L_RA_UI,         10<<26,                                  "cmpli",
 0,     OP_BF_L_RA_RB,         31<<26 | 32<<1,                          "cmpl",
 
-/* extended m for comparison */
+/* extended m for comparison (page 149) */
 0,     OP_BF_RA_SI,           11<<26 | 1<<21,               "cmpdi",
 0,     OP_BF_RA_RB,           31<<26 | 1<<21 | 0<<1,        "cmpd",
 0,     OP_BF_RA_UI,           10<<26 | 1<<21,               "cmpldi",
@@ -434,7 +438,7 @@
 0,     OP_TO_RA_RB,           31<<26 | 68<<1,                          "td",
 0,     OP_TO_RA_RB,           31<<26 | 4<<1,                           "tw",
 
-/* extended m for traps */
+/* extended m for traps (page 150) */
 0,     OP_TOX_RA_RB,          31<<26 | 4<<21 | 68<<1,       "tdeq",
 0,     OP_TOX_RA_SI,          2<<26 | 4<<21,                "tdeqi",
 0,     OP_TOX_RA_RB,          31<<26 | 12<<21 | 68<<1,      "tdge",
@@ -518,11 +522,10 @@
 0,     OP_RA_RS_C,            31<<26 | 58<<1,                          "cntlzd",
 0,     OP_RA_RS_C,            31<<26 | 26<<1,                          "cntlzw",
 
-/* extended m using logic */
-0,     OP_RS_RA_RA_C,         31<<26 | 444<<1,              "mr",
+/* extended m using logic (pages 153, 154) */
+0,     OP_RA_RS_RA_C,         31<<26 | 444<<1,              "mr",
 0,     OP,                    24<<26,                       "nop",
-0,     OP_RS_RA_RA_C,         31<<26 | 124<<1,              "not",
-0,     OP,                    26<<26,                       "xnop",
+0,     OP_RA_RS_RA_C,         31<<26 | 124<<1,              "not",
 
 /* page 69 */
 0,     OP_RA_RS_SH6_MB6_C,     30<<26 | 0<<2,               "rldicl",
@@ -535,7 +538,7 @@
 0,     OP_RA_RS_SH6_MB6_C,     30<<26 | 3<<2,               "rldimi",
 0,     OP_RA_RS_SH5_MB5_ME5_C, 20<<26,                      "rlwimi",
 
-/* extended m for doubleword rotation */
+/* extended m for doubleword rotation (page 151) */
 0,     OP_clrlsldi,           30<<26 | 2<<2,                "clrlsldi",
 0,     OP_clrldi,             30<<26 | 0<<2,                "clrldi",
 0,     OP_clrrdi,             30<<26 | 1<<2,                "clrrdi",
@@ -548,7 +551,7 @@
 0,     OP_sldi,               30<<26 | 1<<2,                "sldi",
 0,     OP_srdi,               30<<26 | 0<<2,                "srdi",
 
-/* extended m for word rotation */
+/* extended m for word rotation (page 152) */
 0,     OP_clrlslwi,           21<<26,                       "clrlslwi",
 0,     OP_clrlwi,             21<<26,                       "clrlwi",
 0,     OP_clrrwi,             21<<26,                       "clrrwi",
@@ -573,21 +576,25 @@
 0,     OP_RA_RS_RB_C,         31<<26 | 792<<1,              "sraw",
 
 /* page 78 */
-0,     OP_RS_SPR,             31<<26 | 467<<1,                         "mtspr",
-0,     OP_RT_SPR,             31<<26 | 339<<1,                         "mfspr",
-0,     OP_RS_FXM,             31<<26 | 0<<21 | 144<<1,                 "mtcrf",
-0,     OP_RT,                 31<<26 | 0<<21 | 19<<1,                  "mfcr",
+0,     OP_RS_SPR,             31<<26 | 467<<1,              "mtspr",
+0,     OP_RT_SPR,             31<<26 | 339<<1,              "mfspr",
+0,     OP_RS_FXM,             31<<26 | 0<<20 | 144<<1,      "mtcrf",
+0,     OP_RT,                 31<<26 | 0<<20 | 19<<1,       "mfcr",
 
-/* extended m for special purpose registers */
+/* extended m for special purpose registers (page 153) */
 0,     OP_RT,       31<<26 | 9<<16 | 0<<11 | 339<<1,        "mfctr",
 0,     OP_RT,       31<<26 | 8<<16 | 0<<11 | 339<<1,        "mflr",
 0,     OP_RT,       31<<26 | 1<<16 | 0<<11 | 339<<1,        "mfxer",
-0,     OP_RT,       31<<26 | 9<<16 | 0<<11 | 467<<1,        "mtctr",
-0,     OP_RT,       31<<26 | 8<<16 | 0<<11 | 467<<1,        "mtlr",
-0,     OP_RT,       31<<26 | 1<<16 | 0<<11 | 467<<1,        "mtxer",
+0,     OP_RS,       31<<26 | 9<<16 | 0<<11 | 467<<1,        "mtctr",
+0,     OP_RS,       31<<26 | 8<<16 | 0<<11 | 467<<1,        "mtlr",
+0,     OP_RS,       31<<26 | 1<<16 | 0<<11 | 467<<1,        "mtxer",
+
+/* extended m for condition register (page 154) */
+0,     OP_RS,       31<<26 | 0<<20 | 255<<12 | 144<<1,      "mtcr",
 
 /* Floating point instructions (page 83) */
 
+/* page 98 */
 0,     OP_FRT_RA_D,           48<<26,                                  "lfs",
 0,     OP_FRT_RA_RB,          31<<26 | 535<<1,                         "lfsx",
 0,     OP_FRT_RA_D,           49<<26,                                  "lfsu",
@@ -606,6 +613,7 @@
 0,     OP_FRS_RA_RB,          31<<26 | 759<<1,                         "stfdux",
 0,     OP_FRS_RA_RB,          31<<26 | 983<<1,                         "stfiwx",
 
+/* page 104 */
 0,     OP_FRT_FRB_C,          63<<26 | 72<<1,                          "fmr",
 0,     OP_FRT_FRB_C,          63<<26 | 40<<1,                          "fneg",
 0,     OP_FRT_FRB_C,          63<<26 | 264<<1,                         "fabs",
@@ -629,6 +637,7 @@
 0,     OP_FRT_FRA_FRC_FRB_C,  63<<26 | 30<<1,                          "fnmsub",
 0,     OP_FRT_FRA_FRC_FRB_C,  59<<26 | 30<<1,                          "fnmsubs",
 
+/* page 109 */
 0,     OP_FRT_FRB_C,          63<<26 | 12<<1,                          "frsp",
 0,     OP_FRT_FRB_C,          63<<26 | 814<<1,                         "fctid",
 0,     OP_FRT_FRB_C,          63<<26 | 815<<1,                         "fctidz",
@@ -652,4 +661,31 @@
 0,     OP_FRT_FRB_C,          63<<26 | 26<<1,                          "frsqrte",
 0,     OP_FRT_FRA_FRC_FRB_C,  63<<26 | 23<<1,                          "fsel",
 
-/* page 98 */
+/* Storage control instructions (Book II, page 15) */
+
+/* Book II, page 17 */
+0,     OP_RA_RB,              31<<26 | 982<<1,              "icbi",
+0,     OP_RA_RB_TH /* page 35 */,          31<<26 | 278<<1, "dcbt",
+0,     OP_RA_RB,              31<<26 | 246<<1,              "dcbtst",
+0,     OP_RA_RB,              31<<26 | 1014<<1,             "dcbz",
+0,     OP_RA_RB,              31<<26 | 54<<1,               "dcbst",
+0,     OP_RA_RB,              31<<26 | 86<<1,               "dcbf",
+0,     OP,                    19<<26 | 150<<1,              "isync",
+0,     OP_RT_RA_RB,           31<<26 | 20<<1,               "lwarx",
+0,     OP_RT_RA_RB,           31<<26 | 84<<1,               "ldarx",
+0,     OP_RS_RA_RB_CC,        31<<26 | 150<<1 | 1<<0,       "stwcx",
+0,     OP_RS_RA_RB_CC,        31<<26 | 150<<1 | 1<<0,       "stdcx",
+0,     OP_L,                  31<<26 | 598<<1,              "sync",
+0,     OP,                    31<<26 | 1<<21 | 598<<1,      "lwsync",
+0,     OP,                    31<<26 | 2<<21 | 598<<1,      "ptesync",
+0,     OP,                    31<<26 | 854<<1,              "eieio",
+
+/* Time base (Book II, page 30) */
+
+0,     OP_RT_TBR,   31<<26 | 371<<1,                        "mftb",
+0,     OP_RT,       31<<26 | 8<<11 | 13<<16 | 371<<1,       "mftbu",
+
+/* External control (Book II, page 33) */
+
+0,     OP_RT_RA_RB,           31<<26 | 310<<1,              "eciwx",
+0,     OP_RS_RA_RB,           31<<26 | 438<<1,              "ecowx",
diff --git a/mach/powerpc/as/mach4.c b/mach/powerpc/as/mach4.c
index 8a0cca9de..b344ba8ce 100644
--- a/mach/powerpc/as/mach4.c
+++ b/mach/powerpc/as/mach4.c
@@ -42,7 +42,23 @@ operation
 	| OP_FRT_RA_D          FPR ',' e16 '(' GPR ')'    { emit_hl($1 | ($2<<21) | ($6<<16) | $4); }
 	| OP_FRT_RA_RB         FPR ',' GPR ',' GPR        { emit4($1 | ($2<<21) | ($4<<16) | ($6<<11)); }
 	| OP_FRT_C             c FPR                      { emit4($1 | $2 | ($3<<21)); }
-	| OP_RA_RS_C           c GPR ',' GPR              { emit4($1 | $2 | ($5<<21) | ($3<<16)); }
+	| OP_L                              { emit4($1); }
+	| OP_L                 u2           { emit4($1 | ($2<<21)); }
+	| OP_LEV                            { emit4($1); }
+	| OP_LEV               u7           { emit4($1 | ($2<<5)); }
+	| OP_RA_RB             GPR ',' GPR
+	{ emit4($1 | ($2<<16) | ($4<<11)); }
+	| OP_RA_RB_TH          GPR ',' GPR opt_bh
+	{ emit4($1 | $5 | ($2<<16) | ($4<<11)); }
+	/*
+	 * For instructions with "mnemonic RS, RA, ..."
+	 * OP_RA_RS_... swaps RS and RA to (RA<<21) || (RS<<16)
+	 * OP_RS_RA_... keeps RS and RA as (RS<<21) || (RA<<16)
+	 */
+	| OP_RA_RS_C           c GPR ',' GPR
+	{ emit4($1 | $2 | ($5<<21) | ($3<<16)); }
+	| OP_RA_RS_RA_C        c GPR ',' GPR
+	{ emit4($1 | $2 | ($5<<21) | ($3<<16) | ($5<<11)); }
 	| OP_RA_RS_RB_C        c GPR ',' GPR ',' GPR
 	{ emit4($1 | $2 | ($5<<21) | ($3<<16) | ($7<<11)); }
 	| OP_RA_RS_RB_MB5_ME5_C c GPR ',' GPR ',' GPR ',' u5 ',' u5
@@ -75,20 +91,19 @@ operation
 	| OP_RT_RB_RA_C        c GPR ',' GPR ',' GPR      { emit4($1 | $2 | ($3<<21) | ($7<<16) | ($5<<11)); }
 	| OP_RT_SI             GPR ',' e16                { emit_hl($1 | ($2<<21) | $4); }
 	| OP_RT_SPR            GPR ',' spr_num            { emit4($1 | ($2<<21) | ($4<<11)); }
+	| OP_RT_TBR            GPR opt_tbr                { emit4($1 | ($2<<21) | ($3<<11)); }
+	| OP_RS                GPR                        { emit4($1 | ($2<<21)); }
 	| OP_RS_FXM            u7 ',' GPR                 { emit4($1 | ($4<<21) | ($2<<12)); }
 	| OP_RS_RA_D           GPR ',' e16 '(' GPR ')'    { emit_hl($1 | ($2<<21) | ($6<<16) | $4); }
 	| OP_RS_RA_DS          GPR ',' ds '(' GPR ')'     { emit_hl($1 | ($2<<21) | ($6<<16) | $4); }
 	| OP_RS_RA_NB          GPR ',' GPR ',' nb         { emit4($1 | ($2<<21) | ($4<<16) | ($6<<11)); }
 	| OP_RS_RA_RB          GPR ',' GPR ',' GPR        { emit4($1 | ($2<<21) | ($4<<16) | ($6<<11)); }
-	| OP_RS_RA_RB_C        c GPR ',' GPR ',' GPR      { emit4($1 | $2 | ($5<<21) | ($3<<16) | ($7<<11)); }
-	| OP_RS_RA_RA_C        c GPR ',' GPR              { emit4($1 | $2 | ($5<<21) | ($3<<16) | ($5<<11)); }
+	| OP_RS_RA_RB_CC       C GPR ',' GPR ',' GPR      { emit4($1 | ($3<<21) | ($5<<16) | ($7<<11)); }
 	| OP_RS_SPR            spr_num ',' GPR            { emit4($1 | ($4<<21) | ($2<<11)); }
 	| OP_TO_RA_RB          u5 ',' GPR ',' GPR         { emit4($1 | ($2<<21) | ($4<<16) | ($6<<11)); }
 	| OP_TO_RA_SI          u5 ',' GPR ',' e16         { emit_hl($1 | ($2<<21) | ($4<<16) | $6); }
 	| OP_TOX_RA_RB         GPR ',' GPR                { emit4($1 | ($2<<16) | ($4<<11)); }
 	| OP_TOX_RA_SI         GPR ',' e16                { emit_hl($1 | ($2<<16) | $4); }
-	| OP_LEV                                          { emit4($1); }
-	| OP_LEV               u7                         { emit4($1 | ($2<<5)); }
 	| OP_LIA               lia                        { emit4($1 | $2); }
 	| OP_LIL               lil                        { emit4($1 | $2); }
 	| OP_LI32              li32                       /* emitted in subrule */
@@ -298,7 +313,7 @@ u2
 	}
 	;
 
-/* Optional comma, branch hint. */
+/* Optional comma, branch hint (or touch hint). */
 opt_bh
 	: /* nothing */         { $$ = 0; }
 	| ',' u2                { $$ = ($2<<11); }
@@ -409,13 +424,28 @@ lia
 	}
 	;
 
+/*
+ * Instructions "mfspr", "mtspr", and "mftb" encode the 10-bit special
+ * purpose register (spr) or time base register (tbr) by swapping the
+ * low 5 bits with the high 5 bits.  The value from an SPR token has
+ * already been swapped.
+ */
+
 spr_num
-	: SPR { $$ = $1; }
-	| absexp
+	: SPR     { $$ = $1; }
+	| tbr_num { $$ = $1; }
+	;
+
+opt_tbr
+	: /* nothing */         { $$ = 8 | (12<<5); }
+	| ',' tbr_num           { $$ = $2; }
+	;
+
+tbr_num
+	: absexp
 	{
 		if (($1 < 0) || ($1 > 0x3ff))
-			serror("spr number out of range");
-		/* mfspr, mtspr swap the low and high 5 bits */
+			serror("10-bit unsigned value out of range");
 		$$ = ($1 >> 5) | (($1 & 0x1f) << 5);
 	}
 	;
diff --git a/man/powerpc_as.6 b/man/powerpc_as.6
index 8198d6bce..f6bb90818 100644
--- a/man/powerpc_as.6
+++ b/man/powerpc_as.6
@@ -1,33 +1,136 @@
-.TH POWERPC_AS 1
+.TH POWERPC_AS 1 2018-03-07
 .ad
 .SH NAME
 powerpc_as \- assembler for PowerPC
-
 .SH SYNOPSIS
 as [options] argument ...
-
 .SH DESCRIPTION
 This assembler is made with the general framework
 described in \fIuni_ass\fP(6).
-
+.PP
+It can assemble the instructions from Book I and Book II of PowerPC
+version 2.01.
+This includes the branch, integer, and floating point instructions
+from Book I; and the cache, synchronization, and time base
+instructions from Book II.
+.PP
+There is no support for other instructions, such as supervisor-mode
+instructions or vector instructions.
+There is some support for 64-bit integer instructions, but the
+assembler only has 32-bit symbols.
 .SH SYNTAX
-Most 32-bit integer and floating point instructions are supported, but not many
-short form instructions. Instructions which take 16-bit operands can additionally
-use the following special functions:
-
-.IP hi16[value], ha16[value]
-Returns the high half of the value of the expression; if the value is not absolute,
-also generates the appropriate fixup. Use of either of these \fImust\fR be followed,
-in the next instruction, by the corresponding use of \fBlo16[]\fR. Use \fBhi16[]\fR
-if the low half is going to interpret its payload as an unsigned value, and
-\fBha16[]\fR if it will be interpreted as a signed value (so that the high half can
-be adjusted to match).
-
-.IP lo16[]
-Returns the low half of the value of the expression. No fixup is generated. Use of
-\fBlo16[]\fR must come in the instruction immediately after a use of \fBhi16[]\fR or
-\fBha16[]\fR.
-
+.SS general purpose registers
+There are 32 GPRs from \fBr0\fP to \fBr31\fP.
+In this assembler, \fBsp\fP is an alias for \fBr1\fP, and \fBfp\fP is
+an alias for \fBr2\fP, because \fIack\fP uses r1 as the stack pointer
+and r2 as the frame pointer.
+Other compilers don't use r2 as the frame pointer.
+.PP
+GPR syntax requires a register name, not a number.
+For example, \(oqaddi\ r5,\ r4,\ 1\(cq works, but
+\(oqaddi\ 5,\ 4,\ 1\(cq is a syntax error.
+.PP
+Certain instructions ignore the contents of \fBr0\fP and use zero.
+This happens when using r0 as the second operand of \fIaddi\fP or
+\fIaddis\fP, or when addressing \(oqexpr(r0)\(cq or
+\(oqr0,\ gpr\(cq.
+The syntax is still the name r0, not the number 0.
+.SS floating point registers
+There are 32 FPRs from \fBf0\fP to \fBf31\fP.
+Each FPR has 64 bits and can hold a single-precision or
+double-precision number.
+FPR syntax requires a register name, not a number.
+.SS special purpose registers
+The three named SPRs are \fBctr\fP (count register), \fBlr\fP (link
+register), and \fBxer\fP (exception register).
+\(oqmfspr\(cq and \(oqmtspr\(cq allow these names or a number.
+.SS condition register
+There is a 32-bit condition register, where bit 0 is most significant,
+and bit 31 is least significant.
+This gets split into 8 registers of 4 bits each, from \fBcr0\fP (with
+bits 0 to 3) to \fBcr7\fP (with bits 28 to 31).
+Some instructions use the names cr0 to cr7, others use a bit numbered
+0 to 31, and others use all 32 bits.
+.SS addressing modes
+\(oqexpr(gpr)\(cq addresses \fIexpr\fP + the contents of \fIgpr\fP,
+except that \(oqexpr(r0)\(cq addresses \fIexpr\fP\ +\ 0.
+A few instructions, like \(oqstwu\(cq, also update \fIgpr\fP by
+setting it to the address.
+.PP
+\(oqgprA,\ gprB\(cq in certain instructions addresses the contents of
+\fIgprA\fP + the contents of \fIgprB\fP, except that \(oqr0,\ gprB\(cq
+addresses 0\ +\ the contents of \fIgprB\fP.
+.SS 16-bit operands
+Some instructions have a 16-bit operand.
+This can be a bare \fIexpr\fP (which must fit signed or unsigned
+16 bits), or it can be one of these special functions:
+.IP "hi16[expr], ha16[expr]"
+Returns the high half of the 32-bit value of the expression.
+If the low half is negative (from 0x8000 to 0xffff),
+then \fBha16[]\fP adjusts the high half by adding 1.
+Use \fBhi16[]\fP if the instruction with \fBlo16[]\fP is going to
+interpret its operand as an unsigned value, or \fBha16[]\fP if it will
+interpret it as signed.
+.IP
+If \fIexpr\fP is not absolute, then the assembler must generate a
+fixup for the linker.
+The fixup only works if the instruction is
+\(oqaddis gpr, r0, hx16[expr]\(cq or \(oqlis gpr, hx16[expr]\(cq.
+.IP lo16[expr]
+Returns the low half of the 32-bit value of the expression.
+.SS short forms
+Some instructions have short forms using extended mnemonics (or
+simplified mnemonics) like \fIli\fP, \fIsrwi\fP, and many others.
+.IP "li r6, 789"
+is short for: addi r6, r0, 789
+.IP "srwi r3, r4, 2"
+is short for: rlwinm r3, r4, 30, 2, 31
+.PP
+This assembler doesn't support extended mnemonics with branch
+prediction, such as \fIblt+\fP or \fIbne-\fP.
+It always parses \(oq+\(cq and \(oq-\(cq as operators,
+never as part of a mnemonic.
+.SH EXAMPLES
+There are two ways to load r3 with _symbol\ =\ 0x1234abcd.
+One way is
+.PP
+.nf
+   lis  r3, hi16[_symbol]
+   ori  r3, r3, lo16[_symbol]  ! r3 = 0x12340000 | 0x0000abcd
+.fi
+.PP
+The other way is
+.PP
+.nf
+   lis  r3, ha16[_symbol]
+   addi r3, r3, lo16[_symbol]  ! r3 = 0x12350000 + 0xffffabcd
+.fi
+.PP
+The next code adds 1 to a global variable.
+.PP
+.nf
+   lis  r3, ha16[_var]
+   lwz  r4, lo16[_var](r3)
+   addi r4, r4, 1
+   stw  r4, lo16[_var](r3)
+.fi
 .SH "SEE ALSO"
 uni_ass(6),
 ack(1)
+.PP
+Freescale Semiconductor, \fIProgramming Environments Manual for 32-Bit
+Implementations of the PowerPC Architecture\fP, Rev. 3, September 2005.
+.PP
+IBM, \fIPowerPC User Instruction Set Architecture, Book I\fP, Version
+2.01, September 2003.
+.PP
+IBM, \fIPowerPC Virtual Environment Architecture, Book II\fP, Version
+2.01, December 2003.
+.SH CAVEATS
+Beware that not every processor can run every instruction.
+The 32-bit processors can't run 64-bit instructions like \fIlwa\fP,
+\fIstd\fP, and \fIfctid\fP.
+The PowerPC 601 can't run \fIstfiwx\fP, nor \fIfres\fP, \fIfrsqrte\fP,
+\fIfsel\fP.
+Many models, like the PowerPC G4, can't run \fIfsqrt\fP nor
+\fIfsqrts\fP.

From 0720671f7ab6bd2c8a60d2138b4befb52a1862db Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 8 Mar 2018 11:49:40 -0500
Subject: [PATCH 51/55] Fix wr_ranlib() for big-endian machines.

With this change, I built and ran ack on a big-endian PowerPC Linux
machine.  I used gcc 4.9.4 to build ack, and I only built the linuxppc
back end.

Before this change, wr_ranlib() corrupted a value by changing it from
0x66 to 0x66000066.  This value was too big, so led made a fatal
error, "bad ranlib string offset".
---
 modules/src/object/wr_ranlib.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/modules/src/object/wr_ranlib.c b/modules/src/object/wr_ranlib.c
index 91274d71c..b515ffb3b 100644
--- a/modules/src/object/wr_ranlib.c
+++ b/modules/src/object/wr_ranlib.c
@@ -10,16 +10,27 @@ wr_ranlib(fd, ran, cnt1)
 	struct ranlib	*ran;
 	long	cnt1;
 {
-	{
-		register long cnt = cnt1;
-		register struct ranlib *r = ran;
-		register char *c = (char *) r;
+	struct ranlib *r;
+	long cnt, val;
+	char *c;
 
-		while (cnt--) {
-			put4(r->ran_off,c); c += 4;
-			put4(r->ran_pos,c); c += 4;
-			r++;
-		}
+	/*
+	 * We overwrite the structs in r with the bytes in c, so we
+	 * don't need to allocate another buffer.
+	 *
+	 * put4(r->ran_off, c) can fail if r->ran_off and c overlap in
+	 * memory, if this is a big-endian machine.  It tries to swap
+	 * the bytes from big to little endian, but overwrites some
+	 * bytes before reading them.  To prevent this, we must copy
+	 * each value before we overwrite it.
+	 */
+	r = ran;
+	c = (char *)r;
+	cnt = cnt1;
+	while (cnt--) {
+		val = r->ran_off; put4(val, c); c += 4;
+		val = r->ran_pos; put4(val, c); c += 4;
+		r++;
 	}
 	wr_bytes(fd, (char *) ran, cnt1 * SZ_RAN);
 }

From 860df1b067051d59e7db02d68522adc5b859c7db Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 8 Mar 2018 12:04:02 -0500
Subject: [PATCH 52/55] Read from new, not old, buffer after realloc.

This got caught by MALLOC_OPTIONS=S in OpenBSD.  The B compiler filled
the buffer while compiling hilo.b.  Then realloc moved the buffer and
unmapped the old buffer.  The compiler tried to read the old buffer
and segfaulted.
---
 modules/src/em_code/insert.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/modules/src/em_code/insert.c b/modules/src/em_code/insert.c
index 36950c3ea..00c628dcb 100644
--- a/modules/src/em_code/insert.c
+++ b/modules/src/em_code/insert.c
@@ -99,20 +99,19 @@ C_out_parts(pp)
 		}
 		else {
 			/* copy the chunk to output */
-#ifdef INCORE
-			register char *s = C_BASE + pp->pp_begin;
-			char *se = C_BASE + pp->pp_end;
-
-			while (s < se) {
-				put(*s++);
-			}
-#else
 			register long b = pp->pp_begin;
 
 			while (b < pp->pp_end) {
+#ifdef INCORE
+				/* C_BASE is not constant, put() may
+				   move C_BASE, so each iteration of
+				   this loop must read C_BASE again.
+				*/
+				put(C_BASE[b++]);
+#else
 				put(getbyte(b++));
-			}
 #endif
+			}
 		}
 		prev = pp;
 		pp = pp->pp_next;

From 12643f17401f031f7301af9c373d44b51bbe28ab Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Thu, 8 Mar 2018 18:51:07 -0500
Subject: [PATCH 53/55] Solve some gcc warnings in ego.

Some of these are from gcc -Wimplicit
---
 util/ego/cs/cs.c        | 6 ++----
 util/ego/cs/cs_debug.c  | 3 ++-
 util/ego/cs/cs_profit.c | 1 +
 util/ego/share/files.c  | 4 +++-
 util/ego/share/files.h  | 4 ++--
 util/ego/share/get.c    | 2 +-
 util/ego/share/go.c     | 2 +-
 util/ego/share/go.h     | 2 +-
 util/ego/share/types.h  | 2 +-
 9 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/util/ego/cs/cs.c b/util/ego/cs/cs.c
index dfcccbbf7..068ddc3dc 100644
--- a/util/ego/cs/cs.c
+++ b/util/ego/cs/cs.c
@@ -25,7 +25,7 @@
 
 int Scs; /* Number of optimizations found. */
 
-STATIC cs_clear()
+STATIC void cs_clear()
 {
 	clr_avails();
 	clr_entities();
@@ -74,9 +74,7 @@ STATIC void cs_optimize(void *vp)
 	}
 }
 
-main(argc, argv)
-	int	argc;
-	char	*argv[];
+int main(int argc, char *argv[])
 {
 	Scs = 0;
 	go(argc, argv, no_action, cs_optimize, cs_machinit, no_action);
diff --git a/util/ego/cs/cs_debug.c b/util/ego/cs/cs_debug.c
index 07890395b..3d5509ddc 100644
--- a/util/ego/cs/cs_debug.c
+++ b/util/ego/cs/cs_debug.c
@@ -11,6 +11,7 @@
 #include "cs.h"
 #include "cs_aux.h"
 #include "cs_avail.h"
+#include "cs_debug.h"
 #include "cs_entity.h"
 
 #ifdef VERBOSE
@@ -48,7 +49,7 @@ STATIC void showinstr(line_p lnp)
 	fprintf(stderr,"\n");
 }
 
-SHOWOCCUR(occur_p ocp)
+void SHOWOCCUR(occur_p ocp)
 {
 	/* Shows all instructions in an occurrence. */
 
diff --git a/util/ego/cs/cs_profit.c b/util/ego/cs/cs_profit.c
index a92028c36..2efabcb03 100644
--- a/util/ego/cs/cs_profit.c
+++ b/util/ego/cs/cs_profit.c
@@ -14,6 +14,7 @@
 #include "../share/cset.h"
 #include "../share/lset.h"
 #include "cs.h"
+#include "cs_alloc.h"
 #include "cs_aux.h"
 #include "cs_debug.h"
 #include "cs_avail.h"
diff --git a/util/ego/share/files.c b/util/ego/share/files.c
index e45f9b7fb..2dd10b25f 100644
--- a/util/ego/share/files.c
+++ b/util/ego/share/files.c
@@ -9,9 +9,11 @@
  */
 
 #include <stdio.h>
+#include "types.h"
+#include "debug.h"
 #include "files.h"
 
-struct files* findfiles(int argc, const char** argv)
+struct files* findfiles(int argc, char * const *argv)
 {
 	static struct files files;
 
diff --git a/util/ego/share/files.h b/util/ego/share/files.h
index 46b19917a..ab2076ea1 100644
--- a/util/ego/share/files.h
+++ b/util/ego/share/files.h
@@ -33,11 +33,11 @@ struct files
 
 	/* The rest of the arguments. */
 
-	const char** argv;
+	char * const *argv;
 	int argc;
 };
 
-struct files* findfiles(int argc, const char** argv);
+struct files* findfiles(int argc, char * const *argv);
 
 FILE *openfile(const char *name, const char *mode);
 				/*
diff --git a/util/ego/share/get.c b/util/ego/share/get.c
index 94c7aabe2..a433b0946 100644
--- a/util/ego/share/get.c
+++ b/util/ego/share/get.c
@@ -285,7 +285,7 @@ dblock_p getdtable(const char *dname)
 
 /* getbblocks */
 
-STATIC argstring(short length, argb_p abp)
+STATIC void argstring(short length, argb_p abp)
 {
 
 	while (length--) {
diff --git a/util/ego/share/go.c b/util/ego/share/go.c
index 9a2107d3d..0ccd3c6e9 100644
--- a/util/ego/share/go.c
+++ b/util/ego/share/go.c
@@ -42,7 +42,7 @@ STATIC void mach_init(char* machfile, void (*phase_machinit)(void *))
 	fclose(f);
 }
 
-void go(int argc, const char** argv,
+void go(int argc, char * const *argv,
 	void (*initialize)(void *), void (*optimize)(void *),
 	void (*phase_machinit)(void *), void (*proc_flag)(void *))
 {
diff --git a/util/ego/share/go.h b/util/ego/share/go.h
index 3bb8c1f54..55f1b48e8 100644
--- a/util/ego/share/go.h
+++ b/util/ego/share/go.h
@@ -22,7 +22,7 @@
  * and 'optimize' is called with the current procedure
  * as parameter.
  */
-void go(int argc, const char** argv,
+void go(int argc, char * const *argv,
 	void (*initialize)(void *null),
 	void (*optimize)(void *),	/* (proc_p *p) */
 	void (*phase_machinit)(void *),	/* (FILE *f) */
diff --git a/util/ego/share/types.h b/util/ego/share/types.h
index cae4d6074..cabc5818d 100644
--- a/util/ego/share/types.h
+++ b/util/ego/share/types.h
@@ -46,7 +46,7 @@ typedef struct elemholder *lset;
 typedef struct bitvector  *cset;
 typedef elem_p Lindex;
 typedef short  Cindex;
-typedef char   *Lelem_t;
+typedef void   *Lelem_t;
 typedef short  Celem_t;
 
 typedef union pext_t *pext_p;

From ebba76e08ff15aa2621319513239a51e6caaf0a7 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Sun, 11 Mar 2018 20:10:13 -0400
Subject: [PATCH 54/55] Don't read INSTR(l) after oldline(l) frees it.

This bug got in my way while I was looking for another read-after-free
bug in the CS phase.
---
 util/ego/ca/ca.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/util/ego/ca/ca.c b/util/ego/ca/ca.c
index 095736665..1bf73d24a 100644
--- a/util/ego/ca/ca.c
+++ b/util/ego/ca/ca.c
@@ -72,6 +72,7 @@ proc_p* p_out;
 			{
 				/* register message without arguments */
 				oldline(l);
+				continue;
 			}
 			else
 			{

From 85fcbde22fe1068e9ef2d73a57d8a813e24eef12 Mon Sep 17 00:00:00 2001
From: George Koehler <xkernigh@netscape.net>
Date: Mon, 12 Mar 2018 20:58:31 -0400
Subject: [PATCH 55/55] Check LOI expressions to prevent a read after free.

CS eliminates outer expressions before inner ones, as `x * y * z`
before `x * y`.  It does this by reversing the order of expressions in
the code.  This almost always works, but it sometimes doesn't work if
a STI changes the value number of a LOI.  In code like `expr1 LOI
expr2 STI expr2 LOI`, CS might eliminate the inner `expr2` before the
outer `expr2 LOI`.  This caused a read after free because the
occurrence of `expr2 LOI` pointed to the eliminated lines of `expr2`.

This bug went unnoticed until my recent changes caused CS to crash
with a double free.  I did not get the crash in OpenBSD, but I saw the
crash in Travis, then David Given reproduced the crash in Linux.  See
the discussion in https://github.com/davidgiven/ack/pull/73
---
 util/ego/cs/cs_elim.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/util/ego/cs/cs_elim.c b/util/ego/cs/cs_elim.c
index 7dce0df09..b83371416 100644
--- a/util/ego/cs/cs_elim.c
+++ b/util/ego/cs/cs_elim.c
@@ -142,8 +142,9 @@ STATIC void replace(occur_p ocp, offset tmp, avail_p avp)
 	/* Replace the lines in the occurrence in ocp by a load of the
 	 * temporary with offset tmp.
 	 */
-	register line_p lol, first, last;
-	register int instr;
+	avail_p ravp;
+	line_p lol, first, last;
+	int instr;
 
 	assert(avp->av_size == ws || avp->av_size == 2*ws);
 
@@ -176,6 +177,33 @@ STATIC void replace(occur_p ocp, offset tmp, avail_p avp)
 			break;
 	}
 
+	/* Some occurrence rocp of an expression before avp might have
+	 * rocp->oc_lfirst == first.  If so, then we must set
+	 * rocp->oc_lfirst = lol before we throw away first.
+	 *
+	 * This is almost not possible, but it can happen in code with
+	 * expr1 LOI expr2 STI expr2 LOI, where the STI causes both
+	 * LOIs to have the same value number.  Then the first LOI
+	 * might come before the first expr2, so we might replace
+	 * expr2 before we replace expr2 LOI.  Then the occurrence of
+	 * expr2 LOI must not point to the eliminated lines of expr2.
+	 */
+	for (ravp = avp->av_before; ravp != (avail_p) 0;
+	     ravp = ravp->av_before) {
+		/* We only check LOI expressions. */
+		if (ravp->av_instr == op_loi) {
+			occur_p rocp;
+			Lindex i;
+
+			for (i = Lfirst(ravp->av_occurs); i != (Lindex) 0;
+			     i = Lnext(i, ravp->av_occurs)) {
+				rocp = occ_elem(i);
+				if (rocp->oc_lfirst == first)
+					rocp->oc_lfirst = lol;
+			}
+		}
+	}
+
 	/* Throw away the by now useless lines. */
 	remove_lines(first, last);
 }