Add floating-point register variables to PowerPC ncg.

Use f14 to f31 as register variables for 8-byte double-precison.
There are no regvars for 4-byte double precision, because all
regvar(reg_float) must have the same size.  I expect more programs to
prefer 8-byte double precision.

Teach mach/powerpc/ncg/mach.c to emit stfd and lfd instructions to
save and restore 8-byte regvars.  Delay emitting the function prolog
until f_regsave(), so we can use one addi to make stack space for both
local vars and saved registers.  Be more careful with types in mach.c;
don't assume that int and long and full are the same.

In ncg table, add f14 to f31 as register variables, and some rules to
use them.  Add rules to put the result of fadd, fsub, fmul, fdiv, fneg
in a regvar.  Without such rules, the result would go in a scratch
FREG, and we would need fmr to move it to the regvar.  Also add a rule
for pat sdl inreg($1)==reg_float with STACK, so we can unstack the
value directly into the regvar, again without a scratch FREG and fmr.

Edit util/ego/descr/powerpc.descr to tell ego about the new float
regvars.  This might not be working right; ego usually decides against
using any float regvars, so ack -O1 (not running ego) uses the
regvars, but ack -O4 (running ego) doesn't use the regvars.

Beware that ack -mosxppc runs ego using powerpc.descr but -mlinuxppc
and -mqemuppc run ego without a config file (since 8ef7c31).  I am
testing powerpc.descr with a local edit to plat/linuxppc/descr to run
ego with powerpc.descr there, but I did not commit my local edit.
This commit is contained in:
George Koehler 2017-02-15 19:34:07 -05:00
parent cf728c2a2a
commit cbe5d8640b
3 changed files with 254 additions and 162 deletions

View file

@ -4,18 +4,13 @@
*
*/
#include <stdlib.h>
/*
* machine dependent back end routines for the PowerPC
*/
#include <limits.h>
#ifndef NORCSID
static char rcsid[]= "$Id$" ;
#endif
int framesize;
/*
* machine dependent back end routines for the Zilog Z80.
*/
static long framesize;
con_part(int sz, word w)
{
@ -25,17 +20,14 @@ con_part(int sz, word w)
part_flush();
if (sz == 1) {
w &= 0xFF;
w <<= 8*(3-part_size);
w <<= 8 * (3 - part_size);
part_word |= w;
} else if (sz == 2) {
w &= 0xFFFF;
if (part_size == 0) {
/* Shift 8 for m68k2, 16 otherwise */
w <<= 4 * TEM_WSIZE;
}
w <<= 8 * (2 - part_size);
part_word |= w;
} else {
assert(sz == TEM_WSIZE);
assert(sz == 4);
part_word = w;
}
part_size += sz;
@ -56,17 +48,26 @@ con_mult(word sz)
#define FL_MSB_AT_LOW_ADDRESS 1
#include <con_float>
static void
emit_prolog(void)
{
fprintf(codefile, "mfspr r0, lr\n");
fprintf(codefile, "addi sp, sp, %ld\n", -framesize - 8);
fprintf(codefile, "stw fp, %ld(sp)\n", framesize);
fprintf(codefile, "stw r0, %ld(sp)\n", framesize + 4);
fprintf(codefile, "addi fp, sp, %ld\n", framesize);
}
void
prolog(full nlocals)
{
int ss = nlocals + 8;
fprintf(codefile, "addi sp, sp, %d\n", -ss);
fprintf(codefile, "stw fp, %d(sp)\n", nlocals);
fprintf(codefile, "mfspr r0, lr\n"
"stw r0, %d(sp)\n", nlocals+4);
fprintf(codefile, "addi fp, sp, %d\n", nlocals);
framesize = nlocals;
#ifdef REGVARS
/* f_regsave() will call emit_prolog() */
#else
emit_prolog();
#endif
}
void
@ -102,110 +103,144 @@ char *segname[] = {
#ifdef REGVARS
static int savedregsi[32];
static int numsaved;
static long savedf[32];
static long savedi[32];
static int savedtop;
/* Calculate the register score of a local variable. */
int
regscore(long offset, int size, int type, int frequency, int totype)
{
int score;
switch (type) {
case reg_float:
if (size != 8) {
fprintf(codefile, "! local %ld float size %d reject\n", offset, size);
return -1;
}
break;
default:
if (size != 4) {
fprintf(codefile, "! local %ld int size %d reject\n", offset, size);
return -1;
}
break;
}
/* Clamp to avoid overflowing 16-bit int score. */
if (frequency > 8000)
frequency = 8000;
/*
* Each occurence of a regvar saves about 4 bytes by not
* emitting a load or store instruction. The overhead is
* about 8 bytes to save and restore the register, plus
* 4 bytes if the local is a parameter.
*/
score = 4 * frequency - 8 - ((offset >= 0) ? 4 : 0);
fprintf(codefile, "! local %ld score %d\n", offset, score);
return score;
}
/* Initialise regvar system for one function. */
i_regsave()
i_regsave(void)
{
int i;
fprintf(codefile, "! i_regsave()\n");
for (i=0; i<32; i++)
savedregsi[i] = INT_MAX;
numsaved = 0;
for (i=0; i<32; i++) {
savedf[i] = LONG_MIN;
savedi[i] = LONG_MIN;
}
/* Set top of register save area, relative to fp. */
savedtop = -framesize;
}
/* Mark a register as being saved. */
regsave(const char* regname, full offset, int size)
regsave(const char* regname, long offset, int size)
{
int regnum = atoi(regname+1);
savedregsi[regnum] = offset;
numsaved++;
fprintf(codefile, "! %d is saved in %s\n", offset, regname);
#if 0
fprintf(codefile, "stwu %s, -4(sp)\n", regname);
if (offset >= 0)
fprintf(codefile, "lwz %s, %d(fp)\n", regname, offset);
#endif
int regnum = atoi(regname + 1);
assert(regnum >= 0 && regnum <= 31);
switch (regname[0]) {
case 'f':
savedf[regnum] = offset;
framesize += 8;
break;
case 'r':
savedi[regnum] = offset;
framesize += 4;
break;
}
}
/* Finish saving ragisters. */
void saveloadregs(const char* ops, const char* opm)
static void
saveloadregs(const char* ops, const char* opm, const char *opf)
{
int offset = -(framesize + numsaved*4);
int reg = 32;
/* Check for the possibility of a multiple. */
do
{
reg--;
}
while ((reg > 0) && (savedregsi[reg] != INT_MAX));
if (reg < 31)
{
fprintf(codefile, "%s r%d, %d(fp)\n", opm, reg+1, offset);
offset += (31-reg)*4;
}
/* Saved everything else singly. */
while (reg > 0)
{
if (savedregsi[reg] != INT_MAX)
{
fprintf(codefile, "%s r%d, %d(fp)\n", ops, reg, offset);
offset += 4;
long offset = savedtop;
int reg;
/* Do floating-point registers. */
for (reg = 31; reg >= 0; reg--) {
if (savedf[reg] != LONG_MIN) {
offset -= 8;
fprintf(codefile, "%s f%d, %ld(fp)\n",
opf, reg, offset);
}
}
if (savedi[31] != LONG_MIN && savedi[30] != LONG_MIN) {
/*
* Do multiple registers from reg to r31.
*
* Using stmw or lmw reduces code size, but in some
* processors, runs slower than the equivalent pile of
* stw or lwz instructions.
*/
reg = 30;
while (reg > 0 && savedi[reg - 1] != LONG_MIN)
reg--;
offset -= (32 - reg) * 4;
fprintf(codefile, "%s r%d, %ld(fp)\n", opm, reg, offset);
} else
reg = 32;
/* Do single general-purpose registers. */
for (reg--; reg >= 0; reg--) {
if (savedi[reg] != LONG_MIN) {
offset -= 4;
fprintf(codefile, "%s r%d, %ld(fp)\n",
ops, reg, offset);
}
reg--;
}
}
f_regsave()
f_regsave(void)
{
int i;
fprintf(codefile, "! f_regsave()\n");
fprintf(codefile, "addi sp, sp, %d\n", -numsaved*4);
saveloadregs("stw", "stmw");
for (i=0; i<32; i++)
if ((savedregsi[i] != INT_MAX) && (savedregsi[i] > 0))
fprintf(codefile, "lwz r%d, %d(fp)\n", i, savedregsi[i]);
int reg;
emit_prolog();
saveloadregs("stw", "stmw", "stfd");
for (reg = 31; reg >= 0; reg--)
if (savedf[reg] >= 0)
fprintf(codefile, "lfd f%rd, %ld(fp)\n",
reg, savedf[reg]);
for (reg = 31; reg >= 0; reg--)
if (savedi[reg] >= 0)
fprintf(codefile, "lwz r%d, %ld(fp)\n",
reg, savedi[reg]);
}
/* Restore all saved registers. */
regreturn()
regreturn(void)
{
fprintf(codefile, "! regreturn()\n");
saveloadregs("lwz", "lmw");
}
/* Calculate the score of a given register. */
int regscore(full offset, int size, int type, int frequency, int totype)
{
int score;
fprintf(codefile, "! regscore(%ld, %d, %d, %d, %d)\n", offset, size, type, frequency, totype);
if (size != 4)
return -1;
/* Per use: 6 bytes (on average)
* Overhead in prologue: 4 bytes, plus 4 if a parameter
* Overhead in epilogue: 0 bytes
*/
score = frequency*6 - 4 - ((offset>=0) ? 4 : 0);
fprintf(codefile, "! local at offset %d has regvar score %d\n", offset, score);
return score;
saveloadregs("lwz", "lmw", "lfd");
}
#endif

View file

@ -47,14 +47,16 @@ REGISTERS
r31, r30, r29, r28, r27, r26,
r25, r24, r23, r22, r21, r20,
r19, r18, r17, r16, r15, r14,
r13 : GPR, REG regvar.
r13 : GPR, REG regvar(reg_any).
r12, r11, r10, r9, r8, r7,
r6, r5, r4, r3 : GPR, REG.
fp, sp, r0 : GPR.
/* f31 to f14 are reserved for regvar. */
f31, f30, f29, f28, f27, f26,
f25, f24, f23, f22, f21, f20,
f19, f18, f17, f16, f15, f14 : FPR, FREG regvar(reg_float).
f13, f12, f11, f10, f9, f8
f7, f6, f5, f4, f3, f2, f1 : FPR, FREG.
@ -86,6 +88,7 @@ TOKENS
LABEL_HA = { ADDR adr; } 4 "ha16[" adr "]".
LABEL_LO = { ADDR adr; } 4 "lo16[" adr "]".
LOCAL = { INT off; } 4 ">>> BUG IN LOCAL".
DLOCAL = { INT off; } 8 ">>> BUG IN DLOCAL".
/* Allows us to use regvar() to refer to registers */
@ -239,27 +242,27 @@ INSTRUCTIONS
eqv GPR:wo, GPR:ro, GPR:ro.
extsb GPR:wo, GPR:ro.
extsh GPR:wo, GPR:ro.
fadd FREG:wo, FREG:ro, FREG:ro cost(4, 5).
fadd FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 5).
fadds FSREG:wo, FSREG:ro, FSREG:ro cost(4, 5).
fcmpo CR:wo, FREG:ro, FREG:ro cost(4, 5).
fcmpo CR:wo, FSREG:ro, FSREG:ro cost(4, 5).
fctiwz FREG:wo, FREG:ro.
fdiv FREG:wo, FREG:ro, FREG:ro cost(4, 35).
fdiv FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 35).
fdivs FSREG:wo, FSREG:ro, FSREG:ro cost(4, 21).
fmr FPR:wo, FPR:ro cost(4, 5).
fmr FPR+DLOCAL:wo, FPR:ro cost(4, 5).
fmr FSREG:wo, FSREG:ro cost(4, 5).
fmul FREG:wo, FREG:ro, FREG:ro cost(4, 5).
fmul FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 5).
fmuls FSREG:wo, FSREG:ro, FSREG:ro cost(4, 5).
fneg FREG:wo, FREG:ro cost(4, 5).
fneg FREG+DLOCAL:wo, FREG:ro cost(4, 5).
fneg FSREG:wo, FSREG:ro cost(4, 5).
frsp FSREG:wo, FREG:ro cost(4, 5).
fsub FREG:wo, FREG:ro, FREG:ro cost(4, 5).
fsub FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 5).
fsubs FSREG:wo, FSREG:ro, FSREG:ro cost(4, 5).
lbz GPR:wo, IND_RC_B+IND_RL_B:ro cost(4, 3).
lbzx GPR:wo, GPR:ro, GPR:ro cost(4, 3).
lfd FPR:wo, IND_RC_D+IND_RL_D:ro cost(4, 5).
lfd FPR+DLOCAL:wo, IND_RC_D+IND_RL_D:ro cost(4, 5).
lfdu FPR:wo, IND_RC_D:ro cost(4, 5).
lfdx FPR:wo, GPR:ro, GPR:ro cost(4, 5).
lfdx FPR+DLOCAL:wo, GPR:ro, GPR:ro cost(4, 5).
lfs FSREG:wo, IND_RC_W+IND_RL_W:ro cost(4, 4).
lfsu FSREG:wo, IND_RC_W:rw cost(4, 4).
lfsx FSREG:wo, GPR:ro, GPR:ro cost(4, 4).
@ -296,7 +299,7 @@ INSTRUCTIONS
stb GPR:ro, IND_RC_B+IND_RL_B:rw cost(4, 3).
stbx GPR:ro, GPR:ro, GPR:ro cost(4, 3).
stfd FPR:ro, IND_RC_D+IND_RL_D:rw cost(4, 4).
stfdu FPR:ro, IND_RC_D:rw cost(4, 4).
stfdu FPR+DLOCAL:ro, IND_RC_D:rw cost(4, 4).
stfdx FPR:ro, GPR:ro, GPR:ro cost(4, 4).
stfs FSREG:ro, IND_RC_W+IND_RL_W:rw cost(4, 3).
stfsu FSREG:ro, IND_RC_W:rw cost(4, 3).
@ -318,6 +321,9 @@ MOVES
from GPR to GPR
gen mr %2, %1
from FPR to FPR+DLOCAL
gen fmr %2, %1
/* Constants */
from CONST + CONST_STACK smalls(%val) to GPR
@ -437,10 +443,10 @@ MOVES
/* Read double */
from IND_RC_D+IND_RL_D to FPR
from IND_RC_D+IND_RL_D to FPR+DLOCAL
gen lfd %2, %1
from IND_RR_D to FPR
from IND_RR_D to FPR+DLOCAL
gen lfdx %2, %1.reg1, %1.reg2
/* Write double */
@ -586,9 +592,9 @@ STACKINGRULES
move %1, FSCRATCH
stfdu FSCRATCH, {IND_RC_D, sp, 0-8}
from FREG to STACK
from FREG+DLOCAL to STACK
gen
COMMENT("stack FPR")
COMMENT("stack FREG+DLOCAL")
stfdu %1, {IND_RC_D, sp, 0-8}
from FSREG to STACK
@ -761,47 +767,57 @@ PATTERNS
uses REG={SUM_RIS, fp, his($1)}
yields {SUM_RC, %a, los($1)}
pat lol inreg($1)>0 /* Load from local */
/* Load word from local */
pat lol inreg($1)==reg_any
yields {LOCAL, $1}
pat lol /* Load from local */
pat lol
leaving
lal $1
loi INT32
loi 4
pat ldl /* Load double-word from local */
/* Load double-word from local */
pat ldl inreg($1)==reg_float
yields {DLOCAL, $1}
pat ldl
leaving
lal $1
loi INT32*2
loi 8
pat stl inreg($1)>0 /* Store to local */
/* Store word to local */
pat stl inreg($1)==reg_any
with ANY_BHW
kills regvar($1), LOCAL %off==$1
gen move %1, {GPRE, regvar($1)}
pat stl
leaving
lal $1
sti 4
/* Store double-word to local */
pat sdl inreg($1)==reg_float
with exact FREG+IND_ALL_D
gen move %1, {DLOCAL, $1}
with STACK
gen
move %1, {GPRE, regvar($1)}
pat stl /* Store to local */
lfd {DLOCAL, $1}, {IND_RC_D, sp, 0}
addi sp, sp, {CONST, 8}
pat sdl
leaving
lal $1
sti INT32
sti 8
pat sdl /* Store double-word to local */
leaving
lal $1
sti INT32*2
pat lil inreg($1)>0 /* Load from indirected local */
/* Load indirect from local */
pat lil inreg($1)==reg_any
yields {IND_RC_W, regvar($1), 0}
pat lil /* Load from indirected local */
pat lil
leaving
lol $1
loi INT32
loi 4
pat sil /* Save to indirected local */
leaving
lol $1
sti INT32
sti 4
pat zrl /* Zero local */
leaving
@ -2021,12 +2037,7 @@ PATTERNS
yields %1
/* Floating point support */
/* All very cheap and nasty --- this needs to be properly integrated into
* the code generator. ncg doesn't like having separate FPU registers. */
/* Single-precision */
/* Single-precision floating-point */
pat zrf $1==INT32 /* Push zero */
leaving
@ -2168,46 +2179,62 @@ PATTERNS
loc 4
cff
/* Double-precision */
/* Double-precision floating-point */
pat zrf $1==INT64 /* Push zero */
leaving
lde ".fd_00000000"
pat adf $1==INT64 /* Add double */
pat adf $1==8 /* Add double */
with FREG FREG
uses FREG
uses reusing %1, FREG
gen
fadd %a, %2, %1
yields %a
pat sbf $1==INT64 /* Subtract double */
pat adf sdl $1==8 && inreg($2)==reg_float
with FREG FREG
uses FREG
gen fadd {DLOCAL, $2}, %2, %1
pat sbf $1==8 /* Subtract double */
with FREG FREG
uses reusing %1, FREG
gen
fsub %a, %2, %1
yields %a
pat sbf sdl $1==8 && inreg($2)==reg_float
with FREG FREG
gen fsub {DLOCAL, $2}, %2, %1
pat mlf $1==INT64 /* Multiply double */
pat mlf $1==8 /* Multiply double */
with FREG FREG
uses reusing %1, FREG
gen
fmul %a, %2, %1
yields %a
pat mlf sdl $1==8 && inreg($2)==reg_float
with FREG FREG
gen fmul {DLOCAL, $2}, %2, %1
pat dvf $1==INT64 /* Divide double */
pat dvf $1==8 /* Divide double */
with FREG FREG
uses reusing %1, FREG
gen
fdiv %a, %2, %1
yields %a
pat dvf sdl $1==8 && inreg($2)==reg_float
with FREG FREG
gen fdiv {DLOCAL, $2}, %2, %1
pat ngf $1==INT64 /* Negate double */
pat ngf $1==8 /* Negate double */
with FREG
uses reusing %1, FREG
gen
fneg %a, %1
yields %a
pat ngf sdl $1==8 && inreg($2)==reg_float
with FREG
gen fneg {DLOCAL, $2}, %1
pat cmf $1==INT64 /* Compare double */
with FREG FREG

View file

@ -3,26 +3,32 @@ pointersize: 4
%%RA
general registers: 19
address registers: 0
floating point registers: 0
floating point registers: 18
use general as pointer: yes
register score parameters:
local variable:
(2 cases)
(3 cases)
pointer,general
(1 size)
default -> (3,4)
general,general
(1 size)
default -> (3,4)
float,float
(1 size)
default -> (5,4)
address of local variable:
(2 cases)
(3 cases)
pointer,general
(1 size)
default -> (0,0)
general,general
(1 size)
default -> (0,0)
float,float
(1 size)
default -> (0,0)
constant:
(2 sizes)
fitbyte -> (-1,-1)
@ -39,21 +45,27 @@ register score parameters:
opening cost parameters:
local variable:
(2 cases)
(3 cases)
pointer
(1 size)
default -> (3,4)
general
(1 size)
default -> (3,4)
float
(1 size)
default -> (5,4)
address of local variable:
(2 cases)
(3 cases)
pointer
(1 size)
default -> (1,4)
general
(1 size)
general -> (1,4)
default -> (1,4)
float
(1 size)
default -> (1,4)
constant:
(2 sizes)
fitbyte -> (1000,1000)
@ -69,7 +81,7 @@ opening cost parameters:
default -> (1000,1000)
register save costs:
(21 cases)
(39 cases)
0 -> (0,0)
1 -> (6,8)
2 -> (12,16)
@ -90,6 +102,24 @@ register save costs:
17 -> (102,136)
18 -> (108,144)
19 -> (114,152)
20 -> (120,160)
21 -> (126,168)
22 -> (132,176)
23 -> (138,184)
24 -> (144,192)
25 -> (150,200)
26 -> (156,208)
27 -> (162,216)
28 -> (168,224)
29 -> (174,232)
30 -> (180,240)
31 -> (186,248)
32 -> (192,256)
33 -> (198,264)
34 -> (204,272)
35 -> (210,280)
36 -> (216,288)
37 -> (222,296)
0 -> (0,0)
%%UD
access costs of global variables: