From cbe5d8640b9b268f5e085b7fd319a5bf8ad8775a Mon Sep 17 00:00:00 2001 From: George Koehler Date: Wed, 15 Feb 2017 19:34:07 -0500 Subject: [PATCH] Add floating-point register variables to PowerPC ncg. Use f14 to f31 as register variables for 8-byte double-precison. There are no regvars for 4-byte double precision, because all regvar(reg_float) must have the same size. I expect more programs to prefer 8-byte double precision. Teach mach/powerpc/ncg/mach.c to emit stfd and lfd instructions to save and restore 8-byte regvars. Delay emitting the function prolog until f_regsave(), so we can use one addi to make stack space for both local vars and saved registers. Be more careful with types in mach.c; don't assume that int and long and full are the same. In ncg table, add f14 to f31 as register variables, and some rules to use them. Add rules to put the result of fadd, fsub, fmul, fdiv, fneg in a regvar. Without such rules, the result would go in a scratch FREG, and we would need fmr to move it to the regvar. Also add a rule for pat sdl inreg($1)==reg_float with STACK, so we can unstack the value directly into the regvar, again without a scratch FREG and fmr. Edit util/ego/descr/powerpc.descr to tell ego about the new float regvars. This might not be working right; ego usually decides against using any float regvars, so ack -O1 (not running ego) uses the regvars, but ack -O4 (running ego) doesn't use the regvars. Beware that ack -mosxppc runs ego using powerpc.descr but -mlinuxppc and -mqemuppc run ego without a config file (since 8ef7c31). I am testing powerpc.descr with a local edit to plat/linuxppc/descr to run ego with powerpc.descr there, but I did not commit my local edit. --- mach/powerpc/ncg/mach.c | 243 ++++++++++++++++++++--------------- mach/powerpc/ncg/table | 129 +++++++++++-------- util/ego/descr/powerpc.descr | 44 ++++++- 3 files changed, 254 insertions(+), 162 deletions(-) diff --git a/mach/powerpc/ncg/mach.c b/mach/powerpc/ncg/mach.c index e4ab3c078..c63cc20be 100644 --- a/mach/powerpc/ncg/mach.c +++ b/mach/powerpc/ncg/mach.c @@ -4,18 +4,13 @@ * */ -#include +/* + * machine dependent back end routines for the PowerPC + */ + #include -#ifndef NORCSID -static char rcsid[]= "$Id$" ; -#endif - -int framesize; - -/* - * machine dependent back end routines for the Zilog Z80. - */ +static long framesize; con_part(int sz, word w) { @@ -25,17 +20,14 @@ con_part(int sz, word w) part_flush(); if (sz == 1) { w &= 0xFF; - w <<= 8*(3-part_size); + w <<= 8 * (3 - part_size); part_word |= w; } else if (sz == 2) { w &= 0xFFFF; - if (part_size == 0) { - /* Shift 8 for m68k2, 16 otherwise */ - w <<= 4 * TEM_WSIZE; - } + w <<= 8 * (2 - part_size); part_word |= w; } else { - assert(sz == TEM_WSIZE); + assert(sz == 4); part_word = w; } part_size += sz; @@ -56,17 +48,26 @@ con_mult(word sz) #define FL_MSB_AT_LOW_ADDRESS 1 #include +static void +emit_prolog(void) +{ + fprintf(codefile, "mfspr r0, lr\n"); + fprintf(codefile, "addi sp, sp, %ld\n", -framesize - 8); + fprintf(codefile, "stw fp, %ld(sp)\n", framesize); + fprintf(codefile, "stw r0, %ld(sp)\n", framesize + 4); + fprintf(codefile, "addi fp, sp, %ld\n", framesize); +} + void prolog(full nlocals) { - int ss = nlocals + 8; - fprintf(codefile, "addi sp, sp, %d\n", -ss); - fprintf(codefile, "stw fp, %d(sp)\n", nlocals); - fprintf(codefile, "mfspr r0, lr\n" - "stw r0, %d(sp)\n", nlocals+4); - fprintf(codefile, "addi fp, sp, %d\n", nlocals); - framesize = nlocals; + +#ifdef REGVARS + /* f_regsave() will call emit_prolog() */ +#else + emit_prolog(); +#endif } void @@ -102,110 +103,144 @@ char *segname[] = { #ifdef REGVARS -static int savedregsi[32]; -static int numsaved; +static long savedf[32]; +static long savedi[32]; +static int savedtop; + +/* Calculate the register score of a local variable. */ +int +regscore(long offset, int size, int type, int frequency, int totype) +{ + int score; + + switch (type) { + case reg_float: + if (size != 8) { + fprintf(codefile, "! local %ld float size %d reject\n", offset, size); + return -1; + } + break; + default: + if (size != 4) { + fprintf(codefile, "! local %ld int size %d reject\n", offset, size); + return -1; + } + break; + } + + /* Clamp to avoid overflowing 16-bit int score. */ + if (frequency > 8000) + frequency = 8000; + + /* + * Each occurence of a regvar saves about 4 bytes by not + * emitting a load or store instruction. The overhead is + * about 8 bytes to save and restore the register, plus + * 4 bytes if the local is a parameter. + */ + score = 4 * frequency - 8 - ((offset >= 0) ? 4 : 0); + fprintf(codefile, "! local %ld score %d\n", offset, score); + return score; +} /* Initialise regvar system for one function. */ -i_regsave() +i_regsave(void) { int i; - - fprintf(codefile, "! i_regsave()\n"); - for (i=0; i<32; i++) - savedregsi[i] = INT_MAX; - numsaved = 0; + + for (i=0; i<32; i++) { + savedf[i] = LONG_MIN; + savedi[i] = LONG_MIN; + } + + /* Set top of register save area, relative to fp. */ + savedtop = -framesize; } /* Mark a register as being saved. */ -regsave(const char* regname, full offset, int size) +regsave(const char* regname, long offset, int size) { - int regnum = atoi(regname+1); - savedregsi[regnum] = offset; - numsaved++; - - fprintf(codefile, "! %d is saved in %s\n", offset, regname); -#if 0 - fprintf(codefile, "stwu %s, -4(sp)\n", regname); - if (offset >= 0) - fprintf(codefile, "lwz %s, %d(fp)\n", regname, offset); -#endif + int regnum = atoi(regname + 1); + + assert(regnum >= 0 && regnum <= 31); + switch (regname[0]) { + case 'f': + savedf[regnum] = offset; + framesize += 8; + break; + case 'r': + savedi[regnum] = offset; + framesize += 4; + break; + } } -/* Finish saving ragisters. */ - -void saveloadregs(const char* ops, const char* opm) +static void +saveloadregs(const char* ops, const char* opm, const char *opf) { - int offset = -(framesize + numsaved*4); - int reg = 32; - - /* Check for the possibility of a multiple. */ - - do - { - reg--; - } - while ((reg > 0) && (savedregsi[reg] != INT_MAX)); - if (reg < 31) - { - fprintf(codefile, "%s r%d, %d(fp)\n", opm, reg+1, offset); - offset += (31-reg)*4; - } - - /* Saved everything else singly. */ - - while (reg > 0) - { - if (savedregsi[reg] != INT_MAX) - { - fprintf(codefile, "%s r%d, %d(fp)\n", ops, reg, offset); - offset += 4; + long offset = savedtop; + int reg; + + /* Do floating-point registers. */ + for (reg = 31; reg >= 0; reg--) { + if (savedf[reg] != LONG_MIN) { + offset -= 8; + fprintf(codefile, "%s f%d, %ld(fp)\n", + opf, reg, offset); + } + } + + if (savedi[31] != LONG_MIN && savedi[30] != LONG_MIN) { + /* + * Do multiple registers from reg to r31. + * + * Using stmw or lmw reduces code size, but in some + * processors, runs slower than the equivalent pile of + * stw or lwz instructions. + */ + reg = 30; + while (reg > 0 && savedi[reg - 1] != LONG_MIN) + reg--; + offset -= (32 - reg) * 4; + fprintf(codefile, "%s r%d, %ld(fp)\n", opm, reg, offset); + } else + reg = 32; + + /* Do single general-purpose registers. */ + for (reg--; reg >= 0; reg--) { + if (savedi[reg] != LONG_MIN) { + offset -= 4; + fprintf(codefile, "%s r%d, %ld(fp)\n", + ops, reg, offset); } - reg--; } } -f_regsave() +f_regsave(void) { - int i; - fprintf(codefile, "! f_regsave()\n"); - fprintf(codefile, "addi sp, sp, %d\n", -numsaved*4); - - saveloadregs("stw", "stmw"); - - for (i=0; i<32; i++) - if ((savedregsi[i] != INT_MAX) && (savedregsi[i] > 0)) - fprintf(codefile, "lwz r%d, %d(fp)\n", i, savedregsi[i]); + int reg; + + emit_prolog(); + saveloadregs("stw", "stmw", "stfd"); + + for (reg = 31; reg >= 0; reg--) + if (savedf[reg] >= 0) + fprintf(codefile, "lfd f%rd, %ld(fp)\n", + reg, savedf[reg]); + + for (reg = 31; reg >= 0; reg--) + if (savedi[reg] >= 0) + fprintf(codefile, "lwz r%d, %ld(fp)\n", + reg, savedi[reg]); } /* Restore all saved registers. */ -regreturn() +regreturn(void) { - fprintf(codefile, "! regreturn()\n"); - saveloadregs("lwz", "lmw"); -} - -/* Calculate the score of a given register. */ - -int regscore(full offset, int size, int type, int frequency, int totype) -{ - int score; - - fprintf(codefile, "! regscore(%ld, %d, %d, %d, %d)\n", offset, size, type, frequency, totype); - - if (size != 4) - return -1; - - /* Per use: 6 bytes (on average) - * Overhead in prologue: 4 bytes, plus 4 if a parameter - * Overhead in epilogue: 0 bytes - */ - - score = frequency*6 - 4 - ((offset>=0) ? 4 : 0); - fprintf(codefile, "! local at offset %d has regvar score %d\n", offset, score); - return score; + saveloadregs("lwz", "lmw", "lfd"); } #endif diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table index 76bc5c90a..264767e8b 100644 --- a/mach/powerpc/ncg/table +++ b/mach/powerpc/ncg/table @@ -47,14 +47,16 @@ REGISTERS r31, r30, r29, r28, r27, r26, r25, r24, r23, r22, r21, r20, r19, r18, r17, r16, r15, r14, - r13 : GPR, REG regvar. + r13 : GPR, REG regvar(reg_any). r12, r11, r10, r9, r8, r7, r6, r5, r4, r3 : GPR, REG. fp, sp, r0 : GPR. - /* f31 to f14 are reserved for regvar. */ + f31, f30, f29, f28, f27, f26, + f25, f24, f23, f22, f21, f20, + f19, f18, f17, f16, f15, f14 : FPR, FREG regvar(reg_float). f13, f12, f11, f10, f9, f8 f7, f6, f5, f4, f3, f2, f1 : FPR, FREG. @@ -86,6 +88,7 @@ TOKENS LABEL_HA = { ADDR adr; } 4 "ha16[" adr "]". LABEL_LO = { ADDR adr; } 4 "lo16[" adr "]". LOCAL = { INT off; } 4 ">>> BUG IN LOCAL". + DLOCAL = { INT off; } 8 ">>> BUG IN DLOCAL". /* Allows us to use regvar() to refer to registers */ @@ -239,27 +242,27 @@ INSTRUCTIONS eqv GPR:wo, GPR:ro, GPR:ro. extsb GPR:wo, GPR:ro. extsh GPR:wo, GPR:ro. - fadd FREG:wo, FREG:ro, FREG:ro cost(4, 5). + fadd FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 5). fadds FSREG:wo, FSREG:ro, FSREG:ro cost(4, 5). fcmpo CR:wo, FREG:ro, FREG:ro cost(4, 5). fcmpo CR:wo, FSREG:ro, FSREG:ro cost(4, 5). fctiwz FREG:wo, FREG:ro. - fdiv FREG:wo, FREG:ro, FREG:ro cost(4, 35). + fdiv FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 35). fdivs FSREG:wo, FSREG:ro, FSREG:ro cost(4, 21). - fmr FPR:wo, FPR:ro cost(4, 5). + fmr FPR+DLOCAL:wo, FPR:ro cost(4, 5). fmr FSREG:wo, FSREG:ro cost(4, 5). - fmul FREG:wo, FREG:ro, FREG:ro cost(4, 5). + fmul FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 5). fmuls FSREG:wo, FSREG:ro, FSREG:ro cost(4, 5). - fneg FREG:wo, FREG:ro cost(4, 5). + fneg FREG+DLOCAL:wo, FREG:ro cost(4, 5). fneg FSREG:wo, FSREG:ro cost(4, 5). frsp FSREG:wo, FREG:ro cost(4, 5). - fsub FREG:wo, FREG:ro, FREG:ro cost(4, 5). + fsub FREG+DLOCAL:wo, FREG:ro, FREG:ro cost(4, 5). fsubs FSREG:wo, FSREG:ro, FSREG:ro cost(4, 5). lbz GPR:wo, IND_RC_B+IND_RL_B:ro cost(4, 3). lbzx GPR:wo, GPR:ro, GPR:ro cost(4, 3). - lfd FPR:wo, IND_RC_D+IND_RL_D:ro cost(4, 5). + lfd FPR+DLOCAL:wo, IND_RC_D+IND_RL_D:ro cost(4, 5). lfdu FPR:wo, IND_RC_D:ro cost(4, 5). - lfdx FPR:wo, GPR:ro, GPR:ro cost(4, 5). + lfdx FPR+DLOCAL:wo, GPR:ro, GPR:ro cost(4, 5). lfs FSREG:wo, IND_RC_W+IND_RL_W:ro cost(4, 4). lfsu FSREG:wo, IND_RC_W:rw cost(4, 4). lfsx FSREG:wo, GPR:ro, GPR:ro cost(4, 4). @@ -296,7 +299,7 @@ INSTRUCTIONS stb GPR:ro, IND_RC_B+IND_RL_B:rw cost(4, 3). stbx GPR:ro, GPR:ro, GPR:ro cost(4, 3). stfd FPR:ro, IND_RC_D+IND_RL_D:rw cost(4, 4). - stfdu FPR:ro, IND_RC_D:rw cost(4, 4). + stfdu FPR+DLOCAL:ro, IND_RC_D:rw cost(4, 4). stfdx FPR:ro, GPR:ro, GPR:ro cost(4, 4). stfs FSREG:ro, IND_RC_W+IND_RL_W:rw cost(4, 3). stfsu FSREG:ro, IND_RC_W:rw cost(4, 3). @@ -318,6 +321,9 @@ MOVES from GPR to GPR gen mr %2, %1 + from FPR to FPR+DLOCAL + gen fmr %2, %1 + /* Constants */ from CONST + CONST_STACK smalls(%val) to GPR @@ -437,10 +443,10 @@ MOVES /* Read double */ - from IND_RC_D+IND_RL_D to FPR + from IND_RC_D+IND_RL_D to FPR+DLOCAL gen lfd %2, %1 - from IND_RR_D to FPR + from IND_RR_D to FPR+DLOCAL gen lfdx %2, %1.reg1, %1.reg2 /* Write double */ @@ -586,9 +592,9 @@ STACKINGRULES move %1, FSCRATCH stfdu FSCRATCH, {IND_RC_D, sp, 0-8} - from FREG to STACK + from FREG+DLOCAL to STACK gen - COMMENT("stack FPR") + COMMENT("stack FREG+DLOCAL") stfdu %1, {IND_RC_D, sp, 0-8} from FSREG to STACK @@ -761,47 +767,57 @@ PATTERNS uses REG={SUM_RIS, fp, his($1)} yields {SUM_RC, %a, los($1)} - pat lol inreg($1)>0 /* Load from local */ + /* Load word from local */ + pat lol inreg($1)==reg_any yields {LOCAL, $1} - - pat lol /* Load from local */ + pat lol leaving lal $1 - loi INT32 + loi 4 - pat ldl /* Load double-word from local */ + /* Load double-word from local */ + pat ldl inreg($1)==reg_float + yields {DLOCAL, $1} + pat ldl leaving lal $1 - loi INT32*2 + loi 8 - pat stl inreg($1)>0 /* Store to local */ + /* Store word to local */ + pat stl inreg($1)==reg_any with ANY_BHW kills regvar($1), LOCAL %off==$1 + gen move %1, {GPRE, regvar($1)} + pat stl + leaving + lal $1 + sti 4 + + /* Store double-word to local */ + pat sdl inreg($1)==reg_float + with exact FREG+IND_ALL_D + gen move %1, {DLOCAL, $1} + with STACK gen - move %1, {GPRE, regvar($1)} - - pat stl /* Store to local */ + lfd {DLOCAL, $1}, {IND_RC_D, sp, 0} + addi sp, sp, {CONST, 8} + pat sdl leaving lal $1 - sti INT32 + sti 8 - pat sdl /* Store double-word to local */ - leaving - lal $1 - sti INT32*2 - - pat lil inreg($1)>0 /* Load from indirected local */ + /* Load indirect from local */ + pat lil inreg($1)==reg_any yields {IND_RC_W, regvar($1), 0} - - pat lil /* Load from indirected local */ + pat lil leaving lol $1 - loi INT32 + loi 4 pat sil /* Save to indirected local */ leaving lol $1 - sti INT32 + sti 4 pat zrl /* Zero local */ leaving @@ -2021,12 +2037,7 @@ PATTERNS yields %1 -/* Floating point support */ - - /* All very cheap and nasty --- this needs to be properly integrated into - * the code generator. ncg doesn't like having separate FPU registers. */ - - /* Single-precision */ +/* Single-precision floating-point */ pat zrf $1==INT32 /* Push zero */ leaving @@ -2168,46 +2179,62 @@ PATTERNS loc 4 cff - /* Double-precision */ + +/* Double-precision floating-point */ pat zrf $1==INT64 /* Push zero */ leaving lde ".fd_00000000" - pat adf $1==INT64 /* Add double */ + pat adf $1==8 /* Add double */ with FREG FREG - uses FREG + uses reusing %1, FREG gen fadd %a, %2, %1 yields %a - - pat sbf $1==INT64 /* Subtract double */ + pat adf sdl $1==8 && inreg($2)==reg_float with FREG FREG - uses FREG + gen fadd {DLOCAL, $2}, %2, %1 + + pat sbf $1==8 /* Subtract double */ + with FREG FREG + uses reusing %1, FREG gen fsub %a, %2, %1 yields %a + pat sbf sdl $1==8 && inreg($2)==reg_float + with FREG FREG + gen fsub {DLOCAL, $2}, %2, %1 - pat mlf $1==INT64 /* Multiply double */ + pat mlf $1==8 /* Multiply double */ with FREG FREG uses reusing %1, FREG gen fmul %a, %2, %1 yields %a + pat mlf sdl $1==8 && inreg($2)==reg_float + with FREG FREG + gen fmul {DLOCAL, $2}, %2, %1 - pat dvf $1==INT64 /* Divide double */ + pat dvf $1==8 /* Divide double */ with FREG FREG uses reusing %1, FREG gen fdiv %a, %2, %1 yields %a + pat dvf sdl $1==8 && inreg($2)==reg_float + with FREG FREG + gen fdiv {DLOCAL, $2}, %2, %1 - pat ngf $1==INT64 /* Negate double */ + pat ngf $1==8 /* Negate double */ with FREG uses reusing %1, FREG gen fneg %a, %1 yields %a + pat ngf sdl $1==8 && inreg($2)==reg_float + with FREG + gen fneg {DLOCAL, $2}, %1 pat cmf $1==INT64 /* Compare double */ with FREG FREG diff --git a/util/ego/descr/powerpc.descr b/util/ego/descr/powerpc.descr index 5138cc44b..e59990ea1 100644 --- a/util/ego/descr/powerpc.descr +++ b/util/ego/descr/powerpc.descr @@ -3,26 +3,32 @@ pointersize: 4 %%RA general registers: 19 address registers: 0 -floating point registers: 0 +floating point registers: 18 use general as pointer: yes register score parameters: local variable: - (2 cases) + (3 cases) pointer,general (1 size) default -> (3,4) general,general (1 size) default -> (3,4) + float,float + (1 size) + default -> (5,4) address of local variable: - (2 cases) + (3 cases) pointer,general (1 size) default -> (0,0) general,general (1 size) default -> (0,0) + float,float + (1 size) + default -> (0,0) constant: (2 sizes) fitbyte -> (-1,-1) @@ -39,21 +45,27 @@ register score parameters: opening cost parameters: local variable: - (2 cases) + (3 cases) pointer (1 size) default -> (3,4) general (1 size) default -> (3,4) + float + (1 size) + default -> (5,4) address of local variable: - (2 cases) + (3 cases) pointer (1 size) default -> (1,4) general (1 size) - general -> (1,4) + default -> (1,4) + float + (1 size) + default -> (1,4) constant: (2 sizes) fitbyte -> (1000,1000) @@ -69,7 +81,7 @@ opening cost parameters: default -> (1000,1000) register save costs: - (21 cases) + (39 cases) 0 -> (0,0) 1 -> (6,8) 2 -> (12,16) @@ -90,6 +102,24 @@ register save costs: 17 -> (102,136) 18 -> (108,144) 19 -> (114,152) + 20 -> (120,160) + 21 -> (126,168) + 22 -> (132,176) + 23 -> (138,184) + 24 -> (144,192) + 25 -> (150,200) + 26 -> (156,208) + 27 -> (162,216) + 28 -> (168,224) + 29 -> (174,232) + 30 -> (180,240) + 31 -> (186,248) + 32 -> (192,256) + 33 -> (198,264) + 34 -> (204,272) + 35 -> (210,280) + 36 -> (216,288) + 37 -> (222,296) 0 -> (0,0) %%UD access costs of global variables: