From 504d2aa34e606b6a51643b2fcdd49312d8ece1d0 Mon Sep 17 00:00:00 2001 From: George Koehler Date: Sat, 9 Dec 2017 17:21:06 -0500 Subject: [PATCH] Revise stack shuffles and integer conversions in PowerPC ncg. Allow asp 4, exg 4 to shuffle tokens without coercing them into registers; but comment why dup 4, dup 8 coerce tokens into registers. Allow dup, dus, exg with larger sizes; and add tests dup_e.e and exg_e.e to check that dup 20, dus, exg 20 work as well in powerpc as in i80 and i86. Then powerpc failed to compile loc 2 loc 4 cuu in dup_e.e. Revise the integer conversions, so powerpc can compile and pass the test. --- mach/powerpc/libem/build.lua | 2 +- mach/powerpc/libem/exg.s | 22 ++++++ mach/powerpc/ncg/table | 134 +++++++++++++++++++------------- tests/plat/build.lua | 2 + tests/plat/dup_e.e | 145 +++++++++++++++++++++++++++++++++++ tests/plat/exg_e.e | 86 +++++++++++++++++++++ 6 files changed, 338 insertions(+), 53 deletions(-) create mode 100644 mach/powerpc/libem/exg.s create mode 100644 tests/plat/dup_e.e create mode 100644 tests/plat/exg_e.e diff --git a/mach/powerpc/libem/build.lua b/mach/powerpc/libem/build.lua index 16a03147e..cb5efd281 100644 --- a/mach/powerpc/libem/build.lua +++ b/mach/powerpc/libem/build.lua @@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do acklibrary { name = "lib_"..plat, srcs = { - "./*.s", -- rm ret.s + "./*.s", -- exg.s }, vars = { plat = plat }, deps = { diff --git a/mach/powerpc/libem/exg.s b/mach/powerpc/libem/exg.s new file mode 100644 index 000000000..eb631b697 --- /dev/null +++ b/mach/powerpc/libem/exg.s @@ -0,0 +1,22 @@ +.sect .text + +! Exchange top two values on stack. +! Stack: ( a b size -- b a ) + +.define .exg +.exg: + lwz r3, 0(sp) ! r3 = size + srwi r7, r3, 2 + mtspr ctr, r7 ! ctr = size / 4 + mr r4, sp ! r4 = pointer before value b + add r5, r4, r3 ! r5 = pointer before value a + + ! Loop to swap each pair of words. +1: lwzu r6, 4(r4) + lwzu r7, 4(r5) + stw r6, 0(r5) + stw r7, 0(r4) + bdnz 1b ! loop ctr times + + addi sp, sp, 4 ! drop size from stack + blr diff --git a/mach/powerpc/ncg/table b/mach/powerpc/ncg/table index 5768c4382..90fd9448d 100644 --- a/mach/powerpc/ncg/table +++ b/mach/powerpc/ncg/table @@ -217,6 +217,9 @@ SETS XOR_RIS + XOR_RC + XOR_RR + NAND_RR + NOR_RR + EQV_RR + XEQ + XNE + XGT + XGE + XLT + XLE. + /* any register or token of each size */ + ANY4 = ANY_BHW + FSREG. + ANY8 = IND_ALL_D + FREG. INSTRUCTIONS @@ -756,7 +759,7 @@ COERCIONS PATTERNS -/* Intrinsics */ +/* Constants */ pat loc $1==(0-0x8000) /* Load constant */ yields {CONST_N8000, $1} @@ -773,22 +776,78 @@ PATTERNS pat loc yields {CONST_HL, $1} - pat dup $1==INT32 /* Duplicate word on top of stack */ - with REG - yields %1 %1 - with FSREG + +/* Stack shuffles */ + + /* The peephole optimizer does: loc $1 ass 4 -> asp $1 + * To optimize multiplication, it uses: dup 8 asp 4 + */ + + pat asp $1==4 /* Adjust stack by constant */ + with exact ANY4 + /* drop %1 */ + with STACK + gen addi sp, sp, {C, 4} + pat asp smalls($1) + with STACK + gen addi sp, sp, {C, $1} + pat asp lo($1)==0 + with STACK + gen addi sp, sp, {C, hi($1)} + pat asp + with STACK + gen + addis sp, sp, {C, his($1)} + addi sp, sp, {C, los($1)} + + pat ass $1==4 /* Adjust stack by variable */ + with REG STACK + gen add sp, sp, %1 + + /* To duplicate a token, we coerce the token into a register, + * then duplicate the register. This decreases code size. + */ + + pat dup $1==4 /* Duplicate word on top of stack */ + with REG+FSREG yields %1 %1 - pat dup $1==INT64 /* Duplicate double-word on top of stack */ - with REG REG + pat dup $1==8 /* Duplicate double-word */ + with REG+FSREG REG+FSREG yields %2 %1 %2 %1 with FREG yields %1 %1 - pat exg $1==INT32 /* Exchange top two words on stack */ - with REG REG + pat dup /* Duplicate other size */ + leaving + loc $1 + dus 4 + + pat dus $1==4 /* Duplicate variable size */ + with REG STACK + /* ( a size%1 -- a a ) */ + uses REG, REG + gen + srwi %a, %1, {C, 2} + mtspr ctr, %a + add %b, sp, %1 + 1: lwzu %a, {IND_RC_W, %b, 0-4} + stwu %a, {IND_RC_W, sp, 0-4} + bdnz {LABEL, "1b"} + + pat exg $1==4 /* Exchange top two words */ + with ANY4 ANY4 yields %1 %2 + pat exg defined($1) /* Exchange other size */ + leaving + loc $1 + cal ".exg" + + pat exg !defined($1) + leaving + cal ".exg" + pat ste loe $1==$2 /* Store then load external */ leaving dup 4 @@ -797,32 +856,30 @@ PATTERNS /* Type conversions */ - pat loc loc ciu /* signed X -> unsigned X */ + pat loc loc ciu /* signed -> unsigned */ leaving loc $1 loc $2 cuu - pat loc loc cuu $1==$2 /* unsigned X -> unsigned X */ + pat loc loc cui /* unsigned -> signed */ + leaving + loc $1 + loc $2 + cuu + + pat loc loc cuu $1<=4 && $2<=4 /* unsigned -> unsigned */ /* nop */ - pat loc loc cii $1==$2 /* signed X -> signed X */ - /* nop */ + pat loc loc cii $1<=4 && $2<=$1 + /* signed -> signed of smaller or same size, + * no sign extension */ - pat loc loc cui $1==$2 /* unsigned X -> signed X */ - /* nop */ - - pat loc loc cui $1==INT8 && $2==INT32 /* unsigned char -> signed int */ - /* nop */ - - pat loc loc cui $1==INT16 && $2==INT32 /* unsigned short -> signed int */ - /* nop */ - - pat loc loc cii $1==INT8 && $2==INT32 /* signed char -> signed int */ + pat loc loc cii $1==1 && $2<=4 /* sign-extend char */ with REG yields {SEX_B, %1} - pat loc loc cii $1==2 && $2==4 /* signed char -> signed short */ + pat loc loc cii $1==2 && $2<=4 /* sign-extend short */ with REG yields {SEX_H, %1} @@ -1362,7 +1419,7 @@ PATTERNS leaving loc 0 - pat zer defined($1) /* Create empty set */ + pat zer defined($1) /* Create empty set */ leaving loc $1 cal ".zer" @@ -2038,33 +2095,6 @@ PATTERNS gen move %1, sp - pat loc ass $1==4 && $2==4 /* Drop 4 bytes from stack */ - with exact REG - /* nop */ - with STACK - gen - addi sp, sp, {C, 4} - - pat ass $1==4 /* Adjust stack by variable amount */ - with CONST2 STACK - gen - move {SUM_RC, sp, %1.val}, sp - with CONST_HZ STACK - gen - move {SUM_RC, sp, his(%1.val)}, sp - with CONST_STACK-CONST2-CONST_HZ STACK - gen - move {SUM_RC, sp, his(%1.val)}, sp - move {SUM_RC, sp, los(%1.val)}, sp - with REG STACK - gen - move {SUM_RR, sp, %1}, sp - - pat asp /* Adjust stack by constant amount */ - leaving - loc $1 - ass 4 - pat lae rck $2==4 /* Range check */ with REG kills ALL diff --git a/tests/plat/build.lua b/tests/plat/build.lua index cbd39468e..609771ed1 100644 --- a/tests/plat/build.lua +++ b/tests/plat/build.lua @@ -10,6 +10,8 @@ definerule("plat_testsuite", -- target names will resolve there. local testfiles = filenamesof( "tests/plat/*.c", + "tests/plat/dup_e.e", + "tests/plat/exg_e.e", "tests/plat/inn_e.e", "tests/plat/rotate_e.e", "tests/plat/*.p", diff --git a/tests/plat/dup_e.e b/tests/plat/dup_e.e new file mode 100644 index 000000000..600161be4 --- /dev/null +++ b/tests/plat/dup_e.e @@ -0,0 +1,145 @@ +# + mes 2, EM_WSIZE, EM_PSIZE + +/* + * Test _dup_ and _dus_ by loading 20 bytes from _src_, then making + * and checking some duplicates. + */ + + exa src + exa size +src + con 3593880729I4, 782166578I4, 4150666996I4, 2453272937I4, 3470523049I4 +size + con 20I2 + + exp $check + exp $_m_a_i_n + pro $_m_a_i_n, 0 + + /* Push 3 copies of src on stack. */ + lae src + loi 20 /* 1st copy */ + dup 20 /* 2nd copy */ + lae size + loi 2 + loc 2 + loc EM_WSIZE + cuu + dus EM_WSIZE /* 3rd copy */ + + cal $check + cal $finished + end /* $_m_a_i_n */ + + pro $check, 4 * EM_PSIZE + 2 * EM_WSIZE +#define p1 (-1 * EM_PSIZE) +#define p2 (-2 * EM_PSIZE) +#define p3 (-3 * EM_PSIZE) +#define p4 (-4 * EM_PSIZE) +#define b (p4 - 1 * EM_WSIZE) +#define i (p4 - 2 * EM_WSIZE) + + /* Set pointers to all 4 copies. */ + lae src + lal p4 + sti EM_PSIZE /* p4 = src */ + lal 0 + lal p3 + sti EM_PSIZE /* p3 = 3rd copy */ + lal 20 + lal p2 + sti EM_PSIZE /* p2 = 2nd copy */ + lal 40 + lal p1 + sti EM_PSIZE /* p1 = 1st copy */ + + /* Loop 20 times to verify each byte. */ + loc 0 + stl i +4 + lal p4 + loi EM_PSIZE + loi 1 + loc 1 + loc EM_WSIZE + cii + stl b /* b = byte from src */ + lol b + lal p3 + loi EM_PSIZE + loi 1 /* byte from 3rd copy */ + loc 1 + loc EM_WSIZE + cii + beq *3 + loc (3 * 256) + lol i + adi EM_WSIZE + loc EM_WSIZE + loc 4 + cuu + cal $fail + asp 4 +3 + lol b + lal p2 + loi EM_PSIZE + loi 1 /* byte from 2nd copy */ + loc 1 + loc EM_WSIZE + cii + beq *2 + loc (2 * 256) + lol i + adi EM_WSIZE + loc EM_WSIZE + loc 4 + cuu + cal $fail + asp 4 +2 + lol b + lal p1 + loi EM_PSIZE + loi 1 /* byte from 1st copy */ + loc 1 + loc EM_WSIZE + cii + beq *1 + loc (1 * 256) + lol i + adi EM_WSIZE + loc EM_WSIZE + loc 4 + cuu + cal $fail + asp 4 +1 + lal p4 + loi EM_PSIZE + adp 1 + lal p4 + sti EM_PSIZE /* increment p4 */ + lal p3 + loi EM_PSIZE + adp 1 + lal p3 + sti EM_PSIZE /* increment p3 */ + lal p2 + loi EM_PSIZE + adp 1 + lal p2 + sti EM_PSIZE /* increment p2 */ + lal p1 + loi EM_PSIZE + adp 1 + lal p1 + sti EM_PSIZE /* increment p1 */ + inl i + lol i + loc 20 + blt *4 /* loop 20 times */ + + ret 0 + end /* $check */ diff --git a/tests/plat/exg_e.e b/tests/plat/exg_e.e new file mode 100644 index 000000000..3a1f06d3b --- /dev/null +++ b/tests/plat/exg_e.e @@ -0,0 +1,86 @@ +# + mes 2, EM_WSIZE, EM_PSIZE + +/* + * Test _exg_ by loading 40 bytes from _src_, then exchanging 20 and + * 20 bytes, and checking the result. + */ + + exa src +src + con 1539465570I4, 1344465418I4, 1317578918I4, 1163467696I4, 2645261331I4 + con 3981585269I4, 1433968975I4, 4256886989I4, 4114909542I4, 1817334375I4 + + exp $check + exp $_m_a_i_n + pro $_m_a_i_n, 0 + + lae src + loi 40 + exg 20 + cal $check + cal $finished + end /* $_m_a_i_n */ + + pro $check, 2 * EM_PSIZE + EM_WSIZE +#define p1 (-1 * EM_PSIZE) +#define p2 (-2 * EM_PSIZE) +#define i (p2 - EM_WSIZE) + + lae src + lal p2 + sti EM_PSIZE /* p2 = src */ + lal 0 + adp 20 + lal p1 + sti EM_PSIZE /* p1 = exchanged copy + 20 */ + + /* Loop 40 times to verify each byte. */ + loc 0 + stl i +1 + lal p2 + loi EM_PSIZE + loi 1 /* byte from src */ + loc 1 + loc EM_WSIZE + cii + lal p1 + loi EM_PSIZE + loi 1 /* byte from exchanged copy */ + loc 1 + loc EM_WSIZE + cii + beq *2 + lol i + loc EM_WSIZE + loc 4 + cuu + cal $fail + asp 4 +2 + lal p2 + loi EM_PSIZE + adp 1 + lal p2 + sti EM_PSIZE /* increment p2 */ + lal p1 + loi EM_PSIZE /* p1 */ + inl i + /* When i reaches 20, p1 would reach end of exchanged copy. */ + lol i + loc 20 + beq *3 + adp 1 /* p1 + 1 */ + bra *4 +3 + adp -39 /* p1 - 39, beginning of exchanged copy */ +4 + lal p1 + sti EM_PSIZE /* move p1 */ + lol i + loc 40 + blt *1 + + ret 0 + end /* $check */ \ No newline at end of file