Revise stack shuffles and integer conversions in PowerPC ncg.
Allow asp 4, exg 4 to shuffle tokens without coercing them into registers; but comment why dup 4, dup 8 coerce tokens into registers. Allow dup, dus, exg with larger sizes; and add tests dup_e.e and exg_e.e to check that dup 20, dus, exg 20 work as well in powerpc as in i80 and i86. Then powerpc failed to compile loc 2 loc 4 cuu in dup_e.e. Revise the integer conversions, so powerpc can compile and pass the test.
This commit is contained in:
parent
48788287b8
commit
504d2aa34e
|
@ -6,7 +6,7 @@ for _, plat in ipairs(vars.plats) do
|
|||
acklibrary {
|
||||
name = "lib_"..plat,
|
||||
srcs = {
|
||||
"./*.s", -- rm ret.s
|
||||
"./*.s", -- exg.s
|
||||
},
|
||||
vars = { plat = plat },
|
||||
deps = {
|
||||
|
|
22
mach/powerpc/libem/exg.s
Normal file
22
mach/powerpc/libem/exg.s
Normal file
|
@ -0,0 +1,22 @@
|
|||
.sect .text
|
||||
|
||||
! Exchange top two values on stack.
|
||||
! Stack: ( a b size -- b a )
|
||||
|
||||
.define .exg
|
||||
.exg:
|
||||
lwz r3, 0(sp) ! r3 = size
|
||||
srwi r7, r3, 2
|
||||
mtspr ctr, r7 ! ctr = size / 4
|
||||
mr r4, sp ! r4 = pointer before value b
|
||||
add r5, r4, r3 ! r5 = pointer before value a
|
||||
|
||||
! Loop to swap each pair of words.
|
||||
1: lwzu r6, 4(r4)
|
||||
lwzu r7, 4(r5)
|
||||
stw r6, 0(r5)
|
||||
stw r7, 0(r4)
|
||||
bdnz 1b ! loop ctr times
|
||||
|
||||
addi sp, sp, 4 ! drop size from stack
|
||||
blr
|
|
@ -217,6 +217,9 @@ SETS
|
|||
XOR_RIS + XOR_RC + XOR_RR + NAND_RR + NOR_RR + EQV_RR +
|
||||
XEQ + XNE + XGT + XGE + XLT + XLE.
|
||||
|
||||
/* any register or token of each size */
|
||||
ANY4 = ANY_BHW + FSREG.
|
||||
ANY8 = IND_ALL_D + FREG.
|
||||
|
||||
INSTRUCTIONS
|
||||
|
||||
|
@ -756,7 +759,7 @@ COERCIONS
|
|||
|
||||
PATTERNS
|
||||
|
||||
/* Intrinsics */
|
||||
/* Constants */
|
||||
|
||||
pat loc $1==(0-0x8000) /* Load constant */
|
||||
yields {CONST_N8000, $1}
|
||||
|
@ -773,22 +776,78 @@ PATTERNS
|
|||
pat loc
|
||||
yields {CONST_HL, $1}
|
||||
|
||||
pat dup $1==INT32 /* Duplicate word on top of stack */
|
||||
with REG
|
||||
yields %1 %1
|
||||
with FSREG
|
||||
|
||||
/* Stack shuffles */
|
||||
|
||||
/* The peephole optimizer does: loc $1 ass 4 -> asp $1
|
||||
* To optimize multiplication, it uses: dup 8 asp 4
|
||||
*/
|
||||
|
||||
pat asp $1==4 /* Adjust stack by constant */
|
||||
with exact ANY4
|
||||
/* drop %1 */
|
||||
with STACK
|
||||
gen addi sp, sp, {C, 4}
|
||||
pat asp smalls($1)
|
||||
with STACK
|
||||
gen addi sp, sp, {C, $1}
|
||||
pat asp lo($1)==0
|
||||
with STACK
|
||||
gen addi sp, sp, {C, hi($1)}
|
||||
pat asp
|
||||
with STACK
|
||||
gen
|
||||
addis sp, sp, {C, his($1)}
|
||||
addi sp, sp, {C, los($1)}
|
||||
|
||||
pat ass $1==4 /* Adjust stack by variable */
|
||||
with REG STACK
|
||||
gen add sp, sp, %1
|
||||
|
||||
/* To duplicate a token, we coerce the token into a register,
|
||||
* then duplicate the register. This decreases code size.
|
||||
*/
|
||||
|
||||
pat dup $1==4 /* Duplicate word on top of stack */
|
||||
with REG+FSREG
|
||||
yields %1 %1
|
||||
|
||||
pat dup $1==INT64 /* Duplicate double-word on top of stack */
|
||||
with REG REG
|
||||
pat dup $1==8 /* Duplicate double-word */
|
||||
with REG+FSREG REG+FSREG
|
||||
yields %2 %1 %2 %1
|
||||
with FREG
|
||||
yields %1 %1
|
||||
|
||||
pat exg $1==INT32 /* Exchange top two words on stack */
|
||||
with REG REG
|
||||
pat dup /* Duplicate other size */
|
||||
leaving
|
||||
loc $1
|
||||
dus 4
|
||||
|
||||
pat dus $1==4 /* Duplicate variable size */
|
||||
with REG STACK
|
||||
/* ( a size%1 -- a a ) */
|
||||
uses REG, REG
|
||||
gen
|
||||
srwi %a, %1, {C, 2}
|
||||
mtspr ctr, %a
|
||||
add %b, sp, %1
|
||||
1: lwzu %a, {IND_RC_W, %b, 0-4}
|
||||
stwu %a, {IND_RC_W, sp, 0-4}
|
||||
bdnz {LABEL, "1b"}
|
||||
|
||||
pat exg $1==4 /* Exchange top two words */
|
||||
with ANY4 ANY4
|
||||
yields %1 %2
|
||||
|
||||
pat exg defined($1) /* Exchange other size */
|
||||
leaving
|
||||
loc $1
|
||||
cal ".exg"
|
||||
|
||||
pat exg !defined($1)
|
||||
leaving
|
||||
cal ".exg"
|
||||
|
||||
pat ste loe $1==$2 /* Store then load external */
|
||||
leaving
|
||||
dup 4
|
||||
|
@ -797,32 +856,30 @@ PATTERNS
|
|||
|
||||
/* Type conversions */
|
||||
|
||||
pat loc loc ciu /* signed X -> unsigned X */
|
||||
pat loc loc ciu /* signed -> unsigned */
|
||||
leaving
|
||||
loc $1
|
||||
loc $2
|
||||
cuu
|
||||
|
||||
pat loc loc cuu $1==$2 /* unsigned X -> unsigned X */
|
||||
pat loc loc cui /* unsigned -> signed */
|
||||
leaving
|
||||
loc $1
|
||||
loc $2
|
||||
cuu
|
||||
|
||||
pat loc loc cuu $1<=4 && $2<=4 /* unsigned -> unsigned */
|
||||
/* nop */
|
||||
|
||||
pat loc loc cii $1==$2 /* signed X -> signed X */
|
||||
/* nop */
|
||||
pat loc loc cii $1<=4 && $2<=$1
|
||||
/* signed -> signed of smaller or same size,
|
||||
* no sign extension */
|
||||
|
||||
pat loc loc cui $1==$2 /* unsigned X -> signed X */
|
||||
/* nop */
|
||||
|
||||
pat loc loc cui $1==INT8 && $2==INT32 /* unsigned char -> signed int */
|
||||
/* nop */
|
||||
|
||||
pat loc loc cui $1==INT16 && $2==INT32 /* unsigned short -> signed int */
|
||||
/* nop */
|
||||
|
||||
pat loc loc cii $1==INT8 && $2==INT32 /* signed char -> signed int */
|
||||
pat loc loc cii $1==1 && $2<=4 /* sign-extend char */
|
||||
with REG
|
||||
yields {SEX_B, %1}
|
||||
|
||||
pat loc loc cii $1==2 && $2==4 /* signed char -> signed short */
|
||||
pat loc loc cii $1==2 && $2<=4 /* sign-extend short */
|
||||
with REG
|
||||
yields {SEX_H, %1}
|
||||
|
||||
|
@ -2038,33 +2095,6 @@ PATTERNS
|
|||
gen
|
||||
move %1, sp
|
||||
|
||||
pat loc ass $1==4 && $2==4 /* Drop 4 bytes from stack */
|
||||
with exact REG
|
||||
/* nop */
|
||||
with STACK
|
||||
gen
|
||||
addi sp, sp, {C, 4}
|
||||
|
||||
pat ass $1==4 /* Adjust stack by variable amount */
|
||||
with CONST2 STACK
|
||||
gen
|
||||
move {SUM_RC, sp, %1.val}, sp
|
||||
with CONST_HZ STACK
|
||||
gen
|
||||
move {SUM_RC, sp, his(%1.val)}, sp
|
||||
with CONST_STACK-CONST2-CONST_HZ STACK
|
||||
gen
|
||||
move {SUM_RC, sp, his(%1.val)}, sp
|
||||
move {SUM_RC, sp, los(%1.val)}, sp
|
||||
with REG STACK
|
||||
gen
|
||||
move {SUM_RR, sp, %1}, sp
|
||||
|
||||
pat asp /* Adjust stack by constant amount */
|
||||
leaving
|
||||
loc $1
|
||||
ass 4
|
||||
|
||||
pat lae rck $2==4 /* Range check */
|
||||
with REG
|
||||
kills ALL
|
||||
|
|
|
@ -10,6 +10,8 @@ definerule("plat_testsuite",
|
|||
-- target names will resolve there.
|
||||
local testfiles = filenamesof(
|
||||
"tests/plat/*.c",
|
||||
"tests/plat/dup_e.e",
|
||||
"tests/plat/exg_e.e",
|
||||
"tests/plat/inn_e.e",
|
||||
"tests/plat/rotate_e.e",
|
||||
"tests/plat/*.p",
|
||||
|
|
145
tests/plat/dup_e.e
Normal file
145
tests/plat/dup_e.e
Normal file
|
@ -0,0 +1,145 @@
|
|||
#
|
||||
mes 2, EM_WSIZE, EM_PSIZE
|
||||
|
||||
/*
|
||||
* Test _dup_ and _dus_ by loading 20 bytes from _src_, then making
|
||||
* and checking some duplicates.
|
||||
*/
|
||||
|
||||
exa src
|
||||
exa size
|
||||
src
|
||||
con 3593880729I4, 782166578I4, 4150666996I4, 2453272937I4, 3470523049I4
|
||||
size
|
||||
con 20I2
|
||||
|
||||
exp $check
|
||||
exp $_m_a_i_n
|
||||
pro $_m_a_i_n, 0
|
||||
|
||||
/* Push 3 copies of src on stack. */
|
||||
lae src
|
||||
loi 20 /* 1st copy */
|
||||
dup 20 /* 2nd copy */
|
||||
lae size
|
||||
loi 2
|
||||
loc 2
|
||||
loc EM_WSIZE
|
||||
cuu
|
||||
dus EM_WSIZE /* 3rd copy */
|
||||
|
||||
cal $check
|
||||
cal $finished
|
||||
end /* $_m_a_i_n */
|
||||
|
||||
pro $check, 4 * EM_PSIZE + 2 * EM_WSIZE
|
||||
#define p1 (-1 * EM_PSIZE)
|
||||
#define p2 (-2 * EM_PSIZE)
|
||||
#define p3 (-3 * EM_PSIZE)
|
||||
#define p4 (-4 * EM_PSIZE)
|
||||
#define b (p4 - 1 * EM_WSIZE)
|
||||
#define i (p4 - 2 * EM_WSIZE)
|
||||
|
||||
/* Set pointers to all 4 copies. */
|
||||
lae src
|
||||
lal p4
|
||||
sti EM_PSIZE /* p4 = src */
|
||||
lal 0
|
||||
lal p3
|
||||
sti EM_PSIZE /* p3 = 3rd copy */
|
||||
lal 20
|
||||
lal p2
|
||||
sti EM_PSIZE /* p2 = 2nd copy */
|
||||
lal 40
|
||||
lal p1
|
||||
sti EM_PSIZE /* p1 = 1st copy */
|
||||
|
||||
/* Loop 20 times to verify each byte. */
|
||||
loc 0
|
||||
stl i
|
||||
4
|
||||
lal p4
|
||||
loi EM_PSIZE
|
||||
loi 1
|
||||
loc 1
|
||||
loc EM_WSIZE
|
||||
cii
|
||||
stl b /* b = byte from src */
|
||||
lol b
|
||||
lal p3
|
||||
loi EM_PSIZE
|
||||
loi 1 /* byte from 3rd copy */
|
||||
loc 1
|
||||
loc EM_WSIZE
|
||||
cii
|
||||
beq *3
|
||||
loc (3 * 256)
|
||||
lol i
|
||||
adi EM_WSIZE
|
||||
loc EM_WSIZE
|
||||
loc 4
|
||||
cuu
|
||||
cal $fail
|
||||
asp 4
|
||||
3
|
||||
lol b
|
||||
lal p2
|
||||
loi EM_PSIZE
|
||||
loi 1 /* byte from 2nd copy */
|
||||
loc 1
|
||||
loc EM_WSIZE
|
||||
cii
|
||||
beq *2
|
||||
loc (2 * 256)
|
||||
lol i
|
||||
adi EM_WSIZE
|
||||
loc EM_WSIZE
|
||||
loc 4
|
||||
cuu
|
||||
cal $fail
|
||||
asp 4
|
||||
2
|
||||
lol b
|
||||
lal p1
|
||||
loi EM_PSIZE
|
||||
loi 1 /* byte from 1st copy */
|
||||
loc 1
|
||||
loc EM_WSIZE
|
||||
cii
|
||||
beq *1
|
||||
loc (1 * 256)
|
||||
lol i
|
||||
adi EM_WSIZE
|
||||
loc EM_WSIZE
|
||||
loc 4
|
||||
cuu
|
||||
cal $fail
|
||||
asp 4
|
||||
1
|
||||
lal p4
|
||||
loi EM_PSIZE
|
||||
adp 1
|
||||
lal p4
|
||||
sti EM_PSIZE /* increment p4 */
|
||||
lal p3
|
||||
loi EM_PSIZE
|
||||
adp 1
|
||||
lal p3
|
||||
sti EM_PSIZE /* increment p3 */
|
||||
lal p2
|
||||
loi EM_PSIZE
|
||||
adp 1
|
||||
lal p2
|
||||
sti EM_PSIZE /* increment p2 */
|
||||
lal p1
|
||||
loi EM_PSIZE
|
||||
adp 1
|
||||
lal p1
|
||||
sti EM_PSIZE /* increment p1 */
|
||||
inl i
|
||||
lol i
|
||||
loc 20
|
||||
blt *4 /* loop 20 times */
|
||||
|
||||
ret 0
|
||||
end /* $check */
|
86
tests/plat/exg_e.e
Normal file
86
tests/plat/exg_e.e
Normal file
|
@ -0,0 +1,86 @@
|
|||
#
|
||||
mes 2, EM_WSIZE, EM_PSIZE
|
||||
|
||||
/*
|
||||
* Test _exg_ by loading 40 bytes from _src_, then exchanging 20 and
|
||||
* 20 bytes, and checking the result.
|
||||
*/
|
||||
|
||||
exa src
|
||||
src
|
||||
con 1539465570I4, 1344465418I4, 1317578918I4, 1163467696I4, 2645261331I4
|
||||
con 3981585269I4, 1433968975I4, 4256886989I4, 4114909542I4, 1817334375I4
|
||||
|
||||
exp $check
|
||||
exp $_m_a_i_n
|
||||
pro $_m_a_i_n, 0
|
||||
|
||||
lae src
|
||||
loi 40
|
||||
exg 20
|
||||
cal $check
|
||||
cal $finished
|
||||
end /* $_m_a_i_n */
|
||||
|
||||
pro $check, 2 * EM_PSIZE + EM_WSIZE
|
||||
#define p1 (-1 * EM_PSIZE)
|
||||
#define p2 (-2 * EM_PSIZE)
|
||||
#define i (p2 - EM_WSIZE)
|
||||
|
||||
lae src
|
||||
lal p2
|
||||
sti EM_PSIZE /* p2 = src */
|
||||
lal 0
|
||||
adp 20
|
||||
lal p1
|
||||
sti EM_PSIZE /* p1 = exchanged copy + 20 */
|
||||
|
||||
/* Loop 40 times to verify each byte. */
|
||||
loc 0
|
||||
stl i
|
||||
1
|
||||
lal p2
|
||||
loi EM_PSIZE
|
||||
loi 1 /* byte from src */
|
||||
loc 1
|
||||
loc EM_WSIZE
|
||||
cii
|
||||
lal p1
|
||||
loi EM_PSIZE
|
||||
loi 1 /* byte from exchanged copy */
|
||||
loc 1
|
||||
loc EM_WSIZE
|
||||
cii
|
||||
beq *2
|
||||
lol i
|
||||
loc EM_WSIZE
|
||||
loc 4
|
||||
cuu
|
||||
cal $fail
|
||||
asp 4
|
||||
2
|
||||
lal p2
|
||||
loi EM_PSIZE
|
||||
adp 1
|
||||
lal p2
|
||||
sti EM_PSIZE /* increment p2 */
|
||||
lal p1
|
||||
loi EM_PSIZE /* p1 */
|
||||
inl i
|
||||
/* When i reaches 20, p1 would reach end of exchanged copy. */
|
||||
lol i
|
||||
loc 20
|
||||
beq *3
|
||||
adp 1 /* p1 + 1 */
|
||||
bra *4
|
||||
3
|
||||
adp -39 /* p1 - 39, beginning of exchanged copy */
|
||||
4
|
||||
lal p1
|
||||
sti EM_PSIZE /* move p1 */
|
||||
lol i
|
||||
loc 40
|
||||
blt *1
|
||||
|
||||
ret 0
|
||||
end /* $check */
|
Loading…
Reference in a new issue