Use lwzu, stwu to tighten more loops.

Because lwzu or stwu moves the pointer, I can remove an addi
instruction from the loop, so the loop is slightly faster.

I wrote a benchmark in Modula-2 that exercises some of these loops.  I
measured its time on my old PowerPC Mac.  Its user time decreases from
8.401s to 8.217s with the tighter loops.
This commit is contained in:
George Koehler 2017-10-18 12:12:42 -04:00
parent ac2b0710c8
commit 459a9b5949
7 changed files with 70 additions and 79 deletions

View file

@ -1,22 +1,20 @@
.sect .text
! Set intersection.
! Stack: ( b a size -- a*b )
! Stack: ( a b size -- a*b )
.define .and
.and:
lwz r3, 0(sp) ! r3 = size
srwi r7, r3, 2
mtspr ctr, r7 ! ctr = size / 4
addi r4, sp, 4 ! r4 = ptr to set a
add r5, r4, r3 ! r5 = ptr to set b
li r6, 0 ! r6 = index
1:
lwzx r7, r4, r6
lwzx r8, r5, r6
and r8, r7, r8 ! intersection of words
stwx r8, r5, r6
addi r6, r6, 4
add r4, sp, r3 ! r4 = pointer before set a
! Loop with r4 in set a and sp in set b.
1: lwzu r5, 4(r4)
lwzu r6, 4(sp)
and r7, r5, r6 ! intersection of words
stw r7, 0(r4)
bdnz 1b ! loop ctr times
mr sp, r5
addi sp, sp, 4 ! drop last word of set b
blr

View file

@ -1,7 +1,7 @@
.sect .text
! Compare sets a, b.
! Stack: ( b a size -- result )
! Stack: ( a b size -- result )
! Result is 0 if equal, nonzero if not equal.
.define .cms
@ -9,22 +9,19 @@
lwz r3, 0(sp) ! r3 = size of each set
srwi r7, r3, 2
mtspr ctr, r7 ! ctr = size / 4
addi r4, sp, 4 ! r4 = ptr to set a
add r5, r4, r3 ! r5 = ptr to set b
li r6, 0 ! r6 = index
1:
lwzx r7, r4, r6
lwzx r8, r5, r6
cmpw cr0, r7, r8 ! compare words in sets
addi r6, r6, 4
bne cr0, 2f ! branch if not equal
add r4, sp, r3 ! r4 = pointer before set a
add r7, r4, r3 ! r7 = pointer to store result
! Loop with r4 in a set a and sp in set b.
1: lwzu r5, 4(r4)
lwzu r6, 4(sp)
cmpw r5, r6 ! compare words
bne 2f ! branch if not equal
bdnz 1b ! loop ctr times
li r9, 0 ! equal: return 0
li r3, 0 ! equal: return 0
b 3f
2:
li r9, 1 ! not equal: return 1
3:
slwi r7, r3, 1
add sp, sp, r7 ! adjust stack pointer
stw r9, 0(sp) ! push result
2: li r3, 1 ! not equal: return 1
3: mr sp, r7
stw r3, 0(sp) ! push result
blr

View file

@ -5,16 +5,15 @@
.define .com
.com:
lwz r3, 0 (sp) ! size
addi sp, sp, 4
lwz r3, 0(sp) ! r3 = size
srwi r7, r3, 2
mtspr ctr, r7 ! ctr = size / 4
mr r4, sp ! r4 = pointer before set a
mr r4, sp ! r4 = pointer to set a
srwi r5, r3, 2
mtspr ctr, r5 ! ctr = r3 / 4
1:
lwz r6, 0(r4)
nor r6, r6, r6 ! complement of word
stw r6, 0(r4)
addi r4, r4, 4
! Loop with r4 in set a.
1: lwzu r5, 4(r4)
nor r7, r5, r5 ! complement of word
stw r7, 0(r4)
bdnz 1b ! loop ctr times
addi sp, sp, 4 ! drop size from stack
blr

View file

@ -1,22 +1,20 @@
.sect .text
! Set union.
! Stack: ( b a size -- a+b )
! Stack: ( a b size -- a+b )
.define .ior
.ior:
lwz r3, 0(sp) ! r3 = size
srwi r7, r3, 2
mtspr ctr, r7 ! ctr = size / 4
addi r4, sp, 4 ! r4 = ptr to set a
add r5, r4, r3 ! r5 = ptr to set b
li r6, 0 ! r6 = index
1:
lwzx r7, r4, r6
lwzx r8, r5, r6
or r8, r7, r8 ! union of words
stwx r8, r5, r6
addi r6, r6, 4
add r4, sp, r3 ! r4 = pointer before set a
! Loop with r4 in set a and sp in set b.
1: lwzu r5, 4(r4)
lwzu r6, 4(sp)
or r7, r5, r6 ! union of words
stw r7, 0(r4)
bdnz 1b ! loop ctr times
mr sp, r5
addi sp, sp, 4 ! drop last word of set b
blr

View file

@ -1,22 +1,20 @@
.sect .text
! Set symmetric difference.
! Stack: ( b a size -- a/b )
! Stack: ( a b size -- a/b )
.define .xor
.xor:
lwz r3, 0(sp) ! r3 = size
srwi r7, r3, 2
mtspr ctr, r7 ! ctr = size / 4
addi r4, sp, 4 ! r4 = ptr to set a
add r5, r4, r3 ! r5 = ptr to set b
li r6, 0 ! r6 = index
1:
lwzx r7, r4, r6
lwzx r8, r5, r6
xor r8, r7, r8 ! symmetric difference of words
stwx r8, r5, r6
addi r6, r6, 4
add r4, sp, r3 ! r4 = pointer before set a
! Loop with r4 in set a and sp in set b.
1: lwzu r5, 4(r4)
lwzu r6, 4(sp)
xor r7, r5, r6 ! symmetric difference of words
stw r7, 0(r4)
bdnz 1b ! loop ctr times
mr sp, r5
addi sp, sp, 4 ! drop last word of set b
blr

View file

@ -6,14 +6,11 @@
.define .zer
.zer:
lwz r3, 0(sp) ! r3 = size
srwi r5, r3, 2
mtspr ctr, r5 ! ctr = word size - 4
li r4, 0 ! r4 = 0
addi sp, sp, 4
subf sp, r3, sp ! sp = ptr to new set
li r6, 0 ! r6 = index
1:
stwx r4, sp, r6 ! store zero in set
addi r6, r6, 4
srwi r7, r3, 2
mtspr ctr, r7 ! ctr = size / 4
addi sp, sp, 4 ! drop size from stack
li r4, 0
1: stwu r4, -4(sp) ! push zero
bdnz 1b ! loop ctr times
blr

View file

@ -1897,6 +1897,11 @@ PATTERNS
gen move %2, r4
leaving ret 0
/*
* These rules for blm/bls are wrong if length is zero.
* So are several procedures in libem.
*/
pat blm /* Block move constant length */
leaving
loc $1
@ -1904,15 +1909,15 @@ PATTERNS
pat bls /* Block move variable length */
with REG REG REG
uses reusing %1, REG, REG={CONST_0000_7FFF, 0}
/* ( src%3 dst%2 len%1 -- ) */
uses reusing %1, REG, REG, REG
gen
/* Wrong if size is zero */
srwi %1, %1, {CONST, 2}
mtspr ctr, %1
1:
lwzx %a, %3, %b
stwx %a, %2, %b
addi %b, %b, {CONST, 4}
srwi %a, %1, {CONST, 2}
mtspr ctr, %a
addi %b, %3, {CONST, 0-4}
addi %c, %2, {CONST, 0-4}
1: lwzu %a, {IND_RC_W, %b, 4}
stwu %a, {IND_RC_W, %c, 4}
bdnz {LABEL, "1b"}
pat csa /* Array-lookup switch */
@ -1987,8 +1992,7 @@ PATTERNS
REG={CONST_0000_7FFF, $1-1}
gen
mtspr ctr, %b
1:
lwz %a, {IND_RC_W, %a, SL_OFFSET}
1: lwz %a, {IND_RC_W, %a, SL_OFFSET}
bdnz {LABEL, "1b"}
yields %a