Because lwzu or stwu moves the pointer, I can remove an addi instruction from the loop, so the loop is slightly faster. I wrote a benchmark in Modula-2 that exercises some of these loops. I measured its time on my old PowerPC Mac. Its user time decreases from 8.401s to 8.217s with the tighter loops.
		
			
				
	
	
		
			20 lines
		
	
	
	
		
			400 B
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			20 lines
		
	
	
	
		
			400 B
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| .sect .text
 | |
| 
 | |
| ! Set union.
 | |
| !  Stack: ( a b size -- a+b )
 | |
| 
 | |
| .define .ior
 | |
| .ior:
 | |
| 	lwz	r3, 0(sp)		! r3 = size
 | |
| 	srwi	r7, r3, 2
 | |
| 	mtspr	ctr, r7			! ctr = size / 4
 | |
| 	add	r4, sp, r3		! r4 = pointer before set a
 | |
| 
 | |
| 	! Loop with r4 in set a and sp in set b.
 | |
| 1:	lwzu	r5, 4(r4)
 | |
| 	lwzu	r6, 4(sp)
 | |
| 	or	r7, r5, r6		! union of words
 | |
| 	stw	r7, 0(r4)
 | |
| 	bdnz	1b			! loop ctr times
 | |
| 	addi	sp, sp, 4		! drop last word of set b
 | |
| 	blr
 |