Because lwzu or stwu moves the pointer, I can remove an addi instruction from the loop, so the loop is slightly faster. I wrote a benchmark in Modula-2 that exercises some of these loops. I measured its time on my old PowerPC Mac. Its user time decreases from 8.401s to 8.217s with the tighter loops.
		
			
				
	
	
		
			16 lines
		
	
	
	
		
			273 B
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			16 lines
		
	
	
	
		
			273 B
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
.sect .text
 | 
						|
 | 
						|
! Create empty set.
 | 
						|
!  Stack: ( size -- set )
 | 
						|
 | 
						|
.define .zer
 | 
						|
.zer:
 | 
						|
	lwz	r3, 0(sp)		! r3 = size
 | 
						|
	srwi	r7, r3, 2
 | 
						|
	mtspr	ctr, r7			! ctr = size / 4
 | 
						|
	addi	sp, sp, 4		! drop size from stack
 | 
						|
	li	r4, 0
 | 
						|
 | 
						|
1:	stwu	r4, -4(sp)		! push zero
 | 
						|
	bdnz	1b			! loop ctr times
 | 
						|
	blr
 |