powerpc/32: optimise memset()

There is no need to extend the set value to an int when the length
is lower than 4 as in that case we only do byte stores.
We can therefore immediately branch to the part handling it.
By separating it from the normal case, we are able to eliminate
a few actions on the destination pointer.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index a3ffeac..05aaee2 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -91,17 +91,17 @@
  * replaced by a nop once cache is active. This is done in machine_init()
  */
 _GLOBAL(memset)
+	cmplwi	0,r5,4
+	blt	7f
+
 	rlwimi	r4,r4,8,16,23
 	rlwimi	r4,r4,16,0,15
 
-	addi	r6,r3,-4
-	cmplwi	0,r5,4
-	blt	7f
-	stwu	r4,4(r6)
+	stw	r4,0(r3)
 	beqlr
-	andi.	r0,r6,3
+	andi.	r0,r3,3
 	add	r5,r0,r5
-	subf	r6,r0,r6
+	subf	r6,r0,r3
 	cmplwi	0,r4,0
 	bne	2f	/* Use normal procedure if r4 is not zero */
 _GLOBAL(memset_nocache_branch)
@@ -132,13 +132,20 @@
 1:	stwu	r4,4(r6)
 	bdnz	1b
 6:	andi.	r5,r5,3
-7:	cmpwi	0,r5,0
 	beqlr
 	mtctr	r5
 	addi	r6,r6,3
 8:	stbu	r4,1(r6)
 	bdnz	8b
 	blr
+
+7:	cmpwi	0,r5,0
+	beqlr
+	mtctr	r5
+	addi	r6,r3,-1
+9:	stbu	r4,1(r6)
+	bdnz	9b
+	blr
 EXPORT_SYMBOL(memset)
 
 /*