MIPS: csum_partial: Improve instruction parallelism. Computing sum introduces true data dependency. This patch removes some true data depdendencies, hence increases instruction level parallelism. This patch brings up to 50% csum performance gain on Loongson 3a. One example about how this patch works is in CSUM_BIGCHUNK1: // ** original ** vs ** patch applied ** ADDC(sum, t0) ADDC(t0, t1) ADDC(sum, t1) ADDC(t2, t3) ADDC(sum, t2) ADDC(sum, t0) ADDC(sum, t3) ADDC(sum, t2) In the original implementation, each ADDC(sum, ...) depends on the sum value updated by previous ADDC(as source operand). With this patch applied, the first two ADDC operations are independent, hence can be executed simultaneously if possible. Another example is in the "copy and sum calculating chunk": // ** original ** vs ** patch applied ** STORE(t0, UNIT(0) ... STORE(t0, UNIT(0) ... ADDC(sum, t0) ADDC(t0, t1) STORE(t1, UNIT(1) ... STORE(t1, UNIT(1) ... ADDC(sum, t1) ADDC(sum, t0) STORE(t2, UNIT(2) ... STORE(t2, UNIT(2) ... ADDC(sum, t2) ADDC(t2, t3) STORE(t3, UNIT(3) ... STORE(t3, UNIT(3) ... ADDC(sum, t3) ADDC(sum, t2) With this patch applied, ADDC and the **next next** ADDC are independent. Signed-off-by: chenj <chenj@lemote.com> Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/9608/ Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

commit: 615eb603f4e1da4f151c7fac4aa175753a9913ec [log] [tgz]
author: Chen Jie <chenj@lemote.com> Fri Mar 27 01:07:24 2015 +0800
committer: Ralf Baechle <ralf@linux-mips.org> Wed Apr 01 17:22:11 2015 +0200
tree: 5f879da5f72fd016a77522a3bfea67a0486ac42c
parent: d548ca6b0784a99f0fcae397f115823ccd0361a5 [diff] [blame]
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index 4c721e2..ed88647 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S

@@ -76,10 +76,10 @@
 	LOAD	_t1, (offset + UNIT(1))(src);			\
 	LOAD	_t2, (offset + UNIT(2))(src);			\
 	LOAD	_t3, (offset + UNIT(3))(src);			\
+	ADDC(_t0, _t1);						\
+	ADDC(_t2, _t3);						\
 	ADDC(sum, _t0);						\
-	ADDC(sum, _t1);						\
-	ADDC(sum, _t2);						\
-	ADDC(sum, _t3)
+	ADDC(sum, _t2)
 
 #ifdef USE_DOUBLE
 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
@@ -504,21 +504,21 @@
 	SUB	len, len, 8*NBYTES
 	ADD	src, src, 8*NBYTES
 	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
-	ADDC(sum, t0)
+	ADDC(t0, t1)
 	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
-	ADDC(sum, t1)
+	ADDC(sum, t0)
 	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
-	ADDC(sum, t2)
+	ADDC(t2, t3)
 	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
-	ADDC(sum, t3)
+	ADDC(sum, t2)
 	STORE(t4, UNIT(4)(dst),	.Ls_exc\@)
-	ADDC(sum, t4)
+	ADDC(t4, t5)
 	STORE(t5, UNIT(5)(dst),	.Ls_exc\@)
-	ADDC(sum, t5)
+	ADDC(sum, t4)
 	STORE(t6, UNIT(6)(dst),	.Ls_exc\@)
-	ADDC(sum, t6)
+	ADDC(t6, t7)
 	STORE(t7, UNIT(7)(dst),	.Ls_exc\@)
-	ADDC(sum, t7)
+	ADDC(sum, t6)
 	.set	reorder				/* DADDI_WAR */
 	ADD	dst, dst, 8*NBYTES
 	bgez	len, 1b
@@ -544,13 +544,13 @@
 	SUB	len, len, 4*NBYTES
 	ADD	src, src, 4*NBYTES
 	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
-	ADDC(sum, t0)
+	ADDC(t0, t1)
 	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
-	ADDC(sum, t1)
+	ADDC(sum, t0)
 	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
-	ADDC(sum, t2)
+	ADDC(t2, t3)
 	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
-	ADDC(sum, t3)
+	ADDC(sum, t2)
 	.set	reorder				/* DADDI_WAR */
 	ADD	dst, dst, 4*NBYTES
 	beqz	len, .Ldone\@
@@ -649,13 +649,13 @@
 	nop				# improves slotting
 #endif
 	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
-	ADDC(sum, t0)
+	ADDC(t0, t1)
 	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
-	ADDC(sum, t1)
+	ADDC(sum, t0)
 	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
-	ADDC(sum, t2)
+	ADDC(t2, t3)
 	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
-	ADDC(sum, t3)
+	ADDC(sum, t2)
 	.set	reorder				/* DADDI_WAR */
 	ADD	dst, dst, 4*NBYTES
 	bne	len, rem, 1b
commit	615eb603f4e1da4f151c7fac4aa175753a9913ec	[log] [tgz]
author	Chen Jie <chenj@lemote.com>	Fri Mar 27 01:07:24 2015 +0800
committer	Ralf Baechle <ralf@linux-mips.org>	Wed Apr 01 17:22:11 2015 +0200
tree	5f879da5f72fd016a77522a3bfea67a0486ac42c
parent	d548ca6b0784a99f0fcae397f115823ccd0361a5 [diff] [blame]