arm64: atomics: prefetch the destination word for write prior to stxr The cost of changing a cacheline from shared to exclusive state can be significant, especially when this is triggered by an exclusive store, since it may result in having to retry the transaction. This patch makes use of prfm to prefetch cachelines for write prior to ldxr/stxr loops when using the ll/sc atomic routines. Reviewed-by: Catalin Marinas <catalin.marinas@arm.com> Signed-off-by: Will Deacon <will.deacon@arm.com>

commit: 0ea366f5e1b6413a6095dce60ea49ae51e468b61 [log] [tgz]
author: Will Deacon <will.deacon@arm.com> Fri May 29 13:31:10 2015 +0100
committer: Will Deacon <will.deacon@arm.com> Mon Jul 27 15:28:53 2015 +0100
tree: fce4fc690edf16784d21a714415a74a8ce53eb2b
parent: a82e62382fcbbf5c3348e802af73583e0cac39c0 [diff]
diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
index 5a9fb37..50d6abd 100644
--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h

@@ -45,6 +45,7 @@
 	int result;							\
 									\
 	asm volatile("// atomic_" #op "\n"				\
+"	prfm	pstl1strm, %2\n"					\
 "1:	ldxr	%w0, %2\n"						\
 "	" #asm_op "	%w0, %w0, %w3\n"				\
 "	stxr	%w1, %w0, %2\n"						\
@@ -62,6 +63,7 @@
 	int result;							\
 									\
 	asm volatile("// atomic_" #op "_return\n"			\
+"	prfm	pstl1strm, %2\n"					\
 "1:	ldxr	%w0, %2\n"						\
 "	" #asm_op "	%w0, %w0, %w3\n"				\
 "	stlxr	%w1, %w0, %2\n"						\
@@ -98,6 +100,7 @@
 	int oldval;
 
 	asm volatile("// atomic_cmpxchg\n"
+"	prfm	pstl1strm, %2\n"
 "1:	ldxr	%w1, %2\n"
 "	eor	%w0, %w1, %w3\n"
 "	cbnz	%w0, 2f\n"
@@ -121,6 +124,7 @@
 	unsigned long tmp;						\
 									\
 	asm volatile("// atomic64_" #op "\n"				\
+"	prfm	pstl1strm, %2\n"					\
 "1:	ldxr	%0, %2\n"						\
 "	" #asm_op "	%0, %0, %3\n"					\
 "	stxr	%w1, %0, %2\n"						\
@@ -138,6 +142,7 @@
 	unsigned long tmp;						\
 									\
 	asm volatile("// atomic64_" #op "_return\n"			\
+"	prfm	pstl1strm, %2\n"					\
 "1:	ldxr	%0, %2\n"						\
 "	" #asm_op "	%0, %0, %3\n"					\
 "	stlxr	%w1, %0, %2\n"						\
@@ -174,6 +179,7 @@
 	unsigned long res;
 
 	asm volatile("// atomic64_cmpxchg\n"
+"	prfm	pstl1strm, %2\n"
 "1:	ldxr	%1, %2\n"
 "	eor	%0, %1, %3\n"
 "	cbnz	%w0, 2f\n"
@@ -196,6 +202,7 @@
 	unsigned long tmp;
 
 	asm volatile("// atomic64_dec_if_positive\n"
+"	prfm	pstl1strm, %2\n"
 "1:	ldxr	%0, %2\n"
 "	subs	%0, %0, #1\n"
 "	b.mi	2f\n"
@@ -220,6 +227,7 @@
 	unsigned long tmp, oldval;					\
 									\
 	asm volatile(							\
+	"	prfm	pstl1strm, %2\n"				\
 	"1:	ldxr" #sz "\t%" #w "[oldval], %[v]\n"			\
 	"	eor	%" #w "[tmp], %" #w "[oldval], %" #w "[old]\n"	\
 	"	cbnz	%" #w "[tmp], 2f\n"				\
@@ -259,6 +267,7 @@
 	unsigned long tmp, ret;						\
 									\
 	asm volatile("// __cmpxchg_double" #name "\n"			\
+	"	prfm	pstl1strm, %2\n"				\
 	"1:	ldxp	%0, %1, %2\n"					\
 	"	eor	%0, %0, %3\n"					\
 	"	eor	%1, %1, %4\n"					\

diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index f702126..7bfda09 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h

@@ -33,12 +33,14 @@
 	case 1:
 		asm volatile(ARM64_LSE_ATOMIC_INSN(
 		/* LL/SC */
+		"	prfm	pstl1strm, %2\n"
 		"1:	ldxrb	%w0, %2\n"
 		"	stlxrb	%w1, %w3, %2\n"
 		"	cbnz	%w1, 1b\n"
 		"	dmb	ish",
 		/* LSE atomics */
 		"	nop\n"
+		"	nop\n"
 		"	swpalb	%w3, %w0, %2\n"
 		"	nop\n"
 		"	nop")
@@ -49,12 +51,14 @@
 	case 2:
 		asm volatile(ARM64_LSE_ATOMIC_INSN(
 		/* LL/SC */
+		"	prfm	pstl1strm, %2\n"
 		"1:	ldxrh	%w0, %2\n"
 		"	stlxrh	%w1, %w3, %2\n"
 		"	cbnz	%w1, 1b\n"
 		"	dmb	ish",
 		/* LSE atomics */
 		"	nop\n"
+		"	nop\n"
 		"	swpalh	%w3, %w0, %2\n"
 		"	nop\n"
 		"	nop")
@@ -65,12 +69,14 @@
 	case 4:
 		asm volatile(ARM64_LSE_ATOMIC_INSN(
 		/* LL/SC */
+		"	prfm	pstl1strm, %2\n"
 		"1:	ldxr	%w0, %2\n"
 		"	stlxr	%w1, %w3, %2\n"
 		"	cbnz	%w1, 1b\n"
 		"	dmb	ish",
 		/* LSE atomics */
 		"	nop\n"
+		"	nop\n"
 		"	swpal	%w3, %w0, %2\n"
 		"	nop\n"
 		"	nop")
@@ -81,12 +87,14 @@
 	case 8:
 		asm volatile(ARM64_LSE_ATOMIC_INSN(
 		/* LL/SC */
+		"	prfm	pstl1strm, %2\n"
 		"1:	ldxr	%0, %2\n"
 		"	stlxr	%w1, %3, %2\n"
 		"	cbnz	%w1, 1b\n"
 		"	dmb	ish",
 		/* LSE atomics */
 		"	nop\n"
+		"	nop\n"
 		"	swpal	%3, %0, %2\n"
 		"	nop\n"
 		"	nop")

diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 775e85b..007a69f 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h

@@ -30,6 +30,7 @@
 	asm volatile(							\
 	ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,		\
 		    CONFIG_ARM64_PAN)					\
+"	prfm	pstl1strm, %2\n"					\
 "1:	ldxr	%w1, %2\n"						\
 	insn "\n"							\
 "2:	stlxr	%w3, %w0, %2\n"						\
@@ -120,6 +121,7 @@
 		return -EFAULT;
 
 	asm volatile("// futex_atomic_cmpxchg_inatomic\n"
+"	prfm	pstl1strm, %2\n"
 "1:	ldxr	%w1, %2\n"
 "	sub	%w3, %w1, %w4\n"
 "	cbnz	%w3, 3f\n"

diff --git a/arch/arm64/lib/bitops.S b/arch/arm64/lib/bitops.S
index bc18457..43ac736 100644
--- a/arch/arm64/lib/bitops.S
+++ b/arch/arm64/lib/bitops.S

@@ -31,6 +31,7 @@
 	eor	w0, w0, w3		// Clear low bits
 	mov	x2, #1
 	add	x1, x1, x0, lsr #3	// Get word offset
+alt_lse "	prfm	pstl1strm, [x1]",	"nop"
 	lsl	x3, x2, x3		// Create mask
 
 alt_lse	"1:	ldxr	x2, [x1]",		"\lse	x3, [x1]"
@@ -48,6 +49,7 @@
 	eor	w0, w0, w3		// Clear low bits
 	mov	x2, #1
 	add	x1, x1, x0, lsr #3	// Get word offset
+alt_lse "	prfm	pstl1strm, [x1]",	"nop"
 	lsl	x4, x2, x3		// Create mask
 
 alt_lse	"1:	ldxr	x2, [x1]",		"\lse	x4, x2, [x1]"
commit	0ea366f5e1b6413a6095dce60ea49ae51e468b61	[log] [tgz]
author	Will Deacon <will.deacon@arm.com>	Fri May 29 13:31:10 2015 +0100
committer	Will Deacon <will.deacon@arm.com>	Mon Jul 27 15:28:53 2015 +0100
tree	fce4fc690edf16784d21a714415a74a8ce53eb2b
parent	a82e62382fcbbf5c3348e802af73583e0cac39c0 [diff]