[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

commit: 7bcd3f34e262bbebffa954d80eab3a84f053da31 [log] [tgz]
author: Andi Kleen <ak@suse.de> Fri Feb 03 21:51:02 2006 +0100
committer: Linus Torvalds <torvalds@g5.osdl.org> Sat Feb 04 16:43:13 2006 -0800
tree: f0765da9eaa8024a2b1d67d3e43730cb32f99fa7
parent: 6bca52b544489b626c7d0db801df6b4aa3d5adb5 [diff] [blame]
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 2aa48f2..ad397f2 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S

@@ -13,6 +13,98 @@
 	.p2align 4
 memset:	
 __memset:
+	movq %rdi,%r10
+	movq %rdx,%r11
+
+	/* expand byte value  */
+	movzbl %sil,%ecx
+	movabs $0x0101010101010101,%rax
+	mul    %rcx		/* with rax, clobbers rdx */
+
+	/* align dst */
+	movl  %edi,%r9d
+	andl  $7,%r9d
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+
+	movl %r11d,%ecx
+	shrl $6,%ecx
+	jz	 .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	movq  %rax,8(%rdi)
+	movq  %rax,16(%rdi)
+	movq  %rax,24(%rdi)
+	movq  %rax,32(%rdi)
+	movq  %rax,40(%rdi)
+	movq  %rax,48(%rdi)
+	movq  %rax,56(%rdi)
+	leaq  64(%rdi),%rdi
+	jnz    .Lloop_64
+
+	/* Handle tail in loops. The loops should be faster than hard
+	   to predict jump tables. */
+	.p2align 4
+.Lhandle_tail:
+	movl	%r11d,%ecx
+	andl    $63&(~7),%ecx
+	jz 		.Lhandle_7
+	shrl	$3,%ecx
+	.p2align 4
+.Lloop_8:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	leaq  8(%rdi),%rdi
+	jnz    .Lloop_8
+
+.Lhandle_7:
+	movl	%r11d,%ecx
+	andl	$7,%ecx
+	jz      .Lende
+	.p2align 4
+.Lloop_1:
+	decl    %ecx
+	movb 	%al,(%rdi)
+	leaq	1(%rdi),%rdi
+	jnz     .Lloop_1
+
+.Lende:
+	movq	%r10,%rax
+	ret
+
+.Lbad_alignment:
+	cmpq $7,%r11
+	jbe	.Lhandle_7
+	movq %rax,(%rdi)	/* unaligned store */
+	movq $8,%r8
+	subq %r9,%r8
+	addq %r8,%rdi
+	subq %r8,%r11
+	jmp .Lafter_bad_alignment
+
+	/* Some CPUs run faster using the string instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+	.section .altinstructions,"a"
+	.align 8
+	.quad  memset
+	.quad  memset_c
+	.byte  X86_FEATURE_REP_GOOD
+	.byte  memset_c_end-memset_c
+	.byte  memset_c_end-memset_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+ /* rdi	destination
+  * rsi value
+  * rdx count
+  */
+memset_c:
 	movq %rdi,%r9
 	movl %edx,%r8d
 	andl $7,%r8d		
@@ -29,3 +121,5 @@
 	stosb
 	movq %r9,%rax
 	ret
+memset_c_end:
+	.previous
commit	7bcd3f34e262bbebffa954d80eab3a84f053da31	[log] [tgz]
author	Andi Kleen <ak@suse.de>	Fri Feb 03 21:51:02 2006 +0100
committer	Linus Torvalds <torvalds@g5.osdl.org>	Sat Feb 04 16:43:13 2006 -0800
tree	f0765da9eaa8024a2b1d67d3e43730cb32f99fa7
parent	6bca52b544489b626c7d0db801df6b4aa3d5adb5 [diff] [blame]