Merge git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc

author: Paul Mackerras <paulus@samba.org> 2006-02-08 09:43:08 +1100
committer: Paul Mackerras <paulus@samba.org> 2006-02-08 09:43:08 +1100
commit: 8f75015f33c3005e0bbf83ffc0d5e0b4262cc03d (patch)
tree: a3c34ad86ccdc904bb43af6cd1cb163231c29276 /arch/x86_64/lib/memset.S
parent: 076d022c566fddde41fd4a858dd24bacad8304d7 (diff)
parent: e060e084e7d9e1c62d02cb6b8d3fe07db5317eaa (diff)
1 files changed, 94 insertions, 0 deletions
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 2aa48f24ed1e..ad397f2c7de8 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -13,6 +13,98 @@
 	.p2align 4
 memset:	
 __memset:
+	movq %rdi,%r10
+	movq %rdx,%r11
+
+	/* expand byte value  */
+	movzbl %sil,%ecx
+	movabs $0x0101010101010101,%rax
+	mul    %rcx		/* with rax, clobbers rdx */
+
+	/* align dst */
+	movl  %edi,%r9d
+	andl  $7,%r9d
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+
+	movl %r11d,%ecx
+	shrl $6,%ecx
+	jz	 .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	movq  %rax,8(%rdi)
+	movq  %rax,16(%rdi)
+	movq  %rax,24(%rdi)
+	movq  %rax,32(%rdi)
+	movq  %rax,40(%rdi)
+	movq  %rax,48(%rdi)
+	movq  %rax,56(%rdi)
+	leaq  64(%rdi),%rdi
+	jnz    .Lloop_64
+
+	/* Handle tail in loops. The loops should be faster than hard
+	   to predict jump tables. */
+	.p2align 4
+.Lhandle_tail:
+	movl	%r11d,%ecx
+	andl    $63&(~7),%ecx
+	jz 		.Lhandle_7
+	shrl	$3,%ecx
+	.p2align 4
+.Lloop_8:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	leaq  8(%rdi),%rdi
+	jnz    .Lloop_8
+
+.Lhandle_7:
+	movl	%r11d,%ecx
+	andl	$7,%ecx
+	jz      .Lende
+	.p2align 4
+.Lloop_1:
+	decl    %ecx
+	movb 	%al,(%rdi)
+	leaq	1(%rdi),%rdi
+	jnz     .Lloop_1
+
+.Lende:
+	movq	%r10,%rax
+	ret
+
+.Lbad_alignment:
+	cmpq $7,%r11
+	jbe	.Lhandle_7
+	movq %rax,(%rdi)	/* unaligned store */
+	movq $8,%r8
+	subq %r9,%r8
+	addq %r8,%rdi
+	subq %r8,%r11
+	jmp .Lafter_bad_alignment
+
+	/* Some CPUs run faster using the string instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+	.section .altinstructions,"a"
+	.align 8
+	.quad  memset
+	.quad  memset_c
+	.byte  X86_FEATURE_REP_GOOD
+	.byte  memset_c_end-memset_c
+	.byte  memset_c_end-memset_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+ /* rdi	destination
+  * rsi value
+  * rdx count
+  */
+memset_c:
 	movq %rdi,%r9
 	movl %edx,%r8d
 	andl $7,%r8d		
@@ -29,3 +121,5 @@ __memset:
 	stosb
 	movq %r9,%rax
 	ret
+memset_c_end:
+	.previous
author	Paul Mackerras <paulus@samba.org>	2006-02-08 09:43:08 +1100
committer	Paul Mackerras <paulus@samba.org>	2006-02-08 09:43:08 +1100
commit	8f75015f33c3005e0bbf83ffc0d5e0b4262cc03d (patch)
tree	a3c34ad86ccdc904bb43af6cd1cb163231c29276 /arch/x86_64/lib/memset.S
parent	076d022c566fddde41fd4a858dd24bacad8304d7 (diff)
parent	e060e084e7d9e1c62d02cb6b8d3fe07db5317eaa (diff)