diff options
| author | Paul Mackerras <paulus@samba.org> | 2006-02-08 09:43:08 +1100 |
|---|---|---|
| committer | Paul Mackerras <paulus@samba.org> | 2006-02-08 09:43:08 +1100 |
| commit | 8f75015f33c3005e0bbf83ffc0d5e0b4262cc03d (patch) | |
| tree | a3c34ad86ccdc904bb43af6cd1cb163231c29276 /arch/x86_64/lib/memset.S | |
| parent | 076d022c566fddde41fd4a858dd24bacad8304d7 (diff) | |
| parent | e060e084e7d9e1c62d02cb6b8d3fe07db5317eaa (diff) | |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc
Diffstat (limited to 'arch/x86_64/lib/memset.S')
| -rw-r--r-- | arch/x86_64/lib/memset.S | 94 |
1 files changed, 94 insertions, 0 deletions
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index 2aa48f24ed1e..ad397f2c7de8 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S @@ -13,6 +13,98 @@ .p2align 4 memset: __memset: + movq %rdi,%r10 + movq %rdx,%r11 + + /* expand byte value */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + mul %rcx /* with rax, clobbers rdx */ + + /* align dst */ + movl %edi,%r9d + andl $7,%r9d + jnz .Lbad_alignment +.Lafter_bad_alignment: + + movl %r11d,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + movq %rax,(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + movq %rax,24(%rdi) + movq %rax,32(%rdi) + movq %rax,40(%rdi) + movq %rax,48(%rdi) + movq %rax,56(%rdi) + leaq 64(%rdi),%rdi + jnz .Lloop_64 + + /* Handle tail in loops. The loops should be faster than hard + to predict jump tables. */ + .p2align 4 +.Lhandle_tail: + movl %r11d,%ecx + andl $63&(~7),%ecx + jz .Lhandle_7 + shrl $3,%ecx + .p2align 4 +.Lloop_8: + decl %ecx + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + jnz .Lloop_8 + +.Lhandle_7: + movl %r11d,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + decl %ecx + movb %al,(%rdi) + leaq 1(%rdi),%rdi + jnz .Lloop_1 + +.Lende: + movq %r10,%rax + ret + +.Lbad_alignment: + cmpq $7,%r11 + jbe .Lhandle_7 + movq %rax,(%rdi) /* unaligned store */ + movq $8,%r8 + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%r11 + jmp .Lafter_bad_alignment + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include <asm/cpufeature.h> + + .section .altinstructions,"a" + .align 8 + .quad memset + .quad memset_c + .byte X86_FEATURE_REP_GOOD + .byte memset_c_end-memset_c + .byte memset_c_end-memset_c + .previous + + .section .altinstr_replacement,"ax" + /* rdi destination + * rsi value + * rdx count + */ +memset_c: movq %rdi,%r9 movl %edx,%r8d andl $7,%r8d @@ -29,3 +121,5 @@ __memset: stosb movq %r9,%rax ret +memset_c_end: + .previous |
