summaryrefslogtreecommitdiff
path: root/arch/x86/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/lib')
-rw-r--r--arch/x86/lib/clear_page_64.S33
-rw-r--r--arch/x86/lib/copy_user_64.S69
-rw-r--r--arch/x86/lib/memcpy_64.S45
-rw-r--r--arch/x86/lib/memmove_64.S29
-rw-r--r--arch/x86/lib/memset_64.S54
5 files changed, 183 insertions, 47 deletions
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index aa4326bfb24a..f2145cfa12a6 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,6 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
/*
* Zero a page.
@@ -14,6 +15,15 @@ ENTRY(clear_page_c)
CFI_ENDPROC
ENDPROC(clear_page_c)
+ENTRY(clear_page_c_e)
+ CFI_STARTPROC
+ movl $4096,%ecx
+ xorl %eax,%eax
+ rep stosb
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_page_c_e)
+
ENTRY(clear_page)
CFI_STARTPROC
xorl %eax,%eax
@@ -38,21 +48,26 @@ ENTRY(clear_page)
.Lclear_page_end:
ENDPROC(clear_page)
- /* Some CPUs run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
+ /*
+ * Some CPUs support enhanced REP MOVSB/STOSB instructions.
+ * It is recommended to use this when possible.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original function.
+ *
+ */
#include <asm/cpufeature.h>
.section .altinstr_replacement,"ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
-2:
+2: .byte 0xeb /* jmp <disp8> */
+ .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
+3:
.previous
.section .altinstructions,"a"
- .align 8
- .quad clear_page
- .quad 1b
- .word X86_FEATURE_REP_GOOD
- .byte .Lclear_page_end - clear_page
- .byte 2b - 1b
+ altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
+ .Lclear_page_end-clear_page, 2b-1b
+ altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
+ .Lclear_page_end-clear_page,3b-2b
.previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 99e482615195..024840266ba0 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -15,23 +15,30 @@
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
- .macro ALTERNATIVE_JUMP feature,orig,alt
+/*
+ * By placing feature2 after feature1 in altinstructions section, we logically
+ * implement:
+ * If CPU has feature2, jmp to alt2 is used
+ * else if CPU has feature1, jmp to alt1 is used
+ * else jmp to orig is used.
+ */
+ .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
0:
.byte 0xe9 /* 32bit jump */
.long \orig-1f /* by default jump to orig */
1:
.section .altinstr_replacement,"ax"
2: .byte 0xe9 /* near jump with 32bit immediate */
- .long \alt-1b /* offset */ /* or alternatively to alt */
+ .long \alt1-1b /* offset */ /* or alternatively to alt1 */
+3: .byte 0xe9 /* near jump with 32bit immediate */
+ .long \alt2-1b /* offset */ /* or alternatively to alt2 */
.previous
+
.section .altinstructions,"a"
- .align 8
- .quad 0b
- .quad 2b
- .word \feature /* when feature is set */
- .byte 5
- .byte 5
+ altinstruction_entry 0b,2b,\feature1,5,5
+ altinstruction_entry 0b,3b,\feature2,5,5
.previous
.endm
@@ -72,8 +79,10 @@ ENTRY(_copy_to_user)
addq %rdx,%rcx
jc bad_to_user
cmpq TI_addr_limit(%rax),%rcx
- jae bad_to_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ ja bad_to_user
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
+ copy_user_generic_unrolled,copy_user_generic_string, \
+ copy_user_enhanced_fast_string
CFI_ENDPROC
ENDPROC(_copy_to_user)
@@ -85,8 +94,10 @@ ENTRY(_copy_from_user)
addq %rdx,%rcx
jc bad_from_user
cmpq TI_addr_limit(%rax),%rcx
- jae bad_from_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ ja bad_from_user
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
+ copy_user_generic_unrolled,copy_user_generic_string, \
+ copy_user_enhanced_fast_string
CFI_ENDPROC
ENDPROC(_copy_from_user)
@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string)
.previous
CFI_ENDPROC
ENDPROC(copy_user_generic_string)
+
+/*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
+ * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_enhanced_fast_string)
+ CFI_STARTPROC
+ andl %edx,%edx
+ jz 2f
+ movl %edx,%ecx
+1: rep
+ movsb
+2: xorl %eax,%eax
+ ret
+
+ .section .fixup,"ax"
+12: movl %ecx,%edx /* ecx is zerorest also */
+ jmp copy_user_handle_tail
+ .previous
+
+ .section __ex_table,"a"
+ .align 8
+ .quad 1b,12b
+ .previous
+ CFI_ENDPROC
+ENDPROC(copy_user_enhanced_fast_string)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 2a560bb2573b..efbf2a0ecdea 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
#include <asm/cpufeature.h>
#include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
/*
* memcpy - Copy a memory block.
@@ -37,6 +38,23 @@
.Lmemcpy_e:
.previous
+/*
+ * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
+ * memcpy_c. Use memcpy_c_e when possible.
+ *
+ * This gets patched over the unrolled variant (below) via the
+ * alternative instructions framework:
+ */
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c_e:
+ movq %rdi, %rax
+
+ movl %edx, %ecx
+ rep movsb
+ ret
+.Lmemcpy_e_e:
+ .previous
+
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
@@ -171,21 +189,22 @@ ENDPROC(memcpy)
ENDPROC(__memcpy)
/*
- * Some CPUs run faster using the string copy instructions.
- * It is also a lot simpler. Use this when possible:
- */
-
- .section .altinstructions, "a"
- .align 8
- .quad memcpy
- .quad .Lmemcpy_c
- .word X86_FEATURE_REP_GOOD
-
- /*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB feature
+ * If the feature is supported, memcpy_c_e() is the first choice.
+ * If enhanced rep movsb copy is not available, use fast string copy
+ * memcpy_c() when possible. This is faster and code is simpler than
+ * original memcpy().
+ * Otherwise, original memcpy() is used.
+ * In .altinstructions section, ERMS feature is placed after REG_GOOD
+ * feature to implement the right patch order.
+ *
* Replace only beginning, memcpy is used to apply alternatives,
* so it is silly to overwrite itself with nops - reboot is the
* only outcome...
*/
- .byte .Lmemcpy_e - .Lmemcpy_c
- .byte .Lmemcpy_e - .Lmemcpy_c
+ .section .altinstructions, "a"
+ altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+ .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
+ altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+ .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 0ecb8433e5a8..d0ec9c2936d7 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -8,6 +8,7 @@
#define _STRING_C
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
#undef memmove
@@ -24,6 +25,7 @@
*/
ENTRY(memmove)
CFI_STARTPROC
+
/* Handle more 32bytes in loop */
mov %rdi, %rax
cmp $0x20, %rdx
@@ -31,8 +33,13 @@ ENTRY(memmove)
/* Decide forward/backward copy mode */
cmp %rdi, %rsi
- jb 2f
+ jge .Lmemmove_begin_forward
+ mov %rsi, %r8
+ add %rdx, %r8
+ cmp %rdi, %r8
+ jg 2f
+.Lmemmove_begin_forward:
/*
* movsq instruction have many startup latency
* so we handle small size by general register.
@@ -78,6 +85,8 @@ ENTRY(memmove)
rep movsq
movq %r11, (%r10)
jmp 13f
+.Lmemmove_end_forward:
+
/*
* Handle data backward by movsq.
*/
@@ -194,4 +203,22 @@ ENTRY(memmove)
13:
retq
CFI_ENDPROC
+
+ .section .altinstr_replacement,"ax"
+.Lmemmove_begin_forward_efs:
+ /* Forward moving data. */
+ movq %rdx, %rcx
+ rep movsb
+ retq
+.Lmemmove_end_forward_efs:
+ .previous
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad .Lmemmove_begin_forward
+ .quad .Lmemmove_begin_forward_efs
+ .word X86_FEATURE_ERMS
+ .byte .Lmemmove_end_forward-.Lmemmove_begin_forward
+ .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
+ .previous
ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 09d344269652..79bd454b78a3 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -2,9 +2,13 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
/*
- * ISO C memset - set a memory block to a byte value.
+ * ISO C memset - set a memory block to a byte value. This function uses fast
+ * string to get better performance than the original function. The code is
+ * simpler and shorter than the orignal function as well.
*
* rdi destination
* rsi value (char)
@@ -31,6 +35,28 @@
.Lmemset_e:
.previous
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses
+ * enhanced rep stosb to override the fast string function.
+ * The code is simpler and shorter than the fast string function as well.
+ *
+ * rdi destination
+ * rsi value (char)
+ * rdx count (bytes)
+ *
+ * rax original destination
+ */
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemset_c_e:
+ movq %rdi,%r9
+ movb %sil,%al
+ movl %edx,%ecx
+ rep stosb
+ movq %r9,%rax
+ ret
+.Lmemset_e_e:
+ .previous
+
ENTRY(memset)
ENTRY(__memset)
CFI_STARTPROC
@@ -112,16 +138,20 @@ ENTRY(__memset)
ENDPROC(memset)
ENDPROC(__memset)
- /* Some CPUs run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
+ /* Some CPUs support enhanced REP MOVSB/STOSB feature.
+ * It is recommended to use this when possible.
+ *
+ * If enhanced REP MOVSB/STOSB feature is not available, use fast string
+ * instructions.
+ *
+ * Otherwise, use original memset function.
+ *
+ * In .altinstructions section, ERMS feature is placed after REG_GOOD
+ * feature to implement the right patch order.
+ */
.section .altinstructions,"a"
- .align 8
- .quad memset
- .quad .Lmemset_c
- .word X86_FEATURE_REP_GOOD
- .byte .Lfinal - memset
- .byte .Lmemset_e - .Lmemset_c
+ altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+ .Lfinal-memset,.Lmemset_e-.Lmemset_c
+ altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+ .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
.previous