summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/alternative-asm.h43
-rw-r--r--arch/x86/include/asm/alternative.h65
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/barrier.h6
-rw-r--r--arch/x86/include/asm/cpufeature.h30
-rw-r--r--arch/x86/include/asm/processor.h16
-rw-r--r--arch/x86/include/asm/smap.h30
-rw-r--r--arch/x86/kernel/alternative.c158
-rw-r--r--arch/x86/kernel/cpu/amd.c5
-rw-r--r--arch/x86/kernel/entry_32.S12
-rw-r--r--arch/x86/lib/clear_page_64.S66
-rw-r--r--arch/x86/lib/copy_page_64.S37
-rw-r--r--arch/x86/lib/copy_user_64.S46
-rw-r--r--arch/x86/lib/memcpy_64.S68
-rw-r--r--arch/x86/lib/memmove_64.S19
-rw-r--r--arch/x86/lib/memset_64.S61
-rw-r--r--arch/x86/um/asm/barrier.h4
-rw-r--r--tools/perf/bench/mem-memcpy-x86-64-asm-def.h6
-rw-r--r--tools/perf/bench/mem-memcpy-x86-64-asm.S2
-rw-r--r--tools/perf/bench/mem-memcpy.c128
-rw-r--r--tools/perf/bench/mem-memset-x86-64-asm-def.h6
-rw-r--r--tools/perf/bench/mem-memset-x86-64-asm.S2
-rw-r--r--tools/perf/util/include/asm/alternative-asm.h1
23 files changed, 433 insertions, 380 deletions
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 372231c22a47..524bddce0b76 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -18,12 +18,53 @@
.endm
#endif
-.macro altinstruction_entry orig alt feature orig_len alt_len
+.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
.long \orig - .
.long \alt - .
.word \feature
.byte \orig_len
.byte \alt_len
+ .byte \pad_len
+.endm
+
+.macro ALTERNATIVE oldinstr, newinstr, feature
+140:
+ \oldinstr
+141:
+ .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr
+144:
+ .popsection
+.endm
+
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+140:
+ \oldinstr
+141:
+ .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+ .skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
+ altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr1
+144:
+ \newinstr2
+145:
+ .popsection
.endm
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 473bdbee378a..5aef6a97d80e 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -48,8 +48,9 @@ struct alt_instr {
s32 repl_offset; /* offset to replacement instruction */
u16 cpuid; /* cpuid bit set for replacement */
u8 instrlen; /* length of original instruction */
- u8 replacementlen; /* length of new instruction, <= instrlen */
-};
+ u8 replacementlen; /* length of new instruction */
+ u8 padlen; /* length of build-time padding */
+} __packed;
extern void alternative_instructions(void);
extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
@@ -76,50 +77,61 @@ static inline int alternatives_text_reserved(void *start, void *end)
}
#endif /* CONFIG_SMP */
-#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n"
+#define b_replacement(num) "664"#num
+#define e_replacement(num) "665"#num
-#define b_replacement(number) "663"#number
-#define e_replacement(number) "664"#number
+#define alt_end_marker "663"
+#define alt_slen "662b-661b"
+#define alt_pad_len alt_end_marker"b-662b"
+#define alt_total_slen alt_end_marker"b-661b"
+#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f"
-#define alt_slen "662b-661b"
-#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f"
+#define __OLDINSTR(oldinstr, num) \
+ "661:\n\t" oldinstr "\n662:\n" \
+ ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \
+ "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
-#define ALTINSTR_ENTRY(feature, number) \
+#define OLDINSTR(oldinstr, num) \
+ __OLDINSTR(oldinstr, num) \
+ alt_end_marker ":\n"
+
+/*
+ * Pad the second replacement alternative with additional NOPs if it is
+ * additionally longer than the first replacement alternative.
+ */
+#define OLDINSTR_2(oldinstr, num1, num2) \
+ __OLDINSTR(oldinstr, num1) \
+ ".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \
+ "((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n" \
+ alt_end_marker ":\n"
+
+#define ALTINSTR_ENTRY(feature, num) \
" .long 661b - .\n" /* label */ \
- " .long " b_replacement(number)"f - .\n" /* new instruction */ \
+ " .long " b_replacement(num)"f - .\n" /* new instruction */ \
" .word " __stringify(feature) "\n" /* feature bit */ \
- " .byte " alt_slen "\n" /* source len */ \
- " .byte " alt_rlen(number) "\n" /* replacement len */
+ " .byte " alt_total_slen "\n" /* source len */ \
+ " .byte " alt_rlen(num) "\n" /* replacement len */ \
+ " .byte " alt_pad_len "\n" /* pad len */
-#define DISCARD_ENTRY(number) /* rlen <= slen */ \
- " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
-
-#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \
- b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
+#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
+ b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
/* alternative assembly primitive: */
#define ALTERNATIVE(oldinstr, newinstr, feature) \
- OLDINSTR(oldinstr) \
+ OLDINSTR(oldinstr, 1) \
".pushsection .altinstructions,\"a\"\n" \
ALTINSTR_ENTRY(feature, 1) \
".popsection\n" \
- ".pushsection .discard,\"aw\",@progbits\n" \
- DISCARD_ENTRY(1) \
- ".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
".popsection"
#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
- OLDINSTR(oldinstr) \
+ OLDINSTR_2(oldinstr, 1, 2) \
".pushsection .altinstructions,\"a\"\n" \
ALTINSTR_ENTRY(feature1, 1) \
ALTINSTR_ENTRY(feature2, 2) \
".popsection\n" \
- ".pushsection .discard,\"aw\",@progbits\n" \
- DISCARD_ENTRY(1) \
- DISCARD_ENTRY(2) \
- ".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
@@ -146,6 +158,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define alternative(oldinstr, newinstr, feature) \
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
+#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
+ asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+
/*
* Alternative inline assembly with input.
*
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index efc3b22d896e..8118e94d50ab 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
{
volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
- alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
+ alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
ASM_OUTPUT2("=r" (v), "=m" (*addr)),
ASM_OUTPUT2("0" (v), "m" (*addr)));
}
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 2ab1eb33106e..959e45b81fe2 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -95,13 +95,11 @@ do { \
* Stop RDTSC speculation. This is needed when you need to use RDTSC
* (or get_cycles or vread that possibly accesses the TSC) in a defined
* code region.
- *
- * (Could use an alternative three way for this if there was one.)
*/
static __always_inline void rdtsc_barrier(void)
{
- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
- alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+ alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+ "lfence", X86_FEATURE_LFENCE_RDTSC);
}
#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index d6428ea5d316..0f7a5a1a8db2 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -419,6 +419,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
" .word %P0\n" /* 1: do replace */
" .byte 2b - 1b\n" /* source len */
" .byte 0\n" /* replacement len */
+ " .byte 0\n" /* pad len */
".previous\n"
/* skipping size check since replacement size = 0 */
: : "i" (X86_FEATURE_ALWAYS) : : t_warn);
@@ -433,6 +434,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
" .word %P0\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 0\n" /* replacement len */
+ " .byte 0\n" /* pad len */
".previous\n"
/* skipping size check since replacement size = 0 */
: : "i" (bit) : : t_no);
@@ -458,6 +460,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
" .word %P1\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 4f - 3f\n" /* replacement len */
+ " .byte 0\n" /* pad len */
".previous\n"
".section .discard,\"aw\",@progbits\n"
" .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -484,31 +487,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
{
#ifdef CC_HAVE_ASM_GOTO
-/*
- * We need to spell the jumps to the compiler because, depending on the offset,
- * the replacement jump can be bigger than the original jump, and this we cannot
- * have. Thus, we force the jump to the widest, 4-byte, signed relative
- * offset even though the last would often fit in less bytes.
- */
- asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
+ asm_volatile_goto("1: jmp %l[t_dynamic]\n"
"2:\n"
+ ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+ "((5f-4f) - (2b-1b)),0x90\n"
+ "3:\n"
".section .altinstructions,\"a\"\n"
" .long 1b - .\n" /* src offset */
- " .long 3f - .\n" /* repl offset */
+ " .long 4f - .\n" /* repl offset */
" .word %P1\n" /* always replace */
- " .byte 2b - 1b\n" /* src len */
- " .byte 4f - 3f\n" /* repl len */
+ " .byte 3b - 1b\n" /* src len */
+ " .byte 5f - 4f\n" /* repl len */
+ " .byte 3b - 2b\n" /* pad len */
".previous\n"
".section .altinstr_replacement,\"ax\"\n"
- "3: .byte 0xe9\n .long %l[t_no] - 2b\n"
- "4:\n"
+ "4: jmp %l[t_no]\n"
+ "5:\n"
".previous\n"
".section .altinstructions,\"a\"\n"
" .long 1b - .\n" /* src offset */
" .long 0\n" /* no replacement */
" .word %P0\n" /* feature bit */
- " .byte 2b - 1b\n" /* src len */
+ " .byte 3b - 1b\n" /* src len */
" .byte 0\n" /* repl len */
+ " .byte 0\n" /* pad len */
".previous\n"
: : "i" (bit), "i" (X86_FEATURE_ALWAYS)
: : t_dynamic, t_no);
@@ -528,6 +530,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
" .word %P2\n" /* always replace */
" .byte 2b - 1b\n" /* source len */
" .byte 4f - 3f\n" /* replacement len */
+ " .byte 0\n" /* pad len */
".previous\n"
".section .discard,\"aw\",@progbits\n"
" .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -542,6 +545,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
" .word %P1\n" /* feature bit */
" .byte 4b - 3b\n" /* src len */
" .byte 6f - 5f\n" /* repl len */
+ " .byte 0\n" /* pad len */
".previous\n"
".section .discard,\"aw\",@progbits\n"
" .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ec1c93588cef..7be2c9a6caba 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -761,10 +761,10 @@ extern char ignore_fpu_irq;
#define ARCH_HAS_SPINLOCK_PREFETCH
#ifdef CONFIG_X86_32
-# define BASE_PREFETCH ASM_NOP4
+# define BASE_PREFETCH ""
# define ARCH_HAS_PREFETCH
#else
-# define BASE_PREFETCH "prefetcht0 (%1)"
+# define BASE_PREFETCH "prefetcht0 %P1"
#endif
/*
@@ -775,10 +775,9 @@ extern char ignore_fpu_irq;
*/
static inline void prefetch(const void *x)
{
- alternative_input(BASE_PREFETCH,
- "prefetchnta (%1)",
+ alternative_input(BASE_PREFETCH, "prefetchnta %P1",
X86_FEATURE_XMM,
- "r" (x));
+ "m" (*(const char *)x));
}
/*
@@ -788,10 +787,9 @@ static inline void prefetch(const void *x)
*/
static inline void prefetchw(const void *x)
{
- alternative_input(BASE_PREFETCH,
- "prefetchw (%1)",
- X86_FEATURE_3DNOW,
- "r" (x));
+ alternative_input(BASE_PREFETCH, "prefetchw %P1",
+ X86_FEATURE_3DNOWPREFETCH,
+ "m" (*(const char *)x));
}
static inline void spin_lock_prefetch(const void *x)
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 8d3120f4e270..ba665ebd17bb 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -27,23 +27,11 @@
#ifdef CONFIG_X86_SMAP
-#define ASM_CLAC \
- 661: ASM_NOP3 ; \
- .pushsection .altinstr_replacement, "ax" ; \
- 662: __ASM_CLAC ; \
- .popsection ; \
- .pushsection .altinstructions, "a" ; \
- altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
- .popsection
-
-#define ASM_STAC \
- 661: ASM_NOP3 ; \
- .pushsection .altinstr_replacement, "ax" ; \
- 662: __ASM_STAC ; \
- .popsection ; \
- .pushsection .altinstructions, "a" ; \
- altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
- .popsection
+#define ASM_CLAC \
+ ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
+
+#define ASM_STAC \
+ ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
#else /* CONFIG_X86_SMAP */
@@ -61,20 +49,20 @@
static __always_inline void clac(void)
{
/* Note: a barrier is implicit in alternative() */
- alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
+ alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
}
static __always_inline void stac(void)
{
/* Note: a barrier is implicit in alternative() */
- alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP);
+ alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
}
/* These macros can be used in asm() statements */
#define ASM_CLAC \
- ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
+ ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
#define ASM_STAC \
- ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP)
+ ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
#else /* CONFIG_X86_SMAP */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 703130f469ec..af397cc98d05 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str)
__setup("noreplace-paravirt", setup_noreplace_paravirt);
#endif
-#define DPRINTK(fmt, ...) \
-do { \
- if (debug_alternative) \
- printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+#define DPRINTK(fmt, args...) \
+do { \
+ if (debug_alternative) \
+ printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
+} while (0)
+
+#define DUMP_BYTES(buf, len, fmt, args...) \
+do { \
+ if (unlikely(debug_alternative)) { \
+ int j; \
+ \
+ if (!(len)) \
+ break; \
+ \
+ printk(KERN_DEBUG fmt, ##args); \
+ for (j = 0; j < (len) - 1; j++) \
+ printk(KERN_CONT "%02hhx ", buf[j]); \
+ printk(KERN_CONT "%02hhx\n", buf[j]); \
+ } \
} while (0)
/*
@@ -243,12 +258,86 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void *text_poke_early(void *addr, const void *opcode, size_t len);
-/* Replace instructions with better alternatives for this CPU type.
- This runs before SMP is initialized to avoid SMP problems with
- self modifying code. This implies that asymmetric systems where
- APs have less capabilities than the boot processor are not handled.
- Tough. Make sure you disable such features by hand. */
+/*
+ * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ */
+static inline bool is_jmp(const u8 opcode)
+{
+ return opcode == 0xeb || opcode == 0xe9;
+}
+
+static void __init_or_module
+recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+{
+ u8 *next_rip, *tgt_rip;
+ s32 n_dspl, o_dspl;
+ int repl_len;
+
+ if (a->replacementlen != 5)
+ return;
+
+ o_dspl = *(s32 *)(insnbuf + 1);
+
+ /* next_rip of the replacement JMP */
+ next_rip = repl_insn + a->replacementlen;
+ /* target rip of the replacement JMP */
+ tgt_rip = next_rip + o_dspl;
+ n_dspl = tgt_rip - orig_insn;
+
+ DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
+
+ if (tgt_rip - orig_insn >= 0) {
+ if (n_dspl - 2 <= 127)
+ goto two_byte_jmp;
+ else
+ goto five_byte_jmp;
+ /* negative offset */
+ } else {
+ if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
+ goto two_byte_jmp;
+ else
+ goto five_byte_jmp;
+ }
+
+two_byte_jmp:
+ n_dspl -= 2;
+
+ insnbuf[0] = 0xeb;
+ insnbuf[1] = (s8)n_dspl;
+ add_nops(insnbuf + 2, 3);
+
+ repl_len = 2;
+ goto done;
+
+five_byte_jmp:
+ n_dspl -= 5;
+ insnbuf[0] = 0xe9;
+ *(s32 *)&insnbuf[1] = n_dspl;
+
+ repl_len = 5;
+
+done:
+
+ DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+ n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+}
+
+static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
+{
+ add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+
+ DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
+ instr, a->instrlen - a->padlen, a->padlen);
+}
+
+/*
+ * Replace instructions with better alternatives for this CPU type. This runs
+ * before SMP is initialized to avoid SMP problems with self modifying code.
+ * This implies that asymmetric systems where APs have less capabilities than
+ * the boot processor are not handled. Tough. Make sure you disable such
+ * features by hand.
+ */
void __init_or_module apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
@@ -256,10 +345,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
u8 *instr, *replacement;
u8 insnbuf[MAX_PATCH_LEN];
- DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+ DPRINTK("alt table %p -> %p", start, end);
/*
* The scan order should be from start to end. A later scanned
- * alternative code can overwrite a previous scanned alternative code.
+ * alternative code can overwrite previously scanned alternative code.
* Some kernel functions (e.g. memcpy, memset, etc) use this order to
* patch code.
*
@@ -267,29 +356,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
* order.
*/
for (a = start; a < end; a++) {
+ int insnbuf_sz = 0;
+
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
- BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
- if (!boot_cpu_has(a->cpuid))
+ if (!boot_cpu_has(a->cpuid)) {
+ if (a->padlen > 1)
+ optimize_nops(a, instr);
+
continue;
+ }
+
+ DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)",
+ a->cpuid >> 5,
+ a->cpuid & 0x1f,
+ instr, a->instrlen,
+ replacement, a->replacementlen);
+
+ DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
+ DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
memcpy(insnbuf, replacement, a->replacementlen);
+ insnbuf_sz = a->replacementlen;
/* 0xe8 is a relative jump; fix the offset. */
- if (*insnbuf == 0xe8 && a->replacementlen == 5)
- *(s32 *)(insnbuf + 1) += replacement - instr;
+ if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+ *(s32 *)(insnbuf + 1) += replacement - instr;
+ DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+ *(s32 *)(insnbuf + 1),
+ (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
+ }
+
+ if (a->replacementlen && is_jmp(replacement[0]))
+ recompute_jump(a, instr, replacement, insnbuf);
- add_nops(insnbuf + a->replacementlen,
- a->instrlen - a->replacementlen);
+ if (a->instrlen > a->replacementlen) {
+ add_nops(insnbuf + a->replacementlen,
+ a->instrlen - a->replacementlen);
+ insnbuf_sz += a->instrlen - a->replacementlen;
+ }
+ DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
- text_poke_early(instr, insnbuf, a->instrlen);
+ text_poke_early(instr, insnbuf, insnbuf_sz);
}
}
#ifdef CONFIG_SMP
-
static void alternatives_smp_lock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end)
{
@@ -371,8 +485,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
smp->locks_end = locks_end;
smp->text = text;
smp->text_end = text_end;
- DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
- __func__, smp->locks, smp->locks_end,
+ DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+ smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
list_add_tail(&smp->next, &smp_alt_modules);
@@ -440,7 +554,7 @@ int alternatives_text_reserved(void *start, void *end)
return 0;
}
-#endif
+#endif /* CONFIG_SMP */
#ifdef CONFIG_PARAVIRT
void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239cea65..dd9e50500297 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+ /* 3DNow or LM implies PREFETCHW */
+ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+ if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+ set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 31e2d5bf3e38..7e0323cc0b7d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -816,15 +816,9 @@ ENTRY(simd_coprocessor_error)
pushl_cfi $0
#ifdef CONFIG_X86_INVD_BUG
/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661: pushl_cfi $do_general_protection
-662:
-.section .altinstructions,"a"
- altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
-.previous
-.section .altinstr_replacement,"ax"
-663: pushl $do_simd_coprocessor_error
-664:
-.previous
+ ALTERNATIVE "pushl_cfi $do_general_protection", \
+ "pushl $do_simd_coprocessor_error", \
+ X86_FEATURE_XMM
#else
pushl_cfi $do_simd_coprocessor_error
#endif
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index f2145cfa12a6..e67e579c93bd 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,31 +1,35 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
/*
- * Zero a page.
- * rdi page
- */
-ENTRY(clear_page_c)
+ * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
+ * recommended to use this when possible and we do use them by default.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original.
+ */
+
+/*
+ * Zero a page.
+ * %rdi - page
+ */
+ENTRY(clear_page)
CFI_STARTPROC
+
+ ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
+ "jmp clear_page_c_e", X86_FEATURE_ERMS
+
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
ret
CFI_ENDPROC
-ENDPROC(clear_page_c)
+ENDPROC(clear_page)
-ENTRY(clear_page_c_e)
+ENTRY(clear_page_orig)
CFI_STARTPROC
- movl $4096,%ecx
- xorl %eax,%eax
- rep stosb
- ret
- CFI_ENDPROC
-ENDPROC(clear_page_c_e)
-ENTRY(clear_page)
- CFI_STARTPROC
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
@@ -45,29 +49,13 @@ ENTRY(clear_page)
nop
ret
CFI_ENDPROC
-.Lclear_page_end:
-ENDPROC(clear_page)
-
- /*
- * Some CPUs support enhanced REP MOVSB/STOSB instructions.
- * It is recommended to use this when possible.
- * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
- * Otherwise, use original function.
- *
- */
+ENDPROC(clear_page_orig)
-#include <asm/cpufeature.h>
-
- .section .altinstr_replacement,"ax"
-1: .byte 0xeb /* jmp <disp8> */
- .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
-2: .byte 0xeb /* jmp <disp8> */
- .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
-3:
- .previous
- .section .altinstructions,"a"
- altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
- .Lclear_page_end-clear_page, 2b-1b
- altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
- .Lclear_page_end-clear_page,3b-2b
- .previous
+ENTRY(clear_page_c_e)
+ CFI_STARTPROC
+ movl $4096,%ecx
+ xorl %eax,%eax
+ rep stosb
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_page_c_e)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 176cca67212b..8239dbcbf984 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,23 +2,26 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
+/*
+ * Some CPUs run faster using the string copy instructions (sane microcode).
+ * It is also a lot simpler. Use this when possible. But, don't use streaming
+ * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
+ * prefetch distance based on SMP/UP.
+ */
ALIGN
-copy_page_rep:
+ENTRY(copy_page)
CFI_STARTPROC
+ ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
movl $4096/8, %ecx
rep movsq
ret
CFI_ENDPROC
-ENDPROC(copy_page_rep)
-
-/*
- * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
- * Could vary the prefetch distance based on SMP/UP.
-*/
+ENDPROC(copy_page)
-ENTRY(copy_page)
+ENTRY(copy_page_regs)
CFI_STARTPROC
subq $2*8, %rsp
CFI_ADJUST_CFA_OFFSET 2*8
@@ -90,21 +93,5 @@ ENTRY(copy_page)
addq $2*8, %rsp
CFI_ADJUST_CFA_OFFSET -2*8
ret
-.Lcopy_page_end:
CFI_ENDPROC
-ENDPROC(copy_page)
-
- /* Some CPUs run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
- .section .altinstr_replacement,"ax"
-1: .byte 0xeb /* jmp <disp8> */
- .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */
-2:
- .previous
- .section .altinstructions,"a"
- altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
- .Lcopy_page_end-copy_page, 2b-1b
- .previous
+ENDPROC(copy_page_regs)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dee945d55594..fa997dfaef24 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -8,9 +8,6 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
@@ -19,33 +16,7 @@
#include <asm/asm.h>
#include <asm/smap.h>
-/*
- * By placing feature2 after feature1 in altinstructions section, we logically
- * implement:
- * If CPU has feature2, jmp to alt2 is used
- * else if CPU has feature1, jmp to alt1 is used
- * else jmp to orig is used.
- */
- .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
-0:
- .byte 0xe9 /* 32bit jump */
- .long \orig-1f /* by default jump to orig */
-1:
- .section .altinstr_replacement,"ax"
-2: .byte 0xe9 /* near jump with 32bit immediate */
- .long \alt1-1b /* offset */ /* or alternatively to alt1 */
-3: .byte 0xe9 /* near jump with 32bit immediate */
- .long \alt2-1b /* offset */ /* or alternatively to alt2 */
- .previous
-
- .section .altinstructions,"a"
- altinstruction_entry 0b,2b,\feature1,5,5
- altinstruction_entry 0b,3b,\feature2,5,5
- .previous
- .endm
-
.macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
/* check for bad alignment of destination */
movl %edi,%ecx
andl $7,%ecx
@@ -67,7 +38,6 @@
_ASM_EXTABLE(100b,103b)
_ASM_EXTABLE(101b,103b)
-#endif
.endm
/* Standard copy_to_user with segment limit checking */
@@ -79,9 +49,11 @@ ENTRY(_copy_to_user)
jc bad_to_user
cmpq TI_addr_limit(%rax),%rcx
ja bad_to_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
- copy_user_generic_unrolled,copy_user_generic_string, \
- copy_user_enhanced_fast_string
+ ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
+ "jmp copy_user_generic_string", \
+ X86_FEATURE_REP_GOOD, \
+ "jmp copy_user_enhanced_fast_string", \
+ X86_FEATURE_ERMS
CFI_ENDPROC
ENDPROC(_copy_to_user)
@@ -94,9 +66,11 @@ ENTRY(_copy_from_user)
jc bad_from_user
cmpq TI_addr_limit(%rax),%rcx
ja bad_from_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
- copy_user_generic_unrolled,copy_user_generic_string, \
- copy_user_enhanced_fast_string
+ ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
+ "jmp copy_user_generic_string", \
+ X86_FEATURE_REP_GOOD, \
+ "jmp copy_user_enhanced_fast_string", \
+ X86_FEATURE_ERMS
CFI_ENDPROC
ENDPROC(_copy_from_user)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 89b53c9968e7..b046664f5a1c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,12 +1,20 @@
/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
-
#include <asm/cpufeature.h>
#include <asm/dwarf2.h>
#include <asm/alternative-asm.h>
/*
+ * We build a jump to memcpy_orig by default which gets NOPped out on
+ * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
+ * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
+ * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
+ */
+
+.weak memcpy
+
+/*
* memcpy - Copy a memory block.
*
* Input:
@@ -17,15 +25,11 @@
* Output:
* rax original destination
*/
+ENTRY(__memcpy)
+ENTRY(memcpy)
+ ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
+ "jmp memcpy_erms", X86_FEATURE_ERMS
-/*
- * memcpy_c() - fast string ops (REP MOVSQ) based variant.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
- */
- .section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c:
movq %rdi, %rax
movq %rdx, %rcx
shrq $3, %rcx
@@ -34,29 +38,21 @@
movl %edx, %ecx
rep movsb
ret
-.Lmemcpy_e:
- .previous
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
/*
- * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
- * memcpy_c. Use memcpy_c_e when possible.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
+ * memcpy_erms() - enhanced fast string memcpy. This is faster and
+ * simpler than memcpy. Use memcpy_erms when possible.
*/
- .section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c_e:
+ENTRY(memcpy_erms)
movq %rdi, %rax
movq %rdx, %rcx
rep movsb
ret
-.Lmemcpy_e_e:
- .previous
-
-.weak memcpy
+ENDPROC(memcpy_erms)
-ENTRY(__memcpy)
-ENTRY(memcpy)
+ENTRY(memcpy_orig)
CFI_STARTPROC
movq %rdi, %rax
@@ -183,26 +179,4 @@ ENTRY(memcpy)
.Lend:
retq
CFI_ENDPROC
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-
- /*
- * Some CPUs are adding enhanced REP MOVSB/STOSB feature
- * If the feature is supported, memcpy_c_e() is the first choice.
- * If enhanced rep movsb copy is not available, use fast string copy
- * memcpy_c() when possible. This is faster and code is simpler than
- * original memcpy().
- * Otherwise, original memcpy() is used.
- * In .altinstructions section, ERMS feature is placed after REG_GOOD
- * feature to implement the right patch order.
- *
- * Replace only beginning, memcpy is used to apply alternatives,
- * so it is silly to overwrite itself with nops - reboot is the
- * only outcome...
- */
- .section .altinstructions, "a"
- altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
- .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
- altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
- .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
- .previous
+ENDPROC(memcpy_orig)
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 9c4b530575da..0f8a0d0331b9 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -5,7 +5,6 @@
* This assembly file is re-written from memmove_64.c file.
* - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
*/
-#define _STRING_C
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeature.h>
@@ -44,6 +43,8 @@ ENTRY(__memmove)
jg 2f
.Lmemmove_begin_forward:
+ ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+
/*
* movsq instruction have many startup latency
* so we handle small size by general register.
@@ -207,21 +208,5 @@ ENTRY(__memmove)
13:
retq
CFI_ENDPROC
-
- .section .altinstr_replacement,"ax"
-.Lmemmove_begin_forward_efs:
- /* Forward moving data. */
- movq %rdx, %rcx
- rep movsb
- retq
-.Lmemmove_end_forward_efs:
- .previous
-
- .section .altinstructions,"a"
- altinstruction_entry .Lmemmove_begin_forward, \
- .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
- .Lmemmove_end_forward-.Lmemmove_begin_forward, \
- .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
- .previous
ENDPROC(__memmove)
ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 6f44935c6a60..93118fb23976 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -5,19 +5,30 @@
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
+.weak memset
+
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
* simpler and shorter than the orignal function as well.
- *
+ *
* rdi destination
- * rsi value (char)
- * rdx count (bytes)
- *
+ * rsi value (char)
+ * rdx count (bytes)
+ *
* rax original destination
- */
- .section .altinstr_replacement, "ax", @progbits
-.Lmemset_c:
+ */
+ENTRY(memset)
+ENTRY(__memset)
+ /*
+ * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
+ * to use it when possible. If not available, use fast string instructions.
+ *
+ * Otherwise, use original memset function.
+ */
+ ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+ "jmp memset_erms", X86_FEATURE_ERMS
+
movq %rdi,%r9
movq %rdx,%rcx
andl $7,%edx
@@ -31,8 +42,8 @@
rep stosb
movq %r9,%rax
ret
-.Lmemset_e:
- .previous
+ENDPROC(memset)
+ENDPROC(__memset)
/*
* ISO C memset - set a memory block to a byte value. This function uses
@@ -45,21 +56,16 @@
*
* rax original destination
*/
- .section .altinstr_replacement, "ax", @progbits
-.Lmemset_c_e:
+ENTRY(memset_erms)
movq %rdi,%r9
movb %sil,%al
movq %rdx,%rcx
rep stosb
movq %r9,%rax
ret
-.Lmemset_e_e:
- .previous
-
-.weak memset
+ENDPROC(memset_erms)
-ENTRY(memset)
-ENTRY(__memset)
+ENTRY(memset_orig)
CFI_STARTPROC
movq %rdi,%r10
@@ -134,23 +140,4 @@ ENTRY(__memset)
jmp .Lafter_bad_alignment
.Lfinal:
CFI_ENDPROC
-ENDPROC(memset)
-ENDPROC(__memset)
-
- /* Some CPUs support enhanced REP MOVSB/STOSB feature.
- * It is recommended to use this when possible.
- *
- * If enhanced REP MOVSB/STOSB feature is not available, use fast string
- * instructions.
- *
- * Otherwise, use original memset function.
- *
- * In .altinstructions section, ERMS feature is placed after REG_GOOD
- * feature to implement the right patch order.
- */
- .section .altinstructions,"a"
- altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
- .Lfinal-__memset,.Lmemset_e-.Lmemset_c
- altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
- .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
- .previous
+ENDPROC(memset_orig)
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 2d7d9a1f5b53..8ffd2146fa6a 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -64,8 +64,8 @@
*/
static inline void rdtsc_barrier(void)
{
- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
- alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+ alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+ "lfence", X86_FEATURE_LFENCE_RDTSC);
}
#endif
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
index d66ab799b35f..8c0c1a2770c8 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -1,12 +1,12 @@
-MEMCPY_FN(__memcpy,
+MEMCPY_FN(memcpy_orig,
"x86-64-unrolled",
"unrolled memcpy() in arch/x86/lib/memcpy_64.S")
-MEMCPY_FN(memcpy_c,
+MEMCPY_FN(__memcpy,
"x86-64-movsq",
"movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
-MEMCPY_FN(memcpy_c_e,
+MEMCPY_FN(memcpy_erms,
"x86-64-movsb",
"movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
index fcd9cf00600a..e4c2c30143b9 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm.S
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -1,8 +1,6 @@
#define memcpy MEMCPY /* don't hide glibc's memcpy() */
#define altinstr_replacement text
#define globl p2align 4; .globl
-#define Lmemcpy_c globl memcpy_c; memcpy_c
-#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e
#include "../../../arch/x86/lib/memcpy_64.S"
/*
* We need to provide note.GNU-stack section, saying that we want
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index db1d3a29d97f..d3dfb7936dcd 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -36,7 +36,7 @@ static const struct option options[] = {
"Specify length of memory to copy. "
"Available units: B, KB, MB, GB and TB (upper and lower)"),
OPT_STRING('r', "routine", &routine, "default",
- "Specify routine to copy"),
+ "Specify routine to copy, \"all\" runs all available routines"),
OPT_INTEGER('i', "iterations", &iterations,
"repeat memcpy() invocation this number of times"),
OPT_BOOLEAN('c', "cycle", &use_cycle,
@@ -135,55 +135,16 @@ struct bench_mem_info {
const char *const *usage;
};
-static int bench_mem_common(int argc, const char **argv,
- const char *prefix __maybe_unused,
- struct bench_mem_info *info)
+static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen)
{
- int i;
- size_t len;
- double totallen;
+ const struct routine *r = &info->routines[r_idx];
double result_bps[2];
u64 result_cycle[2];
- argc = parse_options(argc, argv, options,
- info->usage, 0);
-
- if (no_prefault && only_prefault) {
- fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
- return 1;
- }
-
- if (use_cycle)
- init_cycle();
-
- len = (size_t)perf_atoll((char *)length_str);
- totallen = (double)len * iterations;
-
result_cycle[0] = result_cycle[1] = 0ULL;
result_bps[0] = result_bps[1] = 0.0;
- if ((s64)len <= 0) {
- fprintf(stderr, "Invalid length:%s\n", length_str);
- return 1;
- }
-
- /* same to without specifying either of prefault and no-prefault */
- if (only_prefault && no_prefault)
- only_prefault = no_prefault = false;
-
- for (i = 0; info->routines[i].name; i++) {
- if (!strcmp(info->routines[i].name, routine))
- break;
- }
- if (!info->routines[i].name) {
- printf("Unknown routine:%s\n", routine);
- printf("Available routines...\n");
- for (i = 0; info->routines[i].name; i++) {
- printf("\t%s ... %s\n",
- info->routines[i].name, info->routines[i].desc);
- }
- return 1;
- }
+ printf("Routine %s (%s)\n", r->name, r->desc);
if (bench_format == BENCH_FORMAT_DEFAULT)
printf("# Copying %s Bytes ...\n\n", length_str);
@@ -191,28 +152,17 @@ static int bench_mem_common(int argc, const char **argv,
if (!only_prefault && !no_prefault) {
/* show both of results */
if (use_cycle) {
- result_cycle[0] =
- info->do_cycle(&info->routines[i], len, false);
- result_cycle[1] =
- info->do_cycle(&info->routines[i], len, true);
+ result_cycle[0] = info->do_cycle(r, len, false);
+ result_cycle[1] = info->do_cycle(r, len, true);
} else {
- result_bps[0] =
- info->do_gettimeofday(&info->routines[i],
- len, false);
- result_bps[1] =
- info->do_gettimeofday(&info->routines[i],
- len, true);
+ result_bps[0] = info->do_gettimeofday(r, len, false);
+ result_bps[1] = info->do_gettimeofday(r, len, true);
}
} else {
- if (use_cycle) {
- result_cycle[pf] =
- info->do_cycle(&info->routines[i],
- len, only_prefault);
- } else {
- result_bps[pf] =
- info->do_gettimeofday(&info->routines[i],
- len, only_prefault);
- }
+ if (use_cycle)
+ result_cycle[pf] = info->do_cycle(r, len, only_prefault);
+ else
+ result_bps[pf] = info->do_gettimeofday(r, len, only_prefault);
}
switch (bench_format) {
@@ -265,6 +215,60 @@ static int bench_mem_common(int argc, const char **argv,
die("unknown format: %d\n", bench_format);
break;
}
+}
+
+static int bench_mem_common(int argc, const char **argv,
+ const char *prefix __maybe_unused,
+ struct bench_mem_info *info)
+{
+ int i;
+ size_t len;
+ double totallen;
+
+ argc = parse_options(argc, argv, options,
+ info->usage, 0);
+
+ if (no_prefault && only_prefault) {
+ fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
+ return 1;
+ }
+
+ if (use_cycle)
+ init_cycle();
+
+ len = (size_t)perf_atoll((char *)length_str);
+ totallen = (double)len * iterations;
+
+ if ((s64)len <= 0) {
+ fprintf(stderr, "Invalid length:%s\n", length_str);
+ return 1;
+ }
+
+ /* same to without specifying either of prefault and no-prefault */
+ if (only_prefault && no_prefault)
+ only_prefault = no_prefault = false;
+
+ if (!strncmp(routine, "all", 3)) {
+ for (i = 0; info->routines[i].name; i++)
+ __bench_mem_routine(info, i, len, totallen);
+ return 0;
+ }
+
+ for (i = 0; info->routines[i].name; i++) {
+ if (!strcmp(info->routines[i].name, routine))
+ break;
+ }
+ if (!info->routines[i].name) {
+ printf("Unknown routine:%s\n", routine);
+ printf("Available routines...\n");
+ for (i = 0; info->routines[i].name; i++) {
+ printf("\t%s ... %s\n",
+ info->routines[i].name, info->routines[i].desc);
+ }
+ return 1;
+ }
+
+ __bench_mem_routine(info, i, len, totallen);
return 0;
}
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h
index a71dff97c1f5..f02d028771d9 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h
@@ -1,12 +1,12 @@
-MEMSET_FN(__memset,
+MEMSET_FN(memset_orig,
"x86-64-unrolled",
"unrolled memset() in arch/x86/lib/memset_64.S")
-MEMSET_FN(memset_c,
+MEMSET_FN(__memset,
"x86-64-stosq",
"movsq-based memset() in arch/x86/lib/memset_64.S")
-MEMSET_FN(memset_c_e,
+MEMSET_FN(memset_erms,
"x86-64-stosb",
"movsb-based memset() in arch/x86/lib/memset_64.S")
diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S
index 9e5af89ed13a..de278784c866 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm.S
+++ b/tools/perf/bench/mem-memset-x86-64-asm.S
@@ -1,8 +1,6 @@
#define memset MEMSET /* don't hide glibc's memset() */
#define altinstr_replacement text
#define globl p2align 4; .globl
-#define Lmemset_c globl memset_c; memset_c
-#define Lmemset_c_e globl memset_c_e; memset_c_e
#include "../../../arch/x86/lib/memset_64.S"
/*
diff --git a/tools/perf/util/include/asm/alternative-asm.h b/tools/perf/util/include/asm/alternative-asm.h
index 6789d788d494..3a3a0f16456a 100644
--- a/tools/perf/util/include/asm/alternative-asm.h
+++ b/tools/perf/util/include/asm/alternative-asm.h
@@ -4,5 +4,6 @@
/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
#define altinstruction_entry #
+#define ALTERNATIVE_2 #
#endif