From 5a51467b146ab7948d2f6812892eac120a30529c Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Wed, 18 Jan 2012 20:07:54 -0600 Subject: x86/uv: Fix uv_gpa_to_soc_phys_ram() shift uv_gpa_to_soc_phys_ram() was inadvertently ignoring the shift values. This fix takes the shift into account. Signed-off-by: Russ Anderson Cc: Link: http://lkml.kernel.org/r/20120119020753.GA7228@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 54a13aaebc40..21f7385badb8 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -318,13 +318,13 @@ uv_gpa_in_mmr_space(unsigned long gpa) /* UV global physical address --> socket phys RAM */ static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa) { - unsigned long paddr = gpa & uv_hub_info->gpa_mask; + unsigned long paddr; unsigned long remap_base = uv_hub_info->lowmem_remap_base; unsigned long remap_top = uv_hub_info->lowmem_remap_top; gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | ((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val); - gpa = gpa & uv_hub_info->gpa_mask; + paddr = gpa & uv_hub_info->gpa_mask; if (paddr >= remap_base && paddr < remap_base + remap_top) paddr -= remap_base; return paddr; -- cgit v1.2.3 From 652847aa449cfe364d40018849223f57f31a38e2 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 20 Jan 2012 17:38:23 +0100 Subject: x86/amd: Add missing feature flag for fam15h models 10h-1fh processors That is the last one missing for those CPUs. Others were recently added with commits fb215366b3c7320ac25dca766a0152df16534932 (KVM: expose latest Intel cpu new features (BMI1/BMI2/FMA/AVX2) to guest) and commit 969df4b82904a30fef19a67398a0c854d223ea67 (x86: Report cpb and eff_freq_ro flags correctly) Signed-off-by: Andreas Herrmann Link: http://lkml.kernel.org/r/20120120163823.GC24508@alberich.amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 17c5d4bdee5e..8d67d428b0f9 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -159,6 +159,7 @@ #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ #define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */ #define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE (6*32+17) /* translation cache extension */ #define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ #define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ #define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ -- cgit v1.2.3 From fc395b9291925b1880e0afc61274fe2f6ddc1269 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 26 Jan 2012 15:47:37 +0000 Subject: x86: Properly parenthesize cmpxchg() macro arguments Quite oddly, all of the arguments passed through from the top level macros to the second level which didn't need parentheses had them, while the only expression (involving a parameter) needing them didn't. Very recently I got bitten by the lack thereof when using something like "array + index" for the first operand, with "array" being an array more narrow than int. Signed-off-by: Jan Beulich Cc: Linus Torvalds Link: http://lkml.kernel.org/r/4F2183A9020000780006F3E6@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cmpxchg.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 0c9fa2745f13..b3b733262909 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -145,13 +145,13 @@ extern void __add_wrong_size(void) #ifdef __HAVE_ARCH_CMPXCHG #define cmpxchg(ptr, old, new) \ - __cmpxchg((ptr), (old), (new), sizeof(*ptr)) + __cmpxchg(ptr, old, new, sizeof(*(ptr))) #define sync_cmpxchg(ptr, old, new) \ - __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr)) + __sync_cmpxchg(ptr, old, new, sizeof(*(ptr))) #define cmpxchg_local(ptr, old, new) \ - __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) + __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) #endif /* -- cgit v1.2.3 From bdb42f5afebe208eae90406959383856ae2caf2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stephan=20B=C3=A4rwolf?= Date: Thu, 12 Jan 2012 16:43:03 +0100 Subject: KVM: x86: extend "struct x86_emulate_ops" with "get_cpuid" In order to be able to proceed checks on CPU-specific properties within the emulator, function "get_cpuid" is introduced. With "get_cpuid" it is possible to virtually call the guests "cpuid"-opcode without changing the VM's context. [mtosatti: cleanup/beautify code] Signed-off-by: Stephan Baerwolf Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index ab4092e3214e..c8b28689eeeb 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -190,6 +190,9 @@ struct x86_emulate_ops { int (*intercept)(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage); + + bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); }; typedef u32 __attribute__((vector_size(16))) sse128_t; -- cgit v1.2.3 From c2226fc9e87ba3da060e47333657cd6616652b84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stephan=20B=C3=A4rwolf?= Date: Thu, 12 Jan 2012 16:43:04 +0100 Subject: KVM: x86: fix missing checks in syscall emulation On hosts without this patch, 32bit guests will crash (and 64bit guests may behave in a wrong way) for example by simply executing following nasm-demo-application: [bits 32] global _start SECTION .text _start: syscall (I tested it with winxp and linux - both always crashed) Disassembly of section .text: 00000000 <_start>: 0: 0f 05 syscall The reason seems a missing "invalid opcode"-trap (int6) for the syscall opcode "0f05", which is not available on Intel CPUs within non-longmodes, as also on some AMD CPUs within legacy-mode. (depending on CPU vendor, MSR_EFER and cpuid) Because previous mentioned OSs may not engage corresponding syscall target-registers (STAR, LSTAR, CSTAR), they remain NULL and (non trapping) syscalls are leading to multiple faults and finally crashs. Depending on the architecture (AMD or Intel) pretended by guests, various checks according to vendor's documentation are implemented to overcome the current issue and behave like the CPUs physical counterparts. [mtosatti: cleanup/beautify code] Signed-off-by: Stephan Baerwolf Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index c8b28689eeeb..7b9cfc4878af 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -301,6 +301,19 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \ X86EMUL_MODE_PROT64) +/* CPUID vendors */ +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65 + +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx 0x69444d41 +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574 +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273 + +#define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547 +#define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e +#define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69 + enum x86_intercept_stage { X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */ X86_ICPT_PRE_EXCEPT, -- cgit v1.2.3 From be98c2cdb15ba26148cd2bd58a857d4f7759ed38 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 13 Feb 2012 13:47:25 -0800 Subject: i387: math_state_restore() isn't called from asm It was marked asmlinkage for some really old and stale legacy reasons. Fix that and the equally stale comment. Noticed when debugging the irq_fpu_usable() bugs. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 6919e936345b..a5c7ae504176 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -29,7 +29,7 @@ extern unsigned int sig_xstate_size; extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); -extern asmlinkage void math_state_restore(void); +extern void math_state_restore(void); extern void __math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); -- cgit v1.2.3 From 5b1cbac37798805c1fee18c8cebe5c0a13975b17 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 13 Feb 2012 13:56:14 -0800 Subject: i387: make irq_fpu_usable() tests more robust Some code - especially the crypto layer - wants to use the x86 FP/MMX/AVX register set in what may be interrupt (typically softirq) context. That *can* be ok, but the tests for when it was ok were somewhat suspect. We cannot touch the thread-specific status bits either, so we'd better check that we're not going to try to save FP state or anything like that. Now, it may be that the TS bit is always cleared *before* we set the USEDFPU bit (and only set when we had already cleared the USEDFP before), so the TS bit test may actually have been sufficient, but it certainly was not obviously so. So this explicitly verifies that we will not touch the TS_USEDFPU bit, and adds a few related sanity-checks. Because it seems that somehow AES-NI is corrupting user FP state. The cause is not clear, and this patch doesn't fix it, but while debugging it I really wanted the code to be more obviously correct and robust. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 54 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 8 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index a5c7ae504176..a29571821b99 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -307,9 +307,54 @@ static inline void __clear_fpu(struct task_struct *tsk) } } +/* + * Were we in an interrupt that interrupted kernel mode? + * + * We can do a kernel_fpu_begin/end() pair *ONLY* if that + * pair does nothing at all: TS_USEDFPU must be clear (so + * that we don't try to save the FPU state), and TS must + * be set (so that the clts/stts pair does nothing that is + * visible in the interrupted kernel thread). + */ +static inline bool interrupted_kernel_fpu_idle(void) +{ + return !(current_thread_info()->status & TS_USEDFPU) && + (read_cr0() & X86_CR0_TS); +} + +/* + * Were we in user mode (or vm86 mode) when we were + * interrupted? + * + * Doing kernel_fpu_begin/end() is ok if we are running + * in an interrupt context from user mode - we'll just + * save the FPU state as required. + */ +static inline bool interrupted_user_mode(void) +{ + struct pt_regs *regs = get_irq_regs(); + return regs && user_mode_vm(regs); +} + +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + * + * It's always ok in process context (ie "not interrupt") + * but it is sometimes ok even from an irq. + */ +static inline bool irq_fpu_usable(void) +{ + return !in_interrupt() || + interrupted_user_mode() || + interrupted_kernel_fpu_idle(); +} + static inline void kernel_fpu_begin(void) { struct thread_info *me = current_thread_info(); + + WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); if (me->status & TS_USEDFPU) __save_init_fpu(me->task); @@ -323,14 +368,6 @@ static inline void kernel_fpu_end(void) preempt_enable(); } -static inline bool irq_fpu_usable(void) -{ - struct pt_regs *regs; - - return !in_interrupt() || !(regs = get_irq_regs()) || \ - user_mode(regs) || (read_cr0() & X86_CR0_TS); -} - /* * Some instructions like VIA's padlock instructions generate a spurious * DNA fault but don't modify SSE registers. And these instructions @@ -367,6 +404,7 @@ static inline void irq_ts_restore(int TS_state) */ static inline void save_init_fpu(struct task_struct *tsk) { + WARN_ON_ONCE(task_thread_info(tsk)->status & TS_USEDFPU); preempt_disable(); __save_init_fpu(tsk); stts(); -- cgit v1.2.3 From c38e23456278e967f094b08247ffc3711b1029b2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 15 Feb 2012 08:05:18 -0800 Subject: i387: fix sense of sanity check The check for save_init_fpu() (introduced in commit 5b1cbac37798: "i387: make irq_fpu_usable() tests more robust") was the wrong way around, but I hadn't noticed, because my "tests" were bogus: the FPU exceptions are disabled by default, so even doing a divide by zero never actually triggers this code at all unless you do extra work to enable them. So if anybody did enable them, they'd get one spurious warning. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index a29571821b99..727c1dd84899 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -404,7 +404,7 @@ static inline void irq_ts_restore(int TS_state) */ static inline void save_init_fpu(struct task_struct *tsk) { - WARN_ON_ONCE(task_thread_info(tsk)->status & TS_USEDFPU); + WARN_ON_ONCE(!(task_thread_info(tsk)->status & TS_USEDFPU)); preempt_disable(); __save_init_fpu(tsk); stts(); -- cgit v1.2.3 From 15d8791cae75dca27bfda8ecfe87dca9379d6bb0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 09:15:04 -0800 Subject: i387: fix x86-64 preemption-unsafe user stack save/restore Commit 5b1cbac37798 ("i387: make irq_fpu_usable() tests more robust") added a sanity check to the #NM handler to verify that we never cause the "Device Not Available" exception in kernel mode. However, that check actually pinpointed a (fundamental) race where we do cause that exception as part of the signal stack FPU state save/restore code. Because we use the floating point instructions themselves to save and restore state directly from user mode, we cannot do that atomically with testing the TS_USEDFPU bit: the user mode access itself may cause a page fault, which causes a task switch, which saves and restores the FP/MMX state from the kernel buffers. This kind of "recursive" FP state save is fine per se, but it means that when the signal stack save/restore gets restarted, it will now take the '#NM' exception we originally tried to avoid. With preemption this can happen even without the page fault - but because of the user access, we cannot just disable preemption around the save/restore instruction. There are various ways to solve this, including using the "enable/disable_page_fault()" helpers to not allow page faults at all during the sequence, and fall back to copying things by hand without the use of the native FP state save/restore instructions. However, the simplest thing to do is to just allow the #NM from kernel space, but fix the race in setting and clearing CR0.TS that this all exposed: the TS bit changes and the TS_USEDFPU bit absolutely have to be atomic wrt scheduling, so while the actual state save/restore can be interrupted and restarted, the act of actually clearing/setting CR0.TS and the TS_USEDFPU bit together must not. Instead of just adding random "preempt_disable/enable()" calls to what is already excessively ugly code, this introduces some helper functions that mostly mirror the "kernel_fpu_begin/end()" functionality, just for the user state instead. Those helper functions should probably eventually replace the other ad-hoc CR0.TS and TS_USEDFPU tests too, but I'll need to think about it some more: the task switching functionality in particular needs to expose the difference between the 'prev' and 'next' threads, while the new helper functions intentionally were written to only work with 'current'. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 727c1dd84899..f704be239883 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -399,6 +399,48 @@ static inline void irq_ts_restore(int TS_state) stts(); } +/* + * The question "does this thread have fpu access?" + * is slightly racy, since preemption could come in + * and revoke it immediately after the test. + * + * However, even in that very unlikely scenario, + * we can just assume we have FPU access - typically + * to save the FP state - we'll just take a #NM + * fault and get the FPU access back. + * + * The actual user_fpu_begin/end() functions + * need to be preemption-safe, though. + * + * NOTE! user_fpu_end() must be used only after you + * have saved the FP state, and user_fpu_begin() must + * be used only immediately before restoring it. + * These functions do not do any save/restore on + * their own. + */ +static inline int user_has_fpu(void) +{ + return current_thread_info()->status & TS_USEDFPU; +} + +static inline void user_fpu_end(void) +{ + preempt_disable(); + current_thread_info()->status &= ~TS_USEDFPU; + stts(); + preempt_enable(); +} + +static inline void user_fpu_begin(void) +{ + preempt_disable(); + if (!user_has_fpu()) { + clts(); + current_thread_info()->status |= TS_USEDFPU; + } + preempt_enable(); +} + /* * These disable preemption on their own and are safe */ -- cgit v1.2.3 From b6c66418dcad0fcf83cd1d0a39482db37bf4fc41 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 12:22:48 -0800 Subject: i387: move TS_USEDFPU clearing out of __save_init_fpu and into callers Touching TS_USEDFPU without touching CR0.TS is confusing, so don't do it. By moving it into the callers, we always do the TS_USEDFPU next to the CR0.TS accesses in the source code, and it's much easier to see how the two go hand in hand. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index f704be239883..1e12c2d087e4 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -259,7 +259,6 @@ static inline void fpu_save_init(struct fpu *fpu) static inline void __save_init_fpu(struct task_struct *tsk) { fpu_save_init(&tsk->thread.fpu); - task_thread_info(tsk)->status &= ~TS_USEDFPU; } static inline int fpu_fxrstor_checking(struct fpu *fpu) @@ -290,6 +289,7 @@ static inline void __unlazy_fpu(struct task_struct *tsk) { if (task_thread_info(tsk)->status & TS_USEDFPU) { __save_init_fpu(tsk); + task_thread_info(tsk)->status &= ~TS_USEDFPU; stts(); } else tsk->fpu_counter = 0; @@ -356,9 +356,11 @@ static inline void kernel_fpu_begin(void) WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); - if (me->status & TS_USEDFPU) + if (me->status & TS_USEDFPU) { __save_init_fpu(me->task); - else + me->status &= ~TS_USEDFPU; + /* We do 'stts()' in kernel_fpu_end() */ + } else clts(); } @@ -449,6 +451,7 @@ static inline void save_init_fpu(struct task_struct *tsk) WARN_ON_ONCE(!(task_thread_info(tsk)->status & TS_USEDFPU)); preempt_disable(); __save_init_fpu(tsk); + task_thread_info(tsk)->status &= ~TS_USEDFPU; stts(); preempt_enable(); } -- cgit v1.2.3 From 6d59d7a9f5b723a7ac1925c136e93ec83c0c3043 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 13:33:12 -0800 Subject: i387: don't ever touch TS_USEDFPU directly, use helper functions This creates three helper functions that do the TS_USEDFPU accesses, and makes everybody that used to do it by hand use those helpers instead. In addition, there's a couple of helper functions for the "change both CR0.TS and TS_USEDFPU at the same time" case, and the places that do that together have been changed to use those. That means that we have fewer random places that open-code this situation. The intent is partly to clarify the code without actually changing any semantics yet (since we clearly still have some hard to reproduce bug in this area), but also to make it much easier to use another approach entirely to caching the CR0.TS bit for software accesses. Right now we use a bit in the thread-info 'status' variable (this patch does not change that), but we might want to make it a full field of its own or even make it a per-cpu variable. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 75 +++++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 20 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 1e12c2d087e4..548b2c07ac9a 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -279,6 +279,47 @@ static inline int restore_fpu_checking(struct task_struct *tsk) return fpu_restore_checking(&tsk->thread.fpu); } +/* + * Software FPU state helpers. Careful: these need to + * be preemption protection *and* they need to be + * properly paired with the CR0.TS changes! + */ +static inline int __thread_has_fpu(struct thread_info *ti) +{ + return ti->status & TS_USEDFPU; +} + +/* Must be paired with an 'stts' after! */ +static inline void __thread_clear_has_fpu(struct thread_info *ti) +{ + ti->status &= ~TS_USEDFPU; +} + +/* Must be paired with a 'clts' before! */ +static inline void __thread_set_has_fpu(struct thread_info *ti) +{ + ti->status |= TS_USEDFPU; +} + +/* + * Encapsulate the CR0.TS handling together with the + * software flag. + * + * These generally need preemption protection to work, + * do try to avoid using these on their own. + */ +static inline void __thread_fpu_end(struct thread_info *ti) +{ + __thread_clear_has_fpu(ti); + stts(); +} + +static inline void __thread_fpu_begin(struct thread_info *ti) +{ + clts(); + __thread_set_has_fpu(ti); +} + /* * Signal frame handlers... */ @@ -287,23 +328,21 @@ extern int restore_i387_xstate(void __user *buf); static inline void __unlazy_fpu(struct task_struct *tsk) { - if (task_thread_info(tsk)->status & TS_USEDFPU) { + if (__thread_has_fpu(task_thread_info(tsk))) { __save_init_fpu(tsk); - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); + __thread_fpu_end(task_thread_info(tsk)); } else tsk->fpu_counter = 0; } static inline void __clear_fpu(struct task_struct *tsk) { - if (task_thread_info(tsk)->status & TS_USEDFPU) { + if (__thread_has_fpu(task_thread_info(tsk))) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); + __thread_fpu_end(task_thread_info(tsk)); } } @@ -311,14 +350,14 @@ static inline void __clear_fpu(struct task_struct *tsk) * Were we in an interrupt that interrupted kernel mode? * * We can do a kernel_fpu_begin/end() pair *ONLY* if that - * pair does nothing at all: TS_USEDFPU must be clear (so + * pair does nothing at all: the thread must not have fpu (so * that we don't try to save the FPU state), and TS must * be set (so that the clts/stts pair does nothing that is * visible in the interrupted kernel thread). */ static inline bool interrupted_kernel_fpu_idle(void) { - return !(current_thread_info()->status & TS_USEDFPU) && + return !__thread_has_fpu(current_thread_info()) && (read_cr0() & X86_CR0_TS); } @@ -356,9 +395,9 @@ static inline void kernel_fpu_begin(void) WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); - if (me->status & TS_USEDFPU) { + if (__thread_has_fpu(me)) { __save_init_fpu(me->task); - me->status &= ~TS_USEDFPU; + __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ } else clts(); @@ -422,24 +461,21 @@ static inline void irq_ts_restore(int TS_state) */ static inline int user_has_fpu(void) { - return current_thread_info()->status & TS_USEDFPU; + return __thread_has_fpu(current_thread_info()); } static inline void user_fpu_end(void) { preempt_disable(); - current_thread_info()->status &= ~TS_USEDFPU; - stts(); + __thread_fpu_end(current_thread_info()); preempt_enable(); } static inline void user_fpu_begin(void) { preempt_disable(); - if (!user_has_fpu()) { - clts(); - current_thread_info()->status |= TS_USEDFPU; - } + if (!user_has_fpu()) + __thread_fpu_begin(current_thread_info()); preempt_enable(); } @@ -448,11 +484,10 @@ static inline void user_fpu_begin(void) */ static inline void save_init_fpu(struct task_struct *tsk) { - WARN_ON_ONCE(!(task_thread_info(tsk)->status & TS_USEDFPU)); + WARN_ON_ONCE(!__thread_has_fpu(task_thread_info(tsk))); preempt_disable(); __save_init_fpu(tsk); - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); + __thread_fpu_end(task_thread_info(tsk)); preempt_enable(); } -- cgit v1.2.3 From b3b0870ef3ffed72b92415423da864f440f57ad6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 15:45:23 -0800 Subject: i387: do not preload FPU state at task switch time Yes, taking the trap to re-load the FPU/MMX state is expensive, but so is spending several days looking for a bug in the state save/restore code. And the preload code has some rather subtle interactions with both paravirtualization support and segment state restore, so it's not nearly as simple as it should be. Also, now that we no longer necessarily depend on a single bit (ie TS_USEDFPU) for keeping track of the state of the FPU, we migth be able to do better. If we are really switching between two processes that keep touching the FP state, save/restore is inevitable, but in the case of having one process that does most of the FPU usage, we may actually be able to do much better than the preloading. In particular, we may be able to keep track of which CPU the process ran on last, and also per CPU keep track of which process' FP state that CPU has. For modern CPU's that don't destroy the FPU contents on save time, that would allow us to do a lazy restore by just re-enabling the existing FPU state - with no restore cost at all! Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 548b2c07ac9a..86974c72d0d0 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -30,7 +30,6 @@ extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); extern void math_state_restore(void); -extern void __math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern user_regset_active_fn fpregs_active, xfpregs_active; -- cgit v1.2.3 From 4903062b5485f0e2c286a23b44c9b59d9b017d53 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 19:11:15 -0800 Subject: i387: move AMD K7/K8 fpu fxsave/fxrstor workaround from save to restore The AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. In order to not leak FIP state from one process to another, we need to do a floating point load after the fxsave of the old process, and before the fxrstor of the new FPU state. That resets the state to the (uninteresting) kernel load, rather than some potentially sensitive user information. We used to do this directly after the FPU state save, but that is actually very inconvenient, since it (a) corrupts what is potentially perfectly good FPU state that we might want to lazy avoid restoring later and (b) on x86-64 it resulted in a very annoying ordering constraint, where "__unlazy_fpu()" in the task switch needs to be delayed until after the DS segment has been reloaded just to get the new DS value. Coupling it to the fxrstor instead of the fxsave automatically avoids both of these issues, and also ensures that we only do it when actually necessary (the FP state after a save may never actually get used). It's simply a much more natural place for the leaked state cleanup. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 86974c72d0d0..01b115d86770 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -211,15 +211,6 @@ static inline void fpu_fxsave(struct fpu *fpu) #endif /* CONFIG_X86_64 */ -/* We need a safe address that is cheap to find and that is already - in L1 during context switch. The best choices are unfortunately - different for UP and SMP */ -#ifdef CONFIG_SMP -#define safe_address (__per_cpu_offset[0]) -#else -#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER]) -#endif - /* * These must be called with preempt disabled */ @@ -243,16 +234,6 @@ static inline void fpu_save_init(struct fpu *fpu) if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) asm volatile("fnclex"); - - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. safe_address is a random variable that should be in L1 */ - alternative_input( - ASM_NOP8 ASM_NOP2, - "emms\n\t" /* clear stack tags */ - "fildl %P[addr]", /* set F?P to defined value */ - X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (safe_address)); } static inline void __save_init_fpu(struct task_struct *tsk) -- cgit v1.2.3 From f94edacf998516ac9d849f7bc6949a703977a7f3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 17 Feb 2012 21:48:54 -0800 Subject: i387: move TS_USEDFPU flag from thread_info to task_struct This moves the bit that indicates whether a thread has ownership of the FPU from the TS_USEDFPU bit in thread_info->status to a word of its own (called 'has_fpu') in task_struct->thread.has_fpu. This fixes two independent bugs at the same time: - changing 'thread_info->status' from the scheduler causes nasty problems for the other users of that variable, since it is defined to be thread-synchronous (that's what the "TS_" part of the naming was supposed to indicate). So perfectly valid code could (and did) do ti->status |= TS_RESTORE_SIGMASK; and the compiler was free to do that as separate load, or and store instructions. Which can cause problems with preemption, since a task switch could happen in between, and change the TS_USEDFPU bit. The change to TS_USEDFPU would be overwritten by the final store. In practice, this seldom happened, though, because the 'status' field was seldom used more than once, so gcc would generally tend to generate code that used a read-modify-write instruction and thus happened to avoid this problem - RMW instructions are naturally low fat and preemption-safe. - On x86-32, the current_thread_info() pointer would, during interrupts and softirqs, point to a *copy* of the real thread_info, because x86-32 uses %esp to calculate the thread_info address, and thus the separate irq (and softirq) stacks would cause these kinds of odd thread_info copy aliases. This is normally not a problem, since interrupts aren't supposed to look at thread information anyway (what thread is running at interrupt time really isn't very well-defined), but it confused the heck out of irq_fpu_usable() and the code that tried to squirrel away the FPU state. (It also caused untold confusion for us poor kernel developers). It also turns out that using 'task_struct' is actually much more natural for most of the call sites that care about the FPU state, since they tend to work with the task struct for other reasons anyway (ie scheduling). And the FPU data that we are going to save/restore is found there too. Thanks to Arjan Van De Ven for pointing us to the %esp issue. Cc: Arjan van de Ven Reported-and-tested-by: Raphael Prevost Acked-and-tested-by: Suresh Siddha Tested-by: Peter Anvin Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 44 +++++++++++++++++++------------------- arch/x86/include/asm/processor.h | 1 + arch/x86/include/asm/thread_info.h | 2 -- 3 files changed, 23 insertions(+), 24 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 01b115d86770..f5376676f89c 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -264,21 +264,21 @@ static inline int restore_fpu_checking(struct task_struct *tsk) * be preemption protection *and* they need to be * properly paired with the CR0.TS changes! */ -static inline int __thread_has_fpu(struct thread_info *ti) +static inline int __thread_has_fpu(struct task_struct *tsk) { - return ti->status & TS_USEDFPU; + return tsk->thread.has_fpu; } /* Must be paired with an 'stts' after! */ -static inline void __thread_clear_has_fpu(struct thread_info *ti) +static inline void __thread_clear_has_fpu(struct task_struct *tsk) { - ti->status &= ~TS_USEDFPU; + tsk->thread.has_fpu = 0; } /* Must be paired with a 'clts' before! */ -static inline void __thread_set_has_fpu(struct thread_info *ti) +static inline void __thread_set_has_fpu(struct task_struct *tsk) { - ti->status |= TS_USEDFPU; + tsk->thread.has_fpu = 1; } /* @@ -288,16 +288,16 @@ static inline void __thread_set_has_fpu(struct thread_info *ti) * These generally need preemption protection to work, * do try to avoid using these on their own. */ -static inline void __thread_fpu_end(struct thread_info *ti) +static inline void __thread_fpu_end(struct task_struct *tsk) { - __thread_clear_has_fpu(ti); + __thread_clear_has_fpu(tsk); stts(); } -static inline void __thread_fpu_begin(struct thread_info *ti) +static inline void __thread_fpu_begin(struct task_struct *tsk) { clts(); - __thread_set_has_fpu(ti); + __thread_set_has_fpu(tsk); } /* @@ -308,21 +308,21 @@ extern int restore_i387_xstate(void __user *buf); static inline void __unlazy_fpu(struct task_struct *tsk) { - if (__thread_has_fpu(task_thread_info(tsk))) { + if (__thread_has_fpu(tsk)) { __save_init_fpu(tsk); - __thread_fpu_end(task_thread_info(tsk)); + __thread_fpu_end(tsk); } else tsk->fpu_counter = 0; } static inline void __clear_fpu(struct task_struct *tsk) { - if (__thread_has_fpu(task_thread_info(tsk))) { + if (__thread_has_fpu(tsk)) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); - __thread_fpu_end(task_thread_info(tsk)); + __thread_fpu_end(tsk); } } @@ -337,7 +337,7 @@ static inline void __clear_fpu(struct task_struct *tsk) */ static inline bool interrupted_kernel_fpu_idle(void) { - return !__thread_has_fpu(current_thread_info()) && + return !__thread_has_fpu(current) && (read_cr0() & X86_CR0_TS); } @@ -371,12 +371,12 @@ static inline bool irq_fpu_usable(void) static inline void kernel_fpu_begin(void) { - struct thread_info *me = current_thread_info(); + struct task_struct *me = current; WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); if (__thread_has_fpu(me)) { - __save_init_fpu(me->task); + __save_init_fpu(me); __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ } else @@ -441,13 +441,13 @@ static inline void irq_ts_restore(int TS_state) */ static inline int user_has_fpu(void) { - return __thread_has_fpu(current_thread_info()); + return __thread_has_fpu(current); } static inline void user_fpu_end(void) { preempt_disable(); - __thread_fpu_end(current_thread_info()); + __thread_fpu_end(current); preempt_enable(); } @@ -455,7 +455,7 @@ static inline void user_fpu_begin(void) { preempt_disable(); if (!user_has_fpu()) - __thread_fpu_begin(current_thread_info()); + __thread_fpu_begin(current); preempt_enable(); } @@ -464,10 +464,10 @@ static inline void user_fpu_begin(void) */ static inline void save_init_fpu(struct task_struct *tsk) { - WARN_ON_ONCE(!__thread_has_fpu(task_thread_info(tsk))); + WARN_ON_ONCE(!__thread_has_fpu(tsk)); preempt_disable(); __save_init_fpu(tsk); - __thread_fpu_end(task_thread_info(tsk)); + __thread_fpu_end(tsk); preempt_enable(); } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index aa9088c26931..f7c89e231c6c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -454,6 +454,7 @@ struct thread_struct { unsigned long trap_no; unsigned long error_code; /* floating point and extended processor state */ + unsigned long has_fpu; struct fpu fpu; #ifdef CONFIG_X86_32 /* Virtual 86 mode info */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index bc817cd8b443..cfd8144d5527 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -247,8 +247,6 @@ static inline struct thread_info *current_thread_info(void) * ever touches our thread-synchronous status, so we don't * have to worry about atomic accesses. */ -#define TS_USEDFPU 0x0001 /* FPU was used by this task - this quantum (SMP) */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ #define TS_POLLING 0x0004 /* idle task polling need_resched, skip sending interrupt */ -- cgit v1.2.3 From 34ddc81a230b15c0e345b6b253049db731499f7e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 18 Feb 2012 12:56:35 -0800 Subject: i387: re-introduce FPU state preloading at context switch time After all the FPU state cleanups and finally finding the problem that caused all our FPU save/restore problems, this re-introduces the preloading of FPU state that was removed in commit b3b0870ef3ff ("i387: do not preload FPU state at task switch time"). However, instead of simply reverting the removal, this reimplements preloading with several fixes, most notably - properly abstracted as a true FPU state switch, rather than as open-coded save and restore with various hacks. In particular, implementing it as a proper FPU state switch allows us to optimize the CR0.TS flag accesses: there is no reason to set the TS bit only to then almost immediately clear it again. CR0 accesses are quite slow and expensive, don't flip the bit back and forth for no good reason. - Make sure that the same model works for both x86-32 and x86-64, so that there are no gratuitous differences between the two due to the way they save and restore segment state differently due to architectural differences that really don't matter to the FPU state. - Avoid exposing the "preload" state to the context switch routines, and in particular allow the concept of lazy state restore: if nothing else has used the FPU in the meantime, and the process is still on the same CPU, we can avoid restoring state from memory entirely, just re-expose the state that is still in the FPU unit. That optimized lazy restore isn't actually implemented here, but the infrastructure is set up for it. Of course, older CPU's that use 'fnsave' to save the state cannot take advantage of this, since the state saving also trashes the state. In other words, there is now an actual _design_ to the FPU state saving, rather than just random historical baggage. Hopefully it's easier to follow as a result. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 110 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 93 insertions(+), 17 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index f5376676f89c..a850b4d8d14d 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -29,6 +29,7 @@ extern unsigned int sig_xstate_size; extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); +extern void __math_state_restore(struct task_struct *); extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); @@ -212,9 +213,10 @@ static inline void fpu_fxsave(struct fpu *fpu) #endif /* CONFIG_X86_64 */ /* - * These must be called with preempt disabled + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact. */ -static inline void fpu_save_init(struct fpu *fpu) +static inline int fpu_save_init(struct fpu *fpu) { if (use_xsave()) { fpu_xsave(fpu); @@ -223,22 +225,33 @@ static inline void fpu_save_init(struct fpu *fpu) * xsave header may indicate the init state of the FP. */ if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) - return; + return 1; } else if (use_fxsr()) { fpu_fxsave(fpu); } else { asm volatile("fnsave %[fx]; fwait" : [fx] "=m" (fpu->state->fsave)); - return; + return 0; } - if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) + /* + * If exceptions are pending, we need to clear them so + * that we don't randomly get exceptions later. + * + * FIXME! Is this perhaps only true for the old-style + * irq13 case? Maybe we could leave the x87 state + * intact otherwise? + */ + if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { asm volatile("fnclex"); + return 0; + } + return 1; } -static inline void __save_init_fpu(struct task_struct *tsk) +static inline int __save_init_fpu(struct task_struct *tsk) { - fpu_save_init(&tsk->thread.fpu); + return fpu_save_init(&tsk->thread.fpu); } static inline int fpu_fxrstor_checking(struct fpu *fpu) @@ -301,20 +314,79 @@ static inline void __thread_fpu_begin(struct task_struct *tsk) } /* - * Signal frame handlers... + * FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_fpu_prepare() saves the old state and + * sets the new state of the CR0.TS bit. This is + * done within the context of the old process. + * + * - switch_fpu_finish() restores the new state as + * necessary. */ -extern int save_i387_xstate(void __user *buf); -extern int restore_i387_xstate(void __user *buf); +typedef struct { int preload; } fpu_switch_t; + +/* + * FIXME! We could do a totally lazy restore, but we need to + * add a per-cpu "this was the task that last touched the FPU + * on this CPU" variable, and the task needs to have a "I last + * touched the FPU on this CPU" and check them. + * + * We don't do that yet, so "fpu_lazy_restore()" always returns + * false, but some day.. + */ +#define fpu_lazy_restore(tsk) (0) +#define fpu_lazy_state_intact(tsk) do { } while (0) + +static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new) +{ + fpu_switch_t fpu; + + fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; + if (__thread_has_fpu(old)) { + if (__save_init_fpu(old)) + fpu_lazy_state_intact(old); + __thread_clear_has_fpu(old); + old->fpu_counter++; + + /* Don't change CR0.TS if we just switch! */ + if (fpu.preload) { + __thread_set_has_fpu(new); + prefetch(new->thread.fpu.state); + } else + stts(); + } else { + old->fpu_counter = 0; + if (fpu.preload) { + if (fpu_lazy_restore(new)) + fpu.preload = 0; + else + prefetch(new->thread.fpu.state); + __thread_fpu_begin(new); + } + } + return fpu; +} -static inline void __unlazy_fpu(struct task_struct *tsk) +/* + * By the time this gets called, we've already cleared CR0.TS and + * given the process the FPU if we are going to preload the FPU + * state - all we need to do is to conditionally restore the register + * state itself. + */ +static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) { - if (__thread_has_fpu(tsk)) { - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - } else - tsk->fpu_counter = 0; + if (fpu.preload) + __math_state_restore(new); } +/* + * Signal frame handlers... + */ +extern int save_i387_xstate(void __user *buf); +extern int restore_i387_xstate(void __user *buf); + static inline void __clear_fpu(struct task_struct *tsk) { if (__thread_has_fpu(tsk)) { @@ -474,7 +546,11 @@ static inline void save_init_fpu(struct task_struct *tsk) static inline void unlazy_fpu(struct task_struct *tsk) { preempt_disable(); - __unlazy_fpu(tsk); + if (__thread_has_fpu(tsk)) { + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + } else + tsk->fpu_counter = 0; preempt_enable(); } -- cgit v1.2.3 From cea20ca3f3181fc36788a15bc65d1062b96a0a6c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 20 Feb 2012 10:24:09 -0800 Subject: i387: fix up some fpu_counter confusion This makes sure we clear the FPU usage counter for newly created tasks, just so that we start off in a known state (for example, don't try to preload the FPU state on the first task switch etc). It also fixes a thinko in when we increment the fpu_counter at task switch time, introduced by commit 34ddc81a230b ("i387: re-introduce FPU state preloading at context switch time"). We should increment the *new* task fpu_counter, not the old task, and only if we decide to use that state (whether lazily or preloaded). Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index a850b4d8d14d..8df95849721d 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -348,10 +348,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta if (__save_init_fpu(old)) fpu_lazy_state_intact(old); __thread_clear_has_fpu(old); - old->fpu_counter++; /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { + new->fpu_counter++; __thread_set_has_fpu(new); prefetch(new->thread.fpu.state); } else @@ -359,6 +359,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta } else { old->fpu_counter = 0; if (fpu.preload) { + new->fpu_counter++; if (fpu_lazy_restore(new)) fpu.preload = 0; else -- cgit v1.2.3 From 80ab6f1e8c981b1b6604b2f22e36c917526235cd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Feb 2012 11:48:44 -0800 Subject: i387: use 'restore_fpu_checking()' directly in task switching code This inlines what is usually just a couple of instructions, but more importantly it also fixes the theoretical error case (can that FPU restore really ever fail? Maybe we should remove the checking). We can't start sending signals from within the scheduler, we're much too deep in the kernel and are holding the runqueue lock etc. So don't bother even trying. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 8df95849721d..74c607b37e87 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -29,7 +29,6 @@ extern unsigned int sig_xstate_size; extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); -extern void __math_state_restore(struct task_struct *); extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); @@ -269,6 +268,16 @@ static inline int fpu_restore_checking(struct fpu *fpu) static inline int restore_fpu_checking(struct task_struct *tsk) { + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception + is pending. Clear the x87 state here by setting it to fixed + values. "m" is a random variable that should be in L1 */ + alternative_input( + ASM_NOP8 ASM_NOP2, + "emms\n\t" /* clear stack tags */ + "fildl %P[addr]", /* set F?P to defined value */ + X86_FEATURE_FXSAVE_LEAK, + [addr] "m" (tsk->thread.has_fpu)); + return fpu_restore_checking(&tsk->thread.fpu); } @@ -378,8 +387,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta */ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) { - if (fpu.preload) - __math_state_restore(new); + if (fpu.preload) { + if (unlikely(restore_fpu_checking(new))) + __thread_fpu_end(new); + } } /* -- cgit v1.2.3 From 7e16838d94b566a17b65231073d179bc04d590c8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Feb 2012 13:27:00 -0800 Subject: i387: support lazy restore of FPU state This makes us recognize when we try to restore FPU state that matches what we already have in the FPU on this CPU, and avoids the restore entirely if so. To do this, we add two new data fields: - a percpu 'fpu_owner_task' variable that gets written any time we update the "has_fpu" field, and thus acts as a kind of back-pointer to the task that owns the CPU. The exception is when we save the FPU state as part of a context switch - if the save can keep the FPU state around, we leave the 'fpu_owner_task' variable pointing at the task whose FP state still remains on the CPU. - a per-thread 'last_cpu' field, that indicates which CPU that thread used its FPU on last. We update this on every context switch (writing an invalid CPU number if the last context switch didn't leave the FPU in a lazily usable state), so we know that *that* thread has done nothing else with the FPU since. These two fields together can be used when next switching back to the task to see if the CPU still matches: if 'fpu_owner_task' matches the task we are switching to, we know that no other task (or kernel FPU usage) touched the FPU on this CPU in the meantime, and if the current CPU number matches the 'last_cpu' field, we know that this thread did no other FP work on any other CPU, so the FPU state on the CPU must match what was saved on last context switch. In that case, we can avoid the 'f[x]rstor' entirely, and just clear the CR0.TS bit. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 35 +++++++++++++++++++++++------------ arch/x86/include/asm/processor.h | 3 ++- 2 files changed, 25 insertions(+), 13 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 74c607b37e87..247904945d3f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -32,6 +32,8 @@ extern int init_fpu(struct task_struct *child); extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); +DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); + extern user_regset_active_fn fpregs_active, xfpregs_active; extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, xstateregs_get; @@ -276,7 +278,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk) "emms\n\t" /* clear stack tags */ "fildl %P[addr]", /* set F?P to defined value */ X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (tsk->thread.has_fpu)); + [addr] "m" (tsk->thread.fpu.has_fpu)); return fpu_restore_checking(&tsk->thread.fpu); } @@ -288,19 +290,21 @@ static inline int restore_fpu_checking(struct task_struct *tsk) */ static inline int __thread_has_fpu(struct task_struct *tsk) { - return tsk->thread.has_fpu; + return tsk->thread.fpu.has_fpu; } /* Must be paired with an 'stts' after! */ static inline void __thread_clear_has_fpu(struct task_struct *tsk) { - tsk->thread.has_fpu = 0; + tsk->thread.fpu.has_fpu = 0; + percpu_write(fpu_owner_task, NULL); } /* Must be paired with a 'clts' before! */ static inline void __thread_set_has_fpu(struct task_struct *tsk) { - tsk->thread.has_fpu = 1; + tsk->thread.fpu.has_fpu = 1; + percpu_write(fpu_owner_task, tsk); } /* @@ -345,18 +349,22 @@ typedef struct { int preload; } fpu_switch_t; * We don't do that yet, so "fpu_lazy_restore()" always returns * false, but some day.. */ -#define fpu_lazy_restore(tsk) (0) -#define fpu_lazy_state_intact(tsk) do { } while (0) +static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) +{ + return new == percpu_read_stable(fpu_owner_task) && + cpu == new->thread.fpu.last_cpu; +} -static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new) +static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) { fpu_switch_t fpu; fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; if (__thread_has_fpu(old)) { - if (__save_init_fpu(old)) - fpu_lazy_state_intact(old); - __thread_clear_has_fpu(old); + if (!__save_init_fpu(old)) + cpu = ~0; + old->thread.fpu.last_cpu = cpu; + old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { @@ -367,9 +375,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta stts(); } else { old->fpu_counter = 0; + old->thread.fpu.last_cpu = ~0; if (fpu.preload) { new->fpu_counter++; - if (fpu_lazy_restore(new)) + if (fpu_lazy_restore(new, cpu)) fpu.preload = 0; else prefetch(new->thread.fpu.state); @@ -463,8 +472,10 @@ static inline void kernel_fpu_begin(void) __save_init_fpu(me); __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ - } else + } else { + percpu_write(fpu_owner_task, NULL); clts(); + } } static inline void kernel_fpu_end(void) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f7c89e231c6c..58545c97d071 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -374,6 +374,8 @@ union thread_xstate { }; struct fpu { + unsigned int last_cpu; + unsigned int has_fpu; union thread_xstate *state; }; @@ -454,7 +456,6 @@ struct thread_struct { unsigned long trap_no; unsigned long error_code; /* floating point and extended processor state */ - unsigned long has_fpu; struct fpu fpu; #ifdef CONFIG_X86_32 /* Virtual 86 mode info */ -- cgit v1.2.3 From 1018faa6cf23b256bf25919ef203cd7c129f06f2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 29 Feb 2012 14:57:32 +0100 Subject: perf/x86/kvm: Fix Host-Only/Guest-Only counting with SVM disabled It turned out that a performance counter on AMD does not count at all when the GO or HO bit is set in the control register and SVM is disabled in EFER. This patch works around this issue by masking out the HO bit in the performance counter control register when SVM is not enabled. The GO bit is not touched because it is only set when the user wants to count in guest-mode only. So when SVM is disabled the counter should not run at all and the not-counting is the intended behaviour. Signed-off-by: Joerg Roedel Signed-off-by: Peter Zijlstra Cc: Avi Kivity Cc: Stephane Eranian Cc: David Ahern Cc: Gleb Natapov Cc: Robert Richter Cc: stable@vger.kernel.org # v3.2 Link: http://lkml.kernel.org/r/1330523852-19566-1-git-send-email-joerg.roedel@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 096c975e099f..461ce432b1c2 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -242,4 +242,12 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) static inline void perf_events_lapic_init(void) { } #endif +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) + extern void amd_pmu_enable_virt(void); + extern void amd_pmu_disable_virt(void); +#else + static inline void amd_pmu_enable_virt(void) { } + static inline void amd_pmu_disable_virt(void) { } +#endif + #endif /* _ASM_X86_PERF_EVENT_H */ -- cgit v1.2.3