diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-05-24 11:00:45 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-05-24 11:00:45 -0700 |
| commit | 3526d7462355c4df269d4a87eed53dbd48a1df02 (patch) | |
| tree | 9eee3b40fda02b288aa3cbcb27bab3fe5e2523a7 | |
| parent | a674bf74b31079782d7d8f333c8d832374f0b65c (diff) | |
| parent | fd948c3f96b18ff9ba7d3e8eae13d196593e1aaf (diff) | |
Merge tag 'x86-urgent-2026-05-24' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 fixes from Ingo Molnar:
- On SEV guests, handle set_memory_{encrypted,decrypted}() failures
more conservatively by assuming that all affected pages are
unencrypted (Carlos López)
- Disable broadcast TLB flush when PCID is disabled (Tom Lendacky)
- Fix VMX vs. hrtimer_rearm_deferred() regression (Peter Zijlstra)
- Move IRQ/NMI dispatch code from KVM into x86 core, to prepare for a
KVM x2apic fix (Peter Zijlstra)
- Fix incorrect munmap() size on map_vdso() failure (Guilherme Giacomo
Simoes)
* tag 'x86-urgent-2026-05-24' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
virt: sev-guest: Explicitly leak pages in unknown state
x86/mm: Disable broadcast TLB flush when PCID is disabled
x86/kvm/vmx: Fix VMX vs hrtimer_rearm_deferred()
x86/kvm/vmx: Move IRQ/NMI dispatch from KVM into x86 core
x86/vdso: Fix incorrect size in munmap() on map_vdso() failure
| -rw-r--r-- | arch/x86/entry/Makefile | 2 | ||||
| -rw-r--r-- | arch/x86/entry/common.c | 61 | ||||
| -rw-r--r-- | arch/x86/entry/entry.S | 46 | ||||
| -rw-r--r-- | arch/x86/entry/entry_64_fred.S | 1 | ||||
| -rw-r--r-- | arch/x86/entry/vdso/vma.c | 2 | ||||
| -rw-r--r-- | arch/x86/include/asm/desc.h | 4 | ||||
| -rw-r--r-- | arch/x86/include/asm/desc_defs.h | 2 | ||||
| -rw-r--r-- | arch/x86/include/asm/entry-common.h | 2 | ||||
| -rw-r--r-- | arch/x86/include/asm/fred.h | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/cpuid-deps.c | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/idt.c | 15 | ||||
| -rw-r--r-- | arch/x86/kernel/nmi.c | 1 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/vmenter.S | 46 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 19 | ||||
| -rw-r--r-- | drivers/virt/coco/sev-guest/sev-guest.c | 10 |
15 files changed, 141 insertions, 72 deletions
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 72cae8e0ce85..83b4762d6ecb 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -13,7 +13,7 @@ CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) CFLAGS_syscall_32.o += -fno-stack-protector CFLAGS_syscall_64.o += -fno-stack-protector -obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o +obj-y := entry.o entry_$(BITS).o syscall_$(BITS).o common.o obj-y += vdso/ obj-y += vsyscall/ diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c new file mode 100644 index 000000000000..06c7c6ebd6f9 --- /dev/null +++ b/arch/x86/entry/common.c @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/entry-common.h> +#include <linux/kvm_types.h> +#include <linux/hrtimer_rearm.h> +#include <asm/fred.h> +#include <asm/desc.h> + +#if IS_ENABLED(CONFIG_KVM_INTEL) +/* + * On VMX, NMIs and IRQs (as configured by KVM) are acknowledged by hardware as + * part of the VM-Exit, i.e. the event itself is consumed as part the VM-Exit. + * x86_entry_from_kvm() is invoked by KVM to effectively forward NMIs and IRQs + * to the kernel for servicing. On SVM, a.k.a. AMD, the NMI/IRQ VM-Exit is + * purely a signal that an NMI/IRQ is pending, i.e. the event that triggered + * the VM-Exit is held pending until it's unblocked in the host. + */ +noinstr void x86_entry_from_kvm(unsigned int event_type, unsigned int vector) +{ + if (event_type == EVENT_TYPE_EXTINT) { +#ifdef CONFIG_X86_64 + /* + * Use FRED dispatch, even when running IDT. The dispatch + * tables are kept in sync between FRED and IDT, and the FRED + * dispatch works well with CFI. + */ + fred_entry_from_kvm(event_type, vector); +#else + idt_entry_from_kvm(vector); +#endif + /* + * Strictly speaking, only the NMI path requires noinstr. + */ + instrumentation_begin(); + /* + * KVM/VMX will dispatch from IRQ-disabled but for a context + * that will have IRQs-enabled. This confuses the entry code + * and it will not have reprogrammed the timer. Do so now. + */ + hrtimer_rearm_deferred(); + instrumentation_end(); + + return; + } + + WARN_ON_ONCE(event_type != EVENT_TYPE_NMI); + +#ifdef CONFIG_X86_64 + if (cpu_feature_enabled(X86_FEATURE_FRED)) + return fred_entry_from_kvm(event_type, vector); +#endif + + /* + * Notably, we must use IDT dispatch for NMI when running in IDT mode. + * The FRED NMI context is significantly different and will not work + * right (specifically FRED fixed the NMI recursion issue). + */ + idt_entry_from_kvm(vector); +} +EXPORT_SYMBOL_FOR_KVM(x86_entry_from_kvm); +#endif diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 6ba2b3adcef0..a56e043b266d 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -75,3 +75,49 @@ THUNK warn_thunk_thunk, __warn_thunk #if defined(CONFIG_STACKPROTECTOR) && defined(CONFIG_SMP) EXPORT_SYMBOL(__ref_stack_chk_guard); #endif + +#if IS_ENABLED(CONFIG_KVM_INTEL) +.macro IDT_DO_EVENT_IRQOFF call_insn call_target + /* + * Unconditionally create a stack frame, getting the correct RSP on the + * stack (for x86-64) would take two instructions anyways, and RBP can + * be used to restore RSP to make objtool happy (see below). + */ + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + +#ifdef CONFIG_X86_64 + /* + * Align RSP to a 16-byte boundary (to emulate CPU behavior) before + * creating the synthetic interrupt stack frame for the IRQ/NMI. + */ + and $-16, %rsp + push $__KERNEL_DS + push %rbp +#endif + pushf + push $__KERNEL_CS + \call_insn \call_target + + /* + * "Restore" RSP from RBP, even though IRET has already unwound RSP to + * the correct value. objtool doesn't know the callee will IRET and, + * without the explicit restore, thinks the stack is getting walloped. + * Using an unwind hint is problematic due to x86-64's dynamic alignment. + */ + leave + RET +.endm + +.pushsection .text, "ax" +SYM_FUNC_START(idt_do_interrupt_irqoff) + IDT_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 +SYM_FUNC_END(idt_do_interrupt_irqoff) +.popsection + +.pushsection .noinstr.text, "ax" +SYM_FUNC_START(idt_do_nmi_irqoff) + IDT_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx +SYM_FUNC_END(idt_do_nmi_irqoff) +.popsection +#endif diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S index 894f7f16eb80..0d2768ab836c 100644 --- a/arch/x86/entry/entry_64_fred.S +++ b/arch/x86/entry/entry_64_fred.S @@ -147,5 +147,4 @@ SYM_FUNC_START(asm_fred_entry_from_kvm) RET SYM_FUNC_END(asm_fred_entry_from_kvm) -EXPORT_SYMBOL_FOR_KVM(asm_fred_entry_from_kvm); #endif diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index a6bfcc8243cd..d903bce24f15 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -178,7 +178,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) if (IS_ERR(vma)) { ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size, NULL); - do_munmap(mm, addr, image->size, NULL); + do_munmap(mm, addr, VDSO_NR_PAGES * PAGE_SIZE, NULL); goto up_fail; } diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index ec95fe44fa3a..00aeae843529 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -438,6 +438,10 @@ extern void idt_setup_traps(void); extern void idt_setup_apic_and_irq_gates(void); extern bool idt_is_f00f_address(unsigned long address); +extern void idt_do_interrupt_irqoff(unsigned long address); +extern void idt_do_nmi_irqoff(void); +extern void idt_entry_from_kvm(unsigned int vector); + #ifdef CONFIG_X86_64 extern void idt_setup_early_pf(void); #else diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 7e6b9314758a..2f2ce8aadf07 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -145,7 +145,7 @@ struct gate_struct { typedef struct gate_struct gate_desc; #ifndef _SETUP -static inline unsigned long gate_offset(const gate_desc *g) +static __always_inline unsigned long gate_offset(const gate_desc *g) { #ifdef CONFIG_X86_64 return g->offset_low | ((unsigned long)g->offset_middle << 16) | diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 7535131c711b..eca24b5e07f4 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -97,4 +97,6 @@ static __always_inline void arch_exit_to_user_mode(void) } #define arch_exit_to_user_mode arch_exit_to_user_mode +extern void x86_entry_from_kvm(unsigned int entry_type, unsigned int vector); + #endif diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h index 2bb65677c079..18a2f811c358 100644 --- a/arch/x86/include/asm/fred.h +++ b/arch/x86/include/asm/fred.h @@ -110,7 +110,6 @@ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { ret static inline void cpu_init_fred_exceptions(void) { } static inline void cpu_init_fred_rsps(void) { } static inline void fred_complete_exception_setup(void) { } -static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } static inline void fred_sync_rsp0(unsigned long rsp0) { } static inline void fred_update_rsp0(void) { } #endif /* CONFIG_X86_FRED */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 146f6f8b0650..99801e844b30 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -92,6 +92,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_FRED, X86_FEATURE_LKGS }, { X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL }, { X86_FEATURE_LASS, X86_FEATURE_SMAP }, + { X86_FEATURE_INVLPGB, X86_FEATURE_PCID }, {} }; diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 260456588756..7bcf1decc034 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -268,6 +268,21 @@ void __init idt_setup_early_pf(void) } #endif +#if IS_ENABLED(CONFIG_KVM_INTEL) +noinstr void idt_entry_from_kvm(unsigned int vector) +{ + if (vector == NMI_VECTOR) + return idt_do_nmi_irqoff(); + + /* + * Only the NMI path requires noinstr. + */ + instrumentation_begin(); + idt_do_interrupt_irqoff(gate_offset(idt_table + vector)); + instrumentation_end(); +} +#endif + static void __init idt_map_in_cea(void) { /* diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 3d239ed12744..52a3afb1b79e 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -614,7 +614,6 @@ DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx) { exc_nmi(regs); } -EXPORT_SYMBOL_FOR_KVM(asm_exc_nmi_kvm_vmx); #endif #ifdef CONFIG_NMI_CHECK_CPU diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 8a481dae9cae..ff1f254a0ef4 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -31,38 +31,6 @@ #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE #endif -.macro VMX_DO_EVENT_IRQOFF call_insn call_target - /* - * Unconditionally create a stack frame, getting the correct RSP on the - * stack (for x86-64) would take two instructions anyways, and RBP can - * be used to restore RSP to make objtool happy (see below). - */ - push %_ASM_BP - mov %_ASM_SP, %_ASM_BP - -#ifdef CONFIG_X86_64 - /* - * Align RSP to a 16-byte boundary (to emulate CPU behavior) before - * creating the synthetic interrupt stack frame for the IRQ/NMI. - */ - and $-16, %rsp - push $__KERNEL_DS - push %rbp -#endif - pushf - push $__KERNEL_CS - \call_insn \call_target - - /* - * "Restore" RSP from RBP, even though IRET has already unwound RSP to - * the correct value. objtool doesn't know the callee will IRET and, - * without the explicit restore, thinks the stack is getting walloped. - * Using an unwind hint is problematic due to x86-64's dynamic alignment. - */ - leave - RET -.endm - .section .noinstr.text, "ax" /** @@ -320,10 +288,6 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) SYM_FUNC_END(__vmx_vcpu_run) -SYM_FUNC_START(vmx_do_nmi_irqoff) - VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx -SYM_FUNC_END(vmx_do_nmi_irqoff) - #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT /** @@ -375,13 +339,3 @@ SYM_FUNC_START(vmread_error_trampoline) RET SYM_FUNC_END(vmread_error_trampoline) #endif - -.section .text, "ax" - -#ifndef CONFIG_X86_FRED - -SYM_FUNC_START(vmx_do_interrupt_irqoff) - VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 -SYM_FUNC_END(vmx_do_interrupt_irqoff) - -#endif diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 49feecb286b2..b9103de01428 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7117,9 +7117,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -void vmx_do_interrupt_irqoff(unsigned long entry); -void vmx_do_nmi_irqoff(void); - static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) { /* @@ -7161,17 +7158,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, "unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; - /* - * Invoke the kernel's IRQ handler for the vector. Use the FRED path - * when it's available even if FRED isn't fully enabled, e.g. even if - * FRED isn't supported in hardware, in order to avoid the indirect - * CALL in the non-FRED path. - */ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); - if (IS_ENABLED(CONFIG_X86_FRED)) - fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector); - else - vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector)); + x86_entry_from_kvm(EVENT_TYPE_EXTINT, vector); kvm_after_interrupt(vcpu); vcpu->arch.at_instruction_boundary = true; @@ -7481,10 +7469,7 @@ noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) return; kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - if (cpu_feature_enabled(X86_FEATURE_FRED)) - fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); - else - vmx_do_nmi_irqoff(); + x86_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); kvm_after_interrupt(vcpu); } diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index 910a1de0d5a7..d186ae55cf63 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -176,6 +176,7 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques struct snp_guest_req req = {}; int ret, npages = 0, resp_len; sockptr_t certs_address; + u64 pfn; if (sockptr_is_null(io->req_data) || sockptr_is_null(io->resp_data)) return -EINVAL; @@ -215,10 +216,11 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques if (!req.certs_data) return -ENOMEM; + pfn = PHYS_PFN(virt_to_phys(req.certs_data)); ret = set_memory_decrypted((unsigned long)req.certs_data, npages); if (ret) { pr_err("failed to mark page shared, ret=%d\n", ret); - free_pages_exact(req.certs_data, npages << PAGE_SHIFT); + snp_leak_pages(pfn, npages); return -EFAULT; } @@ -272,10 +274,12 @@ e_free: kfree(report_resp); e_free_data: if (npages) { - if (set_memory_encrypted((unsigned long)req.certs_data, npages)) + if (set_memory_encrypted((unsigned long)req.certs_data, npages)) { WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n"); - else + snp_leak_pages(pfn, npages); + } else { free_pages_exact(req.certs_data, npages << PAGE_SHIFT); + } } return ret; } |
