From fc3ba56385d03501eb582e4b86691ba378e556f9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 16 Dec 2025 08:17:54 -0800 Subject: KVM: nSVM: Remove a user-triggerable WARN on nested_svm_load_cr3() succeeding Drop the WARN in svm_set_nested_state() on nested_svm_load_cr3() failing as it is trivially easy to trigger from userspace by modifying CPUID after loading CR3. E.g. modifying the state restoration selftest like so: --- tools/testing/selftests/kvm/x86/state_test.c +++ tools/testing/selftests/kvm/x86/state_test.c @@ -280,7 +280,16 @@ int main(int argc, char *argv[]) /* Restore state in a new VM. */ vcpu = vm_recreate_with_one_vcpu(vm); - vcpu_load_state(vcpu, state); + + if (stage == 4) { + state->sregs.cr3 = BIT(44); + vcpu_load_state(vcpu, state); + + vcpu_set_cpuid_property(vcpu, X86_PROPERTY_MAX_PHY_ADDR, 36); + __vcpu_nested_state_set(vcpu, &state->nested); + } else { + vcpu_load_state(vcpu, state); + } /* * Restore XSAVE state in a dummy vCPU, first without doing generates: WARNING: CPU: 30 PID: 938 at arch/x86/kvm/svm/nested.c:1877 svm_set_nested_state+0x34a/0x360 [kvm_amd] Modules linked in: kvm_amd kvm irqbypass [last unloaded: kvm] CPU: 30 UID: 1000 PID: 938 Comm: state_test Tainted: G W 6.18.0-rc7-58e10b63777d-next-vm Tainted: [W]=WARN Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:svm_set_nested_state+0x34a/0x360 [kvm_amd] Call Trace: kvm_arch_vcpu_ioctl+0xf33/0x1700 [kvm] kvm_vcpu_ioctl+0x4e6/0x8f0 [kvm] __x64_sys_ioctl+0x8f/0xd0 do_syscall_64+0x61/0xad0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Simply delete the WARN instead of trying to prevent userspace from shoving "illegal" state into CR3. For better or worse, KVM's ABI allows userspace to set CPUID after SREGS, and vice versa, and KVM is very permissive when it comes to guest CPUID. I.e. attempting to enforce the virtual CPU model when setting CPUID could break userspace. Given that the WARN doesn't provide any meaningful protection for KVM or benefit for userspace, simply drop it even though the odds of breaking userspace are minuscule. Opportunistically delete a spurious newline. Fixes: b222b0b88162 ("KVM: nSVM: refactor the CR3 reload on migration") Cc: stable@vger.kernel.org Cc: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251216161755.1775409-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index ba0f11c68372..9be67040e94d 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1870,10 +1870,9 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * thus MMU might not be initialized correctly. * Set it again to fix this. */ - ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, nested_npt_enabled(svm), false); - if (WARN_ON_ONCE(ret)) + if (ret) goto out_free; svm->nested.force_msr_bitmap_recalc = true; -- cgit v1.2.3 From 737f2a382f89f2ff3d9d6a737004d97bfb98dc56 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 14:16:41 -0800 Subject: KVM: SVM: Rename "fault_address" to "gpa" in npf_interception() Rename "fault_address" to "gpa" in KVM's #NPF handler and track it as a gpa_t to more precisely document what type of address is being captured, and because "gpa" is much more succinct. No functional change intended. Link: https://patch.msgid.link/20251113221642.1673023-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 24d59ccfa40d..af018c1196b5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1867,8 +1867,8 @@ static int npf_interception(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); int rc; - u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; + gpa_t gpa = svm->vmcb->control.exit_info_2; /* * WARN if hardware generates a fault with an error code that collides @@ -1882,14 +1882,14 @@ static int npf_interception(struct kvm_vcpu *vcpu) if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) error_code |= PFERR_PRIVATE_ACCESS; - trace_kvm_page_fault(vcpu, fault_address, error_code); - rc = kvm_mmu_page_fault(vcpu, fault_address, error_code, + trace_kvm_page_fault(vcpu, gpa, error_code); + rc = kvm_mmu_page_fault(vcpu, gpa, error_code, static_cpu_has(X86_FEATURE_DECODEASSISTS) ? svm->vmcb->control.insn_bytes : NULL, svm->vmcb->control.insn_len); if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK) - sev_handle_rmp_fault(vcpu, fault_address, error_code); + sev_handle_rmp_fault(vcpu, gpa, error_code); return rc; } -- cgit v1.2.3 From 01cde4eaaecaf5df158234f0a52b4a1c55796858 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Nov 2025 14:16:42 -0800 Subject: KVM: SVM: Add support for expedited writes to the fast MMIO bus Wire up SVM's #NPF handler to fast MMIO. While SVM doesn't provide a dedicated exit reason, it's trivial to key off PFERR_RSVD_MASK. Like VMX, restrict the fast path to L1 to avoid having to deal with nGPA=>GPA translations. For simplicity, use the fast path if and only if the next RIP is known. While KVM could utilize EMULTYPE_SKIP, doing so would require additional logic to deal with SEV guests, e.g. to go down the slow path if the instruction buffer is empty. All modern CPUs support next RIP, and in practice the next RIP will be available for any guest fast path. Copy+paste the kvm_io_bus_write() + trace_kvm_fast_mmio() logic even though KVM would ideally provide a small helper, as such a helper would need to either be a macro or non-inline to avoid including trace.h in a header (trace.h must not be included by x86.c prior to CREATE_TRACE_POINTS being defined). Link: https://patch.msgid.link/20251113221642.1673023-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index af018c1196b5..d1ff23e02ecd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1862,6 +1862,9 @@ static int pf_interception(struct kvm_vcpu *vcpu) svm->vmcb->control.insn_len); } +static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); + static int npf_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1879,6 +1882,24 @@ static int npf_interception(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK)) error_code &= ~PFERR_SYNTHETIC_MASK; + /* + * Expedite fast MMIO kicks if the next RIP is known and KVM is allowed + * emulate a page fault, e.g. skipping the current instruction is wrong + * if the #NPF occurred while vectoring an event. + */ + if ((error_code & PFERR_RSVD_MASK) && !is_guest_mode(vcpu)) { + const int emul_type = EMULTYPE_PF | EMULTYPE_NO_DECODE; + + if (svm_check_emulate_instruction(vcpu, emul_type, NULL, 0)) + return 1; + + if (nrips && svm->vmcb->control.next_rip && + !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + trace_kvm_fast_mmio(gpa); + return kvm_skip_emulated_instruction(vcpu); + } + } + if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) error_code |= PFERR_PRIVATE_ACCESS; -- cgit v1.2.3 From 1d1722e52fcd70deb53d8c192f958fe34be14f5e Mon Sep 17 00:00:00 2001 From: Kevin Cheng Date: Mon, 15 Dec 2025 19:25:10 +0000 Subject: KVM: SVM: Don't allow L1 intercepts for instructions not advertised If a feature is not advertised in the guest's CPUID, prevent L1 from intercepting the unsupported instructions by clearing the corresponding intercept in KVM's cached vmcb12. When an L2 guest executes an instruction that is not advertised to L1, we expect a #UD exception to be injected by L0. However, the nested svm exit handler first checks if the instruction intercept is set in vmcb12, and if so, synthesizes an exit from L2 to L1 instead of a #UD exception. If a feature is not advertised, the L1 intercept should be ignored. While creating KVM's cached vmcb12, sanitize the intercepts for instructions that are not advertised in the guest CPUID. This effectively ignores the L1 intercept on nested vm exit handling. It also ignores the L1 intercept when computing the intercepts in vmcb02, so if L0 (for some reason) does not intercept the instruction, KVM won't intercept it at all. Signed-off-by: Kevin Cheng Co-developed-by: Sean Christopherson Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251215192510.2300816-1-chengkev@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 19 +++++++++++++++++++ arch/x86/kvm/svm/svm.h | 35 +++++++++++++++++++++++++++-------- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 9be67040e94d..aa1bea134ace 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -403,6 +403,19 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) return __nested_vmcb_check_controls(vcpu, ctl); } +/* + * If a feature is not advertised to L1, clear the corresponding vmcb12 + * intercept. + */ +#define __nested_svm_sanitize_intercept(__vcpu, __control, fname, iname) \ +do { \ + if (!guest_cpu_cap_has(__vcpu, X86_FEATURE_##fname)) \ + vmcb12_clr_intercept(__control, INTERCEPT_##iname); \ +} while (0) + +#define nested_svm_sanitize_intercept(__vcpu, __control, name) \ + __nested_svm_sanitize_intercept(__vcpu, __control, name, name) + static void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, struct vmcb_ctrl_area_cached *to, @@ -413,6 +426,12 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, for (i = 0; i < MAX_INTERCEPT; i++) to->intercepts[i] = from->intercepts[i]; + __nested_svm_sanitize_intercept(vcpu, to, XSAVE, XSETBV); + nested_svm_sanitize_intercept(vcpu, to, INVPCID); + nested_svm_sanitize_intercept(vcpu, to, RDTSCP); + nested_svm_sanitize_intercept(vcpu, to, SKINIT); + nested_svm_sanitize_intercept(vcpu, to, RDPRU); + to->iopm_base_pa = from->iopm_base_pa; to->msrpm_base_pa = from->msrpm_base_pa; to->tsc_offset = from->tsc_offset; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 01be93a53d07..806e68ba821b 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -434,28 +434,47 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) */ #define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR) -static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) +static inline void __vmcb_set_intercept(unsigned long *intercepts, u32 bit) { WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); - __set_bit(bit, (unsigned long *)&control->intercepts); + __set_bit(bit, intercepts); } -static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit) +static inline void __vmcb_clr_intercept(unsigned long *intercepts, u32 bit) { WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); - __clear_bit(bit, (unsigned long *)&control->intercepts); + __clear_bit(bit, intercepts); } -static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) +static inline bool __vmcb_is_intercept(unsigned long *intercepts, u32 bit) { WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); - return test_bit(bit, (unsigned long *)&control->intercepts); + return test_bit(bit, intercepts); +} + +static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) +{ + __vmcb_set_intercept((unsigned long *)&control->intercepts, bit); +} + +static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit) +{ + __vmcb_clr_intercept((unsigned long *)&control->intercepts, bit); +} + +static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) +{ + return __vmcb_is_intercept((unsigned long *)&control->intercepts, bit); +} + +static inline void vmcb12_clr_intercept(struct vmcb_ctrl_area_cached *control, u32 bit) +{ + __vmcb_clr_intercept((unsigned long *)&control->intercepts, bit); } static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit) { - WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); - return test_bit(bit, (unsigned long *)&control->intercepts); + return __vmcb_is_intercept((unsigned long *)&control->intercepts, bit); } static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) -- cgit v1.2.3 From db5e82496492b4890b1c3356581c016767ed527f Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Fri, 7 Nov 2025 10:32:39 +0100 Subject: KVM: SVM: Virtualize and advertise support for ERAPS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AMD CPUs with the Enhanced Return Address Predictor Security (ERAPS) feature (available on Zen5+) obviate the need for FILL_RETURN_BUFFER sequences right after VMEXITs. ERAPS adds guest/host tags to entries in the RSB (a.k.a. RAP). This helps with speculation protection across the VM boundary, and it also preserves host and guest entries in the RSB that can improve software performance (which would otherwise be flushed due to the FILL_RETURN_BUFFER sequences). Importantly, ERAPS also improves cross-domain security by clearing the RAP in certain situations. Specifically, the RAP is cleared in response to actions that are typically tied to software context switching between tasks. Per the APM: The ERAPS feature eliminates the need to execute CALL instructions to clear the return address predictor in most cases. On processors that support ERAPS, return addresses from CALL instructions executed in host mode are not used in guest mode, and vice versa. Additionally, the return address predictor is cleared in all cases when the TLB is implicitly invalidated and in the following cases: • MOV CR3 instruction • INVPCID other than single address invalidation (operation type 0) ERAPS also allows CPUs to extends the size of the RSB/RAP from the older standard (of 32 entries) to a new size, enumerated in CPUID leaf 0x80000021:EBX bits 23:16 (64 entries in Zen5 CPUs). In hardware, ERAPS is always-on, when running in host context, the CPU uses the full RSB/RAP size without any software changes necessary. However, when running in guest context, the CPU utilizes the full size of the RSB/RAP if and only if the new ALLOW_LARGER_RAP flag is set in the VMCB; if the flag is not set, the CPU limits itself to the historical size of 32 entires. Requiring software to opt-in for guest usage of RAPs larger than 32 entries allows hypervisors, i.e. KVM, to emulate the aforementioned conditions in which the RAP is cleared as well as the guest/host split. E.g. if the CPU unconditionally used the full RAP for guests, failure to clear the RAP on transitions between L1 or L2, or on emulated guest TLB flushes, would expose the guest to RAP-based attacks as a guest without support for ERAPS wouldn't know that its FILL_RETURN_BUFFER sequence is insufficient. Address the ~two broad categories of ERAPS emulation, and advertise ERAPS support to userspace, along with the RAP size enumerated in CPUID. 1. Architectural RAP clearing: as above, CPUs with ERAPS clear RAP entries on several conditions, including CR3 updates. To handle scenarios where a relevant operation is handled in common code (emulation of INVPCID and to a lesser extent MOV CR3), piggyback VCPU_EXREG_CR3 and create an alias, VCPU_EXREG_ERAPS. SVM doesn't utilize CR3 dirty tracking, and so for all intents and purposes VCPU_EXREG_CR3 is unused. Aliasing VCPU_EXREG_ERAPS ensures that any flow that writes CR3 will also clear the guest's RAP, and allows common x86 to mark ERAPS vCPUs as needing a RAP clear without having to add a new request (or other mechanism). 2. Nested guests: the ERAPS feature adds host/guest tagging to entries in the RSB, but does not distinguish between the guest ASIDs. To prevent the case of an L2 guest poisoning the RSB to attack the L1 guest, the CPU exposes a new VMCB bit (CLEAR_RAP). The next VMRUN with a VMCB that has this bit set causes the CPU to flush the RSB before entering the guest context. Set the bit in VMCB01 after a nested #VMEXIT to ensure the next time the L1 guest runs, its RSB contents aren't polluted by the L2's contents. Similarly, before entry into a nested guest, set the bit for VMCB02, so that the L1 guest's RSB contents are not leaked/used in the L2 context. Enable ALLOW_LARGER_RAP (and emulate RAP clears) if and only if ERAPS is exposed to the guest. Enabling ALLOW_LARGER_RAP unconditionally wouldn't cause any functional issues, but ignoring userspace's (and L1's) desires would put KVM into a grey area, which is especially undesirable due to the potential security implications. E.g. if a use case wants to have L1 do manual RAP clearing even when ERAPS is present in hardware, enabling ALLOW_LARGER_RAP could result in L1 leaving stale entries in the RAP. ERAPS is documented in AMD APM Vol 2 (Pub 24593), in revisions 3.43 and later. Signed-off-by: Amit Shah Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Amit Shah Link: https://patch.msgid.link/aR913X8EqO6meCqa@google.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/kvm_host.h | 8 ++++++++ arch/x86/include/asm/svm.h | 6 +++++- arch/x86/kvm/cpuid.c | 9 ++++++++- arch/x86/kvm/svm/nested.c | 18 ++++++++++++++++++ arch/x86/kvm/svm/svm.c | 25 ++++++++++++++++++++++++- arch/x86/kvm/svm/svm.h | 1 + arch/x86/kvm/x86.c | 12 ++++++++++++ 8 files changed, 77 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c3b53beb1300..81f7b3b91986 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -472,6 +472,7 @@ #define X86_FEATURE_GP_ON_USER_CPUID (20*32+17) /* User CPUID faulting */ #define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */ +#define X86_FEATURE_ERAPS (20*32+24) /* Enhanced Return Address Predictor Security */ #define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5a3bfa293e8b..0353d8b6988c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -195,7 +195,15 @@ enum kvm_reg { VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR0, + /* + * Alias AMD's ERAPS (not a real register) to CR3 so that common code + * can trigger emulation of the RAP (Return Address Predictor) with + * minimal support required in common code. Piggyback CR3 as the RAP + * is cleared on writes to CR3, i.e. marking CR3 dirty will naturally + * mark ERAPS dirty as well. + */ VCPU_EXREG_CR3, + VCPU_EXREG_ERAPS = VCPU_EXREG_CR3, VCPU_EXREG_CR4, VCPU_EXREG_RFLAGS, VCPU_EXREG_SEGMENTS, diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 56aa99503dc4..50ece197c98a 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -131,7 +131,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 tsc_offset; u32 asid; u8 tlb_ctl; - u8 reserved_2[3]; + u8 erap_ctl; + u8 reserved_2[2]; u32 int_ctl; u32 int_vector; u32 int_state; @@ -182,6 +183,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define TLB_CONTROL_FLUSH_ASID 3 #define TLB_CONTROL_FLUSH_ASID_LOCAL 7 +#define ERAP_CONTROL_ALLOW_LARGER_RAP BIT(0) +#define ERAP_CONTROL_CLEAR_RAP BIT(1) + #define V_TPR_MASK 0x0f #define V_IRQ_SHIFT 8 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 88a5426674a1..c590a5bd3196 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1223,6 +1223,7 @@ void kvm_set_cpu_caps(void) /* PrefetchCtlMsr */ /* GpOnUserCpuid */ /* EPSF */ + F(ERAPS), SYNTHESIZED_F(SBPB), SYNTHESIZED_F(IBPB_BRTYPE), SYNTHESIZED_F(SRSO_NO), @@ -1803,8 +1804,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; case 0x80000021: - entry->ebx = entry->edx = 0; + entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0021_EAX); + + if (kvm_cpu_cap_has(X86_FEATURE_ERAPS)) + entry->ebx &= GENMASK(23, 16); + else + entry->ebx = 0; + cpuid_entry_override(entry, CPUID_8000_0021_ECX); break; /* AMD Extended Performance Monitoring and Debug */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index aa1bea134ace..5a1e1164c197 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -436,6 +436,7 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->msrpm_base_pa = from->msrpm_base_pa; to->tsc_offset = from->tsc_offset; to->tlb_ctl = from->tlb_ctl; + to->erap_ctl = from->erap_ctl; to->int_ctl = from->int_ctl; to->int_vector = from->int_vector; to->int_state = from->int_state; @@ -885,6 +886,19 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } + /* + * Take ALLOW_LARGER_RAP from vmcb12 even though it should be safe to + * let L2 use a larger RAP since KVM will emulate the necessary clears, + * as it's possible L1 deliberately wants to restrict L2 to the legacy + * RAP size. Unconditionally clear the RAP on nested VMRUN, as KVM is + * responsible for emulating the host vs. guest tags (L1 is the "host", + * L2 is the "guest"). + */ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) + vmcb02->control.erap_ctl = (svm->nested.ctl.erap_ctl & + ERAP_CONTROL_ALLOW_LARGER_RAP) | + ERAP_CONTROL_CLEAR_RAP; + /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. @@ -1180,6 +1194,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_nested_vmexit_handle_ibrs(vcpu); + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) + vmcb01->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP; + svm_switch_vmcb(svm, &svm->vmcb01); /* @@ -1686,6 +1703,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->tsc_offset = from->tsc_offset; dst->asid = from->asid; dst->tlb_ctl = from->tlb_ctl; + dst->erap_ctl = from->erap_ctl; dst->int_ctl = from->int_ctl; dst->int_vector = from->int_vector; dst->int_state = from->int_state; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d1ff23e02ecd..34c8a94b1b81 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1141,6 +1141,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) svm_clr_intercept(svm, INTERCEPT_PAUSE); } + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) + svm->vmcb->control.erap_ctl |= ERAP_CONTROL_ALLOW_LARGER_RAP; + if (kvm_vcpu_apicv_active(vcpu)) avic_init_vmcb(svm, vmcb); @@ -3293,6 +3296,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); pr_err("%-20s%d\n", "asid:", control->asid); pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); + pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl); pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); pr_err("%-20s%08x\n", "int_vector:", control->int_vector); pr_err("%-20s%08x\n", "int_state:", control->int_state); @@ -4004,6 +4008,13 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) invlpga(gva, svm->vmcb->control.asid); } +static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS); + + svm_flush_tlb_asid(vcpu); +} + static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4262,6 +4273,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) } svm->vmcb->save.cr2 = vcpu->arch.cr2; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS) && + kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS)) + svm->vmcb->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP; + svm_hv_update_vp_id(svm->vmcb, vcpu); /* @@ -4339,6 +4354,14 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) } svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + + /* + * Unconditionally mask off the CLEAR_RAP bit, the AND is just as cheap + * as the TEST+Jcc to avoid it. + */ + if (cpu_feature_enabled(X86_FEATURE_ERAPS)) + svm->vmcb->control.erap_ctl &= ~ERAP_CONTROL_CLEAR_RAP; + vmcb_mark_all_clean(svm->vmcb); /* if exit due to PF check for async PF */ @@ -5094,7 +5117,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = { .flush_tlb_all = svm_flush_tlb_all, .flush_tlb_current = svm_flush_tlb_current, .flush_tlb_gva = svm_flush_tlb_gva, - .flush_tlb_guest = svm_flush_tlb_asid, + .flush_tlb_guest = svm_flush_tlb_guest, .vcpu_pre_run = svm_vcpu_pre_run, .vcpu_run = svm_vcpu_run, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 806e68ba821b..7d28a739865f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -156,6 +156,7 @@ struct vmcb_ctrl_area_cached { u64 tsc_offset; u32 asid; u8 tlb_ctl; + u8 erap_ctl; u32 int_ctl; u32 int_vector; u32 int_state; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ff8812f3a129..e013392fe20c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -14130,6 +14130,13 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return 1; } + /* + * When ERAPS is supported, invalidating a specific PCID clears + * the RAP (Return Address Predicator). + */ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) + kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS); + kvm_invalidate_pcid(vcpu, operand.pcid); return kvm_skip_emulated_instruction(vcpu); @@ -14143,6 +14150,11 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) fallthrough; case INVPCID_TYPE_ALL_INCL_GLOBAL: + /* + * Don't bother marking VCPU_EXREG_ERAPS dirty, SVM will take + * care of doing so when emulating the full guest TLB flush + * (the RAP is cleared on all implicit TLB flushes). + */ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return kvm_skip_emulated_instruction(vcpu); -- cgit v1.2.3 From 8312f1b9dd71340b5fff65e56c6c163187bfa5d0 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 21 Nov 2025 20:48:01 +0000 Subject: KVM: SVM: Don't set GIF when clearing EFER.SVME Clearing EFER.SVME is not architected to set GIF. Don't set GIF when emulating a change to EFER that clears EFER.SVME. However, keep setting GIF if clearing EFER.SVME causes force-leaving the nested guest through svm_leave_nested(), to maintain a sane behavior of not leaving GIF cleared after exiting the guest. In every other path, setting GIF is either correct/desirable, or irrelevant because the caller immediately and unconditionally sets/clears GIF. This is more-or-less KVM defining HW behavior, but leaving GIF cleared would also be defining HW behavior anyway. Note that if force-leaving the nested guest is considered a SHUTDOWN, then this could violate the APM-specified behavior: If the processor enters the shutdown state (due to a triple fault for instance) while GIF is clear, it can only be restarted by means of a RESET. However, a SHUTDOWN leaves the VMCB undefined, so there's not a lot that KVM can do in this case. Also, if vGIF is enabled on SHUTDOWN, KVM has no way of finding out of GIF was cleared. The only way for KVM to handle this without making up HW behavior is to completely terminate the VM, so settle for doing the relatively "sane" thing of setting GIF when force-leaving nested. Fixes: c513f484c558 ("KVM: nSVM: leave guest mode when clearing EFER.SVME") Signed-off-by: Jim Mattson Co-developed-by: Sean Christopherson Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251121204803.991707-3-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 2 ++ arch/x86/kvm/svm/svm.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 5a1e1164c197..47e8ce7d360a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1399,6 +1399,8 @@ void svm_leave_nested(struct kvm_vcpu *vcpu) nested_svm_uninit_mmu_context(vcpu); vmcb_mark_all_dirty(svm->vmcb); + svm_set_gif(svm, true); + if (kvm_apicv_activated(vcpu->kvm)) kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 34c8a94b1b81..c7bd78f5a2c7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -215,7 +215,6 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { if (!(efer & EFER_SVME)) { svm_leave_nested(vcpu); - svm_set_gif(svm, true); /* #GP intercept is still needed for vmware backdoor */ if (!enable_vmware_backdoor) clr_exception_intercept(svm, GP_VECTOR); -- cgit v1.2.3 From 6f4d3ebc24c6ef92e196ebbd389a3f2bfdc7a144 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 21 Nov 2025 20:48:00 +0000 Subject: KVM: SVM: Allow KVM_SET_NESTED_STATE to clear GIF when SVME==0 GIF==0 together with EFER.SVME==0 is a valid architectural state. Don't return -EINVAL for KVM_SET_NESTED_STATE when this combination is specified. Fixes: cc440cdad5b7 ("KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE") Signed-off-by: Jim Mattson Reviewed-by: Yosry Ahmed Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251121204803.991707-2-yosry.ahmed@linux.dev [sean: disallow KVM_STATE_NESTED_RUN_PENDING with SVME=0] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 47e8ce7d360a..5b741f8ed170 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1821,12 +1821,12 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, /* * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed. + * If SVME is disabled, the only valid states are "none" and GIF=1 + * (clearing SVME does NOT set GIF, i.e. GIF=0 is allowed). */ - if (!(vcpu->arch.efer & EFER_SVME)) { - /* GIF=1 and no guest mode are required if SVME=0. */ - if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET) - return -EINVAL; - } + if (!(vcpu->arch.efer & EFER_SVME) && kvm_state->flags && + kvm_state->flags != KVM_STATE_NESTED_GIF_SET) + return -EINVAL; /* SMM temporarily disables SVM, so we cannot be in guest mode. */ if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) -- cgit v1.2.3 From bda6ae6f29664b659671f872a2adda3c1c2f5dd6 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Nov 2025 20:48:02 +0000 Subject: KVM: selftests: Use TEST_ASSERT_EQ() in test_vmx_nested_state() The assert messages do not add much value, so use TEST_ASSERT_EQ(), which also nicely displays the addresses in hex. While at it, also assert the values of state->flags. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251121204803.991707-4-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c index 67a62a5a8895..b59a8a17084d 100644 --- a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c @@ -241,8 +241,10 @@ void test_vmx_nested_state(struct kvm_vcpu *vcpu) TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, "Size must be between %ld and %d. The size returned was %d.", sizeof(*state), state_sz, state->size); - TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull."); - TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull."); + + TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull); + TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull); + TEST_ASSERT_EQ(state->flags, 0); free(state); } -- cgit v1.2.3 From ca2eccb953fd33ef38701e33e660b21f7e84aa14 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Nov 2025 20:48:03 +0000 Subject: KVM: selftests: Extend vmx_set_nested_state_test to cover SVM Add test cases for the validation checks in svm_set_nested_state(), and allow the test to run with SVM as well as VMX. The SVM test also makes sure that KVM_SET_NESTED_STATE accepts GIF being set or cleared if EFER.SVME is cleared, verifying a recently fixed bug where GIF was incorrectly expected to always be set when EFER.SVME is cleared. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20251121204803.991707-5-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 2 +- .../selftests/kvm/x86/nested_set_state_test.c | 406 +++++++++++++++++++++ .../selftests/kvm/x86/vmx_set_nested_state_test.c | 306 ---------------- 3 files changed, 407 insertions(+), 307 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86/nested_set_state_test.c delete mode 100644 tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..4ddece4ee365 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -92,6 +92,7 @@ TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test +TEST_GEN_PROGS_x86 += x86/nested_set_state_test TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/platform_info_test @@ -120,7 +121,6 @@ TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state TEST_GEN_PROGS_x86 += x86/vmx_nested_la57_state_test -TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test TEST_GEN_PROGS_x86 += x86/xapic_ipi_test TEST_GEN_PROGS_x86 += x86/xapic_state_test diff --git a/tools/testing/selftests/kvm/x86/nested_set_state_test.c b/tools/testing/selftests/kvm/x86/nested_set_state_test.c new file mode 100644 index 000000000000..0f2102b43629 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_set_state_test.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019, Google LLC. + * + * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" +#include "svm_util.h" + +#include +#include +#include +#include +#include + +/* + * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value + * changes this should be updated. + */ +#define VMCS12_REVISION 0x11e57ed0 + +bool have_evmcs; + +void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) +{ + vcpu_nested_state_set(vcpu, state); +} + +void test_nested_state_expect_errno(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state, + int expected_errno) +{ + int rv; + + rv = __vcpu_nested_state_set(vcpu, state); + TEST_ASSERT(rv == -1 && errno == expected_errno, + "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)", + strerror(expected_errno), expected_errno, rv, strerror(errno), + errno); +} + +void test_nested_state_expect_einval(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + test_nested_state_expect_errno(vcpu, state, EINVAL); +} + +void test_nested_state_expect_efault(struct kvm_vcpu *vcpu, + struct kvm_nested_state *state) +{ + test_nested_state_expect_errno(vcpu, state, EFAULT); +} + +void set_revision_id_for_vmcs12(struct kvm_nested_state *state, + u32 vmcs12_revision) +{ + /* Set revision_id in vmcs12 to vmcs12_revision. */ + memcpy(&state->data, &vmcs12_revision, sizeof(u32)); +} + +void set_default_state(struct kvm_nested_state *state) +{ + memset(state, 0, sizeof(*state)); + state->flags = KVM_STATE_NESTED_RUN_PENDING | + KVM_STATE_NESTED_GUEST_MODE; + state->format = 0; + state->size = sizeof(*state); +} + +void set_default_vmx_state(struct kvm_nested_state *state, int size) +{ + memset(state, 0, size); + if (have_evmcs) + state->flags = KVM_STATE_NESTED_EVMCS; + state->format = 0; + state->size = size; + state->hdr.vmx.vmxon_pa = 0x1000; + state->hdr.vmx.vmcs12_pa = 0x2000; + state->hdr.vmx.smm.flags = 0; + set_revision_id_for_vmcs12(state, VMCS12_REVISION); +} + +void test_vmx_nested_state(struct kvm_vcpu *vcpu) +{ + /* Add a page for VMCS12. */ + const int state_sz = sizeof(struct kvm_nested_state) + getpagesize(); + struct kvm_nested_state *state = + (struct kvm_nested_state *)malloc(state_sz); + + /* The format must be set to 0. 0 for VMX, 1 for SVM. */ + set_default_vmx_state(state, state_sz); + state->format = 1; + test_nested_state_expect_einval(vcpu, state); + + /* + * We cannot virtualize anything if the guest does not have VMX + * enabled. + */ + set_default_vmx_state(state, state_sz); + test_nested_state_expect_einval(vcpu, state); + + /* + * We cannot virtualize anything if the guest does not have VMX + * enabled. We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa + * is set to -1ull, but the flags must be zero. + */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = -1ull; + test_nested_state_expect_einval(vcpu, state); + + state->hdr.vmx.vmcs12_pa = -1ull; + state->flags = KVM_STATE_NESTED_EVMCS; + test_nested_state_expect_einval(vcpu, state); + + state->flags = 0; + test_nested_state(vcpu, state); + + /* Enable VMX in the guest CPUID. */ + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX); + + /* + * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without + * setting the nested state. When the eVMCS flag is not set, the + * expected return value is '0'. + */ + set_default_vmx_state(state, state_sz); + state->flags = 0; + state->hdr.vmx.vmxon_pa = -1ull; + state->hdr.vmx.vmcs12_pa = -1ull; + test_nested_state(vcpu, state); + + /* + * When eVMCS is supported, the eVMCS flag can only be set if the + * enlightened VMCS capability has been enabled. + */ + if (have_evmcs) { + state->flags = KVM_STATE_NESTED_EVMCS; + test_nested_state_expect_einval(vcpu, state); + vcpu_enable_evmcs(vcpu); + test_nested_state(vcpu, state); + } + + /* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */ + state->hdr.vmx.smm.flags = 1; + test_nested_state_expect_einval(vcpu, state); + + /* Invalid flags are rejected. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.flags = ~0; + test_nested_state_expect_einval(vcpu, state); + + /* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = -1ull; + state->flags = 0; + test_nested_state_expect_einval(vcpu, state); + + /* It is invalid to have vmxon_pa set to a non-page aligned address. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = 1; + test_nested_state_expect_einval(vcpu, state); + + /* + * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and + * KVM_STATE_NESTED_GUEST_MODE set together. + */ + set_default_vmx_state(state, state_sz); + state->flags = KVM_STATE_NESTED_GUEST_MODE | + KVM_STATE_NESTED_RUN_PENDING; + state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * It is invalid to have any of the SMM flags set besides: + * KVM_STATE_NESTED_SMM_GUEST_MODE + * KVM_STATE_NESTED_SMM_VMXON + */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE | + KVM_STATE_NESTED_SMM_VMXON); + test_nested_state_expect_einval(vcpu, state); + + /* Outside SMM, SMM flags must be zero. */ + set_default_vmx_state(state, state_sz); + state->flags = 0; + state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * Size must be large enough to fit kvm_nested_state and vmcs12 + * if VMCS12 physical address is set + */ + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + test_nested_state_expect_einval(vcpu, state); + + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + state->hdr.vmx.vmcs12_pa = -1; + test_nested_state(vcpu, state); + + /* + * KVM_SET_NESTED_STATE succeeds with invalid VMCS + * contents but L2 not running. + */ + set_default_vmx_state(state, state_sz); + state->flags = 0; + test_nested_state(vcpu, state); + + /* Invalid flags are rejected, even if no VMCS loaded. */ + set_default_vmx_state(state, state_sz); + state->size = sizeof(*state); + state->flags = 0; + state->hdr.vmx.vmcs12_pa = -1; + state->hdr.vmx.flags = ~0; + test_nested_state_expect_einval(vcpu, state); + + /* vmxon_pa cannot be the same address as vmcs_pa. */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = 0; + state->hdr.vmx.vmcs12_pa = 0; + test_nested_state_expect_einval(vcpu, state); + + /* + * Test that if we leave nesting the state reflects that when we get + * it again. + */ + set_default_vmx_state(state, state_sz); + state->hdr.vmx.vmxon_pa = -1ull; + state->hdr.vmx.vmcs12_pa = -1ull; + state->flags = 0; + test_nested_state(vcpu, state); + vcpu_nested_state_get(vcpu, state); + TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, + "Size must be between %ld and %d. The size returned was %d.", + sizeof(*state), state_sz, state->size); + + TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull); + TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull); + TEST_ASSERT_EQ(state->flags, 0); + + free(state); +} + +static void vcpu_efer_enable_svm(struct kvm_vcpu *vcpu) +{ + uint64_t old_efer = vcpu_get_msr(vcpu, MSR_EFER); + + vcpu_set_msr(vcpu, MSR_EFER, old_efer | EFER_SVME); +} + +static void vcpu_efer_disable_svm(struct kvm_vcpu *vcpu) +{ + uint64_t old_efer = vcpu_get_msr(vcpu, MSR_EFER); + + vcpu_set_msr(vcpu, MSR_EFER, old_efer & ~EFER_SVME); +} + +void set_default_svm_state(struct kvm_nested_state *state, int size) +{ + memset(state, 0, size); + state->format = 1; + state->size = size; + state->hdr.svm.vmcb_pa = 0x3000; +} + +void test_svm_nested_state(struct kvm_vcpu *vcpu) +{ + /* Add a page for VMCB. */ + const int state_sz = sizeof(struct kvm_nested_state) + getpagesize(); + struct kvm_nested_state *state = + (struct kvm_nested_state *)malloc(state_sz); + + vcpu_set_cpuid_feature(vcpu, X86_FEATURE_SVM); + + /* The format must be set to 1. 0 for VMX, 1 for SVM. */ + set_default_svm_state(state, state_sz); + state->format = 0; + test_nested_state_expect_einval(vcpu, state); + + /* Invalid flags are rejected, KVM_STATE_NESTED_EVMCS is VMX-only */ + set_default_svm_state(state, state_sz); + state->flags = KVM_STATE_NESTED_EVMCS; + test_nested_state_expect_einval(vcpu, state); + + /* + * If EFER.SVME is clear, guest mode is disallowed and GIF can be set or + * cleared. + */ + vcpu_efer_disable_svm(vcpu); + + set_default_svm_state(state, state_sz); + state->flags = KVM_STATE_NESTED_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + state->flags = 0; + test_nested_state(vcpu, state); + + state->flags = KVM_STATE_NESTED_GIF_SET; + test_nested_state(vcpu, state); + + /* Enable SVM in the guest EFER. */ + vcpu_efer_enable_svm(vcpu); + + /* Setting vmcb_pa to a non-aligned address is only fine when not entering guest mode */ + set_default_svm_state(state, state_sz); + state->hdr.svm.vmcb_pa = -1ull; + state->flags = 0; + test_nested_state(vcpu, state); + state->flags = KVM_STATE_NESTED_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * Size must be large enough to fit kvm_nested_state and VMCB + * only when entering guest mode. + */ + set_default_svm_state(state, state_sz/2); + state->flags = 0; + test_nested_state(vcpu, state); + state->flags = KVM_STATE_NESTED_GUEST_MODE; + test_nested_state_expect_einval(vcpu, state); + + /* + * Test that if we leave nesting the state reflects that when we get it + * again, except for vmcb_pa, which is always returned as 0 when not in + * guest mode. + */ + set_default_svm_state(state, state_sz); + state->hdr.svm.vmcb_pa = -1ull; + state->flags = KVM_STATE_NESTED_GIF_SET; + test_nested_state(vcpu, state); + vcpu_nested_state_get(vcpu, state); + TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, + "Size must be between %ld and %d. The size returned was %d.", + sizeof(*state), state_sz, state->size); + + TEST_ASSERT_EQ(state->hdr.svm.vmcb_pa, 0); + TEST_ASSERT_EQ(state->flags, KVM_STATE_NESTED_GIF_SET); + + free(state); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_nested_state state; + struct kvm_vcpu *vcpu; + + have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + + /* + * First run tests with VMX/SVM disabled to check error handling. + * test_{vmx/svm}_nested_state() will re-enable as needed. + */ + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX); + else + vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_SVM); + + /* Passing a NULL kvm_nested_state causes a EFAULT. */ + test_nested_state_expect_efault(vcpu, NULL); + + /* 'size' cannot be smaller than sizeof(kvm_nested_state). */ + set_default_state(&state); + state.size = 0; + test_nested_state_expect_einval(vcpu, &state); + + /* + * Setting the flags 0xf fails the flags check. The only flags that + * can be used are: + * KVM_STATE_NESTED_GUEST_MODE + * KVM_STATE_NESTED_RUN_PENDING + * KVM_STATE_NESTED_EVMCS + */ + set_default_state(&state); + state.flags = 0xf; + test_nested_state_expect_einval(vcpu, &state); + + /* + * If KVM_STATE_NESTED_RUN_PENDING is set then + * KVM_STATE_NESTED_GUEST_MODE has to be set as well. + */ + set_default_state(&state); + state.flags = KVM_STATE_NESTED_RUN_PENDING; + test_nested_state_expect_einval(vcpu, &state); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + test_vmx_nested_state(vcpu); + else + test_svm_nested_state(vcpu); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c deleted file mode 100644 index b59a8a17084d..000000000000 --- a/tools/testing/selftests/kvm/x86/vmx_set_nested_state_test.c +++ /dev/null @@ -1,306 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * vmx_set_nested_state_test - * - * Copyright (C) 2019, Google LLC. - * - * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE. - */ - -#include "test_util.h" -#include "kvm_util.h" -#include "processor.h" -#include "vmx.h" - -#include -#include -#include -#include -#include - -/* - * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value - * changes this should be updated. - */ -#define VMCS12_REVISION 0x11e57ed0 - -bool have_evmcs; - -void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state) -{ - vcpu_nested_state_set(vcpu, state); -} - -void test_nested_state_expect_errno(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state, - int expected_errno) -{ - int rv; - - rv = __vcpu_nested_state_set(vcpu, state); - TEST_ASSERT(rv == -1 && errno == expected_errno, - "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)", - strerror(expected_errno), expected_errno, rv, strerror(errno), - errno); -} - -void test_nested_state_expect_einval(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - test_nested_state_expect_errno(vcpu, state, EINVAL); -} - -void test_nested_state_expect_efault(struct kvm_vcpu *vcpu, - struct kvm_nested_state *state) -{ - test_nested_state_expect_errno(vcpu, state, EFAULT); -} - -void set_revision_id_for_vmcs12(struct kvm_nested_state *state, - u32 vmcs12_revision) -{ - /* Set revision_id in vmcs12 to vmcs12_revision. */ - memcpy(&state->data, &vmcs12_revision, sizeof(u32)); -} - -void set_default_state(struct kvm_nested_state *state) -{ - memset(state, 0, sizeof(*state)); - state->flags = KVM_STATE_NESTED_RUN_PENDING | - KVM_STATE_NESTED_GUEST_MODE; - state->format = 0; - state->size = sizeof(*state); -} - -void set_default_vmx_state(struct kvm_nested_state *state, int size) -{ - memset(state, 0, size); - if (have_evmcs) - state->flags = KVM_STATE_NESTED_EVMCS; - state->format = 0; - state->size = size; - state->hdr.vmx.vmxon_pa = 0x1000; - state->hdr.vmx.vmcs12_pa = 0x2000; - state->hdr.vmx.smm.flags = 0; - set_revision_id_for_vmcs12(state, VMCS12_REVISION); -} - -void test_vmx_nested_state(struct kvm_vcpu *vcpu) -{ - /* Add a page for VMCS12. */ - const int state_sz = sizeof(struct kvm_nested_state) + getpagesize(); - struct kvm_nested_state *state = - (struct kvm_nested_state *)malloc(state_sz); - - /* The format must be set to 0. 0 for VMX, 1 for SVM. */ - set_default_vmx_state(state, state_sz); - state->format = 1; - test_nested_state_expect_einval(vcpu, state); - - /* - * We cannot virtualize anything if the guest does not have VMX - * enabled. - */ - set_default_vmx_state(state, state_sz); - test_nested_state_expect_einval(vcpu, state); - - /* - * We cannot virtualize anything if the guest does not have VMX - * enabled. We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa - * is set to -1ull, but the flags must be zero. - */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = -1ull; - test_nested_state_expect_einval(vcpu, state); - - state->hdr.vmx.vmcs12_pa = -1ull; - state->flags = KVM_STATE_NESTED_EVMCS; - test_nested_state_expect_einval(vcpu, state); - - state->flags = 0; - test_nested_state(vcpu, state); - - /* Enable VMX in the guest CPUID. */ - vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX); - - /* - * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without - * setting the nested state. When the eVMCS flag is not set, the - * expected return value is '0'. - */ - set_default_vmx_state(state, state_sz); - state->flags = 0; - state->hdr.vmx.vmxon_pa = -1ull; - state->hdr.vmx.vmcs12_pa = -1ull; - test_nested_state(vcpu, state); - - /* - * When eVMCS is supported, the eVMCS flag can only be set if the - * enlightened VMCS capability has been enabled. - */ - if (have_evmcs) { - state->flags = KVM_STATE_NESTED_EVMCS; - test_nested_state_expect_einval(vcpu, state); - vcpu_enable_evmcs(vcpu); - test_nested_state(vcpu, state); - } - - /* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */ - state->hdr.vmx.smm.flags = 1; - test_nested_state_expect_einval(vcpu, state); - - /* Invalid flags are rejected. */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.flags = ~0; - test_nested_state_expect_einval(vcpu, state); - - /* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = -1ull; - state->flags = 0; - test_nested_state_expect_einval(vcpu, state); - - /* It is invalid to have vmxon_pa set to a non-page aligned address. */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = 1; - test_nested_state_expect_einval(vcpu, state); - - /* - * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and - * KVM_STATE_NESTED_GUEST_MODE set together. - */ - set_default_vmx_state(state, state_sz); - state->flags = KVM_STATE_NESTED_GUEST_MODE | - KVM_STATE_NESTED_RUN_PENDING; - state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; - test_nested_state_expect_einval(vcpu, state); - - /* - * It is invalid to have any of the SMM flags set besides: - * KVM_STATE_NESTED_SMM_GUEST_MODE - * KVM_STATE_NESTED_SMM_VMXON - */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE | - KVM_STATE_NESTED_SMM_VMXON); - test_nested_state_expect_einval(vcpu, state); - - /* Outside SMM, SMM flags must be zero. */ - set_default_vmx_state(state, state_sz); - state->flags = 0; - state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE; - test_nested_state_expect_einval(vcpu, state); - - /* - * Size must be large enough to fit kvm_nested_state and vmcs12 - * if VMCS12 physical address is set - */ - set_default_vmx_state(state, state_sz); - state->size = sizeof(*state); - state->flags = 0; - test_nested_state_expect_einval(vcpu, state); - - set_default_vmx_state(state, state_sz); - state->size = sizeof(*state); - state->flags = 0; - state->hdr.vmx.vmcs12_pa = -1; - test_nested_state(vcpu, state); - - /* - * KVM_SET_NESTED_STATE succeeds with invalid VMCS - * contents but L2 not running. - */ - set_default_vmx_state(state, state_sz); - state->flags = 0; - test_nested_state(vcpu, state); - - /* Invalid flags are rejected, even if no VMCS loaded. */ - set_default_vmx_state(state, state_sz); - state->size = sizeof(*state); - state->flags = 0; - state->hdr.vmx.vmcs12_pa = -1; - state->hdr.vmx.flags = ~0; - test_nested_state_expect_einval(vcpu, state); - - /* vmxon_pa cannot be the same address as vmcs_pa. */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = 0; - state->hdr.vmx.vmcs12_pa = 0; - test_nested_state_expect_einval(vcpu, state); - - /* - * Test that if we leave nesting the state reflects that when we get - * it again. - */ - set_default_vmx_state(state, state_sz); - state->hdr.vmx.vmxon_pa = -1ull; - state->hdr.vmx.vmcs12_pa = -1ull; - state->flags = 0; - test_nested_state(vcpu, state); - vcpu_nested_state_get(vcpu, state); - TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz, - "Size must be between %ld and %d. The size returned was %d.", - sizeof(*state), state_sz, state->size); - - TEST_ASSERT_EQ(state->hdr.vmx.vmxon_pa, -1ull); - TEST_ASSERT_EQ(state->hdr.vmx.vmcs12_pa, -1ull); - TEST_ASSERT_EQ(state->flags, 0); - - free(state); -} - -int main(int argc, char *argv[]) -{ - struct kvm_vm *vm; - struct kvm_nested_state state; - struct kvm_vcpu *vcpu; - - have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS); - - TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); - - /* - * AMD currently does not implement set_nested_state, so for now we - * just early out. - */ - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); - - vm = vm_create_with_one_vcpu(&vcpu, NULL); - - /* - * First run tests with VMX disabled to check error handling. - */ - vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX); - - /* Passing a NULL kvm_nested_state causes a EFAULT. */ - test_nested_state_expect_efault(vcpu, NULL); - - /* 'size' cannot be smaller than sizeof(kvm_nested_state). */ - set_default_state(&state); - state.size = 0; - test_nested_state_expect_einval(vcpu, &state); - - /* - * Setting the flags 0xf fails the flags check. The only flags that - * can be used are: - * KVM_STATE_NESTED_GUEST_MODE - * KVM_STATE_NESTED_RUN_PENDING - * KVM_STATE_NESTED_EVMCS - */ - set_default_state(&state); - state.flags = 0xf; - test_nested_state_expect_einval(vcpu, &state); - - /* - * If KVM_STATE_NESTED_RUN_PENDING is set then - * KVM_STATE_NESTED_GUEST_MODE has to be set as well. - */ - set_default_state(&state); - state.flags = KVM_STATE_NESTED_RUN_PENDING; - test_nested_state_expect_einval(vcpu, &state); - - test_vmx_nested_state(vcpu); - - kvm_vm_free(vm); - return 0; -} -- cgit v1.2.3 From 9587dd7a7ebd7be3c36815a4c4f90f7e2cedbe03 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:31:00 -0800 Subject: KVM: SVM: Drop the module param to control SEV-ES DebugSwap Rip out the DebugSwap module param, as the sequence of events that led to its inclusion was one big mistake, the param no longer serves any purpose. Commit d1f85fbe836e ("KVM: SEV: Enable data breakpoints in SEV-ES") goofed by not adding a way for the userspace VMM to control the feature. Functionally, that was fine, but it broke attestation signatures because SEV_FEATURES are included in the signature. Commit 5abf6dceb066 ("SEV: disable SEV-ES DebugSwap by default") fixed that issue, but the underlying flaw of userspace not having a way to control SEV_FEATURES was still there. That flaw was addressed by commit 4f5defae7089 ("KVM: SEV: introduce KVM_SEV_INIT2 operation"), and so then 4dd5ecacb9a4 ("KVM: SEV: allow SEV-ES DebugSwap again") re-enabled DebugSwap by default. Now that the dust has settled, the module param doesn't serve any meaningful purpose. Cc: Tom Lendacky Reviewed-by: Tom Lendacky Link: https://patch.msgid.link/20260109033101.1005769-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f59c65abe3cf..9b92f0cccfe6 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -53,9 +53,6 @@ module_param_named(sev_es, sev_es_enabled, bool, 0444); static bool sev_snp_enabled = true; module_param_named(sev_snp, sev_snp_enabled, bool, 0444); -/* enable/disable SEV-ES DebugSwap support */ -static bool sev_es_debug_swap_enabled = true; -module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); static u64 sev_supported_vmsa_features; static unsigned int nr_ciphertext_hiding_asids; @@ -3150,12 +3147,10 @@ out: sev_es_enabled = sev_es_supported; sev_snp_enabled = sev_snp_supported; - if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) || - !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) - sev_es_debug_swap_enabled = false; - sev_supported_vmsa_features = 0; - if (sev_es_debug_swap_enabled) + + if (sev_es_enabled && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) && + cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; if (sev_snp_enabled && tsc_khz && cpu_feature_enabled(X86_FEATURE_SNP_SECURE_TSC)) -- cgit v1.2.3 From d23051f59a5b4eb1f6163cf27e07b8cfcaeb4758 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:31:01 -0800 Subject: KVM: SVM: Tag sev_supported_vmsa_features as read-only after init Tag sev_supported_vmsa_features with __ro_after_init as it's configured by sev_hardware_setup() and never written after initial configuration (and if it were, that'd be a blatant bug). Opportunistically relocate the variable out of the module params area now that sev_es_debug_swap_enabled is gone (which largely motivated its original location). Reviewed-by: Tom Lendacky Link: https://patch.msgid.link/20260109033101.1005769-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 9b92f0cccfe6..28150506b18c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -53,8 +53,6 @@ module_param_named(sev_es, sev_es_enabled, bool, 0444); static bool sev_snp_enabled = true; module_param_named(sev_snp, sev_snp_enabled, bool, 0444); -static u64 sev_supported_vmsa_features; - static unsigned int nr_ciphertext_hiding_asids; module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 0444); @@ -81,6 +79,8 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04 static u64 snp_supported_policy_bits __ro_after_init; +static u64 sev_supported_vmsa_features __ro_after_init; + #define INITIAL_VMSA_GPA 0xFFFFFFFFF000 static u8 sev_enc_bit; -- cgit v1.2.3 From 217463aa329ea9a2efafd1bbfa6787e8df9091b9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:40 -0800 Subject: KVM: SVM: Add a helper to detect VMRUN failures Add a helper to detect VMRUN failures so that KVM can guard against its own long-standing bug, where KVM neglects to set exitcode[63:32] when synthesizing a nested VMFAIL_INVALID VM-Exit. This will allow fixing KVM's mess of treating exitcode as two separate 32-bit values without breaking KVM-on-KVM when running on an older, unfixed KVM. Cc: Jim Mattson Cc: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230211347.4099600-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 16 +++++++--------- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/svm/svm.h | 5 +++++ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 5b741f8ed170..666b5a36c15d 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1167,7 +1167,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1; vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2; - if (vmcb12->control.exit_code != SVM_EXIT_ERR) + if (!svm_is_vmrun_failure(vmcb12->control.exit_code)) nested_save_pending_event_to_vmcb12(svm, vmcb12); if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) @@ -1463,6 +1463,9 @@ static int nested_svm_intercept(struct vcpu_svm *svm) u32 exit_code = svm->vmcb->control.exit_code; int vmexit = NESTED_EXIT_HOST; + if (svm_is_vmrun_failure(exit_code)) + return NESTED_EXIT_DONE; + switch (exit_code) { case SVM_EXIT_MSR: vmexit = nested_svm_exit_handled_msr(svm); @@ -1470,7 +1473,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) case SVM_EXIT_IOIO: vmexit = nested_svm_intercept_ioio(svm); break; - case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { + case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: /* * Host-intercepted exceptions have been checked already in * nested_svm_exit_special. There is nothing to do here, @@ -1478,15 +1481,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm) */ vmexit = NESTED_EXIT_DONE; break; - } - case SVM_EXIT_ERR: { - vmexit = NESTED_EXIT_DONE; - break; - } - default: { + default: if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; - } + break; } return vmexit; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c7bd78f5a2c7..e20b40f346af 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3564,7 +3564,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return 1; } - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { + if (svm_is_vmrun_failure(svm->vmcb->control.exit_code)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; @@ -4346,7 +4346,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) /* Track VMRUNs that have made past consistency checking */ if (svm->nested.nested_run_pending && - svm->vmcb->control.exit_code != SVM_EXIT_ERR) + !svm_is_vmrun_failure(svm->vmcb->control.exit_code)) ++vcpu->stat.nested_run; svm->nested.nested_run_pending = 0; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7d28a739865f..3360ac36e071 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -425,6 +425,11 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_svm, vcpu); } +static inline bool svm_is_vmrun_failure(u64 exit_code) +{ + return (u32)exit_code == (u32)SVM_EXIT_ERR; +} + /* * Only the PDPTRs are loaded on demand into the shadow MMU. All other * fields are synchronized on VM-Exit, because accessing the VMCB is cheap. -- cgit v1.2.3 From 2450c9774510e45c506df4a1b46d129435993ff6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:41 -0800 Subject: KVM: SVM: Open code handling of unexpected exits in svm_invoke_exit_handler() Fold svm_check_exit_valid() and svm_handle_invalid_exit() into their sole caller, svm_invoke_exit_handler(), as having tiny single-use helpers makes the code unncessarily difficult to follow. This will also allow for additional cleanups in svm_invoke_exit_handler(). No functional change intended. Suggested-by: Yosry Ahmed Reviewed-by: Yosry Ahmed Reviewed-by: Pankaj Gupta Link: https://patch.msgid.link/20251230211347.4099600-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e20b40f346af..ddb07c6408de 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3467,23 +3467,13 @@ no_vmsa: sev_free_decrypted_vmsa(vcpu, save); } -static bool svm_check_exit_valid(u64 exit_code) -{ - return (exit_code < ARRAY_SIZE(svm_exit_handlers) && - svm_exit_handlers[exit_code]); -} - -static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) -{ - dump_vmcb(vcpu); - kvm_prepare_unexpected_reason_exit(vcpu, exit_code); - return 0; -} - int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (!svm_check_exit_valid(exit_code)) - return svm_handle_invalid_exit(vcpu, exit_code); + if (exit_code >= ARRAY_SIZE(svm_exit_handlers)) + goto unexpected_vmexit; + + if (!svm_exit_handlers[exit_code]) + goto unexpected_vmexit; #ifdef CONFIG_MITIGATION_RETPOLINE if (exit_code == SVM_EXIT_MSR) @@ -3502,6 +3492,11 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) #endif #endif return svm_exit_handlers[exit_code](vcpu); + +unexpected_vmexit: + dump_vmcb(vcpu); + kvm_prepare_unexpected_reason_exit(vcpu, exit_code); + return 0; } static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, -- cgit v1.2.3 From 194c17bf5ebadd2fcf52ac641793e3d755a7af55 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:42 -0800 Subject: KVM: SVM: Check for an unexpected VM-Exit after RETPOLINE "fast" handling Check for an unexpected/unhandled VM-Exit after the manual RETPOLINE=y handling. The entire point of the RETPOLINE checks is to optimize for common VM-Exits, i.e. checking for the rare case of an unsupported VM-Exit is counter-productive. This also aligns SVM and VMX exit handling. No functional change intended. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230211347.4099600-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ddb07c6408de..d2f997965a96 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3469,12 +3469,6 @@ no_vmsa: int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (exit_code >= ARRAY_SIZE(svm_exit_handlers)) - goto unexpected_vmexit; - - if (!svm_exit_handlers[exit_code]) - goto unexpected_vmexit; - #ifdef CONFIG_MITIGATION_RETPOLINE if (exit_code == SVM_EXIT_MSR) return msr_interception(vcpu); @@ -3491,6 +3485,12 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return sev_handle_vmgexit(vcpu); #endif #endif + if (exit_code >= ARRAY_SIZE(svm_exit_handlers)) + goto unexpected_vmexit; + + if (!svm_exit_handlers[exit_code]) + goto unexpected_vmexit; + return svm_exit_handlers[exit_code](vcpu); unexpected_vmexit: -- cgit v1.2.3 From 405fce694bd1589082a7ffd500b5a4b841c22f0d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:43 -0800 Subject: KVM: SVM: Filter out 64-bit exit codes when invoking exit handlers on bare metal Explicitly filter out 64-bit exit codes when invoking exit handlers, as svm_exit_handlers[] will never be sized with entries that use bits 63:32. Processing the non-failing exit code as a 32-bit value will allow tracking exit_code as a single 64-bit value (which it is, architecturally). This will also allow hardening KVM against Spectre-like attacks without needing to do silly things to avoid build failures on 32-bit kernels (array_index_nospec() rightly asserts that the index fits in an "unsigned long"). Omit the check when running as a VM, as KVM has historically failed to set bits 63:32 appropriately when synthesizing VM-Exits, i.e. KVM could get false positives when running as a VM on an older, broken KVM/kernel. From a functional perspective, omitting the check is "fine", as any unwanted collision between e.g. VMEXIT_INVALID and a 32-bit exit code will be fatal to KVM-on-KVM regardless of what KVM-as-L1 does. Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230211347.4099600-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d2f997965a96..3caf7a21679f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3467,8 +3467,22 @@ no_vmsa: sev_free_decrypted_vmsa(vcpu, save); } -int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) +int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code) { + u32 exit_code = __exit_code; + + /* + * SVM uses negative values, i.e. 64-bit values, to indicate that VMRUN + * failed. Report all such errors to userspace (note, VMEXIT_INVALID, + * a.k.a. SVM_EXIT_ERR, is special cased by svm_handle_exit()). Skip + * the check when running as a VM, as KVM has historically left garbage + * in bits 63:32, i.e. running KVM-on-KVM would hit false positives if + * the underlying kernel is buggy. + */ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR) && + (u64)exit_code != __exit_code) + goto unexpected_vmexit; + #ifdef CONFIG_MITIGATION_RETPOLINE if (exit_code == SVM_EXIT_MSR) return msr_interception(vcpu); @@ -3495,7 +3509,7 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) unexpected_vmexit: dump_vmcb(vcpu); - kvm_prepare_unexpected_reason_exit(vcpu, exit_code); + kvm_prepare_unexpected_reason_exit(vcpu, __exit_code); return 0; } -- cgit v1.2.3 From d7507a94a07202234236d7f94bed6015ca645ae6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:44 -0800 Subject: KVM: SVM: Treat exit_code as an unsigned 64-bit value through all of KVM Fix KVM's long-standing buggy handling of SVM's exit_code as a 32-bit value. Per the APM and Xen commit d1bd157fbc ("Big merge the HVM full-virtualisation abstractions.") (which is arguably more trustworthy than KVM), offset 0x70 is a single 64-bit value: 070h 63:0 EXITCODE Track exit_code as a single u64 to prevent reintroducing bugs where KVM neglects to correctly set bits 63:32. Fixes: 6aa8b732ca01 ("[PATCH] kvm: userspace interface") Cc: Jim Mattson Cc: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230211347.4099600-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 3 +- arch/x86/include/uapi/asm/svm.h | 32 +++++++++---------- arch/x86/kvm/svm/hyperv.c | 1 - arch/x86/kvm/svm/nested.c | 13 ++------ arch/x86/kvm/svm/sev.c | 36 ++++++++-------------- arch/x86/kvm/svm/svm.c | 7 ++--- arch/x86/kvm/svm/svm.h | 4 +-- arch/x86/kvm/trace.h | 6 ++-- include/hyperv/hvgdk.h | 2 +- tools/testing/selftests/kvm/include/x86/svm.h | 3 +- .../kvm/x86/svm_nested_soft_inject_test.c | 4 +-- 11 files changed, 42 insertions(+), 69 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 50ece197c98a..edde36097ddc 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -137,8 +137,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 int_vector; u32 int_state; u8 reserved_3[4]; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 650e3256ea7d..010a45c9f614 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -103,38 +103,38 @@ #define SVM_EXIT_VMGEXIT 0x403 /* SEV-ES software-defined VMGEXIT events */ -#define SVM_VMGEXIT_MMIO_READ 0x80000001 -#define SVM_VMGEXIT_MMIO_WRITE 0x80000002 -#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003 -#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004 -#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005 +#define SVM_VMGEXIT_MMIO_READ 0x80000001ull +#define SVM_VMGEXIT_MMIO_WRITE 0x80000002ull +#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003ull +#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004ull +#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005ull #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 -#define SVM_VMGEXIT_PSC 0x80000010 -#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011 -#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012 -#define SVM_VMGEXIT_AP_CREATION 0x80000013 +#define SVM_VMGEXIT_PSC 0x80000010ull +#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011ull +#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012ull +#define SVM_VMGEXIT_AP_CREATION 0x80000013ull #define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 -#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 -#define SVM_VMGEXIT_SAVIC 0x8000001a +#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018ull +#define SVM_VMGEXIT_SAVIC 0x8000001aull #define SVM_VMGEXIT_SAVIC_REGISTER_GPA 0 #define SVM_VMGEXIT_SAVIC_UNREGISTER_GPA 1 #define SVM_VMGEXIT_SAVIC_SELF_GPA ~0ULL -#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd -#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe +#define SVM_VMGEXIT_HV_FEATURES 0x8000fffdull +#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffeull #define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ /* SW_EXITINFO1[3:0] */ \ (((((u64)reason_set) & 0xf)) | \ /* SW_EXITINFO1[11:4] */ \ ((((u64)reason_code) & 0xff) << 4)) -#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff +#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffffull /* Exit code reserved for hypervisor/software use */ -#define SVM_EXIT_SW 0xf0000000 +#define SVM_EXIT_SW 0xf0000000ull -#define SVM_EXIT_ERR -1 +#define SVM_EXIT_ERR -1ull #define SVM_EXIT_REASONS \ { SVM_EXIT_READ_CR0, "read_cr0" }, \ diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c index 088f6429b24c..3ec580d687f5 100644 --- a/arch/x86/kvm/svm/hyperv.c +++ b/arch/x86/kvm/svm/hyperv.c @@ -11,7 +11,6 @@ void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL; - svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH; svm->vmcb->control.exit_info_2 = 0; nested_svm_vmexit(svm); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 666b5a36c15d..5aa0512e09c9 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -45,7 +45,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, * correctly fill in the high bits of exit_info_1. */ vmcb->control.exit_code = SVM_EXIT_NPF; - vmcb->control.exit_code_hi = 0; vmcb->control.exit_info_1 = (1ULL << 32); vmcb->control.exit_info_2 = fault->address; } @@ -441,7 +440,6 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->int_vector = from->int_vector; to->int_state = from->int_state; to->exit_code = from->exit_code; - to->exit_code_hi = from->exit_code_hi; to->exit_info_1 = from->exit_info_1; to->exit_info_2 = from->exit_info_2; to->exit_int_info = from->exit_int_info; @@ -747,8 +745,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, enter_guest_mode(vcpu); /* - * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, - * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. + * Filled at exit: exit_code, exit_info_1, exit_info_2, exit_int_info, + * exit_int_info_err, next_rip, insn_len, insn_bytes. */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) && @@ -1018,7 +1016,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (!nested_vmcb_check_save(vcpu) || !nested_vmcb_check_controls(vcpu)) { vmcb12->control.exit_code = SVM_EXIT_ERR; - vmcb12->control.exit_code_hi = -1u; vmcb12->control.exit_info_1 = 0; vmcb12->control.exit_info_2 = 0; goto out; @@ -1051,7 +1048,6 @@ out_exit_err: svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = -1u; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; @@ -1163,7 +1159,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.int_state = vmcb02->control.int_state; vmcb12->control.exit_code = vmcb02->control.exit_code; - vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi; vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1; vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2; @@ -1460,7 +1455,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) static int nested_svm_intercept(struct vcpu_svm *svm) { - u32 exit_code = svm->vmcb->control.exit_code; + u64 exit_code = svm->vmcb->control.exit_code; int vmexit = NESTED_EXIT_HOST; if (svm_is_vmrun_failure(exit_code)) @@ -1532,7 +1527,6 @@ static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector; - vmcb->control.exit_code_hi = 0; if (ex->has_error_code) vmcb->control.exit_info_1 = ex->error_code; @@ -1708,7 +1702,6 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->int_vector = from->int_vector; dst->int_state = from->int_state; dst->exit_code = from->exit_code; - dst->exit_code_hi = from->exit_code_hi; dst->exit_info_1 = from->exit_info_1; dst->exit_info_2 = from->exit_info_2; dst->exit_int_info = from->exit_int_info; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 28150506b18c..f67525007089 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3270,11 +3270,6 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } -static u64 kvm_get_cached_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static void dump_ghcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3296,7 +3291,7 @@ static void dump_ghcb(struct vcpu_svm *svm) */ pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - kvm_get_cached_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); + control->exit_code, kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", @@ -3330,7 +3325,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; struct ghcb *ghcb = svm->sev_es.ghcb; - u64 exit_code; /* * The GHCB protocol so far allows for the following data @@ -3364,9 +3358,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm)); /* Copy the GHCB exit information into the VMCB fields */ - exit_code = kvm_ghcb_get_sw_exit_code(svm); - control->exit_code = lower_32_bits(exit_code); - control->exit_code_hi = upper_32_bits(exit_code); + control->exit_code = kvm_ghcb_get_sw_exit_code(svm); control->exit_info_1 = kvm_ghcb_get_sw_exit_info_1(svm); control->exit_info_2 = kvm_ghcb_get_sw_exit_info_2(svm); svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm); @@ -3379,15 +3371,8 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; - u64 exit_code; u64 reason; - /* - * Retrieve the exit code now even though it may not be marked valid - * as it could help with debugging. - */ - exit_code = kvm_get_cached_sw_exit_code(control); - /* Only GHCB Usage code 0 is supported */ if (svm->sev_es.ghcb->ghcb_usage) { reason = GHCB_ERR_INVALID_USAGE; @@ -3401,7 +3386,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) !kvm_ghcb_sw_exit_info_2_is_valid(svm)) goto vmgexit_err; - switch (exit_code) { + switch (control->exit_code) { case SVM_EXIT_READ_DR7: break; case SVM_EXIT_WRITE_DR7: @@ -3502,15 +3487,19 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) return 0; vmgexit_err: + /* + * Print the exit code even though it may not be marked valid as it + * could help with debugging. + */ if (reason == GHCB_ERR_INVALID_USAGE) { vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", svm->sev_es.ghcb->ghcb_usage); } else if (reason == GHCB_ERR_INVALID_EVENT) { vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", - exit_code); + control->exit_code); } else { vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n", - exit_code); + control->exit_code); dump_ghcb(svm); } @@ -4349,7 +4338,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; - u64 ghcb_gpa, exit_code; + u64 ghcb_gpa; int ret; /* Validate the GHCB */ @@ -4391,8 +4380,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm_vmgexit_success(svm, 0); - exit_code = kvm_get_cached_sw_exit_code(control); - switch (exit_code) { + switch (control->exit_code) { case SVM_VMGEXIT_MMIO_READ: ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); if (ret) @@ -4484,7 +4472,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = -EINVAL; break; default: - ret = svm_invoke_exit_handler(vcpu, exit_code); + ret = svm_invoke_exit_handler(vcpu, control->exit_code); } return ret; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3caf7a21679f..a28cd61d87ea 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2466,7 +2466,6 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, if (cr0 ^ val) { svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - svm->vmcb->control.exit_code_hi = 0; ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); } @@ -3299,7 +3298,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); pr_err("%-20s%08x\n", "int_vector:", control->int_vector); pr_err("%-20s%08x\n", "int_state:", control->int_state); - pr_err("%-20s%08x\n", "exit_code:", control->exit_code); + pr_err("%-20s%016llx\n", "exit_code:", control->exit_code); pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); @@ -3549,7 +3548,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; - u32 exit_code = svm->vmcb->control.exit_code; /* SEV-ES guests must use the CR write traps to track CR registers. */ if (!sev_es_guest(vcpu->kvm)) { @@ -3585,7 +3583,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - return svm_invoke_exit_handler(vcpu, exit_code); + return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code); } static int pre_svm_run(struct kvm_vcpu *vcpu) @@ -4670,7 +4668,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, if (static_cpu_has(X86_FEATURE_NRIPS)) vmcb->control.next_rip = info->next_rip; vmcb->control.exit_code = icpt_info.exit_code; - vmcb->control.exit_code_hi = 0; vmexit = nested_svm_exit_handled(svm); ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 3360ac36e071..a22433680c73 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -160,8 +160,7 @@ struct vmcb_ctrl_area_cached { u32 int_ctl; u32 int_vector; u32 int_state; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; @@ -787,7 +786,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm); static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) { svm->vmcb->control.exit_code = exit_code; - svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; return nested_svm_vmexit(svm); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index e79bc9cb7162..e7fdbe9efc90 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -383,10 +383,10 @@ TRACE_EVENT(kvm_apic, #define kvm_print_exit_reason(exit_reason, isa) \ (isa == KVM_ISA_VMX) ? \ __print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) : \ - __print_symbolic(exit_reason, SVM_EXIT_REASONS), \ + __print_symbolic_u64(exit_reason, SVM_EXIT_REASONS), \ (isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "", \ (isa == KVM_ISA_VMX) ? \ - __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" + __print_flags_u64(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" #define TRACE_EVENT_KVM_EXIT(name) \ TRACE_EVENT(name, \ @@ -781,7 +781,7 @@ TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit); * Tracepoint for #VMEXIT reinjected to the guest */ TRACE_EVENT(kvm_nested_vmexit_inject, - TP_PROTO(__u32 exit_code, + TP_PROTO(__u64 exit_code, __u64 exit_info1, __u64 exit_info2, __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), TP_ARGS(exit_code, exit_info1, exit_info2, diff --git a/include/hyperv/hvgdk.h b/include/hyperv/hvgdk.h index dd6d4939ea29..384c3f3ff4a5 100644 --- a/include/hyperv/hvgdk.h +++ b/include/hyperv/hvgdk.h @@ -281,7 +281,7 @@ struct hv_vmcb_enlightenments { #define HV_VMCB_NESTED_ENLIGHTENMENTS 31 /* Synthetic VM-Exit */ -#define HV_SVM_EXITCODE_ENL 0xf0000000 +#define HV_SVM_EXITCODE_ENL 0xf0000000ull #define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1) /* VM_PARTITION_ASSIST_PAGE */ diff --git a/tools/testing/selftests/kvm/include/x86/svm.h b/tools/testing/selftests/kvm/include/x86/svm.h index 29cffd0a9181..10b30b38bb3f 100644 --- a/tools/testing/selftests/kvm/include/x86/svm.h +++ b/tools/testing/selftests/kvm/include/x86/svm.h @@ -92,8 +92,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 int_vector; u32 int_state; u8 reserved_3[4]; - u32 exit_code; - u32 exit_code_hi; + u64 exit_code; u64 exit_info_1; u64 exit_info_2; u32 exit_int_info; diff --git a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c index 7b6481d6c0d3..4bd1655f9e6d 100644 --- a/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c +++ b/tools/testing/selftests/kvm/x86/svm_nested_soft_inject_test.c @@ -103,7 +103,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i run_guest(vmcb, svm->vmcb_gpa); __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL, - "Expected VMMCAL #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'", + "Expected VMMCAL #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'", vmcb->control.exit_code, vmcb->control.exit_info_1, vmcb->control.exit_info_2); @@ -133,7 +133,7 @@ static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t i run_guest(vmcb, svm->vmcb_gpa); __GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_HLT, - "Expected HLT #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'", + "Expected HLT #VMEXIT, got '0x%lx', info1 = '0x%lx, info2 = '0x%lx'", vmcb->control.exit_code, vmcb->control.exit_info_1, vmcb->control.exit_info_2); -- cgit v1.2.3 From a08ca6691fd3ab40e40eb6600193672d50c7a7ba Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:45 -0800 Subject: KVM: SVM: Limit incorrect check on SVM_EXIT_ERR to running as a VM Limit KVM's incorrect check for VMXEXIT_INVALID, a.k.a. SVM_EXIT_ERR, to running as a VM, as detected by X86_FEATURE_HYPERVISOR. The exit_code and all failure codes, e.g. VMXEXIT_INVALID, are 64-bit values, and so checking only bits 31:0 could result in false positives when running on non-broken hardware, e.g. in the extremely unlikely scenario exit code 0xffffffffull is ever generated by hardware. Keep the 32-bit check to play nice with running on broken KVM (for years, KVM has not set bits 63:32 when synthesizing nested SVM VM-Exits). Reviewed-by: Yosry Ahmed Link: https://patch.msgid.link/20251230211347.4099600-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a22433680c73..338fc4f5cc4c 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -426,7 +426,10 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) static inline bool svm_is_vmrun_failure(u64 exit_code) { - return (u32)exit_code == (u32)SVM_EXIT_ERR; + if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return (u32)exit_code == (u32)SVM_EXIT_ERR; + + return exit_code == SVM_EXIT_ERR; } /* -- cgit v1.2.3 From 1e3dddafeceeb8d2cd182b78456cb9ca9d042a01 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:46 -0800 Subject: KVM: SVM: Harden exit_code against being used in Spectre-like attacks Explicitly clamp the exit code used to index KVM's exit handlers to guard against Spectre-like attacks, mainly to provide consistency between VMX and SVM (VMX was given the same treatment by commit c926f2f7230b ("KVM: x86: Protect exit_reason from being used in Spectre-v1/L1TF attacks"). For normal VMs, it's _extremely_ unlikely the exit code could be used to exploit a speculation vulnerability, as the exit code is set by hardware and unexpected/unknown exit codes should be quite well bounded (as is/was the case with VMX). But with SEV-ES+, the exit code is guest-controlled as it comes from the GHCB, not from hardware, i.e. an attack from the guest is at least somewhat plausible. Irrespective of SEV-ES+, hardening KVM is easy and inexpensive, and such an attack is theoretically possible. Link: https://patch.msgid.link/20251230211347.4099600-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a28cd61d87ea..e454ae095cf7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3501,6 +3501,7 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 __exit_code) if (exit_code >= ARRAY_SIZE(svm_exit_handlers)) goto unexpected_vmexit; + exit_code = array_index_nospec(exit_code, ARRAY_SIZE(svm_exit_handlers)); if (!svm_exit_handlers[exit_code]) goto unexpected_vmexit; -- cgit v1.2.3 From d6c20d19f7d3de14d02b47221988cdb19504bb84 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 30 Dec 2025 13:13:47 -0800 Subject: KVM: SVM: Assert that Hyper-V's HV_SVM_EXITCODE_ENL == SVM_EXIT_SW Add a build-time assertiont that Hyper-V's "enlightened" exit code is that, same as the AMD-defined "Reserved for Host" exit code, mostly to help readers connect the dots and understand why synthesizing a software-defined exit code is safe/ok. Reviewed-by: Vitaly Kuznetsov Link: https://patch.msgid.link/20251230211347.4099600-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/hyperv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c index 3ec580d687f5..4f24dcb45116 100644 --- a/arch/x86/kvm/svm/hyperv.c +++ b/arch/x86/kvm/svm/hyperv.c @@ -10,6 +10,12 @@ void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + /* + * The exit code used by Hyper-V for software-defined exits is reserved + * by AMD specifically for such use cases. + */ + BUILD_BUG_ON(HV_SVM_EXITCODE_ENL != SVM_EXIT_SW); + svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL; svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH; svm->vmcb->control.exit_info_2 = 0; -- cgit v1.2.3 From 69555130dccb39df4d40f90fafc7fc79a5d55b8a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 8 Jan 2026 19:50:37 -0800 Subject: KVM: SVM: Fix an off-by-one typo in the comment for enabling AVIC by default Fix a goof in the comment that documents KVM's logic for enabling AVIC by default to reference Zen5+ as family 0x1A (Zen5), not family 0x19 (Zen4). The code is correct (checks for _greater_ than 0x19), only the comment is flawed. Opportunistically tweak the check too, even though it's already correct, so that both the comment and the code reference 0x1A, and so that the checks are "ascending", i.e. check Zen4 and then Zen5+. No functional change intended. Fixes: ca2967de5a5b ("KVM: SVM: Enable AVIC by default for Zen4+ if x2AVIC is support") Acked-by: Naveen N Rao (AMD) Link: https://patch.msgid.link/20260109035037.1015073-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 6b77b2033208..e8acac56da5b 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1224,13 +1224,13 @@ static bool __init avic_want_avic_enabled(void) * In "auto" mode, enable AVIC by default for Zen4+ if x2AVIC is * supported (to avoid enabling partial support by default, and because * x2AVIC should be supported by all Zen4+ CPUs). Explicitly check for - * family 0x19 and later (Zen5+), as the kernel's synthetic ZenX flags + * family 0x1A and later (Zen5+), as the kernel's synthetic ZenX flags * aren't inclusive of previous generations, i.e. the kernel will set * at most one ZenX feature flag. */ if (avic == AVIC_AUTO_MODE) avic = boot_cpu_has(X86_FEATURE_X2AVIC) && - (boot_cpu_data.x86 > 0x19 || cpu_feature_enabled(X86_FEATURE_ZEN4)); + (cpu_feature_enabled(X86_FEATURE_ZEN4) || boot_cpu_data.x86 >= 0x1A); if (!avic || !npt_enabled) return false; -- cgit v1.2.3 From f00ccdede3c84df2287e59b546fd92d58b7e07af Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 13 Jan 2026 17:28:07 +0000 Subject: KVM: nSVM: Drop redundant/wrong comment in nested_vmcb02_prepare_save() The comment above DR6 and DR7 initializations is redundant, because the entire function follows the same pattern of only initializing the fields in vmcb02 if the vmcb12 changed or the fields are dirty, which handles the first execution case. Also, the comment refers to new_vmcb12 as new_vmcs12. Just drop the comment. No functional change intended. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20260113172807.2178526-1-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 5aa0512e09c9..79cb85b8a156 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -681,7 +681,6 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb02->save.rsp = vmcb12->save.rsp; vmcb02->save.rip = vmcb12->save.rip; - /* These bits will be set properly on the first execution when new_vmc12 is true */ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW; -- cgit v1.2.3 From 127ccae2c185f62e6ecb4bf24f9cb307e9b9c619 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 10 Jan 2026 00:48:18 +0000 Subject: KVM: nSVM: Always use vmcb01 in VMLOAD/VMSAVE emulation Commit cc3ed80ae69f ("KVM: nSVM: always use vmcb01 to for vmsave/vmload of guest state") made KVM always use vmcb01 for the fields controlled by VMSAVE/VMLOAD, but it missed updating the VMLOAD/VMSAVE emulation code to always use vmcb01. As a result, if VMSAVE/VMLOAD is executed by an L2 guest and is not intercepted by L1, KVM will mistakenly use vmcb02. Always use vmcb01 instead of the current VMCB. Fixes: cc3ed80ae69f ("KVM: nSVM: always use vmcb01 to for vmsave/vmload of guest state") Cc: Maxim Levitsky Cc: stable@vger.kernel.org Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20260110004821.3411245-2-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e454ae095cf7..f1a5b61bdb5b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2122,12 +2122,13 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) ret = kvm_skip_emulated_instruction(vcpu); + /* KVM always performs VMLOAD/VMSAVE on VMCB01 (see __svm_vcpu_run()) */ if (vmload) { - svm_copy_vmloadsave_state(svm->vmcb, vmcb12); + svm_copy_vmloadsave_state(svm->vmcb01.ptr, vmcb12); svm->sysenter_eip_hi = 0; svm->sysenter_esp_hi = 0; } else { - svm_copy_vmloadsave_state(vmcb12, svm->vmcb); + svm_copy_vmloadsave_state(vmcb12, svm->vmcb01.ptr); } kvm_vcpu_unmap(vcpu, &map); -- cgit v1.2.3 From 55780d8a1dcc93d2c4b33c565ada88df12c9f206 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Sat, 10 Jan 2026 00:48:19 +0000 Subject: KVM: SVM: Stop toggling virtual VMSAVE/VMLOAD on intercept recalc Virtual VMSAVE/VMLOAD enablement (i.e. VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK) is set/cleared by svm_recalc_instruction_intercepts() when the intercepts are cleared/set. This is unnecessary because the bit is meaningless when intercepts are set and KVM emulates the instructions. Initialize the bit in vmcb01 base on vls, and keep it unchanged. This is similar-ish to how vGIF is handled. It is enabled in init_vmcb() if vgif=1 and remains unchanged when the STGI intercept is enabled (e.g. for NMI windows). This fixes a bug in svm_recalc_instruction_intercepts(). The intercepts for VMSAVE/VMLOAD are always toggled in vmcb01, but VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK is toggled in the current VMCB, which could be vmcb02 instead of vmcb01 if L2 is active. Virtual VMSAVE/VMLOAD enablement in vmcb02 is separately controlled by nested_vmcb02_prepare_control() based on the vCPU features and VMCB12, and if intercepts are needed they are set by recalc_intercepts(). The bug is benign though. Not toggling the bit for vmcb01 is harmless because it's useless anyway. For vmcb02: - The bit could be incorrectly cleared when intercepts are set in vmcb01. This is harmless because VMSAVE/VMLOAD will be emulated by KVM anyway. - The bit could be incorrectly set when the intercepts are cleared in vmcb01. However, if the bit was originally clear in vmcb02, then recalc_intercepts() will enable in the intercepts in vmcb02 anyway and VMSAVE/VMLOAD will be emulated by KVM. Signed-off-by: Yosry Ahmed Link: https://patch.msgid.link/20260110004821.3411245-3-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f1a5b61bdb5b..5eadecc5246c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -995,10 +995,14 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_RDTSCP); } + /* + * No need to toggle VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK here, it is + * always set if vls is enabled. If the intercepts are set, the bit is + * meaningless anyway. + */ if (guest_cpuid_is_intel_compatible(vcpu)) { svm_set_intercept(svm, INTERCEPT_VMLOAD); svm_set_intercept(svm, INTERCEPT_VMSAVE); - svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it @@ -1007,7 +1011,6 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) if (vls) { svm_clr_intercept(svm, INTERCEPT_VMLOAD); svm_clr_intercept(svm, INTERCEPT_VMSAVE); - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } } } @@ -1155,6 +1158,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (vls) + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + if (vcpu->kvm->arch.bus_lock_detection_enabled) svm_set_intercept(svm, INTERCEPT_BUSLOCK); -- cgit v1.2.3 From fa9893fadbc245e179cb17f3c371c67471b5a8a8 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Fri, 9 Jan 2026 17:17:32 -0600 Subject: KVM: Introduce KVM_EXIT_SNP_REQ_CERTS for SNP certificate-fetching For SEV-SNP, the host can optionally provide a certificate table to the guest when it issues an attestation request to firmware (see GHCB 2.0 specification regarding "SNP Extended Guest Requests"). This certificate table can then be used to verify the endorsement key used by firmware to sign the attestation report. While it is possible for guests to obtain the certificates through other means, handling it via the host provides more flexibility in being able to keep the certificate data in sync with the endorsement key throughout host-side operations that might resulting in the endorsement key changing. In the case of KVM, userspace will be responsible for fetching the certificate table and keeping it in sync with any modifications to the endorsement key by other userspace management tools. Define a new KVM_EXIT_SNP_REQ_CERTS event where userspace is provided with the GPA of the buffer the guest has provided as part of the attestation request so that userspace can write the certificate data into it while relying on filesystem-based locking to keep the certificates up-to-date relative to the endorsement keys installed/utilized by firmware at the time the certificates are fetched. [Melody: Update the documentation scheme about how file locking is expected to happen.] Reviewed-by: Liam Merwick Tested-by: Liam Merwick Tested-by: Dionna Glaze Signed-off-by: Michael Roth Signed-off-by: Melody Wang Signed-off-by: Michael Roth Link: https://patch.msgid.link/20260109231732.1160759-2-michael.roth@amd.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 44 ++++++++++++++++++++++++++++++ arch/x86/kvm/svm/sev.c | 62 ++++++++++++++++++++++++++++++++++++++---- arch/x86/kvm/svm/svm.h | 1 + include/uapi/linux/kvm.h | 9 ++++++ 4 files changed, 110 insertions(+), 6 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 01a3abef8abb..428d7d9cb4d6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7353,6 +7353,50 @@ Please note that the kernel is allowed to use the kvm_run structure as the primary storage for certain register types. Therefore, the kernel may use the values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set. +:: + + /* KVM_EXIT_SNP_REQ_CERTS */ + struct kvm_exit_snp_req_certs { + __u64 gpa; + __u64 npages; + __u64 ret; + }; + +KVM_EXIT_SNP_REQ_CERTS indicates an SEV-SNP guest with certificate-fetching +enabled (see KVM_SEV_SNP_ENABLE_REQ_CERTS) has generated an Extended Guest +Request NAE #VMGEXIT (SNP_GUEST_REQUEST) with message type MSG_REPORT_REQ, +i.e. has requested an attestation report from firmware, and would like the +certificate data corresponding to the attestation report signature to be +provided by the hypervisor as part of the request. + +To allow for userspace to provide the certificate, the 'gpa' and 'npages' +are forwarded verbatim from the guest request (the RAX and RBX GHCB fields +respectively). 'ret' is not an "output" from KVM, and is always '0' on +exit. KVM verifies the 'gpa' is 4KiB aligned prior to exiting to userspace, +but otherwise the information from the guest isn't validated. + +Upon the next KVM_RUN, e.g. after userspace has serviced the request (or not), +KVM will complete the #VMGEXIT, using the 'ret' field to determine whether to +signal success or failure to the guest, and on failure, what reason code will +be communicated via SW_EXITINFO2. If 'ret' is set to an unsupported value (see +the table below), KVM_RUN will fail with -EINVAL. For a 'ret' of 'ENOSPC', KVM +also consumes the 'npages' field, i.e. userspace can use the field to inform +the guest of the number of pages needed to hold all the certificate data. + +The supported 'ret' values and their respective SW_EXITINFO2 encodings: + + ====== ============================================================= + 0 0x0, i.e. success. KVM will emit an SNP_GUEST_REQUEST command + to SNP firmware. + ENOSPC 0x0000000100000000, i.e. not enough guest pages to hold the + certificate table and certificate data. KVM will also set the + RBX field in the GHBC to 'npages'. + EAGAIN 0x0000000200000000, i.e. the host is busy and the guest should + retry the request. + EIO 0xffffffff00000000, for all other errors (this return code is + a KVM-defined hypervisor value, as allowed by the GHCB) + ====== ============================================================= + .. _cap_enable: diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index f67525007089..9e6a78e448f2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -41,6 +41,16 @@ #define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) +/* + * The GHCB spec essentially states that all non-zero error codes other than + * those explicitly defined above should be treated as an error by the guest. + * Define a generic error to cover that case, and choose a value that is not + * likely to overlap with new explicit error codes should more be added to + * the GHCB spec later. KVM will use this to report generic errors when + * handling SNP guest requests. + */ +#define SNP_GUEST_VMM_ERR_GENERIC (~0U) + /* enable/disable SEV support */ static bool sev_enabled = true; module_param_named(sev, sev_enabled, bool, 0444); @@ -4139,6 +4149,36 @@ out_unlock: return ret; } +static int snp_req_certs_err(struct vcpu_svm *svm, u32 vmm_error) +{ + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, SNP_GUEST_ERR(vmm_error, 0)); + + return 1; /* resume guest */ +} + +static int snp_complete_req_certs(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + switch (READ_ONCE(vcpu->run->snp_req_certs.ret)) { + case 0: + return snp_handle_guest_req(svm, control->exit_info_1, + control->exit_info_2); + case ENOSPC: + vcpu->arch.regs[VCPU_REGS_RBX] = vcpu->run->snp_req_certs.npages; + return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_INVALID_LEN); + case EAGAIN: + return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_BUSY); + case EIO: + return snp_req_certs_err(svm, SNP_GUEST_VMM_ERR_GENERIC); + default: + break; + } + + return -EINVAL; +} + static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) { struct kvm *kvm = svm->vcpu.kvm; @@ -4154,14 +4194,15 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r /* * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for * additional certificate data to be provided alongside the attestation - * report via the guest-provided data pages indicated by RAX/RBX. The - * certificate data is optional and requires additional KVM enablement - * to provide an interface for userspace to provide it, but KVM still - * needs to be able to handle extended guest requests either way. So - * provide a stub implementation that will always return an empty - * certificate table in the guest-provided data pages. + * report via the guest-provided data pages indicated by RAX/RBX. If + * userspace enables KVM_EXIT_SNP_REQ_CERTS, then exit to userspace + * to give userspace an opportunity to provide the certificate data + * before issuing/completing the attestation request. Otherwise, return + * an empty certificate table in the guest-provided data pages and + * handle the attestation request immediately. */ if (msg_type == SNP_MSG_REPORT_REQ) { + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_vcpu *vcpu = &svm->vcpu; u64 data_npages; gpa_t data_gpa; @@ -4175,6 +4216,15 @@ static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t r if (!PAGE_ALIGNED(data_gpa)) goto request_invalid; + if (sev->snp_certs_enabled) { + vcpu->run->exit_reason = KVM_EXIT_SNP_REQ_CERTS; + vcpu->run->snp_req_certs.gpa = data_gpa; + vcpu->run->snp_req_certs.npages = data_npages; + vcpu->run->snp_req_certs.ret = 0; + vcpu->arch.complete_userspace_io = snp_complete_req_certs; + return 0; + } + /* * As per GHCB spec (see "SNP Extended Guest Request"), the * certificate table is terminated by 24-bytes of zeroes. diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 338fc4f5cc4c..ebd7b36b1ceb 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -115,6 +115,7 @@ struct kvm_sev_info { void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */ struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */ + bool snp_certs_enabled; /* SNP certificate-fetching support. */ }; struct kvm_svm { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index dddb781b0507..8cd107cdcf0b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -135,6 +135,12 @@ struct kvm_xen_exit { } u; }; +struct kvm_exit_snp_req_certs { + __u64 gpa; + __u64 npages; + __u64 ret; +}; + #define KVM_S390_GET_SKEYS_NONE 1 #define KVM_S390_SKEYS_MAX 1048576 @@ -180,6 +186,7 @@ struct kvm_xen_exit { #define KVM_EXIT_MEMORY_FAULT 39 #define KVM_EXIT_TDX 40 #define KVM_EXIT_ARM_SEA 41 +#define KVM_EXIT_SNP_REQ_CERTS 42 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -482,6 +489,8 @@ struct kvm_run { __u64 gva; __u64 gpa; } arm_sea; + /* KVM_EXIT_SNP_REQ_CERTS */ + struct kvm_exit_snp_req_certs snp_req_certs; /* Fix the size of the union. */ char padding[256]; }; -- cgit v1.2.3 From 20c3c4108d58f87c711bf44cb0b498b3ac5af6bf Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Fri, 9 Jan 2026 17:17:33 -0600 Subject: KVM: SEV: Add KVM_SEV_SNP_ENABLE_REQ_CERTS command Introduce a new command for KVM_MEMORY_ENCRYPT_OP ioctl that can be used to enable fetching of endorsement key certificates from userspace via the new KVM_EXIT_SNP_REQ_CERTS exit type. Also introduce a new KVM_X86_SEV_SNP_REQ_CERTS KVM device attribute so that userspace can query whether the kernel supports the new command/exit. Suggested-by: Sean Christopherson Reviewed-by: Liam Merwick Tested-by: Liam Merwick Signed-off-by: Michael Roth Link: https://patch.msgid.link/20260109231732.1160759-3-michael.roth@amd.com Signed-off-by: Sean Christopherson --- .../virt/kvm/x86/amd-memory-encryption.rst | 52 +++++++++++++++++++++- arch/x86/include/uapi/asm/kvm.h | 2 + arch/x86/kvm/svm/sev.c | 16 +++++++ 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index 1ddb6a86ce7f..543b5e5dd8d4 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -572,6 +572,52 @@ Returns: 0 on success, -negative on error See SNP_LAUNCH_FINISH in the SEV-SNP specification [snp-fw-abi]_ for further details on the input parameters in ``struct kvm_sev_snp_launch_finish``. +21. KVM_SEV_SNP_ENABLE_REQ_CERTS +-------------------------------- + +The KVM_SEV_SNP_ENABLE_REQ_CERTS command will configure KVM to exit to +userspace with a ``KVM_EXIT_SNP_REQ_CERTS`` exit type as part of handling +a guest attestation report, which will to allow userspace to provide a +certificate corresponding to the endorsement key used by firmware to sign +that attestation report. + +Returns: 0 on success, -negative on error + +NOTE: The endorsement key used by firmware may change as a result of +management activities like updating SEV-SNP firmware or loading new +endorsement keys, so some care should be taken to keep the returned +certificate data in sync with the actual endorsement key in use by +firmware at the time the attestation request is sent to SNP firmware. The +recommended scheme to do this is to use file locking (e.g. via fcntl()'s +F_OFD_SETLK) in the following manner: + + - Prior to obtaining/providing certificate data as part of servicing an + exit type of ``KVM_EXIT_SNP_REQ_CERTS``, the VMM should obtain a + shared/read or exclusive/write lock on the certificate blob file before + reading it and returning it to KVM, and continue to hold the lock until + the attestation request is actually sent to firmware. To facilitate + this, the VMM can set the ``immediate_exit`` flag of kvm_run just after + supplying the certificate data, and just before resuming the vCPU. + This will ensure the vCPU will exit again to userspace with ``-EINTR`` + after it finishes fetching the attestation request from firmware, at + which point the VMM can safely drop the file lock. + + - Tools/libraries that perform updates to SNP firmware TCB values or + endorsement keys (e.g. via /dev/sev interfaces such as ``SNP_COMMIT``, + ``SNP_SET_CONFIG``, or ``SNP_VLEK_LOAD``, see + Documentation/virt/coco/sev-guest.rst for more details) in such a way + that the certificate blob needs to be updated, should similarly take an + exclusive lock on the certificate blob for the duration of any updates + to endorsement keys or the certificate blob contents to ensure that + VMMs using the above scheme will not return certificate blob data that + is out of sync with the endorsement key used by firmware at the time + the attestation request is actually issued. + +This scheme is recommended so that tools can use a fairly generic/natural +approach to synchronizing firmware/certificate updates via file-locking, +which should make it easier to maintain interoperability across +tools/VMMs/vendors. + Device attribute API ==================== @@ -579,11 +625,15 @@ Attributes of the SEV implementation can be retrieved through the ``KVM_HAS_DEVICE_ATTR`` and ``KVM_GET_DEVICE_ATTR`` ioctls on the ``/dev/kvm`` device node, using group ``KVM_X86_GRP_SEV``. -Currently only one attribute is implemented: +The following attributes are currently implemented: * ``KVM_X86_SEV_VMSA_FEATURES``: return the set of all bits that are accepted in the ``vmsa_features`` of ``KVM_SEV_INIT2``. +* ``KVM_X86_SEV_SNP_REQ_CERTS``: return a value of 1 if the kernel supports the + ``KVM_EXIT_SNP_REQ_CERTS`` exit, which allows for fetching endorsement key + certificates from userspace for each SNP attestation request the guest issues. + Firmware Management =================== diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 7ceff6583652..b2c928c5965d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -503,6 +503,7 @@ struct kvm_sync_regs { #define KVM_X86_GRP_SEV 1 # define KVM_X86_SEV_VMSA_FEATURES 0 # define KVM_X86_SNP_POLICY_BITS 1 +# define KVM_X86_SEV_SNP_REQ_CERTS 2 struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; @@ -743,6 +744,7 @@ enum sev_cmd_id { KVM_SEV_SNP_LAUNCH_START = 100, KVM_SEV_SNP_LAUNCH_UPDATE, KVM_SEV_SNP_LAUNCH_FINISH, + KVM_SEV_SNP_ENABLE_REQ_CERTS, KVM_SEV_NR_MAX, }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 9e6a78e448f2..f9aad5c1447e 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2158,6 +2158,9 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val) *val = snp_supported_policy_bits; return 0; + case KVM_X86_SEV_SNP_REQ_CERTS: + *val = sev_snp_enabled ? 1 : 0; + return 0; default: return -ENXIO; } @@ -2574,6 +2577,16 @@ e_free: return ret; } +static int snp_enable_certs(struct kvm *kvm) +{ + if (kvm->created_vcpus || !sev_snp_guest(kvm)) + return -EINVAL; + + to_kvm_sev_info(kvm)->snp_certs_enabled = true; + + return 0; +} + int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -2679,6 +2692,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_SNP_LAUNCH_FINISH: r = snp_launch_finish(kvm, &sev_cmd); break; + case KVM_SEV_SNP_ENABLE_REQ_CERTS: + r = snp_enable_certs(kvm); + break; default: r = -EINVAL; goto out; -- cgit v1.2.3