From 9a7cb00a8ff7380a09fa75287a3f2642c472d562 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Feb 2025 16:33:50 +0000 Subject: x86/cpufeatures: Define X86_FEATURE_AMD_IBRS_SAME_MODE Per the APM [1]: Some processors, identified by CPUID Fn8000_0008_EBX[IbrsSameMode] (bit 19) = 1, provide additional speculation limits. For these processors, when IBRS is set, indirect branch predictions are not influenced by any prior indirect branches, regardless of mode (CPL and guest/host) and regardless of whether the prior indirect branches occurred before or after the setting of IBRS. This is referred to as Same Mode IBRS. Define this feature bit, which will be used by KVM to determine if an IBPB is required on nested VM-exits in SVM. [1] AMD64 Architecture Programmer's Manual Pub. 40332, Rev 4.08 - April 2024, Volume 2, 3.2.9 Speculation Control MSRs Signed-off-by: Yosry Ahmed Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20250221163352.3818347-2-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + tools/arch/x86/include/asm/cpufeatures.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 6c2c152d8a67..97af896de2ce 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -336,6 +336,7 @@ #define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ +#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/ #define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ #define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 9e3fa7942e7d..bd800123311f 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -335,6 +335,7 @@ #define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ +#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/ #define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ #define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ -- cgit v1.2.3 From 65ca2872015c232d6743b497e3c08ff96596b917 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Feb 2025 16:33:51 +0000 Subject: KVM: x86: Propagate AMD's IbrsSameMode to the guest If IBRS provides same mode (kernel/user or host/guest) protection on the host, then by definition it also provides same mode protection in the guest. In fact, all different modes from the guest's perspective are the same mode from the host's perspective anyway. Propagate IbrsSameMode to the guests. Signed-off-by: Yosry Ahmed Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20250221163352.3818347-3-yosry.ahmed@linux.dev Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e92263fbbad6..f678cbda6ebf 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1093,6 +1093,7 @@ void kvm_set_cpu_caps(void) F(AMD_SSB_NO), F(AMD_STIBP), F(AMD_STIBP_ALWAYS_ON), + F(AMD_IBRS_SAME_MODE), F(AMD_PSFD), F(AMD_IBPB_RET), ); -- cgit v1.2.3 From 656d9624bd21d35499eaa5ee97fda6def62901c8 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 21 Feb 2025 16:33:52 +0000 Subject: KVM: x86: Generalize IBRS virtualization on emulated VM-exit Commit 2e7eab81425a ("KVM: VMX: Execute IBPB on emulated VM-exit when guest has IBRS") added an IBPB in the emulated VM-exit path on Intel to properly virtualize IBRS by providing separate predictor modes for L1 and L2. AMD requires similar handling, except when IbrsSameMode is enumerated by the host CPU (which is the case on most/all AMD CPUs). With IbrsSameMode, hardware IBRS is sufficient and no extra handling is needed from KVM. Generalize the handling in nested_vmx_vmexit() by moving it into a generic function, add the AMD handling, and use it in nested_svm_vmexit() too. The main reason for using a generic function is to have a single place to park the huge comment about virtualizing IBRS. Signed-off-by: Yosry Ahmed Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20250221163352.3818347-4-yosry.ahmed@linux.dev [sean: use kvm_nested_vmexit_handle_spec_ctrl() for the helper] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 2 ++ arch/x86/kvm/vmx/nested.c | 11 +---------- arch/x86/kvm/x86.h | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 834b67672d50..6cd2cf7f6f68 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1041,6 +1041,8 @@ int nested_svm_vmexit(struct vcpu_svm *svm) nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); + kvm_nested_vmexit_handle_ibrs(vcpu); + svm_switch_vmcb(svm, &svm->vmcb01); /* diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e073e3008b16..ebeb183270cf 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5020,16 +5020,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_switch_vmcs(vcpu, &vmx->vmcs01); - /* - * If IBRS is advertised to the vCPU, KVM must flush the indirect - * branch predictors when transitioning from L2 to L1, as L1 expects - * hardware (KVM in this case) to provide separate predictor modes. - * Bare metal isolates VMX root (host) from VMX non-root (guest), but - * doesn't isolate different VMCSs, i.e. in this case, doesn't provide - * separate modes for L2 vs L1. - */ - if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL)) - indirect_branch_prediction_barrier(); + kvm_nested_vmexit_handle_ibrs(vcpu); /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 88a9475899c8..832f0faf4779 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -121,6 +121,24 @@ static inline void kvm_leave_nested(struct kvm_vcpu *vcpu) kvm_x86_ops.nested_ops->leave_nested(vcpu); } +/* + * If IBRS is advertised to the vCPU, KVM must flush the indirect branch + * predictors when transitioning from L2 to L1, as L1 expects hardware (KVM in + * this case) to provide separate predictor modes. Bare metal isolates the host + * from the guest, but doesn't isolate different guests from one another (in + * this case L1 and L2). The exception is if bare metal supports same mode IBRS, + * which offers protection within the same mode, and hence protects L1 from L2. + */ +static inline void kvm_nested_vmexit_handle_ibrs(struct kvm_vcpu *vcpu) +{ + if (cpu_feature_enabled(X86_FEATURE_AMD_IBRS_SAME_MODE)) + return; + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS)) + indirect_branch_prediction_barrier(); +} + static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) { return vcpu->arch.last_vmentry_cpu != -1; -- cgit v1.2.3 From 3fa0fc95db6df904dc812fa806a55ea6bafa65c1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 26 Feb 2025 17:01:10 -0800 Subject: x86/msr: Rename the WRMSRNS opcode macro to ASM_WRMSRNS (for KVM) Rename the WRMSRNS instruction opcode macro so that it doesn't collide with X86_FEATURE_WRMSRNS when using token pasting to generate references to X86_FEATURE_WRMSRNS. KVM heavily uses token pasting to generate KVM's set of support feature bits, and adding WRMSRNS support in KVM will run will run afoul of the opcode macro. arch/x86/kvm/cpuid.c:719:37: error: pasting "X86_FEATURE_" and "" "" does not give a valid preprocessing token 719 | u32 __leaf = __feature_leaf(X86_FEATURE_##name); \ | ^~~~~~~~~~~~ KVM has worked around one such collision in the past by #undef'ing the problematic macro in order to avoid blocking a KVM rework, but such games are generally undesirable, e.g. requires bleeding macro details into KVM, risks weird behavior if what KVM is #undef'ing changes, etc. Reviewed-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20250227010111.3222742-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/msr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 9397a319d165..7b16501912ad 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -300,7 +300,7 @@ do { \ #endif /* !CONFIG_PARAVIRT_XXL */ /* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */ -#define WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6) +#define ASM_WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6) /* Non-serializing WRMSR, when available. Falls back to a serializing WRMSR. */ static __always_inline void wrmsrns(u32 msr, u64 val) @@ -309,7 +309,7 @@ static __always_inline void wrmsrns(u32 msr, u64 val) * WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant * DS prefix to avoid a trailing NOP. */ - asm volatile("1: " ALTERNATIVE("ds wrmsr", WRMSRNS, X86_FEATURE_WRMSRNS) + asm volatile("1: " ALTERNATIVE("ds wrmsr", ASM_WRMSRNS, X86_FEATURE_WRMSRNS) "2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32))); } -- cgit v1.2.3 From ead4dac16de22081956f7af1c795bfbebd2d5866 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 26 Feb 2025 17:01:11 -0800 Subject: KVM: x86: Advertise support for WRMSRNS Advertise support for WRMSRNS (WRMSR non-serializing) to userspace if the instruction is supported by the underlying CPU. From a virtualization perspective, the only difference between WRMSRNS and WRMSR is that VM-Exits due to WRMSRNS set EXIT_QUALIFICATION to '1'. WRMSRNS doesn't require a new enabling control, shares the same basic exit reason, and behaves the same as WRMSR with respect to MSR interception. WRMSR and WRMSRNS use the same basic exit reason (see Appendix C). For WRMSR, the exit qualification is 0, while for WRMSRNS it is 1. Don't do anything different when emulating WRMSRNS vs. WRMSR, as KVM can't do anything less, i.e. can't make emulation non-serializing. The motivation for the guest to use WRMSRNS instead of WRMSR is to avoid immediately serializing the CPU when the necessary serialization is guaranteed by some other mechanism, i.e. WRMSRNS being fully serializing isn't guest-visible, just less performant. Suggested-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20250227010111.3222742-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index f678cbda6ebf..3e7a8d5813c2 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -978,6 +978,7 @@ void kvm_set_cpu_caps(void) F(FZRM), F(FSRS), F(FSRC), + F(WRMSRNS), F(AMX_FP16), F(AVX_IFMA), F(LAM), -- cgit v1.2.3 From f804dc6aa20f2ce504456ffbaafc95db646c211b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 24 Mar 2025 13:53:39 +0300 Subject: KVM: x86: clean up a return Returning a literal X86EMUL_CONTINUE is slightly clearer than returning rc. Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/7604cbbf-15e6-45a8-afec-cf5be46c2924@stanley.mountain Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f6ce044b090a..e31dc57e1afa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8021,7 +8021,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, return rc; if (!vcpu->mmio_nr_fragments) - return rc; + return X86EMUL_CONTINUE; gpa = vcpu->mmio_fragments[0].gpa; -- cgit v1.2.3 From 49c140d5af127ef4faf19f06a89a0714edf0316f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 24 Mar 2025 17:06:17 +0100 Subject: KVM: x86: Sort CPUID_8000_0021_EAX leaf bits properly WRMSR_XX_BASE_NS is bit 1 so put it there, add some new bits as comments only. Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250324160617.15379-1-bp@kernel.org [sean: skip the FSRS/FSRC placeholders to avoid confusion] Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 3e7a8d5813c2..5d4de19b7c22 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1152,6 +1152,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_init(CPUID_8000_0021_EAX, F(NO_NESTED_DATA_BP), + F(WRMSR_XX_BASE_NS), /* * Synthesize "LFENCE is serializing" into the AMD-defined entry * in KVM's supported CPUID, i.e. if the feature is reported as @@ -1165,10 +1166,12 @@ void kvm_set_cpu_caps(void) SYNTHESIZED_F(LFENCE_RDTSC), /* SmmPgCfgLock */ F(NULL_SEL_CLR_BASE), + /* UpperAddressIgnore */ F(AUTOIBRS), EMULATED_F(NO_SMM_CTL_MSR), /* PrefetchCtlMsr */ - F(WRMSR_XX_BASE_NS), + /* GpOnUserCpuid */ + /* EPSF */ SYNTHESIZED_F(SBPB), SYNTHESIZED_F(IBPB_BRTYPE), SYNTHESIZED_F(SRSO_NO), -- cgit v1.2.3 From d88bb2ded2efdc3857f0706da097261523aa78b2 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Tue, 8 Apr 2025 17:57:09 -0500 Subject: KVM: x86: Advertise support for AMD's PREFETCHI The latest AMD platform has introduced a new instruction called PREFETCHI. This instruction loads a cache line from a specified memory address into the indicated data or instruction cache level, based on locality reference hints. Feature bit definition: CPUID_Fn80000021_EAX [bit 20] - Indicates support for IC prefetch. This feature is analogous to Intel's PREFETCHITI (CPUID.(EAX=7,ECX=1):EDX), though the CPUID bit definitions differ between AMD and Intel. Advertise support to userspace, as no additional enabling is necessary (PREFETCHI can't be intercepted as there's no instruction specific behavior that needs to be virtualize). The feature is documented in Processor Programming Reference (PPR) for AMD Family 1Ah Model 02h, Revision C1 (Link below). Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 Signed-off-by: Babu Moger Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/ee1c08fc400bb574a2b8f2c6a0bd9def10a29d35.1744130533.git.babu.moger@amd.com [sean: rewrite shortlog to highlight the KVM functionality] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kvm/cpuid.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 97af896de2ce..389d8a6b96bb 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -458,6 +458,7 @@ #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */ #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */ +#define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */ #define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5d4de19b7c22..41df8729c167 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1168,6 +1168,7 @@ void kvm_set_cpu_caps(void) F(NULL_SEL_CLR_BASE), /* UpperAddressIgnore */ F(AUTOIBRS), + F(PREFETCHI), EMULATED_F(NO_SMM_CTL_MSR), /* PrefetchCtlMsr */ /* GpOnUserCpuid */ -- cgit v1.2.3 From b1f7723a5a5b018f4bc3fb8e234510be7c44ad00 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Mar 2025 17:33:33 -0800 Subject: KVM: x86: Isolate edge vs. level check in userspace I/O APIC route scanning Extract and isolate the trigger mode check in kvm_scan_ioapic_routes() in anticipation of moving destination matching logic to a common helper (for userspace vs. in-kernel I/O APIC emulation). No functional change intended. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20250304013335.4155703-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/irq_comm.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8136695f7b96..866f84392797 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -424,10 +424,12 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm_set_msi_irq(vcpu->kvm, entry, &irq); - if (irq.trig_mode && - (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, - irq.dest_id, irq.dest_mode) || - kvm_apic_pending_eoi(vcpu, irq.vector))) + if (!irq.trig_mode) + continue; + + if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, + irq.dest_id, irq.dest_mode) || + kvm_apic_pending_eoi(vcpu, irq.vector)) __set_bit(irq.vector, ioapic_handled_vectors); } } -- cgit v1.2.3 From c2207bbc0c0f4b6ae8dbb73ec26e17aa0c45a3ca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Mar 2025 17:33:34 -0800 Subject: KVM: x86: Add a helper to deduplicate I/O APIC EOI interception logic Extract the vCPU specific EOI interception logic for I/O APIC emulation into a common helper for userspace and in-kernel emulation in anticipation of optimizing the "pending EOI" case. No functional change intended. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20250304013335.4155703-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/ioapic.c | 7 ++----- arch/x86/kvm/ioapic.h | 2 ++ arch/x86/kvm/irq_comm.c | 21 +++++++++++++++++---- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 995eb5054360..45dae2d5d2f1 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -296,11 +296,8 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) index == RTC_GSI) { u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); - if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, - e->fields.dest_id, dm) || - kvm_apic_pending_eoi(vcpu, e->fields.vector)) - __set_bit(e->fields.vector, - ioapic_handled_vectors); + kvm_scan_ioapic_irq(vcpu, e->fields.dest_id, dm, + e->fields.vector, ioapic_handled_vectors); } } spin_unlock(&ioapic->lock); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 539333ac4b38..aa8cb4ac0479 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -120,4 +120,6 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); +void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, + u8 vector, unsigned long *ioapic_handled_vectors); #endif diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 866f84392797..14590d9c4a37 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -402,6 +402,21 @@ void kvm_arch_post_irq_routing_update(struct kvm *kvm) kvm_make_scan_ioapic_request(kvm); } +void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, + u8 vector, unsigned long *ioapic_handled_vectors) +{ + /* + * Intercept EOI if the vCPU is the target of the new IRQ routing, or + * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU + * may receive a level-triggered IRQ in the future, or already received + * level-triggered IRQ. The EOI needs to be intercepted and forwarded + * to I/O APIC emulation so that the IRQ can be de-asserted. + */ + if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode) || + kvm_apic_pending_eoi(vcpu, vector)) + __set_bit(vector, ioapic_handled_vectors); +} + void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) { @@ -427,10 +442,8 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, if (!irq.trig_mode) continue; - if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, - irq.dest_id, irq.dest_mode) || - kvm_apic_pending_eoi(vcpu, irq.vector)) - __set_bit(irq.vector, ioapic_handled_vectors); + kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode, + irq.vector, ioapic_handled_vectors); } } srcu_read_unlock(&kvm->irq_srcu, idx); -- cgit v1.2.3 From 87e4951e250bbccac47a8822f6f023a1de8b96ec Mon Sep 17 00:00:00 2001 From: weizijie Date: Mon, 3 Mar 2025 17:33:35 -0800 Subject: KVM: x86: Rescan I/O APIC routes after EOI interception for old routing Rescan I/O APIC routes for a vCPU after handling an intercepted I/O APIC EOI for an IRQ that is not targeting said vCPU, i.e. after handling what's effectively a stale EOI VM-Exit. If a level-triggered IRQ is in-flight when IRQ routing changes, e.g. because the guest changes routing from its IRQ handler, then KVM intercepts EOIs on both the new and old target vCPUs, so that the in-flight IRQ can be de-asserted when it's EOI'd. However, only the EOI for the in-flight IRQ needs to be intercepted, as IRQs on the same vector with the new routing are coincidental, i.e. occur only if the guest is reusing the vector for multiple interrupt sources. If the I/O APIC routes aren't rescanned, KVM will unnecessarily intercept EOIs for the vector and negative impact the vCPU's interrupt performance. Note, both commit db2bdcbbbd32 ("KVM: x86: fix edge EOI and IOAPIC reconfig race") and commit 0fc5a36dd6b3 ("KVM: x86: ioapic: Fix level-triggered EOI and IOAPIC reconfigure race") mentioned this issue, but it was considered a "rare" occurrence thus was not addressed. However in real environments, this issue can happen even in a well-behaved guest. Cc: Kai Huang Co-developed-by: xuyun Signed-off-by: xuyun Signed-off-by: weizijie [sean: massage changelog and comments, use int/-1, reset at scan] Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20250304013335.4155703-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/irq_comm.c | 16 ++++++++++++++-- arch/x86/kvm/lapic.c | 8 ++++++++ arch/x86/kvm/x86.c | 1 + 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6c06f3d6e081..89102b65827f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1034,6 +1034,7 @@ struct kvm_vcpu_arch { int pending_ioapic_eoi; int pending_external_vector; + int highest_stale_pending_ioapic_eoi; /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 14590d9c4a37..d6d792b5d1bd 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -412,9 +412,21 @@ void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, * level-triggered IRQ. The EOI needs to be intercepted and forwarded * to I/O APIC emulation so that the IRQ can be de-asserted. */ - if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode) || - kvm_apic_pending_eoi(vcpu, vector)) + if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) { __set_bit(vector, ioapic_handled_vectors); + } else if (kvm_apic_pending_eoi(vcpu, vector)) { + __set_bit(vector, ioapic_handled_vectors); + + /* + * Track the highest pending EOI for which the vCPU is NOT the + * target in the new routing. Only the EOI for the IRQ that is + * in-flight (for the old routing) needs to be intercepted, any + * future IRQs that arrive on this vCPU will be coincidental to + * the level-triggered routing and don't need to be intercepted. + */ + if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi) + vcpu->arch.highest_stale_pending_ioapic_eoi = vector; + } } void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c9de81cc27e1..3defbe520aed 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1459,6 +1459,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) if (!kvm_ioapic_handles_vector(apic, vector)) return; + /* + * If the intercepted EOI is for an IRQ that was pending from previous + * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely + * no longer need to be intercepted. + */ + if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector) + kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu); + /* Request a KVM exit to inform the userspace IOAPIC. */ if (irqchip_split(apic->vcpu->kvm)) { apic->vcpu->arch.pending_ioapic_eoi = vector; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e31dc57e1afa..e15af894f414 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10692,6 +10692,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) return; bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); + vcpu->arch.highest_stale_pending_ioapic_eoi = -1; kvm_x86_call(sync_pir_to_irr)(vcpu); -- cgit v1.2.3 From c364baad3e4f114284581c35d4b9006d59d2629a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 09:18:02 -0700 Subject: KVM: VMX: Don't send UNBLOCK when starting device assignment without APICv When starting device assignment, i.e. potential IRQ bypass, don't blast KVM_REQ_UNBLOCK if APICv is disabled/unsupported. There is no need to wake vCPUs if they can never use VT-d posted IRQs (sending UNBLOCK guards against races being vCPUs blocking and devices starting IRQ bypass). Opportunistically use kvm_arch_has_irq_bypass() for all relevant checks in the VMX Posted Interrupt code so that all checks in KVM x86 incorporate the same information (once AMD/AVIC is given similar treatment). Cc: Yosry Ahmed Link: https://lore.kernel.org/r/20250401161804.842968-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/posted_intr.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 99d1d599ff8c..4cf5bf5b33d7 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -148,9 +148,8 @@ after_clear_sn: static bool vmx_can_use_vtd_pi(struct kvm *kvm) { - return irqchip_in_kernel(kvm) && enable_apicv && - kvm_arch_has_assigned_device(kvm) && - irq_remapping_cap(IRQ_POSTING_CAP); + return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() && + kvm_arch_has_assigned_device(kvm); } /* @@ -281,7 +280,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) */ void vmx_pi_start_assignment(struct kvm *kvm) { - if (!irq_remapping_cap(IRQ_POSTING_CAP)) + if (!kvm_arch_has_irq_bypass()) return; kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); -- cgit v1.2.3 From 459074cff66f77af3f327e2c1f9256cdb146d798 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 09:18:04 -0700 Subject: KVM: x86: Add module param to control and enumerate device posted IRQs Add a module param to each KVM vendor module to allow disabling device posted interrupts without having to sacrifice all of APICv/AVIC, and to also effectively enumerate to userspace whether or not KVM may be utilizing device posted IRQs. Disabling device posted interrupts is very desirable for testing, and can even be desirable for production environments, e.g. if the host kernel wants to interpose on device interrupts. Put the module param in kvm-{amd,intel}.ko instead of kvm.ko to match the overall APICv/AVIC controls, and to avoid complications with said controls. E.g. if the param is in kvm.ko, KVM needs to be snapshot the original user-defined value to play nice with a vendor module being reloaded with different enable_apicv settings. Link: https://lore.kernel.org/r/20250401161804.842968-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/x86.c | 6 ++++++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 89102b65827f..1609cc46323a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1942,6 +1942,7 @@ struct kvm_arch_async_pf { extern u32 __read_mostly kvm_nr_uret_msrs; extern bool __read_mostly allow_smaller_maxphyaddr; extern bool __read_mostly enable_apicv; +extern bool __read_mostly enable_device_posted_irqs; extern struct kvm_x86_ops kvm_x86_ops; #define kvm_x86_call(func) static_call(kvm_x86_##func) @@ -2445,7 +2446,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); static inline bool kvm_arch_has_irq_bypass(void) { - return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); + return enable_device_posted_irqs; } #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index cc1c721ba067..8eb482ca3359 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -231,6 +231,8 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); + bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ef2d7208dd20..ba08a7edc712 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -116,6 +116,8 @@ module_param(enable_apicv, bool, 0444); bool __read_mostly enable_ipiv = true; module_param(enable_ipiv, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e15af894f414..f35f4456e388 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -226,6 +226,9 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); bool __read_mostly enable_apicv = true; EXPORT_SYMBOL_GPL(enable_apicv); +bool __read_mostly enable_device_posted_irqs = true; +EXPORT_SYMBOL_GPL(enable_device_posted_irqs); + const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), STATS_DESC_COUNTER(VM, mmu_shadow_zapped), @@ -9809,6 +9812,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (r != 0) goto out_mmu_exit; + enable_device_posted_irqs &= enable_apicv && + irq_remapping_cap(IRQ_POSTING_CAP); + kvm_ops_update(ops); for_each_online_cpu(cpu) { -- cgit v1.2.3 From 1bee4838eb3a2c689f23c7170ea66ae87ea7d93a Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 29 Apr 2025 08:32:15 -0700 Subject: KVM: SVM: Clear current_vmcb during vCPU free for all *possible* CPUs When freeing a vCPU and thus its VMCB, clear current_vmcb for all possible CPUs, not just online CPUs, as it's theoretically possible a CPU could go offline and come back online in conjunction with KVM reusing the page for a new VMCB. Link: https://lore.kernel.org/all/20250320013759.3965869-1-yosry.ahmed@linux.dev Fixes: fd65d3142f73 ("kvm: svm: Ensure an IBPB on all affected CPUs when freeing a vmcb") Cc: stable@vger.kernel.org Cc: Jim Mattson Signed-off-by: Yosry Ahmed [sean: split to separate patch, write changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8eb482ca3359..e6802e33c54d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1496,7 +1496,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb) { int i; - for_each_online_cpu(i) + for_each_possible_cpu(i) cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); } -- cgit v1.2.3 From 54a1a24fea1997861622320eada7de65e0ceb614 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 29 Apr 2025 08:37:19 -0700 Subject: KVM: x86: Unify cross-vCPU IBPB Both SVM and VMX have similar implementation for executing an IBPB between running different vCPUs on the same CPU to create separate prediction domains for different vCPUs. For VMX, when the currently loaded VMCS is changed in vmx_vcpu_load_vmcs(), an IBPB is executed if there is no 'buddy', which is the case on vCPU load. The intention is to execute an IBPB when switching vCPUs, but not when switching the VMCS within the same vCPU. Executing an IBPB on nested transitions within the same vCPU is handled separately and conditionally in nested_vmx_vmexit(). For SVM, the current VMCB is tracked on vCPU load and an IBPB is executed when it is changed. The intention is also to execute an IBPB when switching vCPUs, although it is possible that in some cases an IBBP is executed when switching VMCBs for the same vCPU. Executing an IBPB on nested transitions should be handled separately, and is proposed at [1]. Unify the logic by tracking the last loaded vCPU and execuintg the IBPB on vCPU change in kvm_arch_vcpu_load() instead. When a vCPU is destroyed, make sure all references to it are removed from any CPU. This is similar to how SVM clears the current_vmcb tracking on vCPU destruction. Remove the current VMCB tracking in SVM as it is no longer required, as well as the 'buddy' parameter to vmx_vcpu_load_vmcs(). [1] https://lore.kernel.org/lkml/20250221163352.3818347-4-yosry.ahmed@linux.dev Link: https://lore.kernel.org/all/20250320013759.3965869-1-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed [sean: tweak comment to stay at/under 80 columns] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 25 ------------------------- arch/x86/kvm/svm/svm.h | 2 -- arch/x86/kvm/vmx/nested.c | 6 +++--- arch/x86/kvm/vmx/vmx.c | 16 ++-------------- arch/x86/kvm/vmx/vmx.h | 3 +-- arch/x86/kvm/x86.c | 20 +++++++++++++++++++- 6 files changed, 25 insertions(+), 47 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e6802e33c54d..7d5f5ea9794f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1492,25 +1492,10 @@ out: return err; } -static void svm_clear_current_vmcb(struct vmcb *vmcb) -{ - int i; - - for_each_possible_cpu(i) - cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); -} - static void svm_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - /* - * The vmcb page can be recycled, causing a false negative in - * svm_vcpu_load(). So, ensure that no logical CPU has this - * vmcb page recorded as its current vmcb. - */ - svm_clear_current_vmcb(svm->vmcb); - svm_leave_nested(vcpu); svm_free_nested(svm); @@ -1562,19 +1547,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); - if (sd->current_vmcb != svm->vmcb) { - sd->current_vmcb = svm->vmcb; - - if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) && - static_branch_likely(&switch_vcpu_ibpb)) - indirect_branch_prediction_barrier(); - } if (kvm_vcpu_apicv_active(vcpu)) avic_vcpu_load(vcpu, cpu); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index d4490eaed55d..c4584cee8f71 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -338,8 +338,6 @@ struct svm_cpu_data { struct vmcb *save_area; unsigned long save_area_pa; - struct vmcb *current_vmcb; - /* index = sev_asid, value = vmcb pointer */ struct vmcb **sev_vmcbs; }; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ebeb183270cf..4b3fe437a5a1 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -301,7 +301,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) cpu = get_cpu(); prev = vmx->loaded_vmcs; vmx->loaded_vmcs = vmcs; - vmx_vcpu_load_vmcs(vcpu, cpu, prev); + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); @@ -4520,12 +4520,12 @@ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, cpu = get_cpu(); vmx->loaded_vmcs = &vmx->nested.vmcs02; - vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); + vmx_vcpu_load_vmcs(vcpu, cpu); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); + vmx_vcpu_load_vmcs(vcpu, cpu); put_cpu(); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ba08a7edc712..914d69c0bd9d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1445,8 +1445,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, - struct loaded_vmcs *buddy) +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; @@ -1473,17 +1472,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, if (prev != vmx->loaded_vmcs->vmcs) { per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; vmcs_load(vmx->loaded_vmcs->vmcs); - - /* - * No indirect branch prediction barrier needed when switching - * the active VMCS within a vCPU, unless IBRS is advertised to - * the vCPU. To minimize the number of IBPBs executed, KVM - * performs IBPB on nested VM-Exit (a single nested transition - * may switch the active VMCS multiple times). - */ - if (static_branch_likely(&switch_vcpu_ibpb) && - (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))) - indirect_branch_prediction_barrier(); } if (!already_loaded) { @@ -1522,7 +1510,7 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); - vmx_vcpu_load_vmcs(vcpu, cpu, NULL); + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_vcpu_pi_load(vcpu, cpu); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 6d1e40ecc024..b5758c33c60f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -354,8 +354,7 @@ static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) return vt->exit_intr_info; } -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, - struct loaded_vmcs *buddy); +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu); int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f35f4456e388..b45e9d6b5261 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4991,6 +4991,8 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) return kvm_arch_has_noncoherent_dma(vcpu->kvm); } +static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu); + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -5013,6 +5015,19 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_call(vcpu_load)(vcpu, cpu); + if (vcpu != per_cpu(last_vcpu, cpu)) { + /* + * Flush the branch predictor when switching vCPUs on the same + * physical CPU, as each vCPU needs its own branch prediction + * domain. No IBPB is needed when switching between L1 and L2 + * on the same vCPU unless IBRS is advertised to the vCPU; that + * is handled on the nested VM-Exit path. + */ + if (static_branch_likely(&switch_vcpu_ibpb)) + indirect_branch_prediction_barrier(); + per_cpu(last_vcpu, cpu) = vcpu; + } + /* Save host pkru register if supported */ vcpu->arch.host_pkru = read_pkru(); @@ -12424,13 +12439,16 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - int idx; + int idx, cpu; kvm_clear_async_pf_completion_queue(vcpu); kvm_mmu_unload(vcpu); kvmclock_reset(vcpu); + for_each_possible_cpu(cpu) + cmpxchg(per_cpu_ptr(&last_vcpu, cpu), vcpu, NULL); + kvm_x86_call(vcpu_free)(vcpu); kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); -- cgit v1.2.3 From 37d8bad41d2b0a7d269affb85979a8e4114e177a Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 6 May 2025 09:22:51 +0800 Subject: KVM: Remove obsolete comment about locking for kvm_io_bus_read/write Nobody is actually calling these functions with slots_lock held, The srcu_dereference() in kvm_io_bus_read/write() precisely communicates both what is being protected, and what provides the protection. so the comments are no longer needed Suggested-by: Sean Christopherson Signed-off-by: Li RongQing Link: https://lore.kernel.org/r/20250506012251.2613-1-lirongqing@baidu.com Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 69782df3617f..3a0b1486da31 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5765,7 +5765,6 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -/* kvm_io_bus_write - called under kvm->slots_lock */ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { @@ -5786,7 +5785,6 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, } EXPORT_SYMBOL_GPL(kvm_io_bus_write); -/* kvm_io_bus_write_cookie - called under kvm->slots_lock */ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie) { @@ -5836,7 +5834,6 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -/* kvm_io_bus_read - called under kvm->slots_lock */ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val) { -- cgit v1.2.3