From 6d3790bc689de9f18fae01c21f02e7d6d425534c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 27 Apr 2026 18:25:03 -0700 Subject: KVM: selftests: Include sys/mman.h *and* linux/mman.h, via kvm_syscalls.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include both linux/mman.h (the kernel provided version) and sys/mman.h (the libc provided version) throughout KVM selftests, by way of kvm_syscalls.h (which should have been including sys/mman.h anyways). Pulling in the kernel's version fixes compilation errors with the guest_memfd test on older versions of libc due to a recent commit adding MADV_COLLAPSE testing. In file included from include/kvm_util.h:8, from guest_memfd_test.c:21: guest_memfd_test.c: In function ‘test_collapse’: guest_memfd_test.c:219:47: error: ‘MADV_COLLAPSE’ undeclared (first use in this function); did you mean ‘MADV_COLD’? 219 | TEST_ASSERT_EQ(madvise(mem, pmd_size, MADV_COLLAPSE), -1); | ^~~~~~~~~~~~~ include/test_util.h:62:16: note: in definition of macro ‘TEST_ASSERT_EQ’ 62 | typeof(a) __a = (a); \ | ^ guest_memfd_test.c:219:47: note: each undeclared identifier is reported only once for each function it appears in 219 | TEST_ASSERT_EQ(madvise(mem, pmd_size, MADV_COLLAPSE), -1); | ^~~~~~~~~~~~~ include/test_util.h:62:16: note: in definition of macro ‘TEST_ASSERT_EQ’ 62 | typeof(a) __a = (a); \ | ^ Route the includes through kvm_syscalls.h to try and avoid a future game of whack-a-mole, i.e. so that future expansion of test coverage doesn't run into the same problem. To discourage use of sys/mman.h, opportunistically include the kernel's version of mman.h in test_util.h as it only needs MAP_SHARED, i.e. only needs the full set of kernel defs, not the libc syscall wrappers. Fixes: 9830209b4ae8 ("KVM: selftests: Test MADV_COLLAPSE on guest_memfd") Reported-by: Rick Edgecombe Closes: https://lore.kernel.org/all/20260427204313.50741-1-rick.p.edgecombe@intel.com Link: https://patch.msgid.link/20260428012503.1213654-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/access_tracking_perf_test.c | 2 +- tools/testing/selftests/kvm/guest_memfd_test.c | 2 +- tools/testing/selftests/kvm/include/kvm_syscalls.h | 10 ++++++++++ tools/testing/selftests/kvm/include/test_util.h | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- tools/testing/selftests/kvm/memslot_perf_test.c | 2 +- tools/testing/selftests/kvm/s390/shared_zeropage_test.c | 3 +-- tools/testing/selftests/kvm/s390/tprot.c | 2 +- tools/testing/selftests/kvm/set_memory_region_test.c | 2 +- 9 files changed, 18 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index e5bbdb5bbdc3..4415c94b2866 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -41,10 +41,10 @@ #include #include #include -#include #include #include +#include "kvm_syscalls.h" #include "kvm_util.h" #include "test_util.h" #include "memstress.h" diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 253e748c1d4a..832ef4dfb99f 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -14,10 +14,10 @@ #include #include #include -#include #include #include +#include "kvm_syscalls.h" #include "kvm_util.h" #include "numaif.h" #include "test_util.h" diff --git a/tools/testing/selftests/kvm/include/kvm_syscalls.h b/tools/testing/selftests/kvm/include/kvm_syscalls.h index 843c9904c46f..067a4c9cf452 100644 --- a/tools/testing/selftests/kvm/include/kvm_syscalls.h +++ b/tools/testing/selftests/kvm/include/kvm_syscalls.h @@ -2,8 +2,18 @@ #ifndef SELFTEST_KVM_SYSCALLS_H #define SELFTEST_KVM_SYSCALLS_H +/* + * Include both the kernel and libc versions of mman.h. The kernel provides + * the most up-to-date flags and definitions, while libc provides the syscall + * wrappers tests expect. + */ +#include + +#include #include +#include + #define MAP_ARGS0(m,...) #define MAP_ARGS1(m,t,a,...) m(t,a) #define MAP_ARGS2(m,t,a,...) m(t,a), MAP_ARGS1(m,__VA_ARGS__) diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index d9b433b834f1..a56271c237ae 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -19,9 +19,9 @@ #include #include #include -#include #include "kselftest.h" +#include #include #define msecs_to_usecs(msec) ((msec) * 1000ULL) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 2a76eca7029d..e08967ef7b7b 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -5,13 +5,13 @@ * Copyright (C) 2018, Google LLC. */ #include "test_util.h" +#include "kvm_syscalls.h" #include "kvm_util.h" #include "processor.h" #include "ucall_common.h" #include #include -#include #include #include #include diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 3d02db371422..e977e979470f 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -23,6 +22,7 @@ #include #include +#include #include #include #include diff --git a/tools/testing/selftests/kvm/s390/shared_zeropage_test.c b/tools/testing/selftests/kvm/s390/shared_zeropage_test.c index a9e5a01200b8..478381e6f84e 100644 --- a/tools/testing/selftests/kvm/s390/shared_zeropage_test.c +++ b/tools/testing/selftests/kvm/s390/shared_zeropage_test.c @@ -4,11 +4,10 @@ * * Copyright (C) 2024, Red Hat, Inc. */ -#include - #include #include "test_util.h" +#include "kvm_syscalls.h" #include "kvm_util.h" #include "kselftest.h" #include "ucall_common.h" diff --git a/tools/testing/selftests/kvm/s390/tprot.c b/tools/testing/selftests/kvm/s390/tprot.c index 8054d2b178f0..d86179827a18 100644 --- a/tools/testing/selftests/kvm/s390/tprot.c +++ b/tools/testing/selftests/kvm/s390/tprot.c @@ -4,8 +4,8 @@ * * Copyright IBM Corp. 2021 */ -#include #include "test_util.h" +#include "kvm_syscalls.h" #include "kvm_util.h" #include "kselftest.h" #include "ucall_common.h" diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 9b919a231c93..e639a9db51ee 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -8,11 +8,11 @@ #include #include #include -#include #include #include +#include #include #include -- cgit v1.2.3 From fff82ea9d900b6bbebc58d34b7a63789de1ad10d Mon Sep 17 00:00:00 2001 From: Mikhail Gavrilov Date: Tue, 5 May 2026 04:54:35 +0500 Subject: x86/virt: Silence RCU lockdep splat in emergency virt callback path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit x86_virt_invoke_kvm_emergency_callback() reaches rcu_dereference() through machine_crash_shutdown() with IRQs disabled but with RCU not necessarily watching the crashing CPU, which triggers a suspicious RCU usage splat on debug kernels (CONFIG_PROVE_RCU=y) during panic/kdump: WARNING: suspicious RCU usage arch/x86/virt/hw.c:52 suspicious rcu_dereference_check() usage! rcu_scheduler_active = 2, debug_locks = 1 1 lock held by tee/11119: #0: ffff8881fa32c440 (sb_writers#3){.+.+}-{0:0}, at: ksys_write Call Trace: dump_stack_lvl+0x84/0xd0 lockdep_rcu_suspicious.cold+0x37/0x8f x86_virt_invoke_kvm_emergency_callback+0x5f/0x70 x86_svm_emergency_disable_virtualization_cpu+0x2a/0x30 x86_virt_emergency_disable_virtualization_cpu+0x6b/0x90 native_machine_crash_shutdown+0x72/0x170 __crash_kexec+0x137/0x280 panic+0xce/0xd0 sysrq_handle_crash+0x1f/0x20 __handle_sysrq.cold+0x192/0x335 write_sysrq_trigger+0x8c/0xc0 proc_reg_write+0x1c3/0x3c0 vfs_write+0x1d0/0xf80 ksys_write+0x116/0x250 do_syscall_64+0x11c/0x1480 entry_SYSCALL_64_after_hwframe+0x76/0x7e A truly correct fix is non-trivial: the RCU usage genuinely is wrong in panic context (RCU may ignore the crashing CPU during synchronization), and a concurrent KVM module unload could in principle race with the callback read; see commit 2baa33a8ddd6 ("KVM: x86: Leave user-return notifier registered on reboot/shutdown") which notes that nothing prevents module unload during panic/reboot. However, the alternatives are worse: - smp_store_release()/smp_load_acquire() handles ordering but not liveness; the kernel still needs to keep the module text alive while the callback is in flight. - Taking a lock in the panic path is risky — any lock could be held by a CPU that has already been NMI'd to a halt. Use rcu_dereference_raw() to silence the splat and accept the vanishingly small remaining race. Panic context inherently cannot guarantee complete correctness; the goal here is to keep debug builds quiet on the kdump path so the splat doesn't obscure the actual kernel state being captured. Reproducible on a debug kernel (CONFIG_PROVE_LOCKING=y, CONFIG_PROVE_RCU=y) with kvm_amd or kvm_intel loaded by triggering kdump: echo c > /proc/sysrq-trigger Suggested-by: Sean Christopherson Fixes: 428afac5a8ea ("KVM: x86: Move bulk of emergency virtualizaton logic to virt subsystem") Signed-off-by: Mikhail Gavrilov Acked-by: Sean Christopherson Link: https://patch.msgid.link/20260504235435.90957-1-mikhail.v.gavrilov@gmail.com Signed-off-by: Sean Christopherson --- arch/x86/virt/hw.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/virt/hw.c b/arch/x86/virt/hw.c index f647557d38ac..7e9091c640be 100644 --- a/arch/x86/virt/hw.c +++ b/arch/x86/virt/hw.c @@ -49,7 +49,20 @@ static void x86_virt_invoke_kvm_emergency_callback(void) { cpu_emergency_virt_cb *kvm_callback; - kvm_callback = rcu_dereference(kvm_emergency_callback); + /* + * RCU may not be watching the crashing CPU here, so rcu_dereference() + * triggers a suspicious-RCU-usage splat. In principle, a concurrent + * KVM module unload could race with this read; see commit 2baa33a8ddd6 + * ("KVM: x86: Leave user-return notifier registered on reboot/shutdown") + * which notes that nothing prevents module unload during panic/reboot. + * + * However, taking a lock here would be riskier than the current race: + * the system is going down via NMI shootdown, and any lock could be + * held by an already-stopped CPU. Use rcu_dereference_raw() to silence + * the lockdep splat and accept the comically small remaining race; + * panic context inherently cannot guarantee complete correctness. + */ + kvm_callback = rcu_dereference_raw(kvm_emergency_callback); if (kvm_callback) kvm_callback(); } -- cgit v1.2.3 From 8fe2e698fce4a95a3ac2c25fe59832a3c22534c6 Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Thu, 9 Apr 2026 22:22:26 +0800 Subject: KVM: x86: Rate-limit global clock updates on vCPU load commit 446fcce2a52b ("Revert "x86: kvm: rate-limit global clock updates"") dropped the rate limiting for KVM_REQ_GLOBAL_CLOCK_UPDATE. As a result, kvm_arch_vcpu_load() can queue global clock update requests every time a vCPU is scheduled when the master clock is disabled or when the vCPU is loaded for the first time. Restore the throttling with a per-VM ratelimit state and gate KVM_REQ_GLOBAL_CLOCK_UPDATE through __ratelimit(), so frequent vCPU scheduling does not generate a steady stream of redundant clock update requests. Fixes: 446fcce2a52b ("Revert "x86: kvm: rate-limit global clock updates"") Signed-off-by: Lei Chen Reported-by: Jaroslav Pulchart Closes: https://lore.kernel.org/all/CAK8fFZ5gY8_Mw2A=iZVFNVKQNrXQzVsn-HTd+Me9K6ZfmdgA+Q@mail.gmail.com/ Link: https://patch.msgid.link/20260409142226.2581-1-lei.chen@smartx.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c470e40a00aa..f14009f25a3b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1504,6 +1504,7 @@ struct kvm_arch { bool use_master_clock; u64 master_kernel_ns; u64 master_cycle_now; + struct ratelimit_state kvmclock_update_rs; #ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0a1b63c63d1a..e01d6984ed04 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5227,8 +5227,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration */ - if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) { + if (__ratelimit(&vcpu->kvm->arch.kvmclock_update_rs)) + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + else + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + if (vcpu->cpu != cpu) kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu); vcpu->cpu = cpu; @@ -13366,6 +13371,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); + ratelimit_state_init(&kvm->arch.kvmclock_update_rs, HZ, 10); + ratelimit_set_flags(&kvm->arch.kvmclock_update_rs, RATELIMIT_MSG_ON_RELEASE); kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); -- cgit v1.2.3 From 34065a5f3cf94886e59e2a8b5db00515f32d6cf2 Mon Sep 17 00:00:00 2001 From: Hisam Mehboob Date: Thu, 9 Apr 2026 20:38:47 +0500 Subject: KVM: selftests: Guard execinfo.h inclusion for non-glibc builds The backtrace() function and execinfo.h are GNU extensions available in glibc but not in non-glibc C libraries such as musl. Building KVM selftests with musl-gcc fails with: lib/assert.c:9:10: fatal error: execinfo.h: No such file or directory Fix this by guarding the inclusion of execinfo.h and the stack dumping logic under #ifdef __GLIBC__. For non-glibc builds, provide a local stub for test_dump_stack(). Suggested-by: Aqib Faruqui Suggested-by: Sean Christopherson Signed-off-by: Hisam Mehboob Link: https://patch.msgid.link/20260409153846.1502656-2-hisamshar@gmail.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/assert.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c index b49690658c60..8be0d09ecf0f 100644 --- a/tools/testing/selftests/kvm/lib/assert.c +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -6,11 +6,14 @@ */ #include "test_util.h" -#include + #include #include "kselftest.h" +#ifdef __GLIBC__ +#include + /* Dumps the current stack trace to stderr. */ static void __attribute__((noinline)) test_dump_stack(void); static void test_dump_stack(void) @@ -57,6 +60,9 @@ static void test_dump_stack(void) system(cmd); #pragma GCC diagnostic pop } +#else +static void test_dump_stack(void) {} +#endif static pid_t _gettid(void) { -- cgit v1.2.3 From b60621c5121c9435eda99af7dc2100f5c0f88695 Mon Sep 17 00:00:00 2001 From: Emily Ehlert Date: Mon, 18 May 2026 13:59:56 +0000 Subject: KVM: x86: Fix ERAPS RAP clear on INVPCID single-context invalidation Use kvm_register_mark_dirty() instead of kvm_register_is_dirty() to actually mark VCPU_EXREG_ERAPS as dirty when emulating INVPCID_TYPE_SINGLE_CTXT. kvm_register_is_dirty() is a read-only predicate whose return value is discarded, making the call a no-op. Without this fix, a single-context INVPCID will not trigger a RAP clear on the next VMRUN, breaking the ERAPS security guarantee. Fixes: db5e82496492 ("KVM: SVM: Virtualize and advertise support for ERAPS") Signed-off-by: Emily Ehlert Link: https://patch.msgid.link/20260518135956.82569-1-ehemily@amazon.de Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e01d6984ed04..108318e1b3f0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -14330,7 +14330,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) * the RAP (Return Address Predicator). */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) - kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS); kvm_invalidate_pcid(vcpu, operand.pcid); return kvm_skip_emulated_instruction(vcpu); -- cgit v1.2.3 From a9e18aa3263f356edae305e29830e5fe63d8597a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 May 2026 10:15:36 -0700 Subject: KVM: SVM: Flush the current TLB when transitioning from xAVIC => x2AVIC Flush the current TLB when xAVIC *or* x2AVIC is activated, as KVM is (apparently) responsible for purging TLB entries when transitioning from xAVIC to x2AVIC. The APM says a whole lot of nothing about TLB flushing with respect to (x2)AVIC, but empirical data strongly suggests hardware also does a whole lot of nothing. Failure to flush the TLB when enabling x2AVIC can lead to guest accesses to the APIC base address getting incorrectly redirected to the virtual APIC page. The flaw most visibly manifests as failures in KVM-Unit-Test's verify_disabled_apic_mmio() testcase when x2APIC is enabled (though for reasons unknown, the test only reliably fails with EFI builds). Fixes: 0ccf3e7cb95a ("KVM: SVM: Flush the "current" TLB when activating AVIC") Fixes: 4d1d7942e36a ("KVM: SVM: Introduce logic to (de)activate x2AVIC mode") Cc: stable@vger.kernel.org Cc: Naveen N Rao (AMD) Link: https://patch.msgid.link/20260515171536.1841645-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index adf211860949..e8bd60156941 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -206,6 +206,35 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); + /* + * Flush the TLB when enabling (x2)AVIC and when transitioning between + * xAVIC and x2AVIC, as the CPU may have inserted a TLB entry for the + * "wrong" mapping. + * + * KVM uses a per-VM "scratch" page to back the APIC memslot, because + * KVM also uses per-VM page tables *and* maintains the page table (NPT + * or shadow page) mappings for said memslot even if one or more vCPUs + * have their local APIC hardware-disabled or are in x2APIC mode, i.e. + * even if one or more vCPUs' APIC MMIO BAR is effectively disabled. + * + * If xAVIC is fully enabled, hardware ignores the physical address in + * KVM's page tables, i.e. in the leaf SPTE for the APIC memslot, and + * instead redirects the access to the AVIC backing page, i.e. to the + * vCPU's virtual APIC page. If xAVIC is not enabled (APIC is either + * hardware-disabled or in x2APIC mode), then guest accesses will use + * the page table mapping verbatim, i.e. will access the per-VM scratch + * page, as normal memory. + * + * In both cases, the CPU is allowed to cache TLB entries for the APIC + * base GPA. So, KVM needs to flush the TLB when enabling xAVIC, as + * accesses need to be redirected to the virtual APIC page, but the TLB + * may contain entries pointing at the scratch page. KVM also needs to + * flush the TLB when enabling x2AVIC, as accesses need to go to the + * scratch page, but the TLB may contain entries tagged as xAVIC, i.e. + * entries pointing to the vCPU's virtual APIC page. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); + /* * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR * accesses, while interrupt injection to a running vCPU can be @@ -219,12 +248,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) /* Disabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, false); } else { - /* - * Flush the TLB, the guest may have inserted a non-APIC - * mapping into the TLB while AVIC was disabled. - */ - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); - /* Enabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, true); } -- cgit v1.2.3