summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2026-05-29 19:28:16 +0200
committerPaolo Bonzini <pbonzini@redhat.com>2026-05-29 19:28:16 +0200
commitb3978970165729eea7a80d10d52f17cc495672cd (patch)
treeadfa8ff418b8be54595f53d62f89603ce700c272
parente7ae89a0c97ce2b68b0983cd01eda67cf373517d (diff)
parenta9e18aa3263f356edae305e29830e5fe63d8597a (diff)
Merge tag 'kvm-x86-fixes-7.1-rc6' of https://github.com/kvm-x86/linux into HEAD
KVM x86 fixes for 7.1-rcN - Include the kernel's linux/mman.h in KVM selftests to ensure MADV_COLLAPSE is defined, as older libc versions may not provide it. - Include execinfo.h if and only if KVM selftests are building against glibc, and provide a test_dump_stack() for non-glibc builds. - Fudge around an RCU splat in the emegerncy reboot code that is technically a legitimate flaw, but in practice is a non-issue and fixing the flaw, e.g. by adding locking, would incur meaningful risk, i.e. do more harm than good. - Rate-limit global clock updates once again (but without delayed work), as KVM was subtly relying on the old rate-limiting for NPT correction to guard against "update storms" when running without a master clock on systems with overcommitted CPUs. - Fix a brown paper bag goof where KVM checked if ERAPS is "dirty" instead of marking it dirty when emulating INVPCID. - Flush the TLB when transitioning from xAVIC => x2AVIC to ensure the CPU TLB doesn't contain AVIC-tagged entries for the APIC base GPA.
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/kvm/svm/avic.c35
-rw-r--r--arch/x86/kvm/x86.c13
-rw-r--r--arch/x86/virt/hw.c15
-rw-r--r--tools/testing/selftests/kvm/access_tracking_perf_test.c2
-rw-r--r--tools/testing/selftests/kvm/guest_memfd_test.c2
-rw-r--r--tools/testing/selftests/kvm/include/kvm_syscalls.h10
-rw-r--r--tools/testing/selftests/kvm/include/test_util.h2
-rw-r--r--tools/testing/selftests/kvm/lib/assert.c8
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c2
-rw-r--r--tools/testing/selftests/kvm/memslot_perf_test.c2
-rw-r--r--tools/testing/selftests/kvm/s390/shared_zeropage_test.c3
-rw-r--r--tools/testing/selftests/kvm/s390/tprot.c2
-rw-r--r--tools/testing/selftests/kvm/set_memory_region_test.c2
14 files changed, 79 insertions, 20 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c470e40a00aa..f14009f25a3b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1504,6 +1504,7 @@ struct kvm_arch {
bool use_master_clock;
u64 master_kernel_ns;
u64 master_cycle_now;
+ struct ratelimit_state kvmclock_update_rs;
#ifdef CONFIG_KVM_HYPERV
struct kvm_hv hyperv;
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 993b551180fe..cdd5a6dc646f 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -207,6 +207,35 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
/*
+ * Flush the TLB when enabling (x2)AVIC and when transitioning between
+ * xAVIC and x2AVIC, as the CPU may have inserted a TLB entry for the
+ * "wrong" mapping.
+ *
+ * KVM uses a per-VM "scratch" page to back the APIC memslot, because
+ * KVM also uses per-VM page tables *and* maintains the page table (NPT
+ * or shadow page) mappings for said memslot even if one or more vCPUs
+ * have their local APIC hardware-disabled or are in x2APIC mode, i.e.
+ * even if one or more vCPUs' APIC MMIO BAR is effectively disabled.
+ *
+ * If xAVIC is fully enabled, hardware ignores the physical address in
+ * KVM's page tables, i.e. in the leaf SPTE for the APIC memslot, and
+ * instead redirects the access to the AVIC backing page, i.e. to the
+ * vCPU's virtual APIC page. If xAVIC is not enabled (APIC is either
+ * hardware-disabled or in x2APIC mode), then guest accesses will use
+ * the page table mapping verbatim, i.e. will access the per-VM scratch
+ * page, as normal memory.
+ *
+ * In both cases, the CPU is allowed to cache TLB entries for the APIC
+ * base GPA. So, KVM needs to flush the TLB when enabling xAVIC, as
+ * accesses need to be redirected to the virtual APIC page, but the TLB
+ * may contain entries pointing at the scratch page. KVM also needs to
+ * flush the TLB when enabling x2AVIC, as accesses need to go to the
+ * scratch page, but the TLB may contain entries tagged as xAVIC, i.e.
+ * entries pointing to the vCPU's virtual APIC page.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
+
+ /*
* Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
* accesses, while interrupt injection to a running vCPU can be
* achieved using AVIC doorbell. KVM disables the APIC access page
@@ -219,12 +248,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
/* Disabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, false);
} else {
- /*
- * Flush the TLB, the guest may have inserted a non-APIC
- * mapping into the TLB while AVIC was disabled.
- */
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
-
/* Enabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, true);
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c1a72d749084..0550359ed798 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5227,8 +5227,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* On a host with synchronized TSC, there is no need to update
* kvmclock on vcpu->cpu migration
*/
- if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
- kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+ if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) {
+ if (__ratelimit(&vcpu->kvm->arch.kvmclock_update_rs))
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+ else
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+
if (vcpu->cpu != cpu)
kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
vcpu->cpu = cpu;
@@ -13366,6 +13371,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
+ ratelimit_state_init(&kvm->arch.kvmclock_update_rs, HZ, 10);
+ ratelimit_set_flags(&kvm->arch.kvmclock_update_rs, RATELIMIT_MSG_ON_RELEASE);
kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -14323,7 +14330,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
* the RAP (Return Address Predicator).
*/
if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
- kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS);
kvm_invalidate_pcid(vcpu, operand.pcid);
return kvm_skip_emulated_instruction(vcpu);
diff --git a/arch/x86/virt/hw.c b/arch/x86/virt/hw.c
index f647557d38ac..7e9091c640be 100644
--- a/arch/x86/virt/hw.c
+++ b/arch/x86/virt/hw.c
@@ -49,7 +49,20 @@ static void x86_virt_invoke_kvm_emergency_callback(void)
{
cpu_emergency_virt_cb *kvm_callback;
- kvm_callback = rcu_dereference(kvm_emergency_callback);
+ /*
+ * RCU may not be watching the crashing CPU here, so rcu_dereference()
+ * triggers a suspicious-RCU-usage splat. In principle, a concurrent
+ * KVM module unload could race with this read; see commit 2baa33a8ddd6
+ * ("KVM: x86: Leave user-return notifier registered on reboot/shutdown")
+ * which notes that nothing prevents module unload during panic/reboot.
+ *
+ * However, taking a lock here would be riskier than the current race:
+ * the system is going down via NMI shootdown, and any lock could be
+ * held by an already-stopped CPU. Use rcu_dereference_raw() to silence
+ * the lockdep splat and accept the comically small remaining race;
+ * panic context inherently cannot guarantee complete correctness.
+ */
+ kvm_callback = rcu_dereference_raw(kvm_emergency_callback);
if (kvm_callback)
kvm_callback();
}
diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c
index e5bbdb5bbdc3..4415c94b2866 100644
--- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
+++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
@@ -41,10 +41,10 @@
#include <inttypes.h>
#include <limits.h>
#include <pthread.h>
-#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
+#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "test_util.h"
#include "memstress.h"
diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c
index 253e748c1d4a..832ef4dfb99f 100644
--- a/tools/testing/selftests/kvm/guest_memfd_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_test.c
@@ -14,10 +14,10 @@
#include <linux/bitmap.h>
#include <linux/falloc.h>
#include <linux/sizes.h>
-#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
+#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "numaif.h"
#include "test_util.h"
diff --git a/tools/testing/selftests/kvm/include/kvm_syscalls.h b/tools/testing/selftests/kvm/include/kvm_syscalls.h
index 843c9904c46f..067a4c9cf452 100644
--- a/tools/testing/selftests/kvm/include/kvm_syscalls.h
+++ b/tools/testing/selftests/kvm/include/kvm_syscalls.h
@@ -2,8 +2,18 @@
#ifndef SELFTEST_KVM_SYSCALLS_H
#define SELFTEST_KVM_SYSCALLS_H
+/*
+ * Include both the kernel and libc versions of mman.h. The kernel provides
+ * the most up-to-date flags and definitions, while libc provides the syscall
+ * wrappers tests expect.
+ */
+#include <linux/mman.h>
+
+#include <sys/mman.h>
#include <sys/syscall.h>
+#include <test_util.h>
+
#define MAP_ARGS0(m,...)
#define MAP_ARGS1(m,t,a,...) m(t,a)
#define MAP_ARGS2(m,t,a,...) m(t,a), MAP_ARGS1(m,__VA_ARGS__)
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index d9b433b834f1..a56271c237ae 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -19,9 +19,9 @@
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>
-#include <sys/mman.h>
#include "kselftest.h"
+#include <linux/mman.h>
#include <linux/types.h>
#define msecs_to_usecs(msec) ((msec) * 1000ULL)
diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c
index b49690658c60..8be0d09ecf0f 100644
--- a/tools/testing/selftests/kvm/lib/assert.c
+++ b/tools/testing/selftests/kvm/lib/assert.c
@@ -6,11 +6,14 @@
*/
#include "test_util.h"
-#include <execinfo.h>
+
#include <sys/syscall.h>
#include "kselftest.h"
+#ifdef __GLIBC__
+#include <execinfo.h>
+
/* Dumps the current stack trace to stderr. */
static void __attribute__((noinline)) test_dump_stack(void);
static void test_dump_stack(void)
@@ -57,6 +60,9 @@ static void test_dump_stack(void)
system(cmd);
#pragma GCC diagnostic pop
}
+#else
+static void test_dump_stack(void) {}
+#endif
static pid_t _gettid(void)
{
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 2a76eca7029d..e08967ef7b7b 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -5,13 +5,13 @@
* Copyright (C) 2018, Google LLC.
*/
#include "test_util.h"
+#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "processor.h"
#include "ucall_common.h"
#include <assert.h>
#include <sched.h>
-#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/stat.h>
diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
index 3d02db371422..e977e979470f 100644
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -15,7 +15,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <sys/mman.h>
#include <time.h>
#include <unistd.h>
@@ -23,6 +22,7 @@
#include <linux/sizes.h>
#include <test_util.h>
+#include <kvm_syscalls.h>
#include <kvm_util.h>
#include <processor.h>
#include <ucall_common.h>
diff --git a/tools/testing/selftests/kvm/s390/shared_zeropage_test.c b/tools/testing/selftests/kvm/s390/shared_zeropage_test.c
index a9e5a01200b8..478381e6f84e 100644
--- a/tools/testing/selftests/kvm/s390/shared_zeropage_test.c
+++ b/tools/testing/selftests/kvm/s390/shared_zeropage_test.c
@@ -4,11 +4,10 @@
*
* Copyright (C) 2024, Red Hat, Inc.
*/
-#include <sys/mman.h>
-
#include <linux/fs.h>
#include "test_util.h"
+#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "kselftest.h"
#include "ucall_common.h"
diff --git a/tools/testing/selftests/kvm/s390/tprot.c b/tools/testing/selftests/kvm/s390/tprot.c
index 8054d2b178f0..d86179827a18 100644
--- a/tools/testing/selftests/kvm/s390/tprot.c
+++ b/tools/testing/selftests/kvm/s390/tprot.c
@@ -4,8 +4,8 @@
*
* Copyright IBM Corp. 2021
*/
-#include <sys/mman.h>
#include "test_util.h"
+#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "kselftest.h"
#include "ucall_common.h"
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
index 9b919a231c93..e639a9db51ee 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -8,11 +8,11 @@
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
-#include <sys/mman.h>
#include <linux/compiler.h>
#include <test_util.h>
+#include <kvm_syscalls.h>
#include <kvm_util.h>
#include <processor.h>