summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/hyperv.c6
-rw-r--r--arch/x86/kvm/mmu/mmu.c379
-rw-r--r--arch/x86/kvm/mmu/spte.c31
-rw-r--r--arch/x86/kvm/mmu/spte.h2
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h34
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c45
-rw-r--r--arch/x86/kvm/svm/nested.c10
-rw-r--r--arch/x86/kvm/svm/sev.c34
-rw-r--r--arch/x86/kvm/svm/svm.c62
-rw-r--r--arch/x86/kvm/svm/svm.h2
-rw-r--r--arch/x86/kvm/svm/vmenter.S10
-rw-r--r--arch/x86/kvm/vmx/main.c1
-rw-r--r--arch/x86/kvm/vmx/nested.c11
-rw-r--r--arch/x86/kvm/vmx/vmx.c18
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h1
-rw-r--r--arch/x86/kvm/x86.c7
19 files changed, 461 insertions, 197 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ea2c4f21c1ca..fe8ea8c097de 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -22,6 +22,7 @@ config KVM_X86
select KVM_COMMON
select KVM_GENERIC_MMU_NOTIFIER
select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
+ select KVM_MMU_LOCKLESS_AGING
select HAVE_KVM_IRQCHIP
select HAVE_KVM_PFNCACHE
select HAVE_KVM_DIRTY_RING_TSO
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 668e8fac1f6e..f863a6f93a1a 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -1767,7 +1767,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->ecx = entry->edx = 0;
if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {
- entry->eax = entry->ebx;
+ entry->eax = entry->ebx = 0;
break;
}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 6a6dd5a84f22..6ebeb6cea6c0 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2226,6 +2226,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
u32 vector;
bool all_cpus;
+ if (!lapic_in_kernel(vcpu))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
if (hc->code == HVCALL_SEND_IPI) {
if (!hc->fast) {
if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi,
@@ -2852,7 +2855,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
- ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED;
+ if (!vcpu || lapic_in_kernel(vcpu))
+ ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED;
ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED;
if (evmcs_ver)
ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 74c20dbb92da..63bb77ee1bb1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -501,7 +501,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
return false;
}
- if (!spte_has_volatile_bits(old_spte))
+ if (!spte_needs_atomic_update(old_spte))
__update_clear_spte_fast(sptep, new_spte);
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
@@ -524,7 +524,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
int level = sptep_to_sp(sptep)->role.level;
if (!is_shadow_present_pte(old_spte) ||
- !spte_has_volatile_bits(old_spte))
+ !spte_needs_atomic_update(old_spte))
__update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
else
old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
@@ -853,32 +853,173 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
* About rmap_head encoding:
*
* If the bit zero of rmap_head->val is clear, then it points to the only spte
- * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
+ * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct
* pte_list_desc containing more mappings.
*/
#define KVM_RMAP_MANY BIT(0)
/*
+ * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always
+ * operates with mmu_lock held for write), but rmaps can be walked without
+ * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain
+ * being zapped/dropped _while the rmap is locked_.
+ *
+ * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be
+ * done while holding mmu_lock for write. This allows a task walking rmaps
+ * without holding mmu_lock to concurrently walk the same entries as a task
+ * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify
+ * the rmaps, thus the walks are stable.
+ *
+ * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED,
+ * only the rmap chains themselves are protected. E.g. holding an rmap's lock
+ * ensures all "struct pte_list_desc" fields are stable.
+ */
+#define KVM_RMAP_LOCKED BIT(1)
+
+static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
+{
+ unsigned long old_val, new_val;
+
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * Elide the lock if the rmap is empty, as lockless walkers (read-only
+ * mode) don't need to (and can't) walk an empty rmap, nor can they add
+ * entries to the rmap. I.e. the only paths that process empty rmaps
+ * do so while holding mmu_lock for write, and are mutually exclusive.
+ */
+ old_val = atomic_long_read(&rmap_head->val);
+ if (!old_val)
+ return 0;
+
+ do {
+ /*
+ * If the rmap is locked, wait for it to be unlocked before
+ * trying acquire the lock, e.g. to avoid bouncing the cache
+ * line.
+ */
+ while (old_val & KVM_RMAP_LOCKED) {
+ cpu_relax();
+ old_val = atomic_long_read(&rmap_head->val);
+ }
+
+ /*
+ * Recheck for an empty rmap, it may have been purged by the
+ * task that held the lock.
+ */
+ if (!old_val)
+ return 0;
+
+ new_val = old_val | KVM_RMAP_LOCKED;
+ /*
+ * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
+ * from being reordered outside of the critical section created by
+ * __kvm_rmap_lock().
+ *
+ * Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
+ *
+ * For the !old_val case, no ordering is needed, as there is no rmap
+ * to walk.
+ */
+ } while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));
+
+ /*
+ * Return the old value, i.e. _without_ the LOCKED bit set. It's
+ * impossible for the return value to be 0 (see above), i.e. the read-
+ * only unlock flow can't get a false positive and fail to unlock.
+ */
+ return old_val;
+}
+
+static unsigned long kvm_rmap_lock(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ return __kvm_rmap_lock(rmap_head);
+}
+
+static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
+ unsigned long val)
+{
+ KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED);
+ /*
+ * Ensure that all accesses to the rmap have completed before unlocking
+ * the rmap.
+ *
+ * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock().
+ */
+ atomic_long_set_release(&rmap_head->val, val);
+}
+
+static void kvm_rmap_unlock(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ unsigned long new_val)
+{
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ __kvm_rmap_unlock(rmap_head, new_val);
+}
+
+static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
+{
+ return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED;
+}
+
+/*
+ * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The
+ * actual locking is the same, but the caller is disallowed from modifying the
+ * rmap, and so the unlock flow is a nop if the rmap is/was empty.
+ */
+static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
+{
+ unsigned long rmap_val;
+
+ preempt_disable();
+ rmap_val = __kvm_rmap_lock(rmap_head);
+
+ if (!rmap_val)
+ preempt_enable();
+
+ return rmap_val;
+}
+
+static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
+ unsigned long old_val)
+{
+ if (!old_val)
+ return;
+
+ KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head));
+
+ __kvm_rmap_unlock(rmap_head, old_val);
+ preempt_enable();
+}
+
+/*
* Returns the number of pointers in the rmap chain, not counting the new one.
*/
-static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
- struct kvm_rmap_head *rmap_head)
+static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+ u64 *spte, struct kvm_rmap_head *rmap_head)
{
+ unsigned long old_val, new_val;
struct pte_list_desc *desc;
int count = 0;
- if (!rmap_head->val) {
- rmap_head->val = (unsigned long)spte;
- } else if (!(rmap_head->val & KVM_RMAP_MANY)) {
+ old_val = kvm_rmap_lock(kvm, rmap_head);
+
+ if (!old_val) {
+ new_val = (unsigned long)spte;
+ } else if (!(old_val & KVM_RMAP_MANY)) {
desc = kvm_mmu_memory_cache_alloc(cache);
- desc->sptes[0] = (u64 *)rmap_head->val;
+ desc->sptes[0] = (u64 *)old_val;
desc->sptes[1] = spte;
desc->spte_count = 2;
desc->tail_count = 0;
- rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
+ new_val = (unsigned long)desc | KVM_RMAP_MANY;
++count;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
count = desc->tail_count + desc->spte_count;
/*
@@ -887,21 +1028,25 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
*/
if (desc->spte_count == PTE_LIST_EXT) {
desc = kvm_mmu_memory_cache_alloc(cache);
- desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY);
desc->spte_count = 0;
desc->tail_count = count;
- rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
+ new_val = (unsigned long)desc | KVM_RMAP_MANY;
+ } else {
+ new_val = old_val;
}
desc->sptes[desc->spte_count++] = spte;
}
+
+ kvm_rmap_unlock(kvm, rmap_head, new_val);
+
return count;
}
-static void pte_list_desc_remove_entry(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
+static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val,
struct pte_list_desc *desc, int i)
{
- struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY);
int j = head_desc->spte_count - 1;
/*
@@ -928,9 +1073,9 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
* head at the next descriptor, i.e. the new head.
*/
if (!head_desc->more)
- rmap_head->val = 0;
+ *rmap_val = 0;
else
- rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
+ *rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
mmu_free_pte_list_desc(head_desc);
}
@@ -938,24 +1083,26 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
+ unsigned long rmap_val;
int i;
- if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
- return;
+ rmap_val = kvm_rmap_lock(kvm, rmap_head);
+ if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm))
+ goto out;
- if (!(rmap_head->val & KVM_RMAP_MANY)) {
- if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
- return;
+ if (!(rmap_val & KVM_RMAP_MANY)) {
+ if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm))
+ goto out;
- rmap_head->val = 0;
+ rmap_val = 0;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
- pte_list_desc_remove_entry(kvm, rmap_head,
+ pte_list_desc_remove_entry(kvm, &rmap_val,
desc, i);
- return;
+ goto out;
}
}
desc = desc->more;
@@ -963,6 +1110,9 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
}
+
+out:
+ kvm_rmap_unlock(kvm, rmap_head, rmap_val);
}
static void kvm_zap_one_rmap_spte(struct kvm *kvm,
@@ -977,17 +1127,19 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc, *next;
+ unsigned long rmap_val;
int i;
- if (!rmap_head->val)
+ rmap_val = kvm_rmap_lock(kvm, rmap_head);
+ if (!rmap_val)
return false;
- if (!(rmap_head->val & KVM_RMAP_MANY)) {
- mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
+ if (!(rmap_val & KVM_RMAP_MANY)) {
+ mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val);
goto out;
}
- desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
for (; desc; desc = next) {
for (i = 0; i < desc->spte_count; i++)
@@ -997,20 +1149,21 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
}
out:
/* rmap_head is meaningless now, remember to reset it */
- rmap_head->val = 0;
+ kvm_rmap_unlock(kvm, rmap_head, 0);
return true;
}
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
{
+ unsigned long rmap_val = kvm_rmap_get(rmap_head);
struct pte_list_desc *desc;
- if (!rmap_head->val)
+ if (!rmap_val)
return 0;
- else if (!(rmap_head->val & KVM_RMAP_MANY))
+ else if (!(rmap_val & KVM_RMAP_MANY))
return 1;
- desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
return desc->tail_count + desc->spte_count;
}
@@ -1053,6 +1206,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
*/
struct rmap_iterator {
/* private fields */
+ struct rmap_head *head;
struct pte_list_desc *desc; /* holds the sptep if not NULL */
int pos; /* index of the sptep */
};
@@ -1067,23 +1221,19 @@ struct rmap_iterator {
static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
struct rmap_iterator *iter)
{
- u64 *sptep;
+ unsigned long rmap_val = kvm_rmap_get(rmap_head);
- if (!rmap_head->val)
+ if (!rmap_val)
return NULL;
- if (!(rmap_head->val & KVM_RMAP_MANY)) {
+ if (!(rmap_val & KVM_RMAP_MANY)) {
iter->desc = NULL;
- sptep = (u64 *)rmap_head->val;
- goto out;
+ return (u64 *)rmap_val;
}
- iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
+ iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
iter->pos = 0;
- sptep = iter->desc->sptes[iter->pos];
-out:
- BUG_ON(!is_shadow_present_pte(*sptep));
- return sptep;
+ return iter->desc->sptes[iter->pos];
}
/*
@@ -1093,14 +1243,11 @@ out:
*/
static u64 *rmap_get_next(struct rmap_iterator *iter)
{
- u64 *sptep;
-
if (iter->desc) {
if (iter->pos < PTE_LIST_EXT - 1) {
++iter->pos;
- sptep = iter->desc->sptes[iter->pos];
- if (sptep)
- goto out;
+ if (iter->desc->sptes[iter->pos])
+ return iter->desc->sptes[iter->pos];
}
iter->desc = iter->desc->more;
@@ -1108,20 +1255,24 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
if (iter->desc) {
iter->pos = 0;
/* desc->sptes[0] cannot be NULL */
- sptep = iter->desc->sptes[iter->pos];
- goto out;
+ return iter->desc->sptes[iter->pos];
}
}
return NULL;
-out:
- BUG_ON(!is_shadow_present_pte(*sptep));
- return sptep;
}
-#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
- for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
- _spte_; _spte_ = rmap_get_next(_iter_))
+#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \
+ _sptep_; _sptep_ = rmap_get_next(_iter_))
+
+#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \
+
+#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \
+ __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
+ if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep)))
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
@@ -1207,12 +1358,13 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct rmap_iterator iter;
bool flush = false;
- for_each_rmap_spte(rmap_head, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
if (spte_ad_need_write_protect(*sptep))
flush |= test_and_clear_bit(PT_WRITABLE_SHIFT,
(unsigned long *)sptep);
else
flush |= spte_clear_dirty(sptep);
+ }
return flush;
}
@@ -1401,7 +1553,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
while (++iterator->rmap <= iterator->end_rmap) {
iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
- if (iterator->rmap->val)
+ if (atomic_long_read(&iterator->rmap->val))
return;
}
@@ -1533,7 +1685,7 @@ static void __rmap_add(struct kvm *kvm,
kvm_update_page_stats(kvm, sp->role.level, 1);
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
- rmap_count = pte_list_add(cache, spte, rmap_head);
+ rmap_count = pte_list_add(kvm, cache, spte, rmap_head);
if (rmap_count > kvm->stat.max_mmu_rmap_size)
kvm->stat.max_mmu_rmap_size = rmap_count;
@@ -1552,51 +1704,67 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
}
static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
- struct kvm_gfn_range *range, bool test_only)
+ struct kvm_gfn_range *range,
+ bool test_only)
{
- struct slot_rmap_walk_iterator iterator;
+ struct kvm_rmap_head *rmap_head;
struct rmap_iterator iter;
+ unsigned long rmap_val;
bool young = false;
u64 *sptep;
+ gfn_t gfn;
+ int level;
+ u64 spte;
- for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- range->start, range->end - 1, &iterator) {
- for_each_rmap_spte(iterator.rmap, &iter, sptep) {
- u64 spte = *sptep;
+ for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
+ for (gfn = range->start; gfn < range->end;
+ gfn += KVM_PAGES_PER_HPAGE(level)) {
+ rmap_head = gfn_to_rmap(gfn, level, range->slot);
+ rmap_val = kvm_rmap_lock_readonly(rmap_head);
- if (!is_accessed_spte(spte))
- continue;
+ for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) {
+ if (!is_accessed_spte(spte))
+ continue;
- if (test_only)
- return true;
-
- if (spte_ad_enabled(spte)) {
- clear_bit((ffs(shadow_accessed_mask) - 1),
- (unsigned long *)sptep);
- } else {
- /*
- * WARN if mmu_spte_update() signals the need
- * for a TLB flush, as Access tracking a SPTE
- * should never trigger an _immediate_ flush.
- */
- spte = mark_spte_for_access_track(spte);
- WARN_ON_ONCE(mmu_spte_update(sptep, spte));
+ if (test_only) {
+ kvm_rmap_unlock_readonly(rmap_head, rmap_val);
+ return true;
+ }
+
+ if (spte_ad_enabled(spte))
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ else
+ /*
+ * If the following cmpxchg fails, the
+ * spte is being concurrently modified
+ * and should most likely stay young.
+ */
+ cmpxchg64(sptep, spte,
+ mark_spte_for_access_track(spte));
+ young = true;
}
- young = true;
+
+ kvm_rmap_unlock_readonly(rmap_head, rmap_val);
}
}
return young;
}
+static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm)
+{
+ return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages);
+}
+
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
- if (kvm_memslots_have_rmaps(kvm))
- young = kvm_rmap_age_gfn_range(kvm, range, false);
-
if (tdp_mmu_enabled)
- young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
+ young = kvm_tdp_mmu_age_gfn_range(kvm, range);
+
+ if (kvm_may_have_shadow_mmu_sptes(kvm))
+ young |= kvm_rmap_age_gfn_range(kvm, range, false);
return young;
}
@@ -1605,11 +1773,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
- if (kvm_memslots_have_rmaps(kvm))
- young = kvm_rmap_age_gfn_range(kvm, range, true);
-
if (tdp_mmu_enabled)
- young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
+ young = kvm_tdp_mmu_test_age_gfn(kvm, range);
+
+ if (young)
+ return young;
+
+ if (kvm_may_have_shadow_mmu_sptes(kvm))
+ young |= kvm_rmap_age_gfn_range(kvm, range, true);
return young;
}
@@ -1656,13 +1827,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
return hash_64(gfn, KVM_MMU_HASH_SHIFT);
}
-static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
+static void mmu_page_add_parent_pte(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache,
struct kvm_mmu_page *sp, u64 *parent_pte)
{
if (!parent_pte)
return;
- pte_list_add(cache, parent_pte, &sp->parent_ptes);
+ pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes);
}
static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -2352,7 +2524,7 @@ static void __link_shadow_page(struct kvm *kvm,
mmu_spte_set(sptep, spte);
- mmu_page_add_parent_pte(cache, sp, sptep);
+ mmu_page_add_parent_pte(kvm, cache, sp, sptep);
/*
* The non-direct sub-pagetable must be updated before linking. For
@@ -2416,7 +2588,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
* avoids retaining a large number of stale nested SPs.
*/
if (tdp_enabled && invalid_list &&
- child->role.guest_mode && !child->parent_ptes.val)
+ child->role.guest_mode &&
+ !atomic_long_read(&child->parent_ptes.val))
return kvm_mmu_prepare_zap_page(kvm, child,
invalid_list);
}
@@ -5540,7 +5713,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
union kvm_mmu_page_role root_role;
/* NPT requires CR0.PG=1. */
- WARN_ON_ONCE(cpu_role.base.direct);
+ WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode);
root_role = cpu_role.base;
root_role.level = kvm_mmu_get_tdp_level(vcpu);
@@ -7460,7 +7633,7 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
return true;
}
-static void kvm_mmu_start_lpage_recovery(struct once *once)
+static int kvm_mmu_start_lpage_recovery(struct once *once)
{
struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once);
struct kvm *kvm = container_of(ka, struct kvm, arch);
@@ -7471,13 +7644,14 @@ static void kvm_mmu_start_lpage_recovery(struct once *once)
kvm_nx_huge_page_recovery_worker_kill,
kvm, "kvm-nx-lpage-recovery");
- if (!nx_thread)
- return;
+ if (IS_ERR(nx_thread))
+ return PTR_ERR(nx_thread);
vhost_task_start(nx_thread);
/* Make the task visible only once it is fully started. */
WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
+ return 0;
}
int kvm_mmu_post_init_vm(struct kvm *kvm)
@@ -7485,10 +7659,7 @@ int kvm_mmu_post_init_vm(struct kvm *kvm)
if (nx_hugepage_mitigation_hard_disabled)
return 0;
- call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
- if (!kvm->arch.nx_huge_page_recovery_thread)
- return -ENOMEM;
- return 0;
+ return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
}
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 22551e2f1d00..0f9f47b4ab0e 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -129,25 +129,32 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
}
/*
- * Returns true if the SPTE has bits that may be set without holding mmu_lock.
- * The caller is responsible for checking if the SPTE is shadow-present, and
- * for determining whether or not the caller cares about non-leaf SPTEs.
+ * Returns true if the SPTE needs to be updated atomically due to having bits
+ * that may be changed without holding mmu_lock, and for which KVM must not
+ * lose information. E.g. KVM must not drop Dirty bit information. The caller
+ * is responsible for checking if the SPTE is shadow-present, and for
+ * determining whether or not the caller cares about non-leaf SPTEs.
*/
-bool spte_has_volatile_bits(u64 spte)
+bool spte_needs_atomic_update(u64 spte)
{
+ /* SPTEs can be made Writable bit by KVM's fast page fault handler. */
if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
return true;
- if (is_access_track_spte(spte))
+ /*
+ * A/D-disabled SPTEs can be access-tracked by aging, and access-tracked
+ * SPTEs can be restored by KVM's fast page fault handler.
+ */
+ if (!spte_ad_enabled(spte))
return true;
- if (spte_ad_enabled(spte)) {
- if (!(spte & shadow_accessed_mask) ||
- (is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
- return true;
- }
-
- return false;
+ /*
+ * Dirty and Accessed bits can be set by the CPU. Ignore the Accessed
+ * bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't
+ * invalidate TLBs when aging SPTEs, and so it's safe to clobber the
+ * Accessed bit (and rare in practice).
+ */
+ return is_writable_pte(spte) && !(spte & shadow_dirty_mask);
}
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 59746854c0af..79cdceba9857 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -519,7 +519,7 @@ static inline u64 get_mmio_spte_generation(u64 spte)
return gen;
}
-bool spte_has_volatile_bits(u64 spte);
+bool spte_needs_atomic_update(u64 spte);
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 047b78333653..364c5da6c499 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -25,6 +25,13 @@ static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
return xchg(rcu_dereference(sptep), new_spte);
}
+static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask)
+{
+ atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
+
+ return (u64)atomic64_fetch_and(~mask, sptep_atomic);
+}
+
static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
{
KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte));
@@ -32,28 +39,21 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
}
/*
- * SPTEs must be modified atomically if they are shadow-present, leaf
- * SPTEs, and have volatile bits, i.e. has bits that can be set outside
- * of mmu_lock. The Writable bit can be set by KVM's fast page fault
- * handler, and Accessed and Dirty bits can be set by the CPU.
- *
- * Note, non-leaf SPTEs do have Accessed bits and those bits are
- * technically volatile, but KVM doesn't consume the Accessed bit of
- * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
- * logic needs to be reassessed if KVM were to use non-leaf Accessed
- * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
+ * SPTEs must be modified atomically if they are shadow-present, leaf SPTEs,
+ * and have volatile bits (bits that can be set outside of mmu_lock) that
+ * must not be clobbered.
*/
-static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
+static inline bool kvm_tdp_mmu_spte_need_atomic_update(u64 old_spte, int level)
{
return is_shadow_present_pte(old_spte) &&
is_last_spte(old_spte, level) &&
- spte_has_volatile_bits(old_spte);
+ spte_needs_atomic_update(old_spte);
}
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
u64 new_spte, int level)
{
- if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
+ if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
__kvm_tdp_mmu_write_spte(sptep, new_spte);
@@ -63,12 +63,8 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
u64 mask, int level)
{
- atomic64_t *sptep_atomic;
-
- if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
- sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
- return (u64)atomic64_fetch_and(~mask, sptep_atomic);
- }
+ if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level))
+ return tdp_mmu_clear_spte_bits_atomic(sptep, mask);
__kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
return old_spte;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 046b6ba31197..7cc0564f5f97 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -193,6 +193,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
!tdp_mmu_root_match((_root), (_types)))) { \
} else
+/*
+ * Iterate over all TDP MMU roots in an RCU read-side critical section.
+ * It is safe to iterate over the SPTEs under the root, but their values will
+ * be unstable, so all writes must be atomic. As this routine is meant to be
+ * used without holding the mmu_lock at all, any bits that are flipped must
+ * be reflected in kvm_tdp_mmu_spte_need_atomic_write().
+ */
+#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \
+ list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
+ !tdp_mmu_root_match((_root), (_types))) { \
+ } else
+
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
__for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
@@ -774,9 +787,6 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
continue; \
else
-#define tdp_mmu_for_each_pte(_iter, _kvm, _root, _start, _end) \
- for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
-
static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
struct tdp_iter *iter)
{
@@ -1235,7 +1245,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
rcu_read_lock();
- tdp_mmu_for_each_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
+ for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
int r;
if (fault->nx_huge_page_workaround_enabled)
@@ -1332,21 +1342,22 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
* from the clear_young() or clear_flush_young() notifier, which uses the
* return value to determine if the page has been accessed.
*/
-static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
+static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
{
u64 new_spte;
if (spte_ad_enabled(iter->old_spte)) {
- iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
- iter->old_spte,
- shadow_accessed_mask,
- iter->level);
+ iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
+ shadow_accessed_mask);
new_spte = iter->old_spte & ~shadow_accessed_mask;
} else {
new_spte = mark_spte_for_access_track(iter->old_spte);
- iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
- iter->old_spte, new_spte,
- iter->level);
+ /*
+ * It is safe for the following cmpxchg to fail. Leave the
+ * Accessed bit set, as the spte is most likely young anyway.
+ */
+ if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
+ return;
}
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
@@ -1371,9 +1382,9 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
* valid roots!
*/
WARN_ON(types & ~KVM_VALID_ROOTS);
- __for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) {
- guard(rcu)();
+ guard(rcu)();
+ for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
if (!is_accessed_spte(iter.old_spte))
continue;
@@ -1382,7 +1393,7 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
return true;
ret = true;
- kvm_tdp_mmu_age_spte(&iter);
+ kvm_tdp_mmu_age_spte(kvm, &iter);
}
}
@@ -1904,7 +1915,7 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
*root_level = vcpu->arch.mmu->root_role.level;
- tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
+ for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
@@ -1931,7 +1942,7 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
struct tdp_iter iter;
tdp_ptep_t sptep = NULL;
- tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
+ for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
*spte = iter.old_spte;
sptep = iter.sptep;
}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index c6b79c2e1e05..834b67672d50 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -646,6 +646,11 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
u32 pause_count12;
u32 pause_thresh12;
+ nested_svm_transition_tlb_flush(vcpu);
+
+ /* Enter Guest-Mode */
+ enter_guest_mode(vcpu);
+
/*
* Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
* exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
@@ -762,11 +767,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
}
}
- nested_svm_transition_tlb_flush(vcpu);
-
- /* Enter Guest-Mode */
- enter_guest_mode(vcpu);
-
/*
* Merge guest and host intercepts - must be called with vcpu in
* guest-mode to take effect.
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 42a6d8bb1024..596167d33544 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2972,6 +2972,16 @@ void __init sev_hardware_setup(void)
WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID)))
goto out;
+ /*
+ * The kernel's initcall infrastructure lacks the ability to express
+ * dependencies between initcalls, whereas the modules infrastructure
+ * automatically handles dependencies via symbol loading. Ensure the
+ * PSP SEV driver is initialized before proceeding if KVM is built-in,
+ * as the dependency isn't handled by the initcall infrastructure.
+ */
+ if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init())
+ goto out;
+
/* Retrieve SEV CPUID information */
cpuid(0x8000001f, &eax, &ebx, &ecx, &edx);
@@ -4579,6 +4589,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
{
+ struct kvm *kvm = svm->vcpu.kvm;
+
/*
* All host state for SEV-ES guests is categorized into three swap types
* based on how it is handled by hardware during a world switch:
@@ -4602,14 +4614,22 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are
/*
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
- * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
- * saves and loads debug registers (Type-A).
+ * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does
+ * not save or load debug registers. Sadly, KVM can't prevent SNP
+ * guests from lying about DebugSwap on secondary vCPUs, i.e. the
+ * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what
+ * the guest has actually enabled (or not!) in the VMSA.
+ *
+ * If DebugSwap is *possible*, save the masks so that they're restored
+ * if the guest enables DebugSwap. But for the DRs themselves, do NOT
+ * rely on the CPU to restore the host values; KVM will restore them as
+ * needed in common code, via hw_breakpoint_restore(). Note, KVM does
+ * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs
+ * don't need to be restored per se, KVM just needs to ensure they are
+ * loaded with the correct values *if* the CPU writes the MSRs.
*/
- if (sev_vcpu_has_debug_swap(svm)) {
- hostsa->dr0 = native_get_debugreg(0);
- hostsa->dr1 = native_get_debugreg(1);
- hostsa->dr2 = native_get_debugreg(2);
- hostsa->dr3 = native_get_debugreg(3);
+ if (sev_vcpu_has_debug_swap(svm) ||
+ (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 6bd7e44380e0..5176e2f6cc14 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1991,11 +1991,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid = sd->next_asid++;
}
-static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
{
- struct vmcb *vmcb = svm->vmcb;
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
- if (svm->vcpu.arch.guest_state_protected)
+ if (vcpu->arch.guest_state_protected)
return;
if (unlikely(value != vmcb->save.dr6)) {
@@ -3165,6 +3165,27 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
}
+
+ /*
+ * AMD changed the architectural behavior of bits 5:2. On CPUs
+ * without BusLockTrap, bits 5:2 control "external pins", but
+ * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap
+ * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed
+ * the guest to set bits 5:2 despite not actually virtualizing
+ * Performance-Monitoring/Breakpoint external pins. Drop bits
+ * 5:2 for backwards compatibility.
+ */
+ data &= ~GENMASK(5, 2);
+
+ /*
+ * Suppress BTF as KVM doesn't virtualize BTF, but there's no
+ * way to communicate lack of support to the guest.
+ */
+ if (data & DEBUGCTLMSR_BTF) {
+ kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
+ data &= ~DEBUGCTLMSR_BTF;
+ }
+
if (data & DEBUGCTL_RESERVED_BITS)
return 1;
@@ -4192,6 +4213,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
guest_state_enter_irqoff();
+ /*
+ * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
+ * VMRUN controls whether or not physical IRQs are masked (KVM always
+ * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
+ * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
+ * into guest state if delivery of an event during VMRUN triggers a
+ * #VMEXIT, and the guest_state transitions already tell lockdep that
+ * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
+ * this path, so IRQs aren't actually unmasked while running host code.
+ */
+ raw_local_irq_enable();
+
amd_clear_divider();
if (sev_es_guest(vcpu->kvm))
@@ -4200,6 +4233,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
else
__svm_vcpu_run(svm, spec_ctrl_intercepted);
+ raw_local_irq_disable();
+
guest_state_exit_irqoff();
}
@@ -4250,14 +4285,22 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
* Run with all-zero DR6 unless needed, so that we can get the exact cause
* of a #DB.
*/
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- svm_set_dr6(svm, vcpu->arch.dr6);
- else
- svm_set_dr6(svm, DR6_ACTIVE_LOW);
+ if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
kvm_load_guest_xsave_state(vcpu);
+ /*
+ * Hardware only context switches DEBUGCTL if LBR virtualization is
+ * enabled. Manually load DEBUGCTL if necessary (and restore it after
+ * VM-Exit), as running with the host's DEBUGCTL can negatively affect
+ * guest state and can even be fatal, e.g. due to Bus Lock Detect.
+ */
+ if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
+ vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
+ update_debugctlmsr(svm->vmcb->save.dbgctl);
+
kvm_wait_lapic_expire(vcpu);
/*
@@ -4285,6 +4328,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
+ vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
+
kvm_load_host_xsave_state(vcpu);
stgi();
@@ -5046,6 +5093,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
+ .set_dr6 = svm_set_dr6,
.set_dr7 = svm_set_dr7,
.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 9d7cdb8fbf87..ea44c1da5a7c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -584,7 +584,7 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
/* svm.c */
#define MSR_INVALID 0xffffffffU
-#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR)
extern bool dump_invalid_vmcb;
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 2ed80aea3bb1..0c61153b275f 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run)
mov VCPU_RDI(%_ASM_DI), %_ASM_DI
/* Enter guest mode */
- sti
-
3: vmrun %_ASM_AX
4:
- cli
-
/* Pop @svm to RAX while it's the only available register. */
pop %_ASM_AX
@@ -340,12 +336,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
mov KVM_VMCB_pa(%rax), %rax
/* Enter guest mode */
- sti
-
1: vmrun %rax
-
-2: cli
-
+2:
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 2427f918e763..43ee9ed11291 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -61,6 +61,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt,
+ .set_dr6 = vmx_set_dr6,
.set_dr7 = vmx_set_dr7,
.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
.cache_reg = vmx_cache_reg,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index edc2d2039508..d06e50d9c0e7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5086,6 +5086,17 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
load_vmcs12_host_state(vcpu, vmcs12);
+ /*
+ * Process events if an injectable IRQ or NMI is pending, even
+ * if the event is blocked (RFLAGS.IF is cleared on VM-Exit).
+ * If an event became pending while L2 was active, KVM needs to
+ * either inject the event or request an IRQ/NMI window. SMIs
+ * don't need to be processed as SMM is mutually exclusive with
+ * non-root mode. INIT/SIPI don't need to be checked as INIT
+ * is blocked post-VMXON, and SIPIs are ignored.
+ */
+ if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return;
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 374f5a2ca38e..1e30adc7837e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1514,16 +1514,12 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
*/
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
vmx_vcpu_pi_load(vcpu, cpu);
-
- vmx->host_debugctlmsr = get_debugctlmsr();
}
void vmx_vcpu_put(struct kvm_vcpu *vcpu)
@@ -5648,6 +5644,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ lockdep_assert_irqs_disabled();
+ set_debugreg(vcpu->arch.dr6, 6);
+}
+
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
@@ -7421,10 +7423,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- set_debugreg(vcpu->arch.dr6, 6);
-
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -7460,8 +7458,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
}
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
- if (vmx->host_debugctlmsr)
- update_debugctlmsr(vmx->host_debugctlmsr);
+ if (vcpu->arch.host_debugctl)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
#ifndef CONFIG_X86_64
/*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 8b111ce1087c..951e44dc9d0e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -340,8 +340,6 @@ struct vcpu_vmx {
/* apic deadline value in host tsc */
u64 hv_deadline_tsc;
- unsigned long host_debugctlmsr;
-
/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
* msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index ce3295a67c04..430773a5ef8e 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -73,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bef63fd46d5a..82547da8faf3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10966,10 +10966,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
} else if (unlikely(hw_breakpoint_active())) {
set_debugreg(0, 7);
}
+ vcpu->arch.host_debugctl = get_debugctlmsr();
+
guest_timing_enter_irqoff();
for (;;) {
@@ -12876,11 +12881,11 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
mutex_unlock(&kvm->slots_lock);
}
kvm_unload_vcpu_mmus(kvm);
+ kvm_destroy_vcpus(kvm);
kvm_x86_call(vm_destroy)(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
- kvm_destroy_vcpus(kvm);
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);