diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 379 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 31 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.h | 34 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 45 | ||||
-rw-r--r-- | arch/x86/kvm/svm/nested.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 34 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 62 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/svm/vmenter.S | 10 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/main.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 11 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 18 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/x86_ops.h | 1 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 7 |
19 files changed, 461 insertions, 197 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ea2c4f21c1ca..fe8ea8c097de 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,6 +22,7 @@ config KVM_X86 select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER select KVM_ELIDE_TLB_FLUSH_IF_YOUNG + select KVM_MMU_LOCKLESS_AGING select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE select HAVE_KVM_DIRTY_RING_TSO diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 668e8fac1f6e..f863a6f93a1a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1767,7 +1767,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ecx = entry->edx = 0; if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { - entry->eax = entry->ebx; + entry->eax = entry->ebx = 0; break; } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6a6dd5a84f22..6ebeb6cea6c0 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2226,6 +2226,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) u32 vector; bool all_cpus; + if (!lapic_in_kernel(vcpu)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, @@ -2852,7 +2855,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; - ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + if (!vcpu || lapic_in_kernel(vcpu)) + ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 74c20dbb92da..63bb77ee1bb1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -501,7 +501,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) return false; } - if (!spte_has_volatile_bits(old_spte)) + if (!spte_needs_atomic_update(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); @@ -524,7 +524,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) int level = sptep_to_sp(sptep)->role.level; if (!is_shadow_present_pte(old_spte) || - !spte_has_volatile_bits(old_spte)) + !spte_needs_atomic_update(old_spte)) __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); else old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE); @@ -853,32 +853,173 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu * About rmap_head encoding: * * If the bit zero of rmap_head->val is clear, then it points to the only spte - * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct + * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct * pte_list_desc containing more mappings. */ #define KVM_RMAP_MANY BIT(0) /* + * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always + * operates with mmu_lock held for write), but rmaps can be walked without + * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain + * being zapped/dropped _while the rmap is locked_. + * + * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be + * done while holding mmu_lock for write. This allows a task walking rmaps + * without holding mmu_lock to concurrently walk the same entries as a task + * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify + * the rmaps, thus the walks are stable. + * + * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED, + * only the rmap chains themselves are protected. E.g. holding an rmap's lock + * ensures all "struct pte_list_desc" fields are stable. + */ +#define KVM_RMAP_LOCKED BIT(1) + +static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head) +{ + unsigned long old_val, new_val; + + lockdep_assert_preemption_disabled(); + + /* + * Elide the lock if the rmap is empty, as lockless walkers (read-only + * mode) don't need to (and can't) walk an empty rmap, nor can they add + * entries to the rmap. I.e. the only paths that process empty rmaps + * do so while holding mmu_lock for write, and are mutually exclusive. + */ + old_val = atomic_long_read(&rmap_head->val); + if (!old_val) + return 0; + + do { + /* + * If the rmap is locked, wait for it to be unlocked before + * trying acquire the lock, e.g. to avoid bouncing the cache + * line. + */ + while (old_val & KVM_RMAP_LOCKED) { + cpu_relax(); + old_val = atomic_long_read(&rmap_head->val); + } + + /* + * Recheck for an empty rmap, it may have been purged by the + * task that held the lock. + */ + if (!old_val) + return 0; + + new_val = old_val | KVM_RMAP_LOCKED; + /* + * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap + * from being reordered outside of the critical section created by + * __kvm_rmap_lock(). + * + * Pairs with the atomic_long_set_release() in kvm_rmap_unlock(). + * + * For the !old_val case, no ordering is needed, as there is no rmap + * to walk. + */ + } while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val)); + + /* + * Return the old value, i.e. _without_ the LOCKED bit set. It's + * impossible for the return value to be 0 (see above), i.e. the read- + * only unlock flow can't get a false positive and fail to unlock. + */ + return old_val; +} + +static unsigned long kvm_rmap_lock(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + return __kvm_rmap_lock(rmap_head); +} + +static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head, + unsigned long val) +{ + KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED); + /* + * Ensure that all accesses to the rmap have completed before unlocking + * the rmap. + * + * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock(). + */ + atomic_long_set_release(&rmap_head->val, val); +} + +static void kvm_rmap_unlock(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + unsigned long new_val) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + __kvm_rmap_unlock(rmap_head, new_val); +} + +static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head) +{ + return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED; +} + +/* + * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The + * actual locking is the same, but the caller is disallowed from modifying the + * rmap, and so the unlock flow is a nop if the rmap is/was empty. + */ +static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head) +{ + unsigned long rmap_val; + + preempt_disable(); + rmap_val = __kvm_rmap_lock(rmap_head); + + if (!rmap_val) + preempt_enable(); + + return rmap_val; +} + +static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head, + unsigned long old_val) +{ + if (!old_val) + return; + + KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head)); + + __kvm_rmap_unlock(rmap_head, old_val); + preempt_enable(); +} + +/* * Returns the number of pointers in the rmap chain, not counting the new one. */ -static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, - struct kvm_rmap_head *rmap_head) +static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + u64 *spte, struct kvm_rmap_head *rmap_head) { + unsigned long old_val, new_val; struct pte_list_desc *desc; int count = 0; - if (!rmap_head->val) { - rmap_head->val = (unsigned long)spte; - } else if (!(rmap_head->val & KVM_RMAP_MANY)) { + old_val = kvm_rmap_lock(kvm, rmap_head); + + if (!old_val) { + new_val = (unsigned long)spte; + } else if (!(old_val & KVM_RMAP_MANY)) { desc = kvm_mmu_memory_cache_alloc(cache); - desc->sptes[0] = (u64 *)rmap_head->val; + desc->sptes[0] = (u64 *)old_val; desc->sptes[1] = spte; desc->spte_count = 2; desc->tail_count = 0; - rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; + new_val = (unsigned long)desc | KVM_RMAP_MANY; ++count; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY); count = desc->tail_count + desc->spte_count; /* @@ -887,21 +1028,25 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, */ if (desc->spte_count == PTE_LIST_EXT) { desc = kvm_mmu_memory_cache_alloc(cache); - desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY); desc->spte_count = 0; desc->tail_count = count; - rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; + new_val = (unsigned long)desc | KVM_RMAP_MANY; + } else { + new_val = old_val; } desc->sptes[desc->spte_count++] = spte; } + + kvm_rmap_unlock(kvm, rmap_head, new_val); + return count; } -static void pte_list_desc_remove_entry(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, +static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val, struct pte_list_desc *desc, int i) { - struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY); int j = head_desc->spte_count - 1; /* @@ -928,9 +1073,9 @@ static void pte_list_desc_remove_entry(struct kvm *kvm, * head at the next descriptor, i.e. the new head. */ if (!head_desc->more) - rmap_head->val = 0; + *rmap_val = 0; else - rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY; + *rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY; mmu_free_pte_list_desc(head_desc); } @@ -938,24 +1083,26 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; + unsigned long rmap_val; int i; - if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) - return; + rmap_val = kvm_rmap_lock(kvm, rmap_head); + if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm)) + goto out; - if (!(rmap_head->val & KVM_RMAP_MANY)) { - if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) - return; + if (!(rmap_val & KVM_RMAP_MANY)) { + if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm)) + goto out; - rmap_head->val = 0; + rmap_val = 0; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(kvm, rmap_head, + pte_list_desc_remove_entry(kvm, &rmap_val, desc, i); - return; + goto out; } } desc = desc->more; @@ -963,6 +1110,9 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, KVM_BUG_ON_DATA_CORRUPTION(true, kvm); } + +out: + kvm_rmap_unlock(kvm, rmap_head, rmap_val); } static void kvm_zap_one_rmap_spte(struct kvm *kvm, @@ -977,17 +1127,19 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; + unsigned long rmap_val; int i; - if (!rmap_head->val) + rmap_val = kvm_rmap_lock(kvm, rmap_head); + if (!rmap_val) return false; - if (!(rmap_head->val & KVM_RMAP_MANY)) { - mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); + if (!(rmap_val & KVM_RMAP_MANY)) { + mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val); goto out; } - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) @@ -997,20 +1149,21 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, } out: /* rmap_head is meaningless now, remember to reset it */ - rmap_head->val = 0; + kvm_rmap_unlock(kvm, rmap_head, 0); return true; } unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { + unsigned long rmap_val = kvm_rmap_get(rmap_head); struct pte_list_desc *desc; - if (!rmap_head->val) + if (!rmap_val) return 0; - else if (!(rmap_head->val & KVM_RMAP_MANY)) + else if (!(rmap_val & KVM_RMAP_MANY)) return 1; - desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); return desc->tail_count + desc->spte_count; } @@ -1053,6 +1206,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) */ struct rmap_iterator { /* private fields */ + struct rmap_head *head; struct pte_list_desc *desc; /* holds the sptep if not NULL */ int pos; /* index of the sptep */ }; @@ -1067,23 +1221,19 @@ struct rmap_iterator { static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, struct rmap_iterator *iter) { - u64 *sptep; + unsigned long rmap_val = kvm_rmap_get(rmap_head); - if (!rmap_head->val) + if (!rmap_val) return NULL; - if (!(rmap_head->val & KVM_RMAP_MANY)) { + if (!(rmap_val & KVM_RMAP_MANY)) { iter->desc = NULL; - sptep = (u64 *)rmap_head->val; - goto out; + return (u64 *)rmap_val; } - iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); + iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); iter->pos = 0; - sptep = iter->desc->sptes[iter->pos]; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; + return iter->desc->sptes[iter->pos]; } /* @@ -1093,14 +1243,11 @@ out: */ static u64 *rmap_get_next(struct rmap_iterator *iter) { - u64 *sptep; - if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) { ++iter->pos; - sptep = iter->desc->sptes[iter->pos]; - if (sptep) - goto out; + if (iter->desc->sptes[iter->pos]) + return iter->desc->sptes[iter->pos]; } iter->desc = iter->desc->more; @@ -1108,20 +1255,24 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) if (iter->desc) { iter->pos = 0; /* desc->sptes[0] cannot be NULL */ - sptep = iter->desc->sptes[iter->pos]; - goto out; + return iter->desc->sptes[iter->pos]; } } return NULL; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; } -#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ - for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ - _spte_; _spte_ = rmap_get_next(_iter_)) +#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \ + _sptep_; _sptep_ = rmap_get_next(_iter_)) + +#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \ + +#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep))) static void drop_spte(struct kvm *kvm, u64 *sptep) { @@ -1207,12 +1358,13 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmap_head, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (spte_ad_need_write_protect(*sptep)) flush |= test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); else flush |= spte_clear_dirty(sptep); + } return flush; } @@ -1401,7 +1553,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) while (++iterator->rmap <= iterator->end_rmap) { iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level); - if (iterator->rmap->val) + if (atomic_long_read(&iterator->rmap->val)) return; } @@ -1533,7 +1685,7 @@ static void __rmap_add(struct kvm *kvm, kvm_update_page_stats(kvm, sp->role.level, 1); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - rmap_count = pte_list_add(cache, spte, rmap_head); + rmap_count = pte_list_add(kvm, cache, spte, rmap_head); if (rmap_count > kvm->stat.max_mmu_rmap_size) kvm->stat.max_mmu_rmap_size = rmap_count; @@ -1552,51 +1704,67 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, } static bool kvm_rmap_age_gfn_range(struct kvm *kvm, - struct kvm_gfn_range *range, bool test_only) + struct kvm_gfn_range *range, + bool test_only) { - struct slot_rmap_walk_iterator iterator; + struct kvm_rmap_head *rmap_head; struct rmap_iterator iter; + unsigned long rmap_val; bool young = false; u64 *sptep; + gfn_t gfn; + int level; + u64 spte; - for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - range->start, range->end - 1, &iterator) { - for_each_rmap_spte(iterator.rmap, &iter, sptep) { - u64 spte = *sptep; + for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + for (gfn = range->start; gfn < range->end; + gfn += KVM_PAGES_PER_HPAGE(level)) { + rmap_head = gfn_to_rmap(gfn, level, range->slot); + rmap_val = kvm_rmap_lock_readonly(rmap_head); - if (!is_accessed_spte(spte)) - continue; + for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) { + if (!is_accessed_spte(spte)) + continue; - if (test_only) - return true; - - if (spte_ad_enabled(spte)) { - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } else { - /* - * WARN if mmu_spte_update() signals the need - * for a TLB flush, as Access tracking a SPTE - * should never trigger an _immediate_ flush. - */ - spte = mark_spte_for_access_track(spte); - WARN_ON_ONCE(mmu_spte_update(sptep, spte)); + if (test_only) { + kvm_rmap_unlock_readonly(rmap_head, rmap_val); + return true; + } + + if (spte_ad_enabled(spte)) + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + else + /* + * If the following cmpxchg fails, the + * spte is being concurrently modified + * and should most likely stay young. + */ + cmpxchg64(sptep, spte, + mark_spte_for_access_track(spte)); + young = true; } - young = true; + + kvm_rmap_unlock_readonly(rmap_head, rmap_val); } } return young; } +static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm) +{ + return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages); +} + bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (kvm_memslots_have_rmaps(kvm)) - young = kvm_rmap_age_gfn_range(kvm, range, false); - if (tdp_mmu_enabled) - young |= kvm_tdp_mmu_age_gfn_range(kvm, range); + young = kvm_tdp_mmu_age_gfn_range(kvm, range); + + if (kvm_may_have_shadow_mmu_sptes(kvm)) + young |= kvm_rmap_age_gfn_range(kvm, range, false); return young; } @@ -1605,11 +1773,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (kvm_memslots_have_rmaps(kvm)) - young = kvm_rmap_age_gfn_range(kvm, range, true); - if (tdp_mmu_enabled) - young |= kvm_tdp_mmu_test_age_gfn(kvm, range); + young = kvm_tdp_mmu_test_age_gfn(kvm, range); + + if (young) + return young; + + if (kvm_may_have_shadow_mmu_sptes(kvm)) + young |= kvm_rmap_age_gfn_range(kvm, range, true); return young; } @@ -1656,13 +1827,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) return hash_64(gfn, KVM_MMU_HASH_SHIFT); } -static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, +static void mmu_page_add_parent_pte(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; - pte_list_add(cache, parent_pte, &sp->parent_ptes); + pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, @@ -2352,7 +2524,7 @@ static void __link_shadow_page(struct kvm *kvm, mmu_spte_set(sptep, spte); - mmu_page_add_parent_pte(cache, sp, sptep); + mmu_page_add_parent_pte(kvm, cache, sp, sptep); /* * The non-direct sub-pagetable must be updated before linking. For @@ -2416,7 +2588,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, * avoids retaining a large number of stale nested SPs. */ if (tdp_enabled && invalid_list && - child->role.guest_mode && !child->parent_ptes.val) + child->role.guest_mode && + !atomic_long_read(&child->parent_ptes.val)) return kvm_mmu_prepare_zap_page(kvm, child, invalid_list); } @@ -5540,7 +5713,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, union kvm_mmu_page_role root_role; /* NPT requires CR0.PG=1. */ - WARN_ON_ONCE(cpu_role.base.direct); + WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode); root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); @@ -7460,7 +7633,7 @@ static bool kvm_nx_huge_page_recovery_worker(void *data) return true; } -static void kvm_mmu_start_lpage_recovery(struct once *once) +static int kvm_mmu_start_lpage_recovery(struct once *once) { struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); struct kvm *kvm = container_of(ka, struct kvm, arch); @@ -7471,13 +7644,14 @@ static void kvm_mmu_start_lpage_recovery(struct once *once) kvm_nx_huge_page_recovery_worker_kill, kvm, "kvm-nx-lpage-recovery"); - if (!nx_thread) - return; + if (IS_ERR(nx_thread)) + return PTR_ERR(nx_thread); vhost_task_start(nx_thread); /* Make the task visible only once it is fully started. */ WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); + return 0; } int kvm_mmu_post_init_vm(struct kvm *kvm) @@ -7485,10 +7659,7 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) if (nx_hugepage_mitigation_hard_disabled) return 0; - call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); - if (!kvm->arch.nx_huge_page_recovery_thread) - return -ENOMEM; - return 0; + return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 22551e2f1d00..0f9f47b4ab0e 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -129,25 +129,32 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) } /* - * Returns true if the SPTE has bits that may be set without holding mmu_lock. - * The caller is responsible for checking if the SPTE is shadow-present, and - * for determining whether or not the caller cares about non-leaf SPTEs. + * Returns true if the SPTE needs to be updated atomically due to having bits + * that may be changed without holding mmu_lock, and for which KVM must not + * lose information. E.g. KVM must not drop Dirty bit information. The caller + * is responsible for checking if the SPTE is shadow-present, and for + * determining whether or not the caller cares about non-leaf SPTEs. */ -bool spte_has_volatile_bits(u64 spte) +bool spte_needs_atomic_update(u64 spte) { + /* SPTEs can be made Writable bit by KVM's fast page fault handler. */ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) return true; - if (is_access_track_spte(spte)) + /* + * A/D-disabled SPTEs can be access-tracked by aging, and access-tracked + * SPTEs can be restored by KVM's fast page fault handler. + */ + if (!spte_ad_enabled(spte)) return true; - if (spte_ad_enabled(spte)) { - if (!(spte & shadow_accessed_mask) || - (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) - return true; - } - - return false; + /* + * Dirty and Accessed bits can be set by the CPU. Ignore the Accessed + * bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't + * invalidate TLBs when aging SPTEs, and so it's safe to clobber the + * Accessed bit (and rare in practice). + */ + return is_writable_pte(spte) && !(spte & shadow_dirty_mask); } bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 59746854c0af..79cdceba9857 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -519,7 +519,7 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } -bool spte_has_volatile_bits(u64 spte); +bool spte_needs_atomic_update(u64 spte); bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index 047b78333653..364c5da6c499 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -25,6 +25,13 @@ static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte) return xchg(rcu_dereference(sptep), new_spte); } +static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask) +{ + atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep); + + return (u64)atomic64_fetch_and(~mask, sptep_atomic); +} + static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) { KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte)); @@ -32,28 +39,21 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) } /* - * SPTEs must be modified atomically if they are shadow-present, leaf - * SPTEs, and have volatile bits, i.e. has bits that can be set outside - * of mmu_lock. The Writable bit can be set by KVM's fast page fault - * handler, and Accessed and Dirty bits can be set by the CPU. - * - * Note, non-leaf SPTEs do have Accessed bits and those bits are - * technically volatile, but KVM doesn't consume the Accessed bit of - * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This - * logic needs to be reassessed if KVM were to use non-leaf Accessed - * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. + * SPTEs must be modified atomically if they are shadow-present, leaf SPTEs, + * and have volatile bits (bits that can be set outside of mmu_lock) that + * must not be clobbered. */ -static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level) +static inline bool kvm_tdp_mmu_spte_need_atomic_update(u64 old_spte, int level) { return is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) && - spte_has_volatile_bits(old_spte); + spte_needs_atomic_update(old_spte); } static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, u64 new_spte, int level) { - if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) + if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level)) return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte); __kvm_tdp_mmu_write_spte(sptep, new_spte); @@ -63,12 +63,8 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte, u64 mask, int level) { - atomic64_t *sptep_atomic; - - if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) { - sptep_atomic = (atomic64_t *)rcu_dereference(sptep); - return (u64)atomic64_fetch_and(~mask, sptep_atomic); - } + if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level)) + return tdp_mmu_clear_spte_bits_atomic(sptep, mask); __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask); return old_spte; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 046b6ba31197..7cc0564f5f97 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -193,6 +193,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, !tdp_mmu_root_match((_root), (_types)))) { \ } else +/* + * Iterate over all TDP MMU roots in an RCU read-side critical section. + * It is safe to iterate over the SPTEs under the root, but their values will + * be unstable, so all writes must be atomic. As this routine is meant to be + * used without holding the mmu_lock at all, any bits that are flipped must + * be reflected in kvm_tdp_mmu_spte_need_atomic_write(). + */ +#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \ + list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \ + if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ + !tdp_mmu_root_match((_root), (_types))) { \ + } else + #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS) @@ -774,9 +787,6 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, continue; \ else -#define tdp_mmu_for_each_pte(_iter, _kvm, _root, _start, _end) \ - for_each_tdp_pte(_iter, _kvm, _root, _start, _end) - static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm, struct tdp_iter *iter) { @@ -1235,7 +1245,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) rcu_read_lock(); - tdp_mmu_for_each_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) { + for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) { int r; if (fault->nx_huge_page_workaround_enabled) @@ -1332,21 +1342,22 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, * from the clear_young() or clear_flush_young() notifier, which uses the * return value to determine if the page has been accessed. */ -static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter) +static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter) { u64 new_spte; if (spte_ad_enabled(iter->old_spte)) { - iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, - iter->old_spte, - shadow_accessed_mask, - iter->level); + iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep, + shadow_accessed_mask); new_spte = iter->old_spte & ~shadow_accessed_mask; } else { new_spte = mark_spte_for_access_track(iter->old_spte); - iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, - iter->old_spte, new_spte, - iter->level); + /* + * It is safe for the following cmpxchg to fail. Leave the + * Accessed bit set, as the spte is most likely young anyway. + */ + if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte)) + return; } trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, @@ -1371,9 +1382,9 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, * valid roots! */ WARN_ON(types & ~KVM_VALID_ROOTS); - __for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) { - guard(rcu)(); + guard(rcu)(); + for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) { tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) { if (!is_accessed_spte(iter.old_spte)) continue; @@ -1382,7 +1393,7 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, return true; ret = true; - kvm_tdp_mmu_age_spte(&iter); + kvm_tdp_mmu_age_spte(kvm, &iter); } } @@ -1904,7 +1915,7 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, *root_level = vcpu->arch.mmu->root_role.level; - tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } @@ -1931,7 +1942,7 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, struct tdp_iter iter; tdp_ptep_t sptep = NULL; - tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { *spte = iter.old_spte; sptep = iter.sptep; } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index c6b79c2e1e05..834b67672d50 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -646,6 +646,11 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, u32 pause_count12; u32 pause_thresh12; + nested_svm_transition_tlb_flush(vcpu); + + /* Enter Guest-Mode */ + enter_guest_mode(vcpu); + /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. @@ -762,11 +767,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } - nested_svm_transition_tlb_flush(vcpu); - - /* Enter Guest-Mode */ - enter_guest_mode(vcpu); - /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 42a6d8bb1024..596167d33544 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2972,6 +2972,16 @@ void __init sev_hardware_setup(void) WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) goto out; + /* + * The kernel's initcall infrastructure lacks the ability to express + * dependencies between initcalls, whereas the modules infrastructure + * automatically handles dependencies via symbol loading. Ensure the + * PSP SEV driver is initialized before proceeding if KVM is built-in, + * as the dependency isn't handled by the initcall infrastructure. + */ + if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) + goto out; + /* Retrieve SEV CPUID information */ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); @@ -4579,6 +4589,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) { + struct kvm *kvm = svm->vcpu.kvm; + /* * All host state for SEV-ES guests is categorized into three swap types * based on how it is handled by hardware during a world switch: @@ -4602,14 +4614,22 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are /* * If DebugSwap is enabled, debug registers are loaded but NOT saved by - * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both - * saves and loads debug registers (Type-A). + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does + * not save or load debug registers. Sadly, KVM can't prevent SNP + * guests from lying about DebugSwap on secondary vCPUs, i.e. the + * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what + * the guest has actually enabled (or not!) in the VMSA. + * + * If DebugSwap is *possible*, save the masks so that they're restored + * if the guest enables DebugSwap. But for the DRs themselves, do NOT + * rely on the CPU to restore the host values; KVM will restore them as + * needed in common code, via hw_breakpoint_restore(). Note, KVM does + * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs + * don't need to be restored per se, KVM just needs to ensure they are + * loaded with the correct values *if* the CPU writes the MSRs. */ - if (sev_vcpu_has_debug_swap(svm)) { - hostsa->dr0 = native_get_debugreg(0); - hostsa->dr1 = native_get_debugreg(1); - hostsa->dr2 = native_get_debugreg(2); - hostsa->dr3 = native_get_debugreg(3); + if (sev_vcpu_has_debug_swap(svm) || + (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) { hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6bd7e44380e0..5176e2f6cc14 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1991,11 +1991,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid = sd->next_asid++; } -static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = to_svm(vcpu)->vmcb; - if (svm->vcpu.arch.guest_state_protected) + if (vcpu->arch.guest_state_protected) return; if (unlikely(value != vmcb->save.dr6)) { @@ -3165,6 +3165,27 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } + + /* + * AMD changed the architectural behavior of bits 5:2. On CPUs + * without BusLockTrap, bits 5:2 control "external pins", but + * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap + * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed + * the guest to set bits 5:2 despite not actually virtualizing + * Performance-Monitoring/Breakpoint external pins. Drop bits + * 5:2 for backwards compatibility. + */ + data &= ~GENMASK(5, 2); + + /* + * Suppress BTF as KVM doesn't virtualize BTF, but there's no + * way to communicate lack of support to the guest. + */ + if (data & DEBUGCTLMSR_BTF) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + data &= ~DEBUGCTLMSR_BTF; + } + if (data & DEBUGCTL_RESERVED_BITS) return 1; @@ -4192,6 +4213,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_enter_irqoff(); + /* + * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of + * VMRUN controls whether or not physical IRQs are masked (KVM always + * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the + * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow + * into guest state if delivery of an event during VMRUN triggers a + * #VMEXIT, and the guest_state transitions already tell lockdep that + * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of + * this path, so IRQs aren't actually unmasked while running host code. + */ + raw_local_irq_enable(); + amd_clear_divider(); if (sev_es_guest(vcpu->kvm)) @@ -4200,6 +4233,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in else __svm_vcpu_run(svm, spec_ctrl_intercepted); + raw_local_irq_disable(); + guest_state_exit_irqoff(); } @@ -4250,14 +4285,22 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - svm_set_dr6(svm, vcpu->arch.dr6); - else - svm_set_dr6(svm, DR6_ACTIVE_LOW); + if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); + /* + * Hardware only context switches DEBUGCTL if LBR virtualization is + * enabled. Manually load DEBUGCTL if necessary (and restore it after + * VM-Exit), as running with the host's DEBUGCTL can negatively affect + * guest state and can even be fatal, e.g. due to Bus Lock Detect. + */ + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(svm->vmcb->save.dbgctl); + kvm_wait_lapic_expire(vcpu); /* @@ -4285,6 +4328,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(vcpu->arch.host_debugctl); + kvm_load_host_xsave_state(vcpu); stgi(); @@ -5046,6 +5093,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 9d7cdb8fbf87..ea44c1da5a7c 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -584,7 +584,7 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm) /* svm.c */ #define MSR_INVALID 0xffffffffU -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR) extern bool dump_invalid_vmcb; diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 2ed80aea3bb1..0c61153b275f 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run) mov VCPU_RDI(%_ASM_DI), %_ASM_DI /* Enter guest mode */ - sti - 3: vmrun %_ASM_AX 4: - cli - /* Pop @svm to RAX while it's the only available register. */ pop %_ASM_AX @@ -340,12 +336,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov KVM_VMCB_pa(%rax), %rax /* Enter guest mode */ - sti - 1: vmrun %rax - -2: cli - +2: /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 2427f918e763..43ee9ed11291 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -61,6 +61,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, + .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index edc2d2039508..d06e50d9c0e7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5086,6 +5086,17 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, load_vmcs12_host_state(vcpu, vmcs12); + /* + * Process events if an injectable IRQ or NMI is pending, even + * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). + * If an event became pending while L2 was active, KVM needs to + * either inject the event or request an IRQ/NMI window. SMIs + * don't need to be processed as SMM is mutually exclusive with + * non-root mode. INIT/SIPI don't need to be checked as INIT + * is blocked post-VMXON, and SIPIs are ignored. + */ + if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); return; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 374f5a2ca38e..1e30adc7837e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1514,16 +1514,12 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, */ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); vmx_vcpu_load_vmcs(vcpu, cpu, NULL); vmx_vcpu_pi_load(vcpu, cpu); - - vmx->host_debugctlmsr = get_debugctlmsr(); } void vmx_vcpu_put(struct kvm_vcpu *vcpu) @@ -5648,6 +5644,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) set_debugreg(DR6_RESERVED, 6); } +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ + lockdep_assert_irqs_disabled(); + set_debugreg(vcpu->arch.dr6, 6); +} + void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -7421,10 +7423,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) vmx->loaded_vmcs->host_state.cr4 = cr4; } - /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - set_debugreg(vcpu->arch.dr6, 6); - /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -7460,8 +7458,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) } /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (vmx->host_debugctlmsr) - update_debugctlmsr(vmx->host_debugctlmsr); + if (vcpu->arch.host_debugctl) + update_debugctlmsr(vcpu->arch.host_debugctl); #ifndef CONFIG_X86_64 /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 8b111ce1087c..951e44dc9d0e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -340,8 +340,6 @@ struct vcpu_vmx { /* apic deadline value in host tsc */ u64 hv_deadline_tsc; - unsigned long host_debugctlmsr; - /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index ce3295a67c04..430773a5ef8e 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -73,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val); void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val); void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu); void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bef63fd46d5a..82547da8faf3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10966,10 +10966,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); } else if (unlikely(hw_breakpoint_active())) { set_debugreg(0, 7); } + vcpu->arch.host_debugctl = get_debugctlmsr(); + guest_timing_enter_irqoff(); for (;;) { @@ -12876,11 +12881,11 @@ void kvm_arch_destroy_vm(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); } kvm_unload_vcpu_mmus(kvm); + kvm_destroy_vcpus(kvm); kvm_x86_call(vm_destroy)(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); - kvm_destroy_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); |