diff options
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 206 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu_internal.h | 26 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 46 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 10 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 136 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.h | 2 |
7 files changed, 256 insertions, 173 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8d74bdef68c1..901be9e420a4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -722,7 +722,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) if (sp->role.passthrough) return sp->gfn; - if (!sp->role.direct) + if (sp->shadowed_translation) return sp->shadowed_translation[index] >> PAGE_SHIFT; return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); @@ -736,7 +736,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) */ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { - if (sp_has_gptes(sp)) + if (sp->shadowed_translation) return sp->shadowed_translation[index] & ACC_ALL; /* @@ -757,7 +757,7 @@ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, gfn_t gfn, unsigned int access) { - if (sp_has_gptes(sp)) { + if (sp->shadowed_translation) { sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } @@ -1700,8 +1700,7 @@ static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); - if (!sp->role.direct) - free_page((unsigned long)sp->shadowed_translation); + free_page((unsigned long)sp->shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } @@ -2203,7 +2202,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); - if (!role.direct) + if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL) sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); @@ -3308,7 +3307,7 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, return RET_PF_CONTINUE; } -static bool page_fault_can_be_fast(struct kvm_page_fault *fault) +static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault) { /* * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only @@ -3320,6 +3319,26 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault) return false; /* + * For hardware-protected VMs, certain conditions like attempting to + * perform a write to a page which is not in the state that the guest + * expects it to be in can result in a nested/extended #PF. In this + * case, the below code might misconstrue this situation as being the + * result of a write-protected access, and treat it as a spurious case + * rather than taking any action to satisfy the real source of the #PF + * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the + * guest spinning on a #PF indefinitely, so don't attempt the fast path + * in this case. + * + * Note that the kvm_mem_is_private() check might race with an + * attribute update, but this will either result in the guest spinning + * on RET_PF_SPURIOUS until the update completes, or an actual spurious + * case might go down the slow path. Either case will resolve itself. + */ + if (kvm->arch.has_private_mem && + fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) + return false; + + /* * #PF can be fast if: * * 1. The shadow page table entry is not present and A/D bits are @@ -3419,7 +3438,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) u64 *sptep; uint retry_count = 0; - if (!page_fault_can_be_fast(fault)) + if (!page_fault_can_be_fast(vcpu->kvm, fault)) return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3428,7 +3447,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) u64 new_spte; if (tdp_mmu_enabled) - sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte); else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); @@ -3438,7 +3457,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * available as the vCPU holds a reference to its root(s). */ if (WARN_ON_ONCE(!sptep)) - spte = REMOVED_SPTE; + spte = FROZEN_SPTE; if (!is_shadow_present_pte(spte)) break; @@ -4271,7 +4290,16 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL); + r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, + true, NULL, NULL); + + /* + * Account fixed page faults, otherwise they'll never be counted, but + * ignore stats for all other return times. Page-ready "faults" aren't + * truly spurious and never trigger emulation + */ + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; } static inline u8 kvm_max_level_for_order(int order) @@ -4291,6 +4319,25 @@ static inline u8 kvm_max_level_for_order(int order) return PG_LEVEL_4K; } +static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, + u8 max_level, int gmem_order) +{ + u8 req_max_level; + + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + max_level = min(kvm_max_level_for_order(gmem_order), max_level); + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn); + if (req_max_level) + max_level = min(max_level, req_max_level); + + return req_max_level; +} + static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { @@ -4308,9 +4355,9 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, return r; } - fault->max_level = min(kvm_max_level_for_order(max_order), - fault->max_level); fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); + fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, + fault->max_level, max_order); return RET_PF_CONTINUE; } @@ -4561,7 +4608,10 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, if (WARN_ON_ONCE(error_code >> 32)) error_code = lower_32_bits(error_code); - /* Ensure the above sanity check also covers KVM-defined flags. */ + /* + * Restrict KVM-defined flags to bits 63:32 so that it's impossible for + * them to conflict with #PF error codes, which are limited to 32 bits. + */ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); vcpu->arch.l1tf_flush_l1d = true; @@ -4621,38 +4671,23 @@ out_unlock: } #endif -bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma) +bool kvm_mmu_may_ignore_guest_pat(void) { /* - * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the - * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is - * to honor the memtype from the guest's MTRRs so that guest accesses - * to memory that is DMA'd aren't cached against the guest's wishes. - * - * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs, - * e.g. KVM will force UC memtype for host MMIO. + * When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does + * not support self-snoop (or is affected by an erratum), and the VM + * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to + * honor the memtype from the guest's PAT so that guest accesses to + * memory that is DMA'd aren't cached against the guest's wishes. As a + * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA, + * KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE + * bits in response to non-coherent device (un)registration. */ - return vm_has_noncoherent_dma && shadow_memtype_mask; + return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask; } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - /* - * If the guest's MTRRs may be used to compute the "real" memtype, - * restrict the mapping level to ensure KVM uses a consistent memtype - * across the entire mapping. - */ - if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) { - for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { - int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = gfn_round_for_level(fault->gfn, - fault->max_level); - - if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) - break; - } - } - #ifdef CONFIG_X86_64 if (tdp_mmu_enabled) return kvm_tdp_mmu_page_fault(vcpu, fault); @@ -4661,6 +4696,79 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } +static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, + u8 *level) +{ + int r; + + /* + * Restrict to TDP page fault, since that's the only case where the MMU + * is indexed by GPA. + */ + if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault) + return -EOPNOTSUPP; + + do { + if (signal_pending(current)) + return -EINTR; + cond_resched(); + r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level); + } while (r == RET_PF_RETRY); + + if (r < 0) + return r; + + switch (r) { + case RET_PF_FIXED: + case RET_PF_SPURIOUS: + return 0; + + case RET_PF_EMULATE: + return -ENOENT; + + case RET_PF_RETRY: + case RET_PF_CONTINUE: + case RET_PF_INVALID: + default: + WARN_ONCE(1, "could not fix page fault during prefault"); + return -EIO; + } +} + +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range) +{ + u64 error_code = PFERR_GUEST_FINAL_MASK; + u8 level = PG_LEVEL_4K; + u64 end; + int r; + + /* + * reload is efficient when called repeatedly, so we can do it on + * every iteration. + */ + kvm_mmu_reload(vcpu); + + if (kvm_arch_has_private_mem(vcpu->kvm) && + kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) + error_code |= PFERR_PRIVATE_ACCESS; + + /* + * Shadow paging uses GVA for kvm page fault, so restrict to + * two-dimensional paging. + */ + r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level); + if (r < 0) + return r; + + /* + * If the mapping that covers range->gpa can use a huge page, it + * may start below it or end after range->gpa + range->size. + */ + end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level); + return min(range->size, end - range->gpa); +} + static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; @@ -4988,7 +5096,7 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, static inline u64 reserved_hpa_bits(void) { - return rsvd_bits(shadow_phys_bits, 63); + return rsvd_bits(kvm_host.maxphyaddr, 63); } /* @@ -5633,7 +5741,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) * stale entries. Flushing on alloc also allows KVM to skip the TLB * flush when freeing a root (see kvm_tdp_mmu_put_root()). */ - static_call(kvm_x86_flush_tlb_current)(vcpu); + kvm_x86_call(flush_tlb_current)(vcpu); out: return r; } @@ -5886,14 +5994,24 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err } if (r == RET_PF_INVALID) { + vcpu->stat.pf_taken++; + r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, - &emulation_type); + &emulation_type, NULL); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } if (r < 0) return r; + + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; + else if (r == RET_PF_EMULATE) + vcpu->stat.pf_emulate++; + else if (r == RET_PF_SPURIOUS) + vcpu->stat.pf_spurious++; + if (r != RET_PF_EMULATE) return 1; @@ -5995,7 +6113,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, if (is_noncanonical_address(addr, vcpu)) return; - static_call(kvm_x86_flush_tlb_gva)(vcpu, addr); + kvm_x86_call(flush_tlb_gva)(vcpu, addr); } if (!mmu->sync_spte) @@ -6787,6 +6905,7 @@ restart: return need_tlb_flush; } +EXPORT_SYMBOL_GPL(kvm_zap_gfn_range); static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) @@ -6917,7 +7036,6 @@ static unsigned long mmu_shrink_scan(struct shrinker *shrink, list_for_each_entry(kvm, &vm_list, vm_list) { int idx; - LIST_HEAD(invalid_list); /* * Never scan more than sc->nr_to_scan VM instances. diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ce2fcd19ba6b..1721d97743e9 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -288,7 +288,8 @@ static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, } static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u64 err, bool prefetch, int *emulation_type) + u64 err, bool prefetch, + int *emulation_type, u8 *level) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, @@ -318,14 +319,6 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn); } - /* - * Async #PF "faults", a.k.a. prefetch faults, are not faults from the - * guest perspective and have already been counted at the time of the - * original fault. - */ - if (!prefetch) - vcpu->stat.pf_taken++; - if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp) r = kvm_tdp_page_fault(vcpu, &fault); else @@ -344,20 +337,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (fault.write_fault_to_shadow_pgtable && emulation_type) *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; + if (level) + *level = fault.goal_level; - /* - * Similar to above, prefetch faults aren't truly spurious, and the - * async #PF path doesn't do emulation. Do count faults that are fixed - * by the async #PF handler though, otherwise they'll never be counted. - */ - if (r == RET_PF_FIXED) - vcpu->stat.pf_fixed++; - else if (prefetch) - ; - else if (r == RET_PF_EMULATE) - vcpu->stat.pf_emulate++; - else if (r == RET_PF_SPURIOUS) - vcpu->stat.pf_spurious++; return r; } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index d3dbcf382ed2..69941cebb3a8 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -911,7 +911,8 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int gpa_t pte_gpa; gfn_t gfn; - if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE)) + if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE || + !sp->shadowed_translation)) return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index a5e014d7bc62..d4527965e48c 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -43,7 +43,25 @@ u64 __read_mostly shadow_acc_track_mask; u64 __read_mostly shadow_nonpresent_or_rsvd_mask; u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; -u8 __read_mostly shadow_phys_bits; +static u8 __init kvm_get_host_maxphyaddr(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected + * in CPU detection code, but the processor treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at + * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR, + * when reasoning about CPU behavior with respect to MAXPHYADDR. + */ + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) + return cpuid_eax(0x80000008) & 0xff; + + /* + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with + * custom CPUID. Proceed with whatever the kernel found since these features + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). + */ + return boot_cpu_data.x86_phys_bits; +} void __init kvm_mmu_spte_module_init(void) { @@ -55,6 +73,8 @@ void __init kvm_mmu_spte_module_init(void) * will change when the vendor module is (re)loaded. */ allow_mmio_caching = enable_mmio_caching; + + kvm_host.maxphyaddr = kvm_get_host_maxphyaddr(); } static u64 generation_mmio_spte_mask(u64 gen) @@ -190,8 +210,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= PT_PAGE_SIZE_MASK; if (shadow_memtype_mask) - spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else @@ -271,18 +291,12 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, - int index) +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index) { - u64 child_spte; - - if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte))) - return 0; + u64 child_spte = huge_spte; - if (WARN_ON_ONCE(!is_large_pte(huge_spte))) - return 0; - - child_spte = huge_spte; + KVM_BUG_ON(!is_shadow_present_pte(huge_spte) || !is_large_pte(huge_spte), kvm); /* * The child_spte already has the base address of the huge page being @@ -383,7 +397,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) * not set any RWX bits. */ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || - WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value)) + WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value)) mmio_value = 0; if (!mmio_value) @@ -441,8 +455,6 @@ void kvm_mmu_reset_all_pte_masks(void) u8 low_phys_bits; u64 mask; - shadow_phys_bits = kvm_get_shadow_phys_bits(); - /* * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is @@ -494,7 +506,7 @@ void kvm_mmu_reset_all_pte_masks(void) * 52-bit physical addresses then there are no reserved PA bits in the * PTEs and so the reserved PA approach must be disabled. */ - if (shadow_phys_bits < 52) + if (kvm_host.maxphyaddr < 52) mask = BIT_ULL(51) | PT_PRESENT_MASK; else mask = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 52fa004a1fbc..ef793c459b05 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -202,7 +202,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; /* * If a thread running without exclusive control of the MMU lock must perform a - * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a + * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a * non-present intermediate value. Other threads which encounter this value * should not modify the SPTE. * @@ -212,14 +212,14 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; * * Only used by the TDP MMU. */ -#define REMOVED_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) +#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) /* Removed SPTEs must not be misconstrued as shadow present PTEs. */ -static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); +static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); -static inline bool is_removed_spte(u64 spte) +static inline bool is_frozen_spte(u64 spte) { - return spte == REMOVED_SPTE; + return spte == FROZEN_SPTE; } /* Get an SPTE's index into its parent's page table (and the spt array). */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 36539c1b36cd..c7dc49ee7388 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -365,8 +365,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * value to the removed SPTE value. */ for (;;) { - old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); - if (!is_removed_spte(old_spte)) + old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE); + if (!is_frozen_spte(old_spte)) break; cpu_relax(); } @@ -397,11 +397,11 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * No retry is needed in the atomic update path as the * sole concern is dropping a Dirty bit, i.e. no other * task can zap/remove the SPTE as mmu_lock is held for - * write. Marking the SPTE as a removed SPTE is not + * write. Marking the SPTE as a frozen SPTE is not * strictly necessary for the same reason, but using - * the remove SPTE value keeps the shared/exclusive + * the frozen SPTE value keeps the shared/exclusive * paths consistent and allows the handle_changed_spte() - * call below to hardcode the new value to REMOVED_SPTE. + * call below to hardcode the new value to FROZEN_SPTE. * * Note, even though dropping a Dirty bit is the only * scenario where a non-atomic update could result in a @@ -413,10 +413,10 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * it here. */ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, - REMOVED_SPTE, level); + FROZEN_SPTE, level); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_spte, REMOVED_SPTE, level, shared); + old_spte, FROZEN_SPTE, level, shared); } call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); @@ -490,19 +490,19 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, */ if (!was_present && !is_present) { /* - * If this change does not involve a MMIO SPTE or removed SPTE, + * If this change does not involve a MMIO SPTE or frozen SPTE, * it is unexpected. Log the change, though it should not * impact the guest since both the former and current SPTEs * are nonpresent. */ if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) && !is_mmio_spte(kvm, new_spte) && - !is_removed_spte(new_spte))) + !is_frozen_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" "different nonpresent SPTE, unless one or both\n" "are MMIO SPTEs, or the new SPTE is\n" - "a temporary removed SPTE.\n" + "a temporary frozen SPTE.\n" "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", as_id, gfn, old_spte, new_spte, level); return; @@ -530,7 +530,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } -static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte) +static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, + u64 new_spte) { u64 *sptep = rcu_dereference(iter->sptep); @@ -540,7 +541,7 @@ static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte) * and pre-checking before inserting a new SPTE is advantageous as it * avoids unnecessary work. */ - WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); + WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte)); /* * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and @@ -572,9 +573,9 @@ static inline int __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte) * no side-effects other than setting iter->old_spte to the last * known value of the spte. */ -static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { int ret; @@ -590,8 +591,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, return 0; } -static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter) +static inline int __must_check tdp_mmu_zap_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter) { int ret; @@ -603,26 +604,26 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * in its place before the TLBs are flushed. * * Delay processing of the zapped SPTE until after TLBs are flushed and - * the REMOVED_SPTE is replaced (see below). + * the FROZEN_SPTE is replaced (see below). */ - ret = __tdp_mmu_set_spte_atomic(iter, REMOVED_SPTE); + ret = __tdp_mmu_set_spte_atomic(iter, FROZEN_SPTE); if (ret) return ret; kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); /* - * No other thread can overwrite the removed SPTE as they must either + * No other thread can overwrite the frozen SPTE as they must either * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not - * overwrite the special removed SPTE value. Use the raw write helper to + * overwrite the special frozen SPTE value. Use the raw write helper to * avoid an unnecessary check on volatile bits. */ __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE); /* * Process the zapped SPTE after flushing TLBs, and after replacing - * REMOVED_SPTE with 0. This minimizes the amount of time vCPUs are - * blocked by the REMOVED_SPTE and reduces contention on the child + * FROZEN_SPTE with 0. This minimizes the amount of time vCPUs are + * blocked by the FROZEN_SPTE and reduces contention on the child * SPTEs. */ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, @@ -652,12 +653,12 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, /* * No thread should be using this function to set SPTEs to or from the - * temporary removed SPTE value. + * temporary frozen SPTE value. * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic * should be used. If operating under the MMU lock in write mode, the - * use of the removed SPTE should not be necessary. + * use of the frozen SPTE should not be necessary. */ - WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte)); + WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte)); old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); @@ -1126,7 +1127,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * If SPTE has been frozen by another thread, just give up and * retry, avoiding unnecessary page table allocation and free. */ - if (is_removed_spte(iter.old_spte)) + if (is_frozen_spte(iter.old_spte)) goto retry; if (iter.level == fault->goal_level) @@ -1339,17 +1340,15 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, return spte_set; } -static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) +static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void) { struct kvm_mmu_page *sp; - gfp |= __GFP_ZERO; - - sp = kmem_cache_alloc(mmu_page_header_cache, gfp); + sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT); if (!sp) return NULL; - sp->spt = (void *)__get_free_page(gfp); + sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!sp->spt) { kmem_cache_free(mmu_page_header_cache, sp); return NULL; @@ -1358,47 +1357,6 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) return sp; } -static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, - struct tdp_iter *iter, - bool shared) -{ - struct kvm_mmu_page *sp; - - kvm_lockdep_assert_mmu_lock_held(kvm, shared); - - /* - * Since we are allocating while under the MMU lock we have to be - * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct - * reclaim and to avoid making any filesystem callbacks (which can end - * up invoking KVM MMU notifiers, resulting in a deadlock). - * - * If this allocation fails we drop the lock and retry with reclaim - * allowed. - */ - sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); - if (sp) - return sp; - - rcu_read_unlock(); - - if (shared) - read_unlock(&kvm->mmu_lock); - else - write_unlock(&kvm->mmu_lock); - - iter->yielded = true; - sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); - - if (shared) - read_lock(&kvm->mmu_lock); - else - write_lock(&kvm->mmu_lock); - - rcu_read_lock(); - - return sp; -} - /* Note, the caller is responsible for initializing @sp. */ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) @@ -1445,7 +1403,6 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, { struct kvm_mmu_page *sp = NULL; struct tdp_iter iter; - int ret = 0; rcu_read_lock(); @@ -1469,17 +1426,31 @@ retry: continue; if (!sp) { - sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); + rcu_read_unlock(); + + if (shared) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); + + sp = tdp_mmu_alloc_sp_for_split(); + + if (shared) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); + if (!sp) { - ret = -ENOMEM; trace_kvm_mmu_split_huge_page(iter.gfn, iter.old_spte, - iter.level, ret); - break; + iter.level, -ENOMEM); + return -ENOMEM; } - if (iter.yielded) - continue; + rcu_read_lock(); + + iter.yielded = true; + continue; } tdp_mmu_init_child_sp(sp, &iter); @@ -1500,7 +1471,7 @@ retry: if (sp) tdp_mmu_free_sp(sp); - return ret; + return 0; } @@ -1801,12 +1772,11 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, * * WARNING: This function is only intended to be called during fast_page_fault. */ -u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte) { struct tdp_iter iter; struct kvm_mmu *mmu = vcpu->arch.mmu; - gfn_t gfn = addr >> PAGE_SHIFT; tdp_ptep_t sptep = NULL; tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 58b55e61bd33..1b74e058a81c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -64,7 +64,7 @@ static inline void kvm_tdp_mmu_walk_lockless_end(void) int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); -u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte); #ifdef CONFIG_X86_64 |