diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-05 10:38:45 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-05 10:38:45 -0700 |
| commit | 95b78879a80ab034cd56cc70f057c3cb4f2612a6 (patch) | |
| tree | 0bcf23729bccc2b27b6f88f863985294d93ef119 | |
| parent | d1b0937f0eadbc30b528d37589ec7fb6ce9f4114 (diff) | |
| parent | 7ec0360122d8f5033177ed0210d5ad71ec5b50c7 (diff) | |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini:
"arm64:
- Correctly drop the ITS translation cache reference when it actually
gets invalidated
- Take the SRCU lock for SW page table walks
- Restore POR_EL0 access to host EL0, avoiding POR_EL0 becoming
inaccessible from EL0 after running a guest
- Reassign nested_mmus array behind mmu_lock, ensuring that vcpu init
and MMU notifiers are mutually exclusive
- Correctly handle FEAT_XNX at stage-2
s390:
- More fixes for the new page table management and nested
virtualization
x86:
- More fixes for GHCB issues:
- Read start/end indices of page size change requests exactly once
per vmexit
- Unmap and unpin the GHCB as needed on vCPU free"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (23 commits)
KVM: arm64: Correctly identify executable PTEs at stage-2
KVM: arm64: nv: Fix handling of XN[0] when !FEAT_XNX
KVM: arm64: Reassign nested_mmus array behind mmu_lock
KVM: arm64: Restore POR_EL0 access to host EL0
KVM: arm64: Take the SRCU lock for page table walks in fault injection and AT emulation
KVM: arm64: vgic-its: Drop the translation cache reference only for the erased entry
KVM: SEV: Unmap and unpin the GHCB as needed on vCPU free
KVM: SEV: Decouple the need to sync the GHCB SA from the need to free the SA
KVM: SEV: Move sev_free_vcpu() down below sev_es_unmap_ghcb()
KVM: Don't WARN if memory is dirtied without a vCPU when the VM is dying
KVM: SEV: Read start/end indices of PSC requests exactly once per #VMGEXIT
KVM: SEV: Add an anonymous "psc" struct to track current PSC metadata
KVM: SEV: Make it more obvious when KVM is writing back the current PSC index
KVM: s390: Remove ptep_zap_softleaf_entry()
KVM: s390: Fix possible reference leak in fault-in code
KVM: s390: Prevent memslots outside the ASCE range
KVM: s390: Lock pte when making page secure
KVM: s390: Fix fault-in code
KVM: s390: vsie: Fix rmap handling in _do_shadow_crste()
KVM: s390: Fix guest / virtual address confusion in _essa_clear_cbrl()
...
| -rw-r--r-- | arch/arm64/include/asm/kvm_nested.h | 4 | ||||
| -rw-r--r-- | arch/arm64/kvm/at.c | 6 | ||||
| -rw-r--r-- | arch/arm64/kvm/hyp/include/hyp/switch.h | 2 | ||||
| -rw-r--r-- | arch/arm64/kvm/hyp/pgtable.c | 4 | ||||
| -rw-r--r-- | arch/arm64/kvm/nested.c | 33 | ||||
| -rw-r--r-- | arch/arm64/kvm/vgic/vgic-its.c | 6 | ||||
| -rw-r--r-- | arch/s390/include/asm/gmap_helpers.h | 1 | ||||
| -rw-r--r-- | arch/s390/kvm/faultin.c | 30 | ||||
| -rw-r--r-- | arch/s390/kvm/gaccess.c | 11 | ||||
| -rw-r--r-- | arch/s390/kvm/gmap.c | 19 | ||||
| -rw-r--r-- | arch/s390/kvm/gmap.h | 3 | ||||
| -rw-r--r-- | arch/s390/kvm/kvm-s390.c | 33 | ||||
| -rw-r--r-- | arch/s390/kvm/priv.c | 8 | ||||
| -rw-r--r-- | arch/s390/kvm/pv.c | 21 | ||||
| -rw-r--r-- | arch/s390/mm/gmap_helpers.c | 128 | ||||
| -rw-r--r-- | arch/x86/kvm/svm/sev.c | 238 | ||||
| -rw-r--r-- | arch/x86/kvm/svm/svm.h | 9 | ||||
| -rw-r--r-- | virt/kvm/kvm_main.c | 3 |
18 files changed, 337 insertions, 222 deletions
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index dc2957662ff2..cdf3e8422ea1 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -132,7 +132,7 @@ static inline bool kvm_s2_trans_exec_el0(struct kvm *kvm, struct kvm_s2_trans *t u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc); if (!kvm_has_xnx(kvm)) - xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10); + xn &= 0b10; switch (xn) { case 0b00: @@ -148,7 +148,7 @@ static inline bool kvm_s2_trans_exec_el1(struct kvm *kvm, struct kvm_s2_trans *t u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc); if (!kvm_has_xnx(kvm)) - xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10); + xn &= 0b10; switch (xn) { case 0b00: diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 9f8f0ae8e86e..889c2c15d7bd 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -1569,7 +1569,8 @@ int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) /* Do the stage-2 translation */ ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0)); out.esr = 0; - ret = kvm_walk_nested_s2(vcpu, ipa, &out); + scoped_guard(srcu, &vcpu->kvm->srcu) + ret = kvm_walk_nested_s2(vcpu, ipa, &out); if (ret < 0) return ret; @@ -1665,7 +1666,8 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) } /* Walk the guest's PT, looking for a match along the way */ - ret = walk_s1(vcpu, &wi, &wr, va); + scoped_guard(srcu, &vcpu->kvm->srcu) + ret = walk_s1(vcpu, &wi, &wr, va); switch (ret) { case -EINTR: /* We interrupted the walk on a match, return the level */ diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 320cd45d49c5..e9b36a3b27bb 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -181,6 +181,8 @@ static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu) val |= CPACR_EL1_ZEN; if (cpus_have_final_cap(ARM64_SME)) val |= CPACR_EL1_SMEN; + if (cpus_have_final_cap(ARM64_HAS_S1POE)) + val |= CPACR_EL1_E0POE; write_sysreg(val, cpacr_el1); } diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 0c1defa5fb0f..91a7dfad6686 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -925,7 +925,9 @@ static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) static bool stage2_pte_executable(kvm_pte_t pte) { - return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); + enum kvm_pgtable_prot prot = kvm_pgtable_stage2_pte_prot(pte); + + return prot & (KVM_PGTABLE_PROT_UX | KVM_PGTABLE_PROT_PX); } static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 38f672e94087..6f7bc9a9992e 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -89,21 +89,28 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) * again, and there is no reason to affect the whole VM for this. */ num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU; - tmp = kvrealloc(kvm->arch.nested_mmus, - size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus), - GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!tmp) - return -ENOMEM; - swap(kvm->arch.nested_mmus, tmp); + if (num_mmus > kvm->arch.nested_mmus_size) { + tmp = kvcalloc(num_mmus, sizeof(*tmp), GFP_KERNEL_ACCOUNT); + if (!tmp) + return -ENOMEM; - /* - * If we went through a realocation, adjust the MMU back-pointers in - * the previously initialised kvm_pgtable structures. - */ - if (kvm->arch.nested_mmus != tmp) - for (int i = 0; i < kvm->arch.nested_mmus_size; i++) - kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i]; + write_lock(&kvm->mmu_lock); + + if (kvm->arch.nested_mmus_size) { + memcpy(tmp, kvm->arch.nested_mmus, + size_mul(sizeof(*tmp), kvm->arch.nested_mmus_size)); + + for (int i = 0; i < kvm->arch.nested_mmus_size; i++) + tmp[i].pgt->mmu = &tmp[i]; + } + + swap(kvm->arch.nested_mmus, tmp); + + write_unlock(&kvm->mmu_lock); + + kvfree(tmp); + } for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++) ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]); diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 1d7e5d560af4..1e3706ac3b8e 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -597,8 +597,10 @@ static void vgic_its_invalidate_cache(struct vgic_its *its) unsigned long idx; xa_for_each(&its->translation_cache, idx, irq) { - xa_erase(&its->translation_cache, idx); - vgic_put_irq(kvm, irq); + /* Only the context that erases the entry drops its cache ref. */ + irq = xa_erase(&its->translation_cache, idx); + if (irq) + vgic_put_irq(kvm, irq); } } diff --git a/arch/s390/include/asm/gmap_helpers.h b/arch/s390/include/asm/gmap_helpers.h index 2d3ae421077e..d2b616604a46 100644 --- a/arch/s390/include/asm/gmap_helpers.h +++ b/arch/s390/include/asm/gmap_helpers.h @@ -12,5 +12,6 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr); void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end); int gmap_helper_disable_cow_sharing(void); void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr); +pte_t *try_get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); #endif /* _ASM_S390_GMAP_HELPERS_H */ diff --git a/arch/s390/kvm/faultin.c b/arch/s390/kvm/faultin.c index ddf0ca71f374..fee80047bd94 100644 --- a/arch/s390/kvm/faultin.c +++ b/arch/s390/kvm/faultin.c @@ -36,7 +36,8 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa struct kvm_s390_mmu_cache *mc = NULL; struct kvm_memory_slot *slot; unsigned long inv_seq; - int foll, rc = 0; + int rc = -EAGAIN; + int foll; foll = f->write_attempt ? FOLL_WRITE : 0; foll |= f->attempt_pfault ? FOLL_NOWAIT : 0; @@ -53,7 +54,14 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa return 0; } - while (1) { + if (!mc) { + local_mc = kvm_s390_new_mmu_cache(); + if (!local_mc) + return -ENOMEM; + mc = local_mc; + } + + while (rc == -EAGAIN) { f->valid = false; inv_seq = kvm->mmu_invalidate_seq; /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ @@ -93,14 +101,7 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa if (is_error_pfn(f->pfn)) return -EFAULT; - if (!mc) { - local_mc = kvm_s390_new_mmu_cache(); - if (!local_mc) - return -ENOMEM; - mc = local_mc; - } - - /* Loop, will automatically release the faulted page. */ + /* Loop, release the faulted page. */ if (mmu_invalidate_retry_gfn_unsafe(kvm, inv_seq, f->gfn)) { kvm_release_faultin_page(kvm, f->page, true, false); continue; @@ -110,20 +111,19 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa if (!mmu_invalidate_retry_gfn(kvm, inv_seq, f->gfn)) { f->valid = true; rc = gmap_link(mc, kvm->arch.gmap, f, slot); - kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt); - f->page = NULL; } + kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt); } - kvm_release_faultin_page(kvm, f->page, true, false); if (rc == -ENOMEM) { rc = kvm_s390_mmu_cache_topup(mc); if (rc) return rc; - } else if (rc != -EAGAIN) { - return rc; + rc = -EAGAIN; } } + + return rc; } int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 4f8d5592c9a9..20e28b183c1a 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1466,15 +1466,17 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni struct guest_fault *f, bool p) { union crste newcrste, oldcrste; - gfn_t gfn; + unsigned long mask; + gfn_t r_gfn; int rc; lockdep_assert_held(&sg->kvm->mmu_lock); lockdep_assert_held(&sg->parent->children_lock); - gfn = f->gfn & (is_pmd(*table) ? _SEGMENT_FR_MASK : _REGION3_FR_MASK); + mask = is_pmd(*table) ? _SEGMENT_FR_MASK : _REGION3_FR_MASK; + r_gfn = gpa_to_gfn(raddr) & mask; scoped_guard(spinlock, &sg->host_to_rmap_lock) - rc = gmap_insert_rmap(sg, gfn, gpa_to_gfn(raddr), host->h.tt); + rc = gmap_insert_rmap(sg, f->gfn & mask, r_gfn, host->h.tt); if (rc) return rc; @@ -1497,8 +1499,7 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni return -EAGAIN; newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p); - gfn = gpa_to_gfn(raddr); - while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce)) + while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, r_gfn, sg->asce)) ; return 0; } diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index 957126ab991c..52d55ddea8d4 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -395,15 +395,28 @@ static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct struct gmap_unmap_priv *priv = walk->priv; struct folio *folio = NULL; union crste old = *crstep; + bool ok; if (!old.h.fc) return 0; if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) folio = phys_to_folio(crste_origin_large(old)); - /* No races should happen because kvm->mmu_lock is held in write mode */ - KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn), - priv->gmap->kvm); + /* + * No races should happen because kvm->mmu_lock is held in write mode, + * but the unmap operation could have triggered an unshadow, which + * causes gmap_crstep_xchg_atomic() to return false and clear the + * vsie_notif bit. Allow the operation to fail once, if the old crste + * had the vsie_notif bit set. A second failure is not allowed, for + * the reasons above. + */ + ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn); + if (!ok) { + KVM_BUG_ON(!old.s.fc1.vsie_notif, priv->gmap->kvm); + old.s.fc1.vsie_notif = 0; + ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn); + KVM_BUG_ON(!ok, priv->gmap->kvm); + } if (folio) uv_convert_from_secure_folio(folio); diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h index 742e42a31744..5374f21aaf8d 100644 --- a/arch/s390/kvm/gmap.h +++ b/arch/s390/kvm/gmap.h @@ -273,11 +273,14 @@ static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, unio gmap_unmap_prefix(gmap, gfn, gfn + align); } if (crste_leaf(oldcrste) && crste_needs_unshadow(oldcrste, newcrste)) { + newcrste = oldcrste; newcrste.s.fc1.vsie_notif = 0; if (needs_lock) gmap_handle_vsie_unshadow_event(gmap, gfn); else _gmap_handle_vsie_unshadow_event(gmap, gfn); + dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce); + return false; } if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !newcrste.s.fc1.s) SetPageDirty(phys_to_page(crste_origin_large(newcrste))); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e09960c2e6ed..ffb20a64d328 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -999,7 +999,10 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att break; } case KVM_S390_VM_MEM_LIMIT_SIZE: { + struct kvm_memslots *slots; + struct kvm_memory_slot *ms; unsigned long new_limit; + int bkt; if (kvm_is_ucontrol(kvm)) return -EINVAL; @@ -1007,6 +1010,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (get_user(new_limit, (u64 __user *)attr->addr)) return -EFAULT; + guard(mutex)(&kvm->lock); + + new_limit = ALIGN(new_limit, HPAGE_SIZE); if (kvm->arch.mem_limit != KVM_S390_NO_MEM_LIMIT && new_limit > kvm->arch.mem_limit) return -E2BIG; @@ -1014,12 +1020,27 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!new_limit) return -EINVAL; - ret = -EBUSY; - if (!kvm->created_vcpus) - ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit)); + if (kvm->created_vcpus) + return -EBUSY; + + ret = 0; + scoped_guard(mutex, &kvm->slots_lock) { + slots = kvm_memslots(kvm); + if (slots && !kvm_memslots_empty(slots)) { + kvm_for_each_memslot(ms, bkt, slots) { + if (gpa_to_gfn(new_limit) < ms->base_gfn + ms->npages) { + ret = -EBUSY; + break; + } + } + } + if (!ret) + ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit)); + } + if (ret) + break; VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit); - VM_EVENT(kvm, 3, "New guest asce: 0x%p", - (void *)kvm->arch.gmap->asce.val); + VM_EVENT(kvm, 3, "New guest asce: 0x%p", (void *)kvm->arch.gmap->asce.val); break; } default: @@ -5672,6 +5693,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return -EINVAL; if ((new->base_gfn + new->npages) * PAGE_SIZE > kvm->arch.mem_limit) return -EINVAL; + if (!asce_contains_gfn(kvm->arch.gmap->asce, new->base_gfn + new->npages - 1)) + return -EINVAL; } if (!kvm->arch.migration_mode) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index cc0553da14cb..447ec7ed423d 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1188,6 +1188,7 @@ static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len union crste *crstep; union pgste pgste; union pte *ptep; + hva_t hva; int i; lockdep_assert_held(&vcpu->kvm->mmu_lock); @@ -1199,8 +1200,11 @@ static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len if (!ptep || ptep->s.pr) continue; pgste = pgste_get_lock(ptep); - if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) - gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]); + if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) { + hva = gpa_to_hva(vcpu->kvm, cbrl[i]); + if (!kvm_is_error_hva(hva)) + gmap_helper_zap_one_page(vcpu->kvm->mm, hva); + } pgste_set_unlock(ptep, pgste); } } diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index c2dafd812a3b..4b865e75351c 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -17,6 +17,7 @@ #include <linux/pagewalk.h> #include <linux/sched/mm.h> #include <linux/mmu_notifier.h> +#include <asm/gmap_helpers.h> #include "kvm-s390.h" #include "dat.h" #include "gaccess.h" @@ -73,6 +74,7 @@ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_str struct pv_make_secure { void *uvcb; struct folio *folio; + struct kvm *kvm; int rc; bool needs_export; }; @@ -103,9 +105,21 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f) { struct pv_make_secure *priv = f->priv; struct folio *folio; + spinlock_t *ptl; /* pte lock from try_get_locked_pte() */ + pte_t *ptep; folio = pfn_folio(f->pfn); priv->rc = -EAGAIN; + + if (!mmap_read_trylock(priv->kvm->mm)) + return; + + ptep = try_get_locked_pte(priv->kvm->mm, gfn_to_hva(priv->kvm, f->gfn), &ptl); + if (IS_ERR_VALUE(ptep)) { + priv->rc = PTR_ERR(ptep); + goto out; + } + if (folio_trylock(folio)) { priv->rc = __kvm_s390_pv_make_secure(f, folio); if (priv->rc == -E2BIG || priv->rc == -EBUSY) { @@ -114,6 +128,11 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f) } folio_unlock(folio); } + + if (ptep) + pte_unmap_unlock(ptep, ptl); +out: + mmap_read_unlock(priv->kvm->mm); } /** @@ -127,7 +146,7 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f) */ int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb) { - struct pv_make_secure priv = { .uvcb = uvcb }; + struct pv_make_secure priv = { .uvcb = uvcb, .kvm = kvm, }; struct guest_fault f = { .write_attempt = true, .gfn = gpa_to_gfn(gaddr), diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index f8789ffcc05c..1cfe4724fbe2 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -17,22 +17,68 @@ #include <asm/gmap_helpers.h> /** - * ptep_zap_softleaf_entry() - discard a software leaf entry. + * try_get_locked_pte() - like get_locked_pte(), but atomic and with trylock * @mm: the mm - * @entry: the software leaf entry that needs to be zapped + * @vmaddr: the userspace virtual address whose pte is to be found + * @ptl: will be set to the pointer to the lock used to lock the pte in case + * of success. * - * Discards the given software leaf entry. If the leaf entry was an actual - * swap entry (and not a migration entry, for example), the actual swapped - * page is also discarded from swap. + * This function returns the pointer to the pte corresponding to @addr in @mm, + * similarly to get_locked_pte(). Unlike get_locked_pte(), no attempt is made + * to allocate missing page tables. If a missing or large entry is found, the + * function will return NULL. If the ptl lock is contended, %-EAGAIN is + * returned. + * + * In case of success, *@ptl will point to the locked pte lock for the returned + * pte, like get_locked_pte() does. + * + * Context: mmap_lock or vma lock for read or for write needs to be held. + * Return: + * * %NULL if the pte cannot be reached. + * * %-EAGAIN if the pte can be reached, but cannot be locked. + * * the pointer to the pte corresponding to @addr in @mm, if it can be reached + * and locked. */ -static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) +pte_t *try_get_locked_pte(struct mm_struct *mm, unsigned long vmaddr, spinlock_t **ptl) { - if (softleaf_is_swap(entry)) - dec_mm_counter(mm, MM_SWAPENTS); - else if (softleaf_is_migration(entry)) - dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry))); - swap_put_entries_direct(entry, 1); + pmd_t *pmdp, pmd, pmdval; + pud_t *pudp, pud; + p4d_t *p4dp, p4d; + pgd_t *pgdp, pgd; + pte_t *ptep; + + pgdp = pgd_offset(mm, vmaddr); + pgd = pgdp_get(pgdp); + if (pgd_none(pgd) || !pgd_present(pgd)) + return NULL; + p4dp = p4d_offset(pgdp, vmaddr); + p4d = p4dp_get(p4dp); + if (p4d_none(p4d) || !p4d_present(p4d)) + return NULL; + pudp = pud_offset(p4dp, vmaddr); + pud = pudp_get(pudp); + if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud)) + return NULL; + pmdp = pmd_offset(pudp, vmaddr); + pmd = pmdp_get_lockless(pmdp); + if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd)) + return NULL; + ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, ptl); + if (!ptep) + return NULL; + + if (spin_trylock(*ptl)) { + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmdp)))) { + pte_unmap_unlock(ptep, *ptl); + return ERR_PTR(-EAGAIN); + } + return ptep; + } + + pte_unmap(ptep); + return ERR_PTR(-EAGAIN); } +EXPORT_SYMBOL_GPL(try_get_locked_pte); /** * gmap_helper_zap_one_page() - discard a page if it was swapped. @@ -46,7 +92,8 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) { struct vm_area_struct *vma; - spinlock_t *ptl; + spinlock_t *ptl; /* Lock for the host (userspace) page table */ + softleaf_t sl; pte_t *ptep; mmap_assert_locked(mm); @@ -57,11 +104,13 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) return; /* Get pointer to the page table entry */ - ptep = get_locked_pte(mm, vmaddr, &ptl); - if (unlikely(!ptep)) + ptep = try_get_locked_pte(mm, vmaddr, &ptl); + if (IS_ERR_OR_NULL(ptep)) return; - if (pte_swap(*ptep)) { - ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); + sl = softleaf_from_pte(*ptep); + if (pte_swap(*ptep) && softleaf_is_swap(sl)) { + dec_mm_counter(mm, MM_SWAPENTS); + swap_put_entries_direct(sl, 1); pte_clear(mm, vmaddr, ptep); } pte_unmap_unlock(ptep, ptl); @@ -113,37 +162,9 @@ EXPORT_SYMBOL_GPL(gmap_helper_discard); */ void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr) { - pmd_t *pmdp, pmd, pmdval; - pud_t *pudp, pud; - p4d_t *p4dp, p4d; - pgd_t *pgdp, pgd; spinlock_t *ptl; /* Lock for the host (userspace) page table */ pte_t *ptep; - pgdp = pgd_offset(mm, vmaddr); - pgd = pgdp_get(pgdp); - if (pgd_none(pgd) || !pgd_present(pgd)) - return; - - p4dp = p4d_offset(pgdp, vmaddr); - p4d = p4dp_get(p4dp); - if (p4d_none(p4d) || !p4d_present(p4d)) - return; - - pudp = pud_offset(p4dp, vmaddr); - pud = pudp_get(pudp); - if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud)) - return; - - pmdp = pmd_offset(pudp, vmaddr); - pmd = pmdp_get_lockless(pmdp); - if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd)) - return; - - ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl); - if (!ptep) - return; - /* * Several paths exists that takes the ptl lock and then call the * mmu_notifier, which takes the mmu_lock. The unmap path, instead, @@ -156,21 +177,12 @@ void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr) * If the lock is contended the bit is not set and the deadlock is * avoided. */ - if (spin_trylock(ptl)) { - /* - * Make sure the pte we are touching is still the correct - * one. In theory this check should not be needed, but - * better safe than sorry. - * Disabling interrupts or holding the mmap lock is enough to - * guarantee that no concurrent updates to the page tables - * are possible. - */ - if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp)))) - __atomic64_or(_PAGE_UNUSED, (long *)ptep); - spin_unlock(ptl); - } + ptep = try_get_locked_pte(mm, vmaddr, &ptl); + if (IS_ERR_OR_NULL(ptep)) + return; - pte_unmap(ptep); + __atomic64_or(_PAGE_UNUSED, (long *)ptep); + pte_unmap_unlock(ptep, ptl); } EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 62b5befe0eed..6c6a6d663e29 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3313,37 +3313,6 @@ void sev_guest_memory_reclaimed(struct kvm *kvm) sev_writeback_caches(kvm); } -void sev_free_vcpu(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm; - - if (!is_sev_es_guest(vcpu)) - return; - - svm = to_svm(vcpu); - - /* - * If it's an SNP guest, then the VMSA was marked in the RMP table as - * a guest-owned page. Transition the page to hypervisor state before - * releasing it back to the system. - */ - if (is_sev_snp_guest(vcpu)) { - u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; - - if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) - goto skip_vmsa_free; - } - - if (vcpu->arch.guest_state_protected) - sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); - - __free_page(virt_to_page(svm->sev_es.vmsa)); - -skip_vmsa_free: - if (svm->sev_es.ghcb_sa_free) - kvfree(svm->sev_es.ghcb_sa); -} - static void dump_ghcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -3583,6 +3552,20 @@ vmgexit_err: return 1; } +static void __sev_es_unmap_ghcb(struct vcpu_svm *svm) +{ + if (svm->sev_es.ghcb_sa_free) { + kvfree(svm->sev_es.ghcb_sa); + svm->sev_es.ghcb_sa = NULL; + svm->sev_es.ghcb_sa_free = false; + } + + if (svm->sev_es.ghcb) { + kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); + svm->sev_es.ghcb = NULL; + } +} + void sev_es_unmap_ghcb(struct vcpu_svm *svm) { /* Clear any indication that the vCPU is in a type of AP Reset Hold */ @@ -3591,31 +3574,51 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) if (!svm->sev_es.ghcb) return; - if (svm->sev_es.ghcb_sa_free) { - /* - * The scratch area lives outside the GHCB, so there is a - * buffer that, depending on the operation performed, may - * need to be synced, then freed. - */ - if (svm->sev_es.ghcb_sa_sync) { - kvm_write_guest(svm->vcpu.kvm, - svm->sev_es.sw_scratch, - svm->sev_es.ghcb_sa, - svm->sev_es.ghcb_sa_len); - svm->sev_es.ghcb_sa_sync = false; - } - - kvfree(svm->sev_es.ghcb_sa); - svm->sev_es.ghcb_sa = NULL; - svm->sev_es.ghcb_sa_free = false; + /* + * If the scratch area lives outside the GHCB, there's a buffer that, + * depending on the operation performed, may need to be synced. + */ + if (svm->sev_es.ghcb_sa_sync) { + kvm_write_guest(svm->vcpu.kvm, svm->sev_es.sw_scratch, + svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); + svm->sev_es.ghcb_sa_sync = false; } trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb); sev_es_sync_to_ghcb(svm); - kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); - svm->sev_es.ghcb = NULL; + __sev_es_unmap_ghcb(svm); +} + +void sev_free_vcpu(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm; + + if (!is_sev_es_guest(vcpu)) + return; + + svm = to_svm(vcpu); + + /* + * If it's an SNP guest, then the VMSA was marked in the RMP table as + * a guest-owned page. Transition the page to hypervisor state before + * releasing it back to the system. + */ + if (is_sev_snp_guest(vcpu)) { + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) + goto skip_vmsa_free; + } + + if (vcpu->arch.guest_state_protected) + sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); + + __free_page(virt_to_page(svm->sev_es.vmsa)); + +skip_vmsa_free: + __sev_es_unmap_ghcb(svm); } int pre_sev_run(struct vcpu_svm *svm, int cpu) @@ -3685,6 +3688,8 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 min_len) goto e_scratch; } + WARN_ON_ONCE(svm->sev_es.ghcb_sa_sync || svm->sev_es.ghcb_sa_free); + if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) { /* Scratch area begins within GHCB */ ghcb_scratch_beg = control->ghcb_gpa + @@ -3706,6 +3711,8 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 min_len) scratch_va = (void *)svm->sev_es.ghcb; scratch_va += (scratch_gpa_beg - control->ghcb_gpa); + svm->sev_es.ghcb_sa_sync = false; + svm->sev_es.ghcb_sa_free = false; svm->sev_es.ghcb_sa_len = ghcb_scratch_end - scratch_gpa_beg; } else { /* GHCB v2 requires the scratch area to be within the GHCB. */ @@ -3841,13 +3848,11 @@ struct psc_buffer { struct psc_entry entries[]; } __packed; -static int snp_begin_psc(struct vcpu_svm *svm); +static int snp_do_psc(struct vcpu_svm *svm); static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) { - svm->sev_es.psc_inflight = 0; - svm->sev_es.psc_idx = 0; - svm->sev_es.psc_2m = false; + memset(&svm->sev_es.psc, 0, sizeof(svm->sev_es.psc)); /* * PSC requests always get a "no action" response in SW_EXITINFO1, with @@ -3860,9 +3865,8 @@ static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) static void __snp_complete_one_psc(struct vcpu_svm *svm) { - struct psc_buffer *psc = svm->sev_es.ghcb_sa; - struct psc_entry *entries = psc->entries; - struct psc_hdr *hdr = &psc->hdr; + struct vcpu_sev_es_state *sev_es = &svm->sev_es; + struct psc_buffer *guest_psc = sev_es->ghcb_sa; __u16 idx; /* @@ -3870,14 +3874,15 @@ static void __snp_complete_one_psc(struct vcpu_svm *svm) * corresponding entries in the guest's PSC buffer and zero out the * count of in-flight PSC entries. */ - for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight; - svm->sev_es.psc_inflight--, idx++) { - struct psc_entry entry = READ_ONCE(entries[idx]); + for (idx = sev_es->psc.cur_idx; sev_es->psc.batch_size; + sev_es->psc.batch_size--, idx++) { + struct psc_entry entry = READ_ONCE(guest_psc->entries[idx]); - entries[idx].cur_page = entry.pagesize ? 512 : 1; + guest_psc->entries[idx].cur_page = entry.pagesize ? 512 : 1; } - hdr->cur_entry = idx; + sev_es->psc.cur_idx = idx; + guest_psc->hdr.cur_entry = idx; } static int snp_complete_one_psc(struct kvm_vcpu *vcpu) @@ -3892,63 +3897,30 @@ static int snp_complete_one_psc(struct kvm_vcpu *vcpu) __snp_complete_one_psc(svm); /* Handle the next range (if any). */ - return snp_begin_psc(svm); + return snp_do_psc(svm); } -static int snp_begin_psc(struct vcpu_svm *svm) +static int snp_do_psc(struct vcpu_svm *svm) { struct vcpu_sev_es_state *sev_es = &svm->sev_es; - struct psc_buffer *psc = sev_es->ghcb_sa; - struct psc_entry *entries = psc->entries; + struct psc_buffer *guest_psc = sev_es->ghcb_sa; struct kvm_vcpu *vcpu = &svm->vcpu; - struct psc_hdr *hdr = &psc->hdr; struct psc_entry entry_start; - u16 idx, idx_start, idx_end, max_nr_entries; int npages; bool huge; u64 gfn; - - if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { - snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); - return 1; - } - - /* - * GHCB v2 requires the scratch area to reside within the GHCB itself, - * and PSC requests are only supported for GHCB v2+. Thus it should be - * impossible to exceed the max PSC entry count (which is derived from - * the size of the shared GHCB buffer). - */ - max_nr_entries = (sev_es->ghcb_sa_len - sizeof(struct psc_hdr)) / - sizeof(struct psc_entry); - if (WARN_ON_ONCE(max_nr_entries > VMGEXIT_PSC_MAX_COUNT)) { - snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); - return 1; - } + u16 idx; next_range: /* There should be no other PSCs in-flight at this point. */ - if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) { + if (WARN_ON_ONCE(svm->sev_es.psc.batch_size)) { snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); return 1; } - /* - * The PSC descriptor buffer can be modified by a misbehaved guest after - * validation, so take care to only use validated copies of values used - * for things like array indexing. - */ - idx_start = READ_ONCE(hdr->cur_entry); - idx_end = READ_ONCE(hdr->end_entry); - - if (idx_end >= max_nr_entries) { - snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); - return 1; - } - /* Find the start of the next range which needs processing. */ - for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) { - entry_start = READ_ONCE(entries[idx]); + for (idx = sev_es->psc.cur_idx; idx <= sev_es->psc.end_idx; idx++) { + entry_start = READ_ONCE(guest_psc->entries[idx]); gfn = entry_start.gfn; huge = entry_start.pagesize; @@ -3974,32 +3946,40 @@ next_range: if (npages) break; + + /* + * Increment the guest-visible index to communicate the current + * entry back to the guest, e.g. in case of failure. No need + * for READ_ONCE() as KVM doesn't consume the field, i.e. a + * misbehaving guest can only break itself. + */ + guest_psc->hdr.cur_entry++; } - if (idx > idx_end) { + if (idx > sev_es->psc.end_idx) { /* Nothing more to process. */ snp_complete_psc(svm, 0); return 1; } - svm->sev_es.psc_2m = huge; - svm->sev_es.psc_idx = idx; - svm->sev_es.psc_inflight = 1; + sev_es->psc.is_2m = huge; + sev_es->psc.cur_idx = idx; + sev_es->psc.batch_size = 1; /* * Find all subsequent PSC entries that contain adjacent GPA * ranges/operations and can be combined into a single * KVM_HC_MAP_GPA_RANGE exit. */ - while (++idx <= idx_end) { - struct psc_entry entry = READ_ONCE(entries[idx]); + while (++idx <= sev_es->psc.end_idx) { + struct psc_entry entry = READ_ONCE(guest_psc->entries[idx]); if (entry.operation != entry_start.operation || entry.gfn != entry_start.gfn + npages || entry.cur_page || !!entry.pagesize != huge) break; - svm->sev_es.psc_inflight++; + sev_es->psc.batch_size++; npages += huge ? 512 : 1; } @@ -4041,6 +4021,46 @@ next_range: BUG(); } +static int snp_begin_psc(struct vcpu_svm *svm) +{ + struct vcpu_sev_es_state *sev_es = &svm->sev_es; + struct psc_buffer *guest_psc = sev_es->ghcb_sa; + u16 max_nr_entries; + + if (!user_exit_on_hypercall(svm->vcpu.kvm, KVM_HC_MAP_GPA_RANGE)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + + /* + * GHCB v2 requires the scratch area to reside within the GHCB itself, + * and PSC requests are only supported for GHCB v2+. Thus it should be + * impossible to exceed the max PSC entry count (which is derived from + * the size of the shared GHCB buffer). + */ + max_nr_entries = (sev_es->ghcb_sa_len - sizeof(struct psc_hdr)) / + sizeof(struct psc_entry); + if (WARN_ON_ONCE(max_nr_entries > VMGEXIT_PSC_MAX_COUNT)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + + /* + * The PSC descriptor buffer can be modified by a misbehaved guest after + * validation, so take care to only use validated copies of values used + * for things like array indexing. + */ + sev_es->psc.cur_idx = READ_ONCE(guest_psc->hdr.cur_entry); + sev_es->psc.end_idx = READ_ONCE(guest_psc->hdr.end_entry); + + if (sev_es->psc.end_idx >= max_nr_entries) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); + return 1; + } + + return snp_do_psc(svm); +} + /* * Invoked as part of svm_vcpu_reset() processing of an init event. */ diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a10668d17a16..5137416be593 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -257,9 +257,12 @@ struct vcpu_sev_es_state { bool ghcb_sa_free; /* SNP Page-State-Change buffer entries currently being processed */ - u16 psc_idx; - u16 psc_inflight; - bool psc_2m; + struct { + u16 cur_idx; + u16 end_idx; + u16 batch_size; + bool is_2m; + } psc; u64 ghcb_registered_gpa; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 89489996fbc1..881f92d7a469 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3520,7 +3520,8 @@ void mark_page_dirty_in_slot(struct kvm *kvm, if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm)) return; - WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm)); + WARN_ON_ONCE(!vcpu && refcount_read(&kvm->users_count) && + !kvm_arch_allow_write_without_running_vcpu(kvm)); #endif if (memslot && kvm_slot_dirty_track_enabled(memslot)) { |
