diff options
| author | Paolo Bonzini <pbonzini@redhat.com> | 2026-06-03 16:46:31 +0200 |
|---|---|---|
| committer | Paolo Bonzini <pbonzini@redhat.com> | 2026-06-03 16:46:31 +0200 |
| commit | bd2e19cf8f3028620428c698f6783de5306a6342 (patch) | |
| tree | 0ed8b56f44881ba19b306b77cea74b20a5dbc010 | |
| parent | db38bcb3311053954f62b865cd2d86e164b04351 (diff) | |
| parent | c1edda54a0f713412f5914f9c9080856694bddca (diff) | |
Merge tag 'kvm-s390-master-7.1-3' of https://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into HEAD
KVM: s390: More gmap and vsie fixes
| -rw-r--r-- | arch/s390/include/asm/gmap_helpers.h | 1 | ||||
| -rw-r--r-- | arch/s390/kvm/faultin.c | 30 | ||||
| -rw-r--r-- | arch/s390/kvm/gaccess.c | 11 | ||||
| -rw-r--r-- | arch/s390/kvm/gmap.c | 19 | ||||
| -rw-r--r-- | arch/s390/kvm/gmap.h | 3 | ||||
| -rw-r--r-- | arch/s390/kvm/kvm-s390.c | 33 | ||||
| -rw-r--r-- | arch/s390/kvm/priv.c | 8 | ||||
| -rw-r--r-- | arch/s390/kvm/pv.c | 21 | ||||
| -rw-r--r-- | arch/s390/mm/gmap_helpers.c | 128 |
9 files changed, 165 insertions, 89 deletions
diff --git a/arch/s390/include/asm/gmap_helpers.h b/arch/s390/include/asm/gmap_helpers.h index 2d3ae421077e..d2b616604a46 100644 --- a/arch/s390/include/asm/gmap_helpers.h +++ b/arch/s390/include/asm/gmap_helpers.h @@ -12,5 +12,6 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr); void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end); int gmap_helper_disable_cow_sharing(void); void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr); +pte_t *try_get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); #endif /* _ASM_S390_GMAP_HELPERS_H */ diff --git a/arch/s390/kvm/faultin.c b/arch/s390/kvm/faultin.c index ddf0ca71f374..fee80047bd94 100644 --- a/arch/s390/kvm/faultin.c +++ b/arch/s390/kvm/faultin.c @@ -36,7 +36,8 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa struct kvm_s390_mmu_cache *mc = NULL; struct kvm_memory_slot *slot; unsigned long inv_seq; - int foll, rc = 0; + int rc = -EAGAIN; + int foll; foll = f->write_attempt ? FOLL_WRITE : 0; foll |= f->attempt_pfault ? FOLL_NOWAIT : 0; @@ -53,7 +54,14 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa return 0; } - while (1) { + if (!mc) { + local_mc = kvm_s390_new_mmu_cache(); + if (!local_mc) + return -ENOMEM; + mc = local_mc; + } + + while (rc == -EAGAIN) { f->valid = false; inv_seq = kvm->mmu_invalidate_seq; /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ @@ -93,14 +101,7 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa if (is_error_pfn(f->pfn)) return -EFAULT; - if (!mc) { - local_mc = kvm_s390_new_mmu_cache(); - if (!local_mc) - return -ENOMEM; - mc = local_mc; - } - - /* Loop, will automatically release the faulted page. */ + /* Loop, release the faulted page. */ if (mmu_invalidate_retry_gfn_unsafe(kvm, inv_seq, f->gfn)) { kvm_release_faultin_page(kvm, f->page, true, false); continue; @@ -110,20 +111,19 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa if (!mmu_invalidate_retry_gfn(kvm, inv_seq, f->gfn)) { f->valid = true; rc = gmap_link(mc, kvm->arch.gmap, f, slot); - kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt); - f->page = NULL; } + kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt); } - kvm_release_faultin_page(kvm, f->page, true, false); if (rc == -ENOMEM) { rc = kvm_s390_mmu_cache_topup(mc); if (rc) return rc; - } else if (rc != -EAGAIN) { - return rc; + rc = -EAGAIN; } } + + return rc; } int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 4f8d5592c9a9..20e28b183c1a 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1466,15 +1466,17 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni struct guest_fault *f, bool p) { union crste newcrste, oldcrste; - gfn_t gfn; + unsigned long mask; + gfn_t r_gfn; int rc; lockdep_assert_held(&sg->kvm->mmu_lock); lockdep_assert_held(&sg->parent->children_lock); - gfn = f->gfn & (is_pmd(*table) ? _SEGMENT_FR_MASK : _REGION3_FR_MASK); + mask = is_pmd(*table) ? _SEGMENT_FR_MASK : _REGION3_FR_MASK; + r_gfn = gpa_to_gfn(raddr) & mask; scoped_guard(spinlock, &sg->host_to_rmap_lock) - rc = gmap_insert_rmap(sg, gfn, gpa_to_gfn(raddr), host->h.tt); + rc = gmap_insert_rmap(sg, f->gfn & mask, r_gfn, host->h.tt); if (rc) return rc; @@ -1497,8 +1499,7 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni return -EAGAIN; newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p); - gfn = gpa_to_gfn(raddr); - while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce)) + while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, r_gfn, sg->asce)) ; return 0; } diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index 957126ab991c..52d55ddea8d4 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -395,15 +395,28 @@ static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct struct gmap_unmap_priv *priv = walk->priv; struct folio *folio = NULL; union crste old = *crstep; + bool ok; if (!old.h.fc) return 0; if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) folio = phys_to_folio(crste_origin_large(old)); - /* No races should happen because kvm->mmu_lock is held in write mode */ - KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn), - priv->gmap->kvm); + /* + * No races should happen because kvm->mmu_lock is held in write mode, + * but the unmap operation could have triggered an unshadow, which + * causes gmap_crstep_xchg_atomic() to return false and clear the + * vsie_notif bit. Allow the operation to fail once, if the old crste + * had the vsie_notif bit set. A second failure is not allowed, for + * the reasons above. + */ + ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn); + if (!ok) { + KVM_BUG_ON(!old.s.fc1.vsie_notif, priv->gmap->kvm); + old.s.fc1.vsie_notif = 0; + ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn); + KVM_BUG_ON(!ok, priv->gmap->kvm); + } if (folio) uv_convert_from_secure_folio(folio); diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h index 742e42a31744..5374f21aaf8d 100644 --- a/arch/s390/kvm/gmap.h +++ b/arch/s390/kvm/gmap.h @@ -273,11 +273,14 @@ static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, unio gmap_unmap_prefix(gmap, gfn, gfn + align); } if (crste_leaf(oldcrste) && crste_needs_unshadow(oldcrste, newcrste)) { + newcrste = oldcrste; newcrste.s.fc1.vsie_notif = 0; if (needs_lock) gmap_handle_vsie_unshadow_event(gmap, gfn); else _gmap_handle_vsie_unshadow_event(gmap, gfn); + dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce); + return false; } if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !newcrste.s.fc1.s) SetPageDirty(phys_to_page(crste_origin_large(newcrste))); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e09960c2e6ed..ffb20a64d328 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -999,7 +999,10 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att break; } case KVM_S390_VM_MEM_LIMIT_SIZE: { + struct kvm_memslots *slots; + struct kvm_memory_slot *ms; unsigned long new_limit; + int bkt; if (kvm_is_ucontrol(kvm)) return -EINVAL; @@ -1007,6 +1010,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (get_user(new_limit, (u64 __user *)attr->addr)) return -EFAULT; + guard(mutex)(&kvm->lock); + + new_limit = ALIGN(new_limit, HPAGE_SIZE); if (kvm->arch.mem_limit != KVM_S390_NO_MEM_LIMIT && new_limit > kvm->arch.mem_limit) return -E2BIG; @@ -1014,12 +1020,27 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!new_limit) return -EINVAL; - ret = -EBUSY; - if (!kvm->created_vcpus) - ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit)); + if (kvm->created_vcpus) + return -EBUSY; + + ret = 0; + scoped_guard(mutex, &kvm->slots_lock) { + slots = kvm_memslots(kvm); + if (slots && !kvm_memslots_empty(slots)) { + kvm_for_each_memslot(ms, bkt, slots) { + if (gpa_to_gfn(new_limit) < ms->base_gfn + ms->npages) { + ret = -EBUSY; + break; + } + } + } + if (!ret) + ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit)); + } + if (ret) + break; VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit); - VM_EVENT(kvm, 3, "New guest asce: 0x%p", - (void *)kvm->arch.gmap->asce.val); + VM_EVENT(kvm, 3, "New guest asce: 0x%p", (void *)kvm->arch.gmap->asce.val); break; } default: @@ -5672,6 +5693,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return -EINVAL; if ((new->base_gfn + new->npages) * PAGE_SIZE > kvm->arch.mem_limit) return -EINVAL; + if (!asce_contains_gfn(kvm->arch.gmap->asce, new->base_gfn + new->npages - 1)) + return -EINVAL; } if (!kvm->arch.migration_mode) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index cc0553da14cb..447ec7ed423d 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1188,6 +1188,7 @@ static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len union crste *crstep; union pgste pgste; union pte *ptep; + hva_t hva; int i; lockdep_assert_held(&vcpu->kvm->mmu_lock); @@ -1199,8 +1200,11 @@ static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len if (!ptep || ptep->s.pr) continue; pgste = pgste_get_lock(ptep); - if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) - gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]); + if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) { + hva = gpa_to_hva(vcpu->kvm, cbrl[i]); + if (!kvm_is_error_hva(hva)) + gmap_helper_zap_one_page(vcpu->kvm->mm, hva); + } pgste_set_unlock(ptep, pgste); } } diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index c2dafd812a3b..4b865e75351c 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -17,6 +17,7 @@ #include <linux/pagewalk.h> #include <linux/sched/mm.h> #include <linux/mmu_notifier.h> +#include <asm/gmap_helpers.h> #include "kvm-s390.h" #include "dat.h" #include "gaccess.h" @@ -73,6 +74,7 @@ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_str struct pv_make_secure { void *uvcb; struct folio *folio; + struct kvm *kvm; int rc; bool needs_export; }; @@ -103,9 +105,21 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f) { struct pv_make_secure *priv = f->priv; struct folio *folio; + spinlock_t *ptl; /* pte lock from try_get_locked_pte() */ + pte_t *ptep; folio = pfn_folio(f->pfn); priv->rc = -EAGAIN; + + if (!mmap_read_trylock(priv->kvm->mm)) + return; + + ptep = try_get_locked_pte(priv->kvm->mm, gfn_to_hva(priv->kvm, f->gfn), &ptl); + if (IS_ERR_VALUE(ptep)) { + priv->rc = PTR_ERR(ptep); + goto out; + } + if (folio_trylock(folio)) { priv->rc = __kvm_s390_pv_make_secure(f, folio); if (priv->rc == -E2BIG || priv->rc == -EBUSY) { @@ -114,6 +128,11 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f) } folio_unlock(folio); } + + if (ptep) + pte_unmap_unlock(ptep, ptl); +out: + mmap_read_unlock(priv->kvm->mm); } /** @@ -127,7 +146,7 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f) */ int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb) { - struct pv_make_secure priv = { .uvcb = uvcb }; + struct pv_make_secure priv = { .uvcb = uvcb, .kvm = kvm, }; struct guest_fault f = { .write_attempt = true, .gfn = gpa_to_gfn(gaddr), diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index f8789ffcc05c..1cfe4724fbe2 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -17,22 +17,68 @@ #include <asm/gmap_helpers.h> /** - * ptep_zap_softleaf_entry() - discard a software leaf entry. + * try_get_locked_pte() - like get_locked_pte(), but atomic and with trylock * @mm: the mm - * @entry: the software leaf entry that needs to be zapped + * @vmaddr: the userspace virtual address whose pte is to be found + * @ptl: will be set to the pointer to the lock used to lock the pte in case + * of success. * - * Discards the given software leaf entry. If the leaf entry was an actual - * swap entry (and not a migration entry, for example), the actual swapped - * page is also discarded from swap. + * This function returns the pointer to the pte corresponding to @addr in @mm, + * similarly to get_locked_pte(). Unlike get_locked_pte(), no attempt is made + * to allocate missing page tables. If a missing or large entry is found, the + * function will return NULL. If the ptl lock is contended, %-EAGAIN is + * returned. + * + * In case of success, *@ptl will point to the locked pte lock for the returned + * pte, like get_locked_pte() does. + * + * Context: mmap_lock or vma lock for read or for write needs to be held. + * Return: + * * %NULL if the pte cannot be reached. + * * %-EAGAIN if the pte can be reached, but cannot be locked. + * * the pointer to the pte corresponding to @addr in @mm, if it can be reached + * and locked. */ -static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) +pte_t *try_get_locked_pte(struct mm_struct *mm, unsigned long vmaddr, spinlock_t **ptl) { - if (softleaf_is_swap(entry)) - dec_mm_counter(mm, MM_SWAPENTS); - else if (softleaf_is_migration(entry)) - dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry))); - swap_put_entries_direct(entry, 1); + pmd_t *pmdp, pmd, pmdval; + pud_t *pudp, pud; + p4d_t *p4dp, p4d; + pgd_t *pgdp, pgd; + pte_t *ptep; + + pgdp = pgd_offset(mm, vmaddr); + pgd = pgdp_get(pgdp); + if (pgd_none(pgd) || !pgd_present(pgd)) + return NULL; + p4dp = p4d_offset(pgdp, vmaddr); + p4d = p4dp_get(p4dp); + if (p4d_none(p4d) || !p4d_present(p4d)) + return NULL; + pudp = pud_offset(p4dp, vmaddr); + pud = pudp_get(pudp); + if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud)) + return NULL; + pmdp = pmd_offset(pudp, vmaddr); + pmd = pmdp_get_lockless(pmdp); + if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd)) + return NULL; + ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, ptl); + if (!ptep) + return NULL; + + if (spin_trylock(*ptl)) { + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmdp)))) { + pte_unmap_unlock(ptep, *ptl); + return ERR_PTR(-EAGAIN); + } + return ptep; + } + + pte_unmap(ptep); + return ERR_PTR(-EAGAIN); } +EXPORT_SYMBOL_GPL(try_get_locked_pte); /** * gmap_helper_zap_one_page() - discard a page if it was swapped. @@ -46,7 +92,8 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) { struct vm_area_struct *vma; - spinlock_t *ptl; + spinlock_t *ptl; /* Lock for the host (userspace) page table */ + softleaf_t sl; pte_t *ptep; mmap_assert_locked(mm); @@ -57,11 +104,13 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) return; /* Get pointer to the page table entry */ - ptep = get_locked_pte(mm, vmaddr, &ptl); - if (unlikely(!ptep)) + ptep = try_get_locked_pte(mm, vmaddr, &ptl); + if (IS_ERR_OR_NULL(ptep)) return; - if (pte_swap(*ptep)) { - ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); + sl = softleaf_from_pte(*ptep); + if (pte_swap(*ptep) && softleaf_is_swap(sl)) { + dec_mm_counter(mm, MM_SWAPENTS); + swap_put_entries_direct(sl, 1); pte_clear(mm, vmaddr, ptep); } pte_unmap_unlock(ptep, ptl); @@ -113,37 +162,9 @@ EXPORT_SYMBOL_GPL(gmap_helper_discard); */ void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr) { - pmd_t *pmdp, pmd, pmdval; - pud_t *pudp, pud; - p4d_t *p4dp, p4d; - pgd_t *pgdp, pgd; spinlock_t *ptl; /* Lock for the host (userspace) page table */ pte_t *ptep; - pgdp = pgd_offset(mm, vmaddr); - pgd = pgdp_get(pgdp); - if (pgd_none(pgd) || !pgd_present(pgd)) - return; - - p4dp = p4d_offset(pgdp, vmaddr); - p4d = p4dp_get(p4dp); - if (p4d_none(p4d) || !p4d_present(p4d)) - return; - - pudp = pud_offset(p4dp, vmaddr); - pud = pudp_get(pudp); - if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud)) - return; - - pmdp = pmd_offset(pudp, vmaddr); - pmd = pmdp_get_lockless(pmdp); - if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd)) - return; - - ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl); - if (!ptep) - return; - /* * Several paths exists that takes the ptl lock and then call the * mmu_notifier, which takes the mmu_lock. The unmap path, instead, @@ -156,21 +177,12 @@ void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr) * If the lock is contended the bit is not set and the deadlock is * avoided. */ - if (spin_trylock(ptl)) { - /* - * Make sure the pte we are touching is still the correct - * one. In theory this check should not be needed, but - * better safe than sorry. - * Disabling interrupts or holding the mmap lock is enough to - * guarantee that no concurrent updates to the page tables - * are possible. - */ - if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp)))) - __atomic64_or(_PAGE_UNUSED, (long *)ptep); - spin_unlock(ptl); - } + ptep = try_get_locked_pte(mm, vmaddr, &ptl); + if (IS_ERR_OR_NULL(ptep)) + return; - pte_unmap(ptep); + __atomic64_or(_PAGE_UNUSED, (long *)ptep); + pte_unmap_unlock(ptep, ptl); } EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused); |
