From 29e8751c1dd278262fb4cd234e8909287d4189d4 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 2 Jun 2026 16:23:47 +0200 Subject: KVM: s390: Fix _gmap_unmap_crste() In _gmap_unmap_crste(), the crste to be unmapped is zapped calling gmap_crstep_xchg_atomic() exactly once, and expecting it to succeed. This is a reasonable sanity check, since kvm->mmu_lock is being held in write mode, and thus no races should be possible. An upcoming patch will change the behaviour of gmap_crstep_xchg_atomic() to return false and clear the vsie_notif bit if the operation triggers an unshadow operation. With the new behaviour, an unmap operation that triggers an unshadow would cause the VM to be killed. Prepare for the change by checking if the vsie_notif bit was set in the old crste if gmap_crstep_xchg_atomic() fails the first time, and try a second time. The second time no failures are allowed. Fixes: b827ef02f409 ("KVM: s390: Remove non-atomic dat_crstep_xchg()") Fixes: a2c17f9270cc ("KVM: s390: New gmap code") Signed-off-by: Claudio Imbrenda Message-ID: <20260602142356.169458-2-imbrenda@linux.ibm.com> --- arch/s390/kvm/gmap.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index 957126ab991c..52d55ddea8d4 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -395,15 +395,28 @@ static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct struct gmap_unmap_priv *priv = walk->priv; struct folio *folio = NULL; union crste old = *crstep; + bool ok; if (!old.h.fc) return 0; if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) folio = phys_to_folio(crste_origin_large(old)); - /* No races should happen because kvm->mmu_lock is held in write mode */ - KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn), - priv->gmap->kvm); + /* + * No races should happen because kvm->mmu_lock is held in write mode, + * but the unmap operation could have triggered an unshadow, which + * causes gmap_crstep_xchg_atomic() to return false and clear the + * vsie_notif bit. Allow the operation to fail once, if the old crste + * had the vsie_notif bit set. A second failure is not allowed, for + * the reasons above. + */ + ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn); + if (!ok) { + KVM_BUG_ON(!old.s.fc1.vsie_notif, priv->gmap->kvm); + old.s.fc1.vsie_notif = 0; + ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn); + KVM_BUG_ON(!ok, priv->gmap->kvm); + } if (folio) uv_convert_from_secure_folio(folio); -- cgit v1.2.3 From d1adc098ce08893c92fce3db63f7bb750fbb4c30 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 2 Jun 2026 16:23:48 +0200 Subject: KVM: s390: Fix _gmap_crstep_xchg_atomic() The previous incorrect behaviour cleared the vsie_notif bit without returning false, which allowed shadow crstes to be installed without the vsie_notif bit. Return false and do not perform the operation if an unshadow event has been triggered, but still attempt to clear the vsie_notif bit from the existing crste. This will prevent the installation of shadow crstes without vsie_notif bit and will also prevent the caller from looping forever if it was not checking for the sg->invalidated flag. Fixes: b827ef02f409 ("KVM: s390: Remove non-atomic dat_crstep_xchg()") Fixes: a2c17f9270cc ("KVM: s390: New gmap code") Signed-off-by: Claudio Imbrenda Message-ID: <20260602142356.169458-3-imbrenda@linux.ibm.com> --- arch/s390/kvm/gmap.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h index 742e42a31744..5374f21aaf8d 100644 --- a/arch/s390/kvm/gmap.h +++ b/arch/s390/kvm/gmap.h @@ -273,11 +273,14 @@ static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, unio gmap_unmap_prefix(gmap, gfn, gfn + align); } if (crste_leaf(oldcrste) && crste_needs_unshadow(oldcrste, newcrste)) { + newcrste = oldcrste; newcrste.s.fc1.vsie_notif = 0; if (needs_lock) gmap_handle_vsie_unshadow_event(gmap, gfn); else _gmap_handle_vsie_unshadow_event(gmap, gfn); + dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce); + return false; } if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !newcrste.s.fc1.s) SetPageDirty(phys_to_page(crste_origin_large(newcrste))); -- cgit v1.2.3 From 89fa757931dc0bcd64ef22b28d1d5ad00c5d02f4 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 2 Jun 2026 16:23:49 +0200 Subject: KVM: s390: Avoid potentially sleeping while atomic when zapping pages Factor out try_get_locked_pte(), which behaves similarly to get_locked_pte(), but does not attempt to allocate missing tables and performs a spin_trylock() instead of blocking. The new function is also exported, since it will be used in other patches. If intermediate entries are missing, there can be no pte swap entry to free, so it's safe to ignore them. This avoids potentially sleeping while atomic. Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") Signed-off-by: Claudio Imbrenda Message-ID: <20260602142356.169458-4-imbrenda@linux.ibm.com> --- arch/s390/include/asm/gmap_helpers.h | 1 + arch/s390/mm/gmap_helpers.c | 117 +++++++++++++++++++++-------------- 2 files changed, 73 insertions(+), 45 deletions(-) diff --git a/arch/s390/include/asm/gmap_helpers.h b/arch/s390/include/asm/gmap_helpers.h index 2d3ae421077e..d2b616604a46 100644 --- a/arch/s390/include/asm/gmap_helpers.h +++ b/arch/s390/include/asm/gmap_helpers.h @@ -12,5 +12,6 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr); void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end); int gmap_helper_disable_cow_sharing(void); void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr); +pte_t *try_get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); #endif /* _ASM_S390_GMAP_HELPERS_H */ diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index f8789ffcc05c..396207163ca6 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -34,6 +34,70 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) swap_put_entries_direct(entry, 1); } +/** + * try_get_locked_pte() - like get_locked_pte(), but atomic and with trylock + * @mm: the mm + * @vmaddr: the userspace virtual address whose pte is to be found + * @ptl: will be set to the pointer to the lock used to lock the pte in case + * of success. + * + * This function returns the pointer to the pte corresponding to @addr in @mm, + * similarly to get_locked_pte(). Unlike get_locked_pte(), no attempt is made + * to allocate missing page tables. If a missing or large entry is found, the + * function will return NULL. If the ptl lock is contended, %-EAGAIN is + * returned. + * + * In case of success, *@ptl will point to the locked pte lock for the returned + * pte, like get_locked_pte() does. + * + * Context: mmap_lock or vma lock for read or for write needs to be held. + * Return: + * * %NULL if the pte cannot be reached. + * * %-EAGAIN if the pte can be reached, but cannot be locked. + * * the pointer to the pte corresponding to @addr in @mm, if it can be reached + * and locked. + */ +pte_t *try_get_locked_pte(struct mm_struct *mm, unsigned long vmaddr, spinlock_t **ptl) +{ + pmd_t *pmdp, pmd, pmdval; + pud_t *pudp, pud; + p4d_t *p4dp, p4d; + pgd_t *pgdp, pgd; + pte_t *ptep; + + pgdp = pgd_offset(mm, vmaddr); + pgd = pgdp_get(pgdp); + if (pgd_none(pgd) || !pgd_present(pgd)) + return NULL; + p4dp = p4d_offset(pgdp, vmaddr); + p4d = p4dp_get(p4dp); + if (p4d_none(p4d) || !p4d_present(p4d)) + return NULL; + pudp = pud_offset(p4dp, vmaddr); + pud = pudp_get(pudp); + if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud)) + return NULL; + pmdp = pmd_offset(pudp, vmaddr); + pmd = pmdp_get_lockless(pmdp); + if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd)) + return NULL; + ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, ptl); + if (!ptep) + return NULL; + + if (spin_trylock(*ptl)) { + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmdp)))) { + pte_unmap_unlock(ptep, *ptl); + return ERR_PTR(-EAGAIN); + } + return ptep; + } + + pte_unmap(ptep); + return ERR_PTR(-EAGAIN); +} +EXPORT_SYMBOL_GPL(try_get_locked_pte); + /** * gmap_helper_zap_one_page() - discard a page if it was swapped. * @mm: the mm @@ -46,7 +110,7 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) { struct vm_area_struct *vma; - spinlock_t *ptl; + spinlock_t *ptl; /* Lock for the host (userspace) page table */ pte_t *ptep; mmap_assert_locked(mm); @@ -57,8 +121,8 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) return; /* Get pointer to the page table entry */ - ptep = get_locked_pte(mm, vmaddr, &ptl); - if (unlikely(!ptep)) + ptep = try_get_locked_pte(mm, vmaddr, &ptl); + if (IS_ERR_OR_NULL(ptep)) return; if (pte_swap(*ptep)) { ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); @@ -113,37 +177,9 @@ EXPORT_SYMBOL_GPL(gmap_helper_discard); */ void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr) { - pmd_t *pmdp, pmd, pmdval; - pud_t *pudp, pud; - p4d_t *p4dp, p4d; - pgd_t *pgdp, pgd; spinlock_t *ptl; /* Lock for the host (userspace) page table */ pte_t *ptep; - pgdp = pgd_offset(mm, vmaddr); - pgd = pgdp_get(pgdp); - if (pgd_none(pgd) || !pgd_present(pgd)) - return; - - p4dp = p4d_offset(pgdp, vmaddr); - p4d = p4dp_get(p4dp); - if (p4d_none(p4d) || !p4d_present(p4d)) - return; - - pudp = pud_offset(p4dp, vmaddr); - pud = pudp_get(pudp); - if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud)) - return; - - pmdp = pmd_offset(pudp, vmaddr); - pmd = pmdp_get_lockless(pmdp); - if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd)) - return; - - ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl); - if (!ptep) - return; - /* * Several paths exists that takes the ptl lock and then call the * mmu_notifier, which takes the mmu_lock. The unmap path, instead, @@ -156,21 +192,12 @@ void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr) * If the lock is contended the bit is not set and the deadlock is * avoided. */ - if (spin_trylock(ptl)) { - /* - * Make sure the pte we are touching is still the correct - * one. In theory this check should not be needed, but - * better safe than sorry. - * Disabling interrupts or holding the mmap lock is enough to - * guarantee that no concurrent updates to the page tables - * are possible. - */ - if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp)))) - __atomic64_or(_PAGE_UNUSED, (long *)ptep); - spin_unlock(ptl); - } + ptep = try_get_locked_pte(mm, vmaddr, &ptl); + if (IS_ERR_OR_NULL(ptep)) + return; - pte_unmap(ptep); + __atomic64_or(_PAGE_UNUSED, (long *)ptep); + pte_unmap_unlock(ptep, ptl); } EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused); -- cgit v1.2.3 From ca42c16638d5570c44842ff68a772c62e6dd0124 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 2 Jun 2026 16:23:50 +0200 Subject: KVM: s390: Fix guest / virtual address confusion in _essa_clear_cbrl() Until now, gmap_helper_zap_one_page() was being called with the guest absolute address, but it expects a userspace virtual address. This meant that in the best case the requested pages were not being discarded, and in the worst case that the wrong pages were being discarded. Fix this by converting the guest absolute address to host virtual before passing it to gmap_helper_zap_one_page(). Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") Signed-off-by: Claudio Imbrenda Message-ID: <20260602142356.169458-5-imbrenda@linux.ibm.com> --- arch/s390/kvm/priv.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index cc0553da14cb..447ec7ed423d 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1188,6 +1188,7 @@ static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len union crste *crstep; union pgste pgste; union pte *ptep; + hva_t hva; int i; lockdep_assert_held(&vcpu->kvm->mmu_lock); @@ -1199,8 +1200,11 @@ static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len if (!ptep || ptep->s.pr) continue; pgste = pgste_get_lock(ptep); - if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) - gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]); + if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) { + hva = gpa_to_hva(vcpu->kvm, cbrl[i]); + if (!kvm_is_error_hva(hva)) + gmap_helper_zap_one_page(vcpu->kvm->mm, hva); + } pgste_set_unlock(ptep, pgste); } } -- cgit v1.2.3 From 6af5563e827913d3e14dbf12eefd5af4aa592739 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 2 Jun 2026 16:23:51 +0200 Subject: KVM: s390: vsie: Fix rmap handling in _do_shadow_crste() Fix _do_shadow_crste() to also apply a mask on the reverse address, to prevent spurious entries from being created, like already done in gmap_protect_rmap(). Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") Signed-off-by: Claudio Imbrenda Message-ID: <20260602142356.169458-6-imbrenda@linux.ibm.com> --- arch/s390/kvm/gaccess.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 4f8d5592c9a9..20e28b183c1a 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1466,15 +1466,17 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni struct guest_fault *f, bool p) { union crste newcrste, oldcrste; - gfn_t gfn; + unsigned long mask; + gfn_t r_gfn; int rc; lockdep_assert_held(&sg->kvm->mmu_lock); lockdep_assert_held(&sg->parent->children_lock); - gfn = f->gfn & (is_pmd(*table) ? _SEGMENT_FR_MASK : _REGION3_FR_MASK); + mask = is_pmd(*table) ? _SEGMENT_FR_MASK : _REGION3_FR_MASK; + r_gfn = gpa_to_gfn(raddr) & mask; scoped_guard(spinlock, &sg->host_to_rmap_lock) - rc = gmap_insert_rmap(sg, gfn, gpa_to_gfn(raddr), host->h.tt); + rc = gmap_insert_rmap(sg, f->gfn & mask, r_gfn, host->h.tt); if (rc) return rc; @@ -1497,8 +1499,7 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni return -EAGAIN; newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p); - gfn = gpa_to_gfn(raddr); - while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce)) + while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, r_gfn, sg->asce)) ; return 0; } -- cgit v1.2.3 From 9a1dfbbd3506c4f7159feab5ef27434e80256fa5 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 2 Jun 2026 16:23:52 +0200 Subject: KVM: s390: Fix fault-in code Fix the fault-in code so that it does not return success if a concurrent unmap event invalidated the fault-in process between the best-effort lockless check and the proper check with lock. The new behaviour is to retry, like the best-effort lockless check already did. This prevents the fault-in handler from returning success without having actually faulted in the requested page. Fixes: e907ae530133 ("KVM: s390: Add helper functions for fault handling") Reviewed-by: Steffen Eiden Signed-off-by: Claudio Imbrenda Message-ID: <20260602142356.169458-7-imbrenda@linux.ibm.com> --- arch/s390/kvm/faultin.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/s390/kvm/faultin.c b/arch/s390/kvm/faultin.c index ddf0ca71f374..cf542b0a7e8e 100644 --- a/arch/s390/kvm/faultin.c +++ b/arch/s390/kvm/faultin.c @@ -36,7 +36,8 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa struct kvm_s390_mmu_cache *mc = NULL; struct kvm_memory_slot *slot; unsigned long inv_seq; - int foll, rc = 0; + int rc = -EAGAIN; + int foll; foll = f->write_attempt ? FOLL_WRITE : 0; foll |= f->attempt_pfault ? FOLL_NOWAIT : 0; @@ -53,7 +54,7 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa return 0; } - while (1) { + while (rc == -EAGAIN) { f->valid = false; inv_seq = kvm->mmu_invalidate_seq; /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ @@ -110,20 +111,19 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa if (!mmu_invalidate_retry_gfn(kvm, inv_seq, f->gfn)) { f->valid = true; rc = gmap_link(mc, kvm->arch.gmap, f, slot); - kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt); - f->page = NULL; } + kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt); } - kvm_release_faultin_page(kvm, f->page, true, false); if (rc == -ENOMEM) { rc = kvm_s390_mmu_cache_topup(mc); if (rc) return rc; - } else if (rc != -EAGAIN) { - return rc; + rc = -EAGAIN; } } + + return rc; } int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w) -- cgit v1.2.3 From 42546fc642e929de07459ff839a9f43a653ffb4e Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 2 Jun 2026 16:23:53 +0200 Subject: KVM: s390: Lock pte when making page secure Make sure _kvm_s390_pv_make_secure() takes the pte lock for the given address when attempting to make the page secure. One of the steps in making the page secure is freezing the folio using folio_ref_freeze(), which temporarily sets the reference count to 0. Any attempt to get such a folio while frozen will fail and cause a warning to be printed. Other users of folio_ref_freeze() make sure that the page is not mapped while it's being frozen, thus preventing gup functions from being able to access it. For _kvm_s390_pv_make_secure(), this is not possible, because the page needs to be mapped in order for the import to succeed. By taking the pte lock, gup functions will be blocked until the import operation is done, thus avoiding the race. In theory this does not completely solve the issue: if a page is mapped through multiple mappings, locking one pte does not protect from calling gup on it through the other mapping. In practice this does not happen and it is a decent stopgap solution until a more correct solution is available. Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") Signed-off-by: Claudio Imbrenda Message-ID: <20260602142356.169458-8-imbrenda@linux.ibm.com> --- arch/s390/kvm/pv.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index c2dafd812a3b..4b865e75351c 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "dat.h" #include "gaccess.h" @@ -73,6 +74,7 @@ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_str struct pv_make_secure { void *uvcb; struct folio *folio; + struct kvm *kvm; int rc; bool needs_export; }; @@ -103,9 +105,21 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f) { struct pv_make_secure *priv = f->priv; struct folio *folio; + spinlock_t *ptl; /* pte lock from try_get_locked_pte() */ + pte_t *ptep; folio = pfn_folio(f->pfn); priv->rc = -EAGAIN; + + if (!mmap_read_trylock(priv->kvm->mm)) + return; + + ptep = try_get_locked_pte(priv->kvm->mm, gfn_to_hva(priv->kvm, f->gfn), &ptl); + if (IS_ERR_VALUE(ptep)) { + priv->rc = PTR_ERR(ptep); + goto out; + } + if (folio_trylock(folio)) { priv->rc = __kvm_s390_pv_make_secure(f, folio); if (priv->rc == -E2BIG || priv->rc == -EBUSY) { @@ -114,6 +128,11 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f) } folio_unlock(folio); } + + if (ptep) + pte_unmap_unlock(ptep, ptl); +out: + mmap_read_unlock(priv->kvm->mm); } /** @@ -127,7 +146,7 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f) */ int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb) { - struct pv_make_secure priv = { .uvcb = uvcb }; + struct pv_make_secure priv = { .uvcb = uvcb, .kvm = kvm, }; struct guest_fault f = { .write_attempt = true, .gfn = gpa_to_gfn(gaddr), -- cgit v1.2.3 From 7b53b4801884d48551f26a17f00cdc0ae4640d64 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 2 Jun 2026 16:23:54 +0200 Subject: KVM: s390: Prevent memslots outside the ASCE range With KVM_S390_VM_MEM_LIMIT_SIZE, userspace can set the highest address allowed for the VM. Creating a memslot that lies over the maximum address does not make sense and is only a potential source of bugs. Prevent creation of memslots over the maximum address, and prevent the maximum address from being reduced below the end of existing memslots. Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") Signed-off-by: Claudio Imbrenda Message-ID: <20260602142356.169458-9-imbrenda@linux.ibm.com> --- arch/s390/kvm/kvm-s390.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e09960c2e6ed..ffb20a64d328 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -999,7 +999,10 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att break; } case KVM_S390_VM_MEM_LIMIT_SIZE: { + struct kvm_memslots *slots; + struct kvm_memory_slot *ms; unsigned long new_limit; + int bkt; if (kvm_is_ucontrol(kvm)) return -EINVAL; @@ -1007,6 +1010,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (get_user(new_limit, (u64 __user *)attr->addr)) return -EFAULT; + guard(mutex)(&kvm->lock); + + new_limit = ALIGN(new_limit, HPAGE_SIZE); if (kvm->arch.mem_limit != KVM_S390_NO_MEM_LIMIT && new_limit > kvm->arch.mem_limit) return -E2BIG; @@ -1014,12 +1020,27 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!new_limit) return -EINVAL; - ret = -EBUSY; - if (!kvm->created_vcpus) - ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit)); + if (kvm->created_vcpus) + return -EBUSY; + + ret = 0; + scoped_guard(mutex, &kvm->slots_lock) { + slots = kvm_memslots(kvm); + if (slots && !kvm_memslots_empty(slots)) { + kvm_for_each_memslot(ms, bkt, slots) { + if (gpa_to_gfn(new_limit) < ms->base_gfn + ms->npages) { + ret = -EBUSY; + break; + } + } + } + if (!ret) + ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit)); + } + if (ret) + break; VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit); - VM_EVENT(kvm, 3, "New guest asce: 0x%p", - (void *)kvm->arch.gmap->asce.val); + VM_EVENT(kvm, 3, "New guest asce: 0x%p", (void *)kvm->arch.gmap->asce.val); break; } default: @@ -5672,6 +5693,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return -EINVAL; if ((new->base_gfn + new->npages) * PAGE_SIZE > kvm->arch.mem_limit) return -EINVAL; + if (!asce_contains_gfn(kvm->arch.gmap->asce, new->base_gfn + new->npages - 1)) + return -EINVAL; } if (!kvm->arch.migration_mode) -- cgit v1.2.3 From 6ae67dcac742529210a23dac6c9a7de1acfb52a3 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 2 Jun 2026 16:23:55 +0200 Subject: KVM: s390: Fix possible reference leak in fault-in code If kvm_s390_new_mmu_cache() fails, kvm_s390_faultin_gfn() returns without releasing the faulted page. Fix this by moving the allocation of the memory cache outside of the loop. There is no reason to check at every iteration. Opportunistically fix a comment. Fixes: e907ae530133 ("KVM: s390: Add helper functions for fault handling") Signed-off-by: Claudio Imbrenda Message-ID: <20260602142356.169458-10-imbrenda@linux.ibm.com> --- arch/s390/kvm/faultin.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/s390/kvm/faultin.c b/arch/s390/kvm/faultin.c index cf542b0a7e8e..fee80047bd94 100644 --- a/arch/s390/kvm/faultin.c +++ b/arch/s390/kvm/faultin.c @@ -54,6 +54,13 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa return 0; } + if (!mc) { + local_mc = kvm_s390_new_mmu_cache(); + if (!local_mc) + return -ENOMEM; + mc = local_mc; + } + while (rc == -EAGAIN) { f->valid = false; inv_seq = kvm->mmu_invalidate_seq; @@ -94,14 +101,7 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa if (is_error_pfn(f->pfn)) return -EFAULT; - if (!mc) { - local_mc = kvm_s390_new_mmu_cache(); - if (!local_mc) - return -ENOMEM; - mc = local_mc; - } - - /* Loop, will automatically release the faulted page. */ + /* Loop, release the faulted page. */ if (mmu_invalidate_retry_gfn_unsafe(kvm, inv_seq, f->gfn)) { kvm_release_faultin_page(kvm, f->page, true, false); continue; -- cgit v1.2.3 From c1edda54a0f713412f5914f9c9080856694bddca Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 2 Jun 2026 16:23:56 +0200 Subject: KVM: s390: Remove ptep_zap_softleaf_entry() Migration entries do not need to be removed. The swap subsystem has been (and still is being) heavily reworked. The current implementation of ptep_zap_softleaf_entry() has been slowly modified and is now wrong, since it unconditionally calls swap_put_entries_direct() for both swap and migration entries. Remove ptep_zap_softleaf_entry() altogether, merge the path for proper swap entries directly in the only caller, and ignore migration entries. Fixes: 200197908dc4 ("KVM: s390: Refactor and split some gmap helpers") Signed-off-by: Claudio Imbrenda Message-ID: <20260602142356.169458-11-imbrenda@linux.ibm.com> --- arch/s390/mm/gmap_helpers.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index 396207163ca6..1cfe4724fbe2 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -16,24 +16,6 @@ #include #include -/** - * ptep_zap_softleaf_entry() - discard a software leaf entry. - * @mm: the mm - * @entry: the software leaf entry that needs to be zapped - * - * Discards the given software leaf entry. If the leaf entry was an actual - * swap entry (and not a migration entry, for example), the actual swapped - * page is also discarded from swap. - */ -static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) -{ - if (softleaf_is_swap(entry)) - dec_mm_counter(mm, MM_SWAPENTS); - else if (softleaf_is_migration(entry)) - dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry))); - swap_put_entries_direct(entry, 1); -} - /** * try_get_locked_pte() - like get_locked_pte(), but atomic and with trylock * @mm: the mm @@ -111,6 +93,7 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) { struct vm_area_struct *vma; spinlock_t *ptl; /* Lock for the host (userspace) page table */ + softleaf_t sl; pte_t *ptep; mmap_assert_locked(mm); @@ -124,8 +107,10 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) ptep = try_get_locked_pte(mm, vmaddr, &ptl); if (IS_ERR_OR_NULL(ptep)) return; - if (pte_swap(*ptep)) { - ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); + sl = softleaf_from_pte(*ptep); + if (pte_swap(*ptep) && softleaf_is_swap(sl)) { + dec_mm_counter(mm, MM_SWAPENTS); + swap_put_entries_direct(sl, 1); pte_clear(mm, vmaddr, ptep); } pte_unmap_unlock(ptep, ptl); -- cgit v1.2.3