summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2026-06-03 16:46:31 +0200
committerPaolo Bonzini <pbonzini@redhat.com>2026-06-03 16:46:31 +0200
commitbd2e19cf8f3028620428c698f6783de5306a6342 (patch)
tree0ed8b56f44881ba19b306b77cea74b20a5dbc010
parentdb38bcb3311053954f62b865cd2d86e164b04351 (diff)
parentc1edda54a0f713412f5914f9c9080856694bddca (diff)
Merge tag 'kvm-s390-master-7.1-3' of https://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into HEAD
KVM: s390: More gmap and vsie fixes
-rw-r--r--arch/s390/include/asm/gmap_helpers.h1
-rw-r--r--arch/s390/kvm/faultin.c30
-rw-r--r--arch/s390/kvm/gaccess.c11
-rw-r--r--arch/s390/kvm/gmap.c19
-rw-r--r--arch/s390/kvm/gmap.h3
-rw-r--r--arch/s390/kvm/kvm-s390.c33
-rw-r--r--arch/s390/kvm/priv.c8
-rw-r--r--arch/s390/kvm/pv.c21
-rw-r--r--arch/s390/mm/gmap_helpers.c128
9 files changed, 165 insertions, 89 deletions
diff --git a/arch/s390/include/asm/gmap_helpers.h b/arch/s390/include/asm/gmap_helpers.h
index 2d3ae421077e..d2b616604a46 100644
--- a/arch/s390/include/asm/gmap_helpers.h
+++ b/arch/s390/include/asm/gmap_helpers.h
@@ -12,5 +12,6 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr);
void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end);
int gmap_helper_disable_cow_sharing(void);
void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr);
+pte_t *try_get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl);
#endif /* _ASM_S390_GMAP_HELPERS_H */
diff --git a/arch/s390/kvm/faultin.c b/arch/s390/kvm/faultin.c
index ddf0ca71f374..fee80047bd94 100644
--- a/arch/s390/kvm/faultin.c
+++ b/arch/s390/kvm/faultin.c
@@ -36,7 +36,8 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa
struct kvm_s390_mmu_cache *mc = NULL;
struct kvm_memory_slot *slot;
unsigned long inv_seq;
- int foll, rc = 0;
+ int rc = -EAGAIN;
+ int foll;
foll = f->write_attempt ? FOLL_WRITE : 0;
foll |= f->attempt_pfault ? FOLL_NOWAIT : 0;
@@ -53,7 +54,14 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa
return 0;
}
- while (1) {
+ if (!mc) {
+ local_mc = kvm_s390_new_mmu_cache();
+ if (!local_mc)
+ return -ENOMEM;
+ mc = local_mc;
+ }
+
+ while (rc == -EAGAIN) {
f->valid = false;
inv_seq = kvm->mmu_invalidate_seq;
/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
@@ -93,14 +101,7 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa
if (is_error_pfn(f->pfn))
return -EFAULT;
- if (!mc) {
- local_mc = kvm_s390_new_mmu_cache();
- if (!local_mc)
- return -ENOMEM;
- mc = local_mc;
- }
-
- /* Loop, will automatically release the faulted page. */
+ /* Loop, release the faulted page. */
if (mmu_invalidate_retry_gfn_unsafe(kvm, inv_seq, f->gfn)) {
kvm_release_faultin_page(kvm, f->page, true, false);
continue;
@@ -110,20 +111,19 @@ int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fa
if (!mmu_invalidate_retry_gfn(kvm, inv_seq, f->gfn)) {
f->valid = true;
rc = gmap_link(mc, kvm->arch.gmap, f, slot);
- kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt);
- f->page = NULL;
}
+ kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt);
}
- kvm_release_faultin_page(kvm, f->page, true, false);
if (rc == -ENOMEM) {
rc = kvm_s390_mmu_cache_topup(mc);
if (rc)
return rc;
- } else if (rc != -EAGAIN) {
- return rc;
+ rc = -EAGAIN;
}
}
+
+ return rc;
}
int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 4f8d5592c9a9..20e28b183c1a 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1466,15 +1466,17 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni
struct guest_fault *f, bool p)
{
union crste newcrste, oldcrste;
- gfn_t gfn;
+ unsigned long mask;
+ gfn_t r_gfn;
int rc;
lockdep_assert_held(&sg->kvm->mmu_lock);
lockdep_assert_held(&sg->parent->children_lock);
- gfn = f->gfn & (is_pmd(*table) ? _SEGMENT_FR_MASK : _REGION3_FR_MASK);
+ mask = is_pmd(*table) ? _SEGMENT_FR_MASK : _REGION3_FR_MASK;
+ r_gfn = gpa_to_gfn(raddr) & mask;
scoped_guard(spinlock, &sg->host_to_rmap_lock)
- rc = gmap_insert_rmap(sg, gfn, gpa_to_gfn(raddr), host->h.tt);
+ rc = gmap_insert_rmap(sg, f->gfn & mask, r_gfn, host->h.tt);
if (rc)
return rc;
@@ -1497,8 +1499,7 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni
return -EAGAIN;
newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p);
- gfn = gpa_to_gfn(raddr);
- while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce))
+ while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, r_gfn, sg->asce))
;
return 0;
}
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
index 957126ab991c..52d55ddea8d4 100644
--- a/arch/s390/kvm/gmap.c
+++ b/arch/s390/kvm/gmap.c
@@ -395,15 +395,28 @@ static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct
struct gmap_unmap_priv *priv = walk->priv;
struct folio *folio = NULL;
union crste old = *crstep;
+ bool ok;
if (!old.h.fc)
return 0;
if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
folio = phys_to_folio(crste_origin_large(old));
- /* No races should happen because kvm->mmu_lock is held in write mode */
- KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn),
- priv->gmap->kvm);
+ /*
+ * No races should happen because kvm->mmu_lock is held in write mode,
+ * but the unmap operation could have triggered an unshadow, which
+ * causes gmap_crstep_xchg_atomic() to return false and clear the
+ * vsie_notif bit. Allow the operation to fail once, if the old crste
+ * had the vsie_notif bit set. A second failure is not allowed, for
+ * the reasons above.
+ */
+ ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn);
+ if (!ok) {
+ KVM_BUG_ON(!old.s.fc1.vsie_notif, priv->gmap->kvm);
+ old.s.fc1.vsie_notif = 0;
+ ok = gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn);
+ KVM_BUG_ON(!ok, priv->gmap->kvm);
+ }
if (folio)
uv_convert_from_secure_folio(folio);
diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h
index 742e42a31744..5374f21aaf8d 100644
--- a/arch/s390/kvm/gmap.h
+++ b/arch/s390/kvm/gmap.h
@@ -273,11 +273,14 @@ static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, unio
gmap_unmap_prefix(gmap, gfn, gfn + align);
}
if (crste_leaf(oldcrste) && crste_needs_unshadow(oldcrste, newcrste)) {
+ newcrste = oldcrste;
newcrste.s.fc1.vsie_notif = 0;
if (needs_lock)
gmap_handle_vsie_unshadow_event(gmap, gfn);
else
_gmap_handle_vsie_unshadow_event(gmap, gfn);
+ dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce);
+ return false;
}
if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !newcrste.s.fc1.s)
SetPageDirty(phys_to_page(crste_origin_large(newcrste)));
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e09960c2e6ed..ffb20a64d328 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -999,7 +999,10 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
break;
}
case KVM_S390_VM_MEM_LIMIT_SIZE: {
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *ms;
unsigned long new_limit;
+ int bkt;
if (kvm_is_ucontrol(kvm))
return -EINVAL;
@@ -1007,6 +1010,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
if (get_user(new_limit, (u64 __user *)attr->addr))
return -EFAULT;
+ guard(mutex)(&kvm->lock);
+
+ new_limit = ALIGN(new_limit, HPAGE_SIZE);
if (kvm->arch.mem_limit != KVM_S390_NO_MEM_LIMIT &&
new_limit > kvm->arch.mem_limit)
return -E2BIG;
@@ -1014,12 +1020,27 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
if (!new_limit)
return -EINVAL;
- ret = -EBUSY;
- if (!kvm->created_vcpus)
- ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit));
+ if (kvm->created_vcpus)
+ return -EBUSY;
+
+ ret = 0;
+ scoped_guard(mutex, &kvm->slots_lock) {
+ slots = kvm_memslots(kvm);
+ if (slots && !kvm_memslots_empty(slots)) {
+ kvm_for_each_memslot(ms, bkt, slots) {
+ if (gpa_to_gfn(new_limit) < ms->base_gfn + ms->npages) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ }
+ if (!ret)
+ ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit));
+ }
+ if (ret)
+ break;
VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit);
- VM_EVENT(kvm, 3, "New guest asce: 0x%p",
- (void *)kvm->arch.gmap->asce.val);
+ VM_EVENT(kvm, 3, "New guest asce: 0x%p", (void *)kvm->arch.gmap->asce.val);
break;
}
default:
@@ -5672,6 +5693,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
return -EINVAL;
if ((new->base_gfn + new->npages) * PAGE_SIZE > kvm->arch.mem_limit)
return -EINVAL;
+ if (!asce_contains_gfn(kvm->arch.gmap->asce, new->base_gfn + new->npages - 1))
+ return -EINVAL;
}
if (!kvm->arch.migration_mode)
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index cc0553da14cb..447ec7ed423d 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -1188,6 +1188,7 @@ static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len
union crste *crstep;
union pgste pgste;
union pte *ptep;
+ hva_t hva;
int i;
lockdep_assert_held(&vcpu->kvm->mmu_lock);
@@ -1199,8 +1200,11 @@ static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len
if (!ptep || ptep->s.pr)
continue;
pgste = pgste_get_lock(ptep);
- if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero)
- gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]);
+ if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) {
+ hva = gpa_to_hva(vcpu->kvm, cbrl[i]);
+ if (!kvm_is_error_hva(hva))
+ gmap_helper_zap_one_page(vcpu->kvm->mm, hva);
+ }
pgste_set_unlock(ptep, pgste);
}
}
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index c2dafd812a3b..4b865e75351c 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -17,6 +17,7 @@
#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
#include <linux/mmu_notifier.h>
+#include <asm/gmap_helpers.h>
#include "kvm-s390.h"
#include "dat.h"
#include "gaccess.h"
@@ -73,6 +74,7 @@ static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_str
struct pv_make_secure {
void *uvcb;
struct folio *folio;
+ struct kvm *kvm;
int rc;
bool needs_export;
};
@@ -103,9 +105,21 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f)
{
struct pv_make_secure *priv = f->priv;
struct folio *folio;
+ spinlock_t *ptl; /* pte lock from try_get_locked_pte() */
+ pte_t *ptep;
folio = pfn_folio(f->pfn);
priv->rc = -EAGAIN;
+
+ if (!mmap_read_trylock(priv->kvm->mm))
+ return;
+
+ ptep = try_get_locked_pte(priv->kvm->mm, gfn_to_hva(priv->kvm, f->gfn), &ptl);
+ if (IS_ERR_VALUE(ptep)) {
+ priv->rc = PTR_ERR(ptep);
+ goto out;
+ }
+
if (folio_trylock(folio)) {
priv->rc = __kvm_s390_pv_make_secure(f, folio);
if (priv->rc == -E2BIG || priv->rc == -EBUSY) {
@@ -114,6 +128,11 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f)
}
folio_unlock(folio);
}
+
+ if (ptep)
+ pte_unmap_unlock(ptep, ptl);
+out:
+ mmap_read_unlock(priv->kvm->mm);
}
/**
@@ -127,7 +146,7 @@ static void _kvm_s390_pv_make_secure(struct guest_fault *f)
*/
int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb)
{
- struct pv_make_secure priv = { .uvcb = uvcb };
+ struct pv_make_secure priv = { .uvcb = uvcb, .kvm = kvm, };
struct guest_fault f = {
.write_attempt = true,
.gfn = gpa_to_gfn(gaddr),
diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
index f8789ffcc05c..1cfe4724fbe2 100644
--- a/arch/s390/mm/gmap_helpers.c
+++ b/arch/s390/mm/gmap_helpers.c
@@ -17,22 +17,68 @@
#include <asm/gmap_helpers.h>
/**
- * ptep_zap_softleaf_entry() - discard a software leaf entry.
+ * try_get_locked_pte() - like get_locked_pte(), but atomic and with trylock
* @mm: the mm
- * @entry: the software leaf entry that needs to be zapped
+ * @vmaddr: the userspace virtual address whose pte is to be found
+ * @ptl: will be set to the pointer to the lock used to lock the pte in case
+ * of success.
*
- * Discards the given software leaf entry. If the leaf entry was an actual
- * swap entry (and not a migration entry, for example), the actual swapped
- * page is also discarded from swap.
+ * This function returns the pointer to the pte corresponding to @addr in @mm,
+ * similarly to get_locked_pte(). Unlike get_locked_pte(), no attempt is made
+ * to allocate missing page tables. If a missing or large entry is found, the
+ * function will return NULL. If the ptl lock is contended, %-EAGAIN is
+ * returned.
+ *
+ * In case of success, *@ptl will point to the locked pte lock for the returned
+ * pte, like get_locked_pte() does.
+ *
+ * Context: mmap_lock or vma lock for read or for write needs to be held.
+ * Return:
+ * * %NULL if the pte cannot be reached.
+ * * %-EAGAIN if the pte can be reached, but cannot be locked.
+ * * the pointer to the pte corresponding to @addr in @mm, if it can be reached
+ * and locked.
*/
-static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
+pte_t *try_get_locked_pte(struct mm_struct *mm, unsigned long vmaddr, spinlock_t **ptl)
{
- if (softleaf_is_swap(entry))
- dec_mm_counter(mm, MM_SWAPENTS);
- else if (softleaf_is_migration(entry))
- dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry)));
- swap_put_entries_direct(entry, 1);
+ pmd_t *pmdp, pmd, pmdval;
+ pud_t *pudp, pud;
+ p4d_t *p4dp, p4d;
+ pgd_t *pgdp, pgd;
+ pte_t *ptep;
+
+ pgdp = pgd_offset(mm, vmaddr);
+ pgd = pgdp_get(pgdp);
+ if (pgd_none(pgd) || !pgd_present(pgd))
+ return NULL;
+ p4dp = p4d_offset(pgdp, vmaddr);
+ p4d = p4dp_get(p4dp);
+ if (p4d_none(p4d) || !p4d_present(p4d))
+ return NULL;
+ pudp = pud_offset(p4dp, vmaddr);
+ pud = pudp_get(pudp);
+ if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud))
+ return NULL;
+ pmdp = pmd_offset(pudp, vmaddr);
+ pmd = pmdp_get_lockless(pmdp);
+ if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd))
+ return NULL;
+ ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, ptl);
+ if (!ptep)
+ return NULL;
+
+ if (spin_trylock(*ptl)) {
+ if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmdp)))) {
+ pte_unmap_unlock(ptep, *ptl);
+ return ERR_PTR(-EAGAIN);
+ }
+ return ptep;
+ }
+
+ pte_unmap(ptep);
+ return ERR_PTR(-EAGAIN);
}
+EXPORT_SYMBOL_GPL(try_get_locked_pte);
/**
* gmap_helper_zap_one_page() - discard a page if it was swapped.
@@ -46,7 +92,8 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
{
struct vm_area_struct *vma;
- spinlock_t *ptl;
+ spinlock_t *ptl; /* Lock for the host (userspace) page table */
+ softleaf_t sl;
pte_t *ptep;
mmap_assert_locked(mm);
@@ -57,11 +104,13 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
return;
/* Get pointer to the page table entry */
- ptep = get_locked_pte(mm, vmaddr, &ptl);
- if (unlikely(!ptep))
+ ptep = try_get_locked_pte(mm, vmaddr, &ptl);
+ if (IS_ERR_OR_NULL(ptep))
return;
- if (pte_swap(*ptep)) {
- ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep));
+ sl = softleaf_from_pte(*ptep);
+ if (pte_swap(*ptep) && softleaf_is_swap(sl)) {
+ dec_mm_counter(mm, MM_SWAPENTS);
+ swap_put_entries_direct(sl, 1);
pte_clear(mm, vmaddr, ptep);
}
pte_unmap_unlock(ptep, ptl);
@@ -113,37 +162,9 @@ EXPORT_SYMBOL_GPL(gmap_helper_discard);
*/
void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr)
{
- pmd_t *pmdp, pmd, pmdval;
- pud_t *pudp, pud;
- p4d_t *p4dp, p4d;
- pgd_t *pgdp, pgd;
spinlock_t *ptl; /* Lock for the host (userspace) page table */
pte_t *ptep;
- pgdp = pgd_offset(mm, vmaddr);
- pgd = pgdp_get(pgdp);
- if (pgd_none(pgd) || !pgd_present(pgd))
- return;
-
- p4dp = p4d_offset(pgdp, vmaddr);
- p4d = p4dp_get(p4dp);
- if (p4d_none(p4d) || !p4d_present(p4d))
- return;
-
- pudp = pud_offset(p4dp, vmaddr);
- pud = pudp_get(pudp);
- if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud))
- return;
-
- pmdp = pmd_offset(pudp, vmaddr);
- pmd = pmdp_get_lockless(pmdp);
- if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd))
- return;
-
- ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl);
- if (!ptep)
- return;
-
/*
* Several paths exists that takes the ptl lock and then call the
* mmu_notifier, which takes the mmu_lock. The unmap path, instead,
@@ -156,21 +177,12 @@ void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr)
* If the lock is contended the bit is not set and the deadlock is
* avoided.
*/
- if (spin_trylock(ptl)) {
- /*
- * Make sure the pte we are touching is still the correct
- * one. In theory this check should not be needed, but
- * better safe than sorry.
- * Disabling interrupts or holding the mmap lock is enough to
- * guarantee that no concurrent updates to the page tables
- * are possible.
- */
- if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp))))
- __atomic64_or(_PAGE_UNUSED, (long *)ptep);
- spin_unlock(ptl);
- }
+ ptep = try_get_locked_pte(mm, vmaddr, &ptl);
+ if (IS_ERR_OR_NULL(ptep))
+ return;
- pte_unmap(ptep);
+ __atomic64_or(_PAGE_UNUSED, (long *)ptep);
+ pte_unmap_unlock(ptep, ptl);
}
EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused);