44 files changed, 5562 insertions, 5490 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 0e5fad5f06ca..961cbf023c1b 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -32,9 +32,6 @@ config GENERIC_BUG_RELATIVE_POINTERS
 config GENERIC_LOCKBREAK
 	def_bool y if PREEMPTION
 
-config PGSTE
-	def_bool y if KVM
-
 config AUDIT_ARCH
 	def_bool y
 
diff --git a/arch/s390/include/asm/dat-bits.h b/arch/s390/include/asm/dat-bits.h
index 8d65eec2f124..c40874e0e426 100644
--- a/arch/s390/include/asm/dat-bits.h
+++ b/arch/s390/include/asm/dat-bits.h
@@ -9,6 +9,32 @@
 #ifndef _S390_DAT_BITS_H
 #define _S390_DAT_BITS_H
 
+/*
+ * vaddress union in order to easily decode a virtual address into its
+ * region first index, region second index etc. parts.
+ */
+union vaddress {
+	unsigned long addr;
+	struct {
+		unsigned long rfx : 11;
+		unsigned long rsx : 11;
+		unsigned long rtx : 11;
+		unsigned long sx  : 11;
+		unsigned long px  : 8;
+		unsigned long bx  : 12;
+	};
+	struct {
+		unsigned long rfx01 : 2;
+		unsigned long	    : 9;
+		unsigned long rsx01 : 2;
+		unsigned long	    : 9;
+		unsigned long rtx01 : 2;
+		unsigned long	    : 9;
+		unsigned long sx01  : 2;
+		unsigned long	    : 29;
+	};
+};
+
 union asce {
 	unsigned long val;
 	struct {
@@ -98,7 +124,8 @@ union region3_table_entry {
 	struct {
 		unsigned long	: 53;
 		unsigned long fc: 1; /* Format-Control */
-		unsigned long	: 4;
+		unsigned long p : 1; /* DAT-Protection Bit */
+		unsigned long	: 3;
 		unsigned long i : 1; /* Region-Invalid Bit */
 		unsigned long cr: 1; /* Common-Region Bit */
 		unsigned long tt: 2; /* Table-Type Bits */
@@ -140,7 +167,8 @@ union segment_table_entry {
 	struct {
 		unsigned long	: 53;
 		unsigned long fc: 1; /* Format-Control */
-		unsigned long	: 4;
+		unsigned long p : 1; /* DAT-Protection Bit */
+		unsigned long	: 3;
 		unsigned long i : 1; /* Segment-Invalid Bit */
 		unsigned long cs: 1; /* Common-Segment Bit */
 		unsigned long tt: 2; /* Table-Type Bits */
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
deleted file mode 100644
index 66c5808fd011..000000000000
--- a/arch/s390/include/asm/gmap.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *  KVM guest address space mapping code
- *
- *    Copyright IBM Corp. 2007, 2016
- *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- */
-
-#ifndef _ASM_S390_GMAP_H
-#define _ASM_S390_GMAP_H
-
-#include <linux/radix-tree.h>
-#include <linux/refcount.h>
-
-/* Generic bits for GMAP notification on DAT table entry changes. */
-#define GMAP_NOTIFY_SHADOW	0x2
-#define GMAP_NOTIFY_MPROT	0x1
-
-/* Status bits only for huge segment entries */
-#define _SEGMENT_ENTRY_GMAP_IN		0x0800	/* invalidation notify bit */
-#define _SEGMENT_ENTRY_GMAP_UC		0x0002	/* dirty (migration) */
-
-/**
- * struct gmap_struct - guest address space
- * @list: list head for the mm->context gmap list
- * @mm: pointer to the parent mm_struct
- * @guest_to_host: radix tree with guest to host address translation
- * @host_to_guest: radix tree with pointer to segment table entries
- * @guest_table_lock: spinlock to protect all entries in the guest page table
- * @ref_count: reference counter for the gmap structure
- * @table: pointer to the page directory
- * @asce: address space control element for gmap page table
- * @pfault_enabled: defines if pfaults are applicable for the guest
- * @guest_handle: protected virtual machine handle for the ultravisor
- * @host_to_rmap: radix tree with gmap_rmap lists
- * @children: list of shadow gmap structures
- * @shadow_lock: spinlock to protect the shadow gmap list
- * @parent: pointer to the parent gmap for shadow guest address spaces
- * @orig_asce: ASCE for which the shadow page table has been created
- * @edat_level: edat level to be used for the shadow translation
- * @removed: flag to indicate if a shadow guest address space has been removed
- * @initialized: flag to indicate if a shadow guest address space can be used
- */
-struct gmap {
-	struct list_head list;
-	struct mm_struct *mm;
-	struct radix_tree_root guest_to_host;
-	struct radix_tree_root host_to_guest;
-	spinlock_t guest_table_lock;
-	refcount_t ref_count;
-	unsigned long *table;
-	unsigned long asce;
-	unsigned long asce_end;
-	void *private;
-	bool pfault_enabled;
-	/* only set for protected virtual machines */
-	unsigned long guest_handle;
-	/* Additional data for shadow guest address spaces */
-	struct radix_tree_root host_to_rmap;
-	struct list_head children;
-	spinlock_t shadow_lock;
-	struct gmap *parent;
-	unsigned long orig_asce;
-	int edat_level;
-	bool removed;
-	bool initialized;
-};
-
-/**
- * struct gmap_rmap - reverse mapping for shadow page table entries
- * @next: pointer to next rmap in the list
- * @raddr: virtual rmap address in the shadow guest address space
- */
-struct gmap_rmap {
-	struct gmap_rmap *next;
-	unsigned long raddr;
-};
-
-#define gmap_for_each_rmap(pos, head) \
-	for (pos = (head); pos; pos = pos->next)
-
-#define gmap_for_each_rmap_safe(pos, n, head) \
-	for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
-
-/**
- * struct gmap_notifier - notify function block for page invalidation
- * @notifier_call: address of callback function
- */
-struct gmap_notifier {
-	struct list_head list;
-	struct rcu_head rcu;
-	void (*notifier_call)(struct gmap *gmap, unsigned long start,
-			      unsigned long end);
-};
-
-static inline int gmap_is_shadow(struct gmap *gmap)
-{
-	return !!gmap->parent;
-}
-
-struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
-void gmap_remove(struct gmap *gmap);
-struct gmap *gmap_get(struct gmap *gmap);
-void gmap_put(struct gmap *gmap);
-void gmap_free(struct gmap *gmap);
-struct gmap *gmap_alloc(unsigned long limit);
-
-int gmap_map_segment(struct gmap *gmap, unsigned long from,
-		     unsigned long to, unsigned long len);
-int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
-unsigned long __gmap_translate(struct gmap *, unsigned long gaddr);
-int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr);
-void __gmap_zap(struct gmap *, unsigned long gaddr);
-void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
-
-int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
-
-void gmap_unshadow(struct gmap *sg);
-int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
-		    int fake);
-int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
-		    int fake);
-int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
-		    int fake);
-int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
-		    int fake);
-int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
-
-void gmap_register_pte_notifier(struct gmap_notifier *);
-void gmap_unregister_pte_notifier(struct gmap_notifier *);
-
-int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits);
-
-void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
-			     unsigned long gaddr, unsigned long vmaddr);
-int s390_replace_asce(struct gmap *gmap);
-void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
-int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
-			    unsigned long end, bool interruptible);
-unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level);
-
-/**
- * s390_uv_destroy_range - Destroy a range of pages in the given mm.
- * @mm: the mm on which to operate on
- * @start: the start of the range
- * @end: the end of the range
- *
- * This function will call cond_sched, so it should not generate stalls, but
- * it will otherwise only return when it completed.
- */
-static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
-					 unsigned long end)
-{
-	(void)__s390_uv_destroy_range(mm, start, end, false);
-}
-
-/**
- * s390_uv_destroy_range_interruptible - Destroy a range of pages in the
- * given mm, but stop when a fatal signal is received.
- * @mm: the mm on which to operate on
- * @start: the start of the range
- * @end: the end of the range
- *
- * This function will call cond_sched, so it should not generate stalls. If
- * a fatal signal is received, it will return with -EINTR immediately,
- * without finishing destroying the whole range. Upon successful
- * completion, 0 is returned.
- */
-static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start,
-						      unsigned long end)
-{
-	return __s390_uv_destroy_range(mm, start, end, true);
-}
-#endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/gmap_helpers.h b/arch/s390/include/asm/gmap_helpers.h
index 5356446a61c4..2d3ae421077e 100644
--- a/arch/s390/include/asm/gmap_helpers.h
+++ b/arch/s390/include/asm/gmap_helpers.h
@@ -11,5 +11,6 @@
 void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr);
 void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end);
 int gmap_helper_disable_cow_sharing(void);
+void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr);
 
 #endif /* _ASM_S390_GMAP_HELPERS_H */
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 69131736daaa..6983e52eaf81 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -37,12 +37,6 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 	return __huge_ptep_get_and_clear(mm, addr, ptep);
 }
 
-static inline void arch_clear_hugetlb_flags(struct folio *folio)
-{
-	clear_bit(PG_arch_1, &folio->flags.f);
-}
-#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
-
 #define __HAVE_ARCH_HUGE_PTE_CLEAR
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
 				  pte_t *ptep, unsigned long sz)
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index ae1223264d3c..64a50f0862aa 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -27,6 +27,7 @@
 #include <asm/isc.h>
 #include <asm/guarded_storage.h>
 
+#define KVM_HAVE_MMU_RWLOCK
 #define KVM_MAX_VCPUS 255
 
 #define KVM_INTERNAL_MEM_SLOTS 1
@@ -441,6 +442,7 @@ struct kvm_vcpu_arch {
 	bool acrs_loaded;
 	struct kvm_s390_pv_vcpu pv;
 	union diag318_info diag318_info;
+	struct kvm_s390_mmu_cache *mc;
 };
 
 struct kvm_vm_stat {
@@ -630,8 +632,12 @@ struct kvm_s390_pv {
 	void *set_aside;
 	struct list_head need_cleanup;
 	struct mmu_notifier mmu_notifier;
+	/* Protects against concurrent import-like operations */
+	struct mutex import_lock;
 };
 
+struct kvm_s390_mmu_cache;
+
 struct kvm_arch {
 	struct esca_block *sca;
 	debug_info_t *dbf;
@@ -671,6 +677,7 @@ struct kvm_arch {
 	struct kvm_s390_pv pv;
 	struct list_head kzdev_list;
 	spinlock_t kzdev_list_lock;
+	struct kvm_s390_mmu_cache *mc;
 };
 
 #define KVM_HVA_ERR_BAD		(-1UL)
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index f07e49b419ab..d4fd7bf3692e 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -19,23 +19,10 @@ typedef struct {
 	/* The mmu context belongs to a secure guest. */
 	atomic_t protected_count;
 	/*
-	 * The following bitfields need a down_write on the mm
-	 * semaphore when they are written to. As they are only
-	 * written once, they can be read without a lock.
-	 */
-	/* The mmu context uses extended page tables. */
-	unsigned int has_pgste:1;
-	/* The mmu context uses storage keys. */
-	unsigned int uses_skeys:1;
-	/* The mmu context uses CMM. */
-	unsigned int uses_cmm:1;
-	/*
 	 * The mmu context allows COW-sharing of memory pages (KSM, zeropage).
 	 * Note that COW-sharing during fork() is currently always allowed.
 	 */
 	unsigned int allow_cow_sharing:1;
-	/* The gmaps associated with this context are allowed to use huge pages. */
-	unsigned int allow_gmap_hpage_1m:1;
 } mm_context_t;
 
 #define INIT_MM_CONTEXT(name)						   \
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index d9b8501bc93d..bd1ef5e2d2eb 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -29,12 +29,8 @@ static inline int init_new_context(struct task_struct *tsk,
 	atomic_set(&mm->context.protected_count, 0);
 	mm->context.gmap_asce = 0;
 	mm->context.flush_mm = 0;
-#ifdef CONFIG_PGSTE
-	mm->context.has_pgste = 0;
-	mm->context.uses_skeys = 0;
-	mm->context.uses_cmm = 0;
+#if IS_ENABLED(CONFIG_KVM)
 	mm->context.allow_cow_sharing = 1;
-	mm->context.allow_gmap_hpage_1m = 0;
 #endif
 	switch (mm->context.asce_limit) {
 	default:
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index c1d63b613bf9..6de2f4d25b63 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -78,7 +78,6 @@ static inline void copy_page(void *to, void *from)
 #ifdef STRICT_MM_TYPECHECKS
 
 typedef struct { unsigned long pgprot; } pgprot_t;
-typedef struct { unsigned long pgste; } pgste_t;
 typedef struct { unsigned long pte; } pte_t;
 typedef struct { unsigned long pmd; } pmd_t;
 typedef struct { unsigned long pud; } pud_t;
@@ -94,7 +93,6 @@ static __always_inline unsigned long name ## _val(name ## _t name)	\
 #else /* STRICT_MM_TYPECHECKS */
 
 typedef unsigned long pgprot_t;
-typedef unsigned long pgste_t;
 typedef unsigned long pte_t;
 typedef unsigned long pmd_t;
 typedef unsigned long pud_t;
@@ -110,7 +108,6 @@ static __always_inline unsigned long name ## _val(name ## _t name)	\
 #endif /* STRICT_MM_TYPECHECKS */
 
 DEFINE_PGVAL_FUNC(pgprot)
-DEFINE_PGVAL_FUNC(pgste)
 DEFINE_PGVAL_FUNC(pte)
 DEFINE_PGVAL_FUNC(pmd)
 DEFINE_PGVAL_FUNC(pud)
@@ -120,7 +117,6 @@ DEFINE_PGVAL_FUNC(pgd)
 typedef pte_t *pgtable_t;
 
 #define __pgprot(x)	((pgprot_t) { (x) } )
-#define __pgste(x)	((pgste_t) { (x) } )
 #define __pte(x)        ((pte_t) { (x) } )
 #define __pmd(x)        ((pmd_t) { (x) } )
 #define __pud(x)	((pud_t) { (x) } )
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index a16e65072371..a5de9e61ea9e 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -27,10 +27,6 @@ unsigned long *page_table_alloc_noprof(struct mm_struct *);
 #define page_table_alloc(...)	alloc_hooks(page_table_alloc_noprof(__VA_ARGS__))
 void page_table_free(struct mm_struct *, unsigned long *);
 
-struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm);
-#define page_table_alloc_pgste(...)	alloc_hooks(page_table_alloc_pgste_noprof(__VA_ARGS__))
-void page_table_free_pgste(struct ptdesc *ptdesc);
-
 static inline void crst_table_init(unsigned long *crst, unsigned long entry)
 {
 	memset64((u64 *)crst, entry, _CRST_ENTRIES);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index bca9b29778c3..1c3c3be93be9 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -413,28 +413,6 @@ void setup_protection_map(void);
  * SW-bits: y young, d dirty, r read, w write
  */
 
-/* Page status table bits for virtualization */
-#define PGSTE_ACC_BITS	0xf000000000000000UL
-#define PGSTE_FP_BIT	0x0800000000000000UL
-#define PGSTE_PCL_BIT	0x0080000000000000UL
-#define PGSTE_HR_BIT	0x0040000000000000UL
-#define PGSTE_HC_BIT	0x0020000000000000UL
-#define PGSTE_GR_BIT	0x0004000000000000UL
-#define PGSTE_GC_BIT	0x0002000000000000UL
-#define PGSTE_ST2_MASK	0x0000ffff00000000UL
-#define PGSTE_UC_BIT	0x0000000000008000UL	/* user dirty (migration) */
-#define PGSTE_IN_BIT	0x0000000000004000UL	/* IPTE notify bit */
-#define PGSTE_VSIE_BIT	0x0000000000002000UL	/* ref'd in a shadow table */
-
-/* Guest Page State used for virtualization */
-#define _PGSTE_GPS_ZERO			0x0000000080000000UL
-#define _PGSTE_GPS_NODAT		0x0000000040000000UL
-#define _PGSTE_GPS_USAGE_MASK		0x0000000003000000UL
-#define _PGSTE_GPS_USAGE_STABLE		0x0000000000000000UL
-#define _PGSTE_GPS_USAGE_UNUSED		0x0000000001000000UL
-#define _PGSTE_GPS_USAGE_POT_VOLATILE	0x0000000002000000UL
-#define _PGSTE_GPS_USAGE_VOLATILE	_PGSTE_GPS_USAGE_MASK
-
 /*
  * A user page table pointer has the space-switch-event bit, the
  * private-space-control bit and the storage-alteration-event-control
@@ -566,34 +544,15 @@ static inline bool mm_pmd_folded(struct mm_struct *mm)
 }
 #define mm_pmd_folded(mm) mm_pmd_folded(mm)
 
-static inline int mm_has_pgste(struct mm_struct *mm)
-{
-#ifdef CONFIG_PGSTE
-	if (unlikely(mm->context.has_pgste))
-		return 1;
-#endif
-	return 0;
-}
-
 static inline int mm_is_protected(struct mm_struct *mm)
 {
-#ifdef CONFIG_PGSTE
+#if IS_ENABLED(CONFIG_KVM)
 	if (unlikely(atomic_read(&mm->context.protected_count)))
 		return 1;
 #endif
 	return 0;
 }
 
-static inline pgste_t clear_pgste_bit(pgste_t pgste, unsigned long mask)
-{
-	return __pgste(pgste_val(pgste) & ~mask);
-}
-
-static inline pgste_t set_pgste_bit(pgste_t pgste, unsigned long mask)
-{
-	return __pgste(pgste_val(pgste) | mask);
-}
-
 static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot)
 {
 	return __pte(pte_val(pte) & ~pgprot_val(prot));
@@ -632,22 +591,13 @@ static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot)
 #define mm_forbids_zeropage mm_forbids_zeropage
 static inline int mm_forbids_zeropage(struct mm_struct *mm)
 {
-#ifdef CONFIG_PGSTE
+#if IS_ENABLED(CONFIG_KVM)
 	if (!mm->context.allow_cow_sharing)
 		return 1;
 #endif
 	return 0;
 }
 
-static inline int mm_uses_skeys(struct mm_struct *mm)
-{
-#ifdef CONFIG_PGSTE
-	if (mm->context.uses_skeys)
-		return 1;
-#endif
-	return 0;
-}
-
 /**
  * cspg() - Compare and Swap and Purge (CSPG)
  * @ptr: Pointer to the value to be exchanged
@@ -1136,6 +1086,13 @@ static inline pte_t pte_mkhuge(pte_t pte)
 }
 #endif
 
+static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
+{
+	asm volatile("sske %[skey],%[addr],1"
+		     : [addr] "+a" (addr) : [skey] "d" (skey));
+	return addr;
+}
+
 #define IPTE_GLOBAL	0
 #define	IPTE_LOCAL	1
 
@@ -1232,7 +1189,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 	res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
 	/* At this point the reference through the mapping is still present */
 	if (mm_is_protected(mm) && pte_present(res))
-		uv_convert_from_secure_pte(res);
+		WARN_ON_ONCE(uv_convert_from_secure_pte(res));
 	return res;
 }
 
@@ -1250,7 +1207,7 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
 	res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID));
 	/* At this point the reference through the mapping is still present */
 	if (mm_is_protected(vma->vm_mm) && pte_present(res))
-		uv_convert_from_secure_pte(res);
+		WARN_ON_ONCE(uv_convert_from_secure_pte(res));
 	return res;
 }
 
@@ -1287,9 +1244,10 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
 	/*
 	 * If something went wrong and the page could not be destroyed, or
 	 * if this is not a mm teardown, the slower export is used as
-	 * fallback instead.
+	 * fallback instead. If even that fails, print a warning and leak
+	 * the page, to avoid crashing the whole system.
 	 */
-	uv_convert_from_secure_pte(res);
+	WARN_ON_ONCE(uv_convert_from_secure_pte(res));
 	return res;
 }
 
@@ -1348,50 +1306,13 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 {
 	if (pte_same(*ptep, entry))
 		return 0;
-	if (cpu_has_rdp() && !mm_has_pgste(vma->vm_mm) && pte_allow_rdp(*ptep, entry))
+	if (cpu_has_rdp() && pte_allow_rdp(*ptep, entry))
 		ptep_reset_dat_prot(vma->vm_mm, addr, ptep, entry);
 	else
 		ptep_xchg_direct(vma->vm_mm, addr, ptep, entry);
 	return 1;
 }
 
-/*
- * Additional functions to handle KVM guest page tables
- */
-void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, pte_t entry);
-void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void ptep_notify(struct mm_struct *mm, unsigned long addr,
-		 pte_t *ptep, unsigned long bits);
-int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
-		    pte_t *ptep, int prot, unsigned long bit);
-void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep , int reset);
-void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
-		    pte_t *sptep, pte_t *tptep, pte_t pte);
-void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
-
-bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long address,
-			    pte_t *ptep);
-int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			  unsigned char key, bool nq);
-int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			       unsigned char key, unsigned char *oldkey,
-			       bool nq, bool mr, bool mc);
-int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr);
-int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			  unsigned char *key);
-
-int set_pgste_bits(struct mm_struct *mm, unsigned long addr,
-				unsigned long bits, unsigned long value);
-int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep);
-int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
-			unsigned long *oldpte, unsigned long *oldpgste);
-void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr);
-void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr);
-void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr);
-
 #define pgprot_writecombine	pgprot_writecombine
 pgprot_t pgprot_writecombine(pgprot_t prot);
 
@@ -1406,23 +1327,12 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 {
 	if (pte_present(entry))
 		entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED));
-	if (mm_has_pgste(mm)) {
-		for (;;) {
-			ptep_set_pte_at(mm, addr, ptep, entry);
-			if (--nr == 0)
-				break;
-			ptep++;
-			entry = __pte(pte_val(entry) + PAGE_SIZE);
-			addr += PAGE_SIZE;
-		}
-	} else {
-		for (;;) {
-			set_pte(ptep, entry);
-			if (--nr == 0)
-				break;
-			ptep++;
-			entry = __pte(pte_val(entry) + PAGE_SIZE);
-		}
+	for (;;) {
+		set_pte(ptep, entry);
+		if (--nr == 0)
+			break;
+		ptep++;
+		entry = __pte(pte_val(entry) + PAGE_SIZE);
 	}
 }
 #define set_ptes set_ptes
@@ -2015,9 +1925,6 @@ extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t p
 extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot);
 extern void vmem_unmap_4k_page(unsigned long addr);
 extern pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc);
-extern int s390_enable_sie(void);
-extern int s390_enable_skey(void);
-extern void s390_reset_cmma(struct mm_struct *mm);
 
 /* s390 has a private copy of get unmapped area to deal with cache synonyms */
 #define HAVE_ARCH_UNMAPPED_AREA
@@ -2026,40 +1933,4 @@ extern void s390_reset_cmma(struct mm_struct *mm);
 #define pmd_pgtable(pmd) \
 	((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE))
 
-static inline unsigned long gmap_pgste_get_pgt_addr(unsigned long *pgt)
-{
-	unsigned long *pgstes, res;
-
-	pgstes = pgt + _PAGE_ENTRIES;
-
-	res = (pgstes[0] & PGSTE_ST2_MASK) << 16;
-	res |= pgstes[1] & PGSTE_ST2_MASK;
-	res |= (pgstes[2] & PGSTE_ST2_MASK) >> 16;
-	res |= (pgstes[3] & PGSTE_ST2_MASK) >> 32;
-
-	return res;
-}
-
-static inline pgste_t pgste_get_lock(pte_t *ptep)
-{
-	unsigned long value = 0;
-#ifdef CONFIG_PGSTE
-	unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE);
-
-	do {
-		value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr);
-	} while (value & PGSTE_PCL_BIT);
-	value |= PGSTE_PCL_BIT;
-#endif
-	return __pgste(value);
-}
-
-static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
-{
-#ifdef CONFIG_PGSTE
-	barrier();
-	WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT);
-#endif
-}
-
 #endif /* _S390_PAGE_H */
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 1e50f6f1ad9d..7354b42ee994 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -36,7 +36,6 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
 
 #include <asm/tlbflush.h>
 #include <asm-generic/tlb.h>
-#include <asm/gmap.h>
 
 /*
  * Release the page cache reference for a pte removed by
@@ -85,8 +84,6 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
 	tlb->mm->context.flush_mm = 1;
 	tlb->freed_tables = 1;
 	tlb->cleared_pmds = 1;
-	if (mm_has_pgste(tlb->mm))
-		gmap_unlink(tlb->mm, (unsigned long *)pte, address);
 	tlb_remove_ptdesc(tlb, virt_to_ptdesc(pte));
 }
 
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index c5e02addcd67..dff035372601 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -471,65 +471,15 @@ do {									\
 #define arch_get_kernel_nofault __mvc_kernel_nofault
 #define arch_put_kernel_nofault __mvc_kernel_nofault
 
-void __cmpxchg_user_key_called_with_bad_pointer(void);
-
-int __cmpxchg_user_key1(unsigned long address, unsigned char *uval,
-			unsigned char old, unsigned char new, unsigned long key);
-int __cmpxchg_user_key2(unsigned long address, unsigned short *uval,
-			unsigned short old, unsigned short new, unsigned long key);
-int __cmpxchg_user_key4(unsigned long address, unsigned int *uval,
-			unsigned int old, unsigned int new, unsigned long key);
-int __cmpxchg_user_key8(unsigned long address, unsigned long *uval,
-			unsigned long old, unsigned long new, unsigned long key);
-int __cmpxchg_user_key16(unsigned long address, __uint128_t *uval,
-			 __uint128_t old, __uint128_t new, unsigned long key);
-
-static __always_inline int _cmpxchg_user_key(unsigned long address, void *uval,
-					     __uint128_t old, __uint128_t new,
-					     unsigned long key, int size)
-{
-	switch (size) {
-	case 1:  return __cmpxchg_user_key1(address, uval, old, new, key);
-	case 2:  return __cmpxchg_user_key2(address, uval, old, new, key);
-	case 4:  return __cmpxchg_user_key4(address, uval, old, new, key);
-	case 8:  return __cmpxchg_user_key8(address, uval, old, new, key);
-	case 16: return __cmpxchg_user_key16(address, uval, old, new, key);
-	default: __cmpxchg_user_key_called_with_bad_pointer();
-	}
-	return 0;
-}
-
-/**
- * cmpxchg_user_key() - cmpxchg with user space target, honoring storage keys
- * @ptr: User space address of value to compare to @old and exchange with
- *	 @new. Must be aligned to sizeof(*@ptr).
- * @uval: Address where the old value of *@ptr is written to.
- * @old: Old value. Compared to the content pointed to by @ptr in order to
- *	 determine if the exchange occurs. The old value read from *@ptr is
- *	 written to *@uval.
- * @new: New value to place at *@ptr.
- * @key: Access key to use for checking storage key protection.
- *
- * Perform a cmpxchg on a user space target, honoring storage key protection.
- * @key alone determines how key checking is performed, neither
- * storage-protection-override nor fetch-protection-override apply.
- * The caller must compare *@uval and @old to determine if values have been
- * exchanged. In case of an exception *@uval is set to zero.
- *
- * Return:     0: cmpxchg executed
- *	       -EFAULT: an exception happened when trying to access *@ptr
- *	       -EAGAIN: maxed out number of retries (byte and short only)
- */
-#define cmpxchg_user_key(ptr, uval, old, new, key)			\
-({									\
-	__typeof__(ptr) __ptr = (ptr);					\
-	__typeof__(uval) __uval = (uval);				\
-									\
-	BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval)));		\
-	might_fault();							\
-	__chk_user_ptr(__ptr);						\
-	_cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval),	\
-			  (old), (new), (key), sizeof(*(__ptr)));	\
-})
+int __cmpxchg_key1(void *address, unsigned char *uval, unsigned char old,
+		   unsigned char new, unsigned long key);
+int __cmpxchg_key2(void *address, unsigned short *uval, unsigned short old,
+		   unsigned short new, unsigned long key);
+int __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old,
+		   unsigned int new, unsigned long key);
+int __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old,
+		   unsigned long new, unsigned long key);
+int __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old,
+		    __uint128_t new, unsigned long key);
 
 #endif /* __S390_UACCESS_H */
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index 8018549a1ad2..d919e69662f5 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -631,7 +631,8 @@ int uv_pin_shared(unsigned long paddr);
 int uv_destroy_folio(struct folio *folio);
 int uv_destroy_pte(pte_t pte);
 int uv_convert_from_secure_pte(pte_t pte);
-int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb);
+int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio);
+int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb);
 int uv_convert_from_secure(unsigned long paddr);
 int uv_convert_from_secure_folio(struct folio *folio);
 
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index ed46950be86f..a284f98d9716 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -134,14 +134,15 @@ static int uv_destroy(unsigned long paddr)
  */
 int uv_destroy_folio(struct folio *folio)
 {
+	unsigned long i;
 	int rc;
 
-	/* Large folios cannot be secure */
-	if (unlikely(folio_test_large(folio)))
-		return 0;
-
 	folio_get(folio);
-	rc = uv_destroy(folio_to_phys(folio));
+	for (i = 0; i < (1 << folio_order(folio)); i++) {
+		rc = uv_destroy(folio_to_phys(folio) + i * PAGE_SIZE);
+		if (rc)
+			break;
+	}
 	if (!rc)
 		clear_bit(PG_arch_1, &folio->flags.f);
 	folio_put(folio);
@@ -183,14 +184,15 @@ EXPORT_SYMBOL_GPL(uv_convert_from_secure);
  */
 int uv_convert_from_secure_folio(struct folio *folio)
 {
+	unsigned long i;
 	int rc;
 
-	/* Large folios cannot be secure */
-	if (unlikely(folio_test_large(folio)))
-		return 0;
-
 	folio_get(folio);
-	rc = uv_convert_from_secure(folio_to_phys(folio));
+	for (i = 0; i < (1 << folio_order(folio)); i++) {
+		rc = uv_convert_from_secure(folio_to_phys(folio) + i * PAGE_SIZE);
+		if (rc)
+			break;
+	}
 	if (!rc)
 		clear_bit(PG_arch_1, &folio->flags.f);
 	folio_put(folio);
@@ -207,39 +209,6 @@ int uv_convert_from_secure_pte(pte_t pte)
 	return uv_convert_from_secure_folio(pfn_folio(pte_pfn(pte)));
 }
 
-/**
- * should_export_before_import - Determine whether an export is needed
- * before an import-like operation
- * @uvcb: the Ultravisor control block of the UVC to be performed
- * @mm: the mm of the process
- *
- * Returns whether an export is needed before every import-like operation.
- * This is needed for shared pages, which don't trigger a secure storage
- * exception when accessed from a different guest.
- *
- * Although considered as one, the Unpin Page UVC is not an actual import,
- * so it is not affected.
- *
- * No export is needed also when there is only one protected VM, because the
- * page cannot belong to the wrong VM in that case (there is no "other VM"
- * it can belong to).
- *
- * Return: true if an export is needed before every import, otherwise false.
- */
-static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
-{
-	/*
-	 * The misc feature indicates, among other things, that importing a
-	 * shared page from a different protected VM will automatically also
-	 * transfer its ownership.
-	 */
-	if (uv_has_feature(BIT_UV_FEAT_MISC))
-		return false;
-	if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
-		return false;
-	return atomic_read(&mm->context.protected_count) > 1;
-}
-
 /*
  * Calculate the expected ref_count for a folio that would otherwise have no
  * further pins. This was cribbed from similar functions in other places in
@@ -279,7 +248,7 @@ static int expected_folio_refs(struct folio *folio)
  *          (it's the same logic as split_folio()), and the folio must be
  *          locked.
  */
-static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
+int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
 {
 	int expected, cc = 0;
 
@@ -309,20 +278,7 @@ static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
 		return -EAGAIN;
 	return uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
 }
-
-static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct uv_cb_header *uvcb)
-{
-	int rc;
-
-	if (!folio_trylock(folio))
-		return -EAGAIN;
-	if (should_export_before_import(uvcb, mm))
-		uv_convert_from_secure(folio_to_phys(folio));
-	rc = __make_folio_secure(folio, uvcb);
-	folio_unlock(folio);
-
-	return rc;
-}
+EXPORT_SYMBOL(__make_folio_secure);
 
 /**
  * s390_wiggle_split_folio() - try to drain extra references to a folio and
@@ -337,7 +293,7 @@ static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct u
  *		   but another attempt can be made;
  *	   -EINVAL in case of other folio splitting errors. See split_folio().
  */
-static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio)
+int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio)
 {
 	int rc, tried_splits;
 
@@ -409,56 +365,7 @@ static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio)
 	}
 	return -EAGAIN;
 }
-
-int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb)
-{
-	struct vm_area_struct *vma;
-	struct folio_walk fw;
-	struct folio *folio;
-	int rc;
-
-	mmap_read_lock(mm);
-	vma = vma_lookup(mm, hva);
-	if (!vma) {
-		mmap_read_unlock(mm);
-		return -EFAULT;
-	}
-	folio = folio_walk_start(&fw, vma, hva, 0);
-	if (!folio) {
-		mmap_read_unlock(mm);
-		return -ENXIO;
-	}
-
-	folio_get(folio);
-	/*
-	 * Secure pages cannot be huge and userspace should not combine both.
-	 * In case userspace does it anyway this will result in an -EFAULT for
-	 * the unpack. The guest is thus never reaching secure mode.
-	 * If userspace plays dirty tricks and decides to map huge pages at a
-	 * later point in time, it will receive a segmentation fault or
-	 * KVM_RUN will return -EFAULT.
-	 */
-	if (folio_test_hugetlb(folio))
-		rc = -EFAULT;
-	else if (folio_test_large(folio))
-		rc = -E2BIG;
-	else if (!pte_write(fw.pte) || (pte_val(fw.pte) & _PAGE_INVALID))
-		rc = -ENXIO;
-	else
-		rc = make_folio_secure(mm, folio, uvcb);
-	folio_walk_end(&fw, vma);
-	mmap_read_unlock(mm);
-
-	if (rc == -E2BIG || rc == -EBUSY) {
-		rc = s390_wiggle_split_folio(mm, folio);
-		if (!rc)
-			rc = -EAGAIN;
-	}
-	folio_put(folio);
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(make_hva_secure);
+EXPORT_SYMBOL_GPL(s390_wiggle_split_folio);
 
 /*
  * To be called with the folio locked or with an extra reference! This will
@@ -470,21 +377,18 @@ int arch_make_folio_accessible(struct folio *folio)
 {
 	int rc = 0;
 
-	/* Large folios cannot be secure */
-	if (unlikely(folio_test_large(folio)))
-		return 0;
-
 	/*
-	 * PG_arch_1 is used in 2 places:
-	 * 1. for storage keys of hugetlb folios and KVM
-	 * 2. As an indication that this small folio might be secure. This can
-	 *    overindicate, e.g. we set the bit before calling
-	 *    convert_to_secure.
-	 * As secure pages are never large folios, both variants can co-exists.
+	 * PG_arch_1 is used as an indication that this small folio might be
+	 * secure. This can overindicate, e.g. we set the bit before calling
+	 * convert_to_secure.
 	 */
 	if (!test_bit(PG_arch_1, &folio->flags.f))
 		return 0;
 
+	/* Large folios cannot be secure. */
+	if (WARN_ON_ONCE(folio_test_large(folio)))
+		return -EFAULT;
+
 	rc = uv_pin_shared(folio_to_phys(folio));
 	if (!rc) {
 		clear_bit(PG_arch_1, &folio->flags.f);
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index f4ec8c1ce214..917ac740513e 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -30,6 +30,8 @@ config KVM
 	select KVM_VFIO
 	select MMU_NOTIFIER
 	select VIRT_XFER_TO_GUEST_WORK
+	select KVM_GENERIC_MMU_NOTIFIER
+	select KVM_MMU_LOCKLESS_AGING
 	help
 	  Support hosting paravirtualized guest machines using the SIE
 	  virtualization capability on the mainframe. This should work
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 9a723c48b05a..dac9d53b23d8 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -8,7 +8,8 @@ include $(srctree)/virt/kvm/Makefile.kvm
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
 kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o
+kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o
+kvm-y += dat.o gmap.o faultin.o
 
 kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
 obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c
new file mode 100644
index 000000000000..129dc55a4a0d
--- /dev/null
+++ b/arch/s390/kvm/dat.c
@@ -0,0 +1,1391 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  KVM guest address space mapping code
+ *
+ *    Copyright IBM Corp. 2007, 2020, 2024
+ *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ *		 Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *		 David Hildenbrand <david@redhat.com>
+ *		 Janosch Frank <frankja@linux.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/pagewalk.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/swapops.h>
+#include <linux/ksm.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/pgtable.h>
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+#include <linux/pgalloc.h>
+
+#include <asm/page-states.h>
+#include <asm/tlb.h>
+#include "dat.h"
+
+int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc)
+{
+	void *o;
+
+	for ( ; mc->n_crsts < KVM_S390_MMU_CACHE_N_CRSTS; mc->n_crsts++) {
+		o = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER);
+		if (!o)
+			return -ENOMEM;
+		mc->crsts[mc->n_crsts] = o;
+	}
+	for ( ; mc->n_pts < KVM_S390_MMU_CACHE_N_PTS; mc->n_pts++) {
+		o = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+		if (!o)
+			return -ENOMEM;
+		mc->pts[mc->n_pts] = o;
+	}
+	for ( ; mc->n_rmaps < KVM_S390_MMU_CACHE_N_RMAPS; mc->n_rmaps++) {
+		o = kzalloc(sizeof(*mc->rmaps[0]), GFP_KERNEL_ACCOUNT);
+		if (!o)
+			return -ENOMEM;
+		mc->rmaps[mc->n_rmaps] = o;
+	}
+	return 0;
+}
+
+static inline struct page_table *dat_alloc_pt_noinit(struct kvm_s390_mmu_cache *mc)
+{
+	struct page_table *res;
+
+	res = kvm_s390_mmu_cache_alloc_pt(mc);
+	if (res)
+		__arch_set_page_dat(res, 1);
+	return res;
+}
+
+static inline struct crst_table *dat_alloc_crst_noinit(struct kvm_s390_mmu_cache *mc)
+{
+	struct crst_table *res;
+
+	res = kvm_s390_mmu_cache_alloc_crst(mc);
+	if (res)
+		__arch_set_page_dat(res, 1UL << CRST_ALLOC_ORDER);
+	return res;
+}
+
+struct crst_table *dat_alloc_crst_sleepable(unsigned long init)
+{
+	struct page *page;
+	void *virt;
+
+	page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER);
+	if (!page)
+		return NULL;
+	virt = page_to_virt(page);
+	__arch_set_page_dat(virt, 1UL << CRST_ALLOC_ORDER);
+	crst_table_init(virt, init);
+	return virt;
+}
+
+void dat_free_level(struct crst_table *table, bool owns_ptes)
+{
+	unsigned int i;
+
+	for (i = 0; i < _CRST_ENTRIES; i++) {
+		if (table->crstes[i].h.fc || table->crstes[i].h.i)
+			continue;
+		if (!is_pmd(table->crstes[i]))
+			dat_free_level(dereference_crste(table->crstes[i]), owns_ptes);
+		else if (owns_ptes)
+			dat_free_pt(dereference_pmd(table->crstes[i].pmd));
+	}
+	dat_free_crst(table);
+}
+
+int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype)
+{
+	struct crst_table *table;
+	union crste crste;
+
+	while (asce->dt > newtype) {
+		table = dereference_asce(*asce);
+		crste = table->crstes[0];
+		if (crste.h.fc)
+			return 0;
+		if (!crste.h.i) {
+			asce->rsto = crste.h.fc0.to;
+			dat_free_crst(table);
+		} else {
+			crste.h.tt--;
+			crst_table_init((void *)table, crste.val);
+		}
+		asce->dt--;
+	}
+	while (asce->dt < newtype) {
+		crste = _crste_fc0(asce->rsto, asce->dt + 1);
+		table = dat_alloc_crst_noinit(mc);
+		if (!table)
+			return -ENOMEM;
+		crst_table_init((void *)table, _CRSTE_HOLE(crste.h.tt).val);
+		table->crstes[0] = crste;
+		asce->rsto = __pa(table) >> PAGE_SHIFT;
+		asce->dt++;
+	}
+	return 0;
+}
+
+/**
+ * dat_crstep_xchg() - Exchange a gmap CRSTE with another.
+ * @crstep: Pointer to the CRST entry
+ * @new: Replacement entry.
+ * @gfn: The affected guest address.
+ * @asce: The ASCE of the address space.
+ *
+ * Context: This function is assumed to be called with kvm->mmu_lock held.
+ */
+void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce)
+{
+	if (crstep->h.i) {
+		WRITE_ONCE(*crstep, new);
+		return;
+	} else if (cpu_has_edat2()) {
+		crdte_crste(crstep, *crstep, new, gfn, asce);
+		return;
+	}
+
+	if (machine_has_tlb_guest())
+		idte_crste(crstep, gfn, IDTE_GUEST_ASCE, asce, IDTE_GLOBAL);
+	else
+		idte_crste(crstep, gfn, 0, NULL_ASCE, IDTE_GLOBAL);
+	WRITE_ONCE(*crstep, new);
+}
+
+/**
+ * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another.
+ * @crstep: Pointer to the CRST entry.
+ * @old: Expected old value.
+ * @new: Replacement entry.
+ * @gfn: The affected guest address.
+ * @asce: The asce of the address space.
+ *
+ * This function is needed to atomically exchange a CRSTE that potentially
+ * maps a prefix area, without having to invalidate it inbetween.
+ *
+ * Context: This function is assumed to be called with kvm->mmu_lock held.
+ *
+ * Return: %true if the exchange was successful.
+ */
+bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn,
+			    union asce asce)
+{
+	if (old.h.i)
+		return arch_try_cmpxchg((long *)crstep, &old.val, new.val);
+	if (cpu_has_edat2())
+		return crdte_crste(crstep, old, new, gfn, asce);
+	return cspg_crste(crstep, old, new);
+}
+
+static void dat_set_storage_key_from_pgste(union pte pte, union pgste pgste)
+{
+	union skey nkey = { .acc = pgste.acc, .fp = pgste.fp };
+
+	page_set_storage_key(pte_origin(pte), nkey.skey, 0);
+}
+
+static void dat_move_storage_key(union pte old, union pte new)
+{
+	page_set_storage_key(pte_origin(new), page_get_storage_key(pte_origin(old)), 1);
+}
+
+static union pgste dat_save_storage_key_into_pgste(union pte pte, union pgste pgste)
+{
+	union skey skey;
+
+	skey.skey = page_get_storage_key(pte_origin(pte));
+
+	pgste.acc = skey.acc;
+	pgste.fp = skey.fp;
+	pgste.gr |= skey.r;
+	pgste.gc |= skey.c;
+
+	return pgste;
+}
+
+union pgste __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, gfn_t gfn,
+			    union asce asce, bool uses_skeys)
+{
+	union pte old = READ_ONCE(*ptep);
+
+	/* Updating only the software bits while holding the pgste lock. */
+	if (!((ptep->val ^ new.val) & ~_PAGE_SW_BITS)) {
+		WRITE_ONCE(ptep->swbyte, new.swbyte);
+		return pgste;
+	}
+
+	if (!old.h.i) {
+		unsigned long opts = IPTE_GUEST_ASCE | (pgste.nodat ? IPTE_NODAT : 0);
+
+		if (machine_has_tlb_guest())
+			__ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, opts, asce.val, IPTE_GLOBAL);
+		else
+			__ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, 0, 0, IPTE_GLOBAL);
+	}
+
+	if (uses_skeys) {
+		if (old.h.i && !new.h.i)
+			/* Invalid to valid: restore storage keys from PGSTE. */
+			dat_set_storage_key_from_pgste(new, pgste);
+		else if (!old.h.i && new.h.i)
+			/* Valid to invalid: save storage keys to PGSTE. */
+			pgste = dat_save_storage_key_into_pgste(old, pgste);
+		else if (!old.h.i && !new.h.i)
+			/* Valid to valid: move storage keys. */
+			if (old.h.pfra != new.h.pfra)
+				dat_move_storage_key(old, new);
+		/* Invalid to invalid: nothing to do. */
+	}
+
+	WRITE_ONCE(*ptep, new);
+	return pgste;
+}
+
+/*
+ * dat_split_ste() - Split a segment table entry into page table entries.
+ *
+ * Context: This function is assumed to be called with kvm->mmu_lock held.
+ *
+ * Return: 0 in case of success, -ENOMEM if running out of memory.
+ */
+static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t gfn,
+			 union asce asce, bool uses_skeys)
+{
+	union pgste pgste_init;
+	struct page_table *pt;
+	union pmd new, old;
+	union pte init;
+	int i;
+
+	BUG_ON(!mc);
+	old = READ_ONCE(*pmdp);
+
+	/* Already split, nothing to do. */
+	if (!old.h.i && !old.h.fc)
+		return 0;
+
+	pt = dat_alloc_pt_noinit(mc);
+	if (!pt)
+		return -ENOMEM;
+	new.val = virt_to_phys(pt);
+
+	while (old.h.i || old.h.fc) {
+		init.val = pmd_origin_large(old);
+		init.h.p = old.h.p;
+		init.h.i = old.h.i;
+		init.s.d = old.s.fc1.d;
+		init.s.w = old.s.fc1.w;
+		init.s.y = old.s.fc1.y;
+		init.s.sd = old.s.fc1.sd;
+		init.s.pr = old.s.fc1.pr;
+		pgste_init.val = 0;
+		if (old.h.fc) {
+			for (i = 0; i < _PAGE_ENTRIES; i++)
+				pt->ptes[i].val = init.val | i * PAGE_SIZE;
+			/* No need to take locks as the page table is not installed yet. */
+			pgste_init.prefix_notif = old.s.fc1.prefix_notif;
+			pgste_init.pcl = uses_skeys && init.h.i;
+			dat_init_pgstes(pt, pgste_init.val);
+		} else {
+			dat_init_page_table(pt, init.val, 0);
+		}
+
+		if (dat_pmdp_xchg_atomic(pmdp, old, new, gfn, asce)) {
+			if (!pgste_init.pcl)
+				return 0;
+			for (i = 0; i < _PAGE_ENTRIES; i++) {
+				union pgste pgste = pt->pgstes[i];
+
+				pgste = dat_save_storage_key_into_pgste(pt->ptes[i], pgste);
+				pgste_set_unlock(pt->ptes + i, pgste);
+			}
+			return 0;
+		}
+		old = READ_ONCE(*pmdp);
+	}
+
+	dat_free_pt(pt);
+	return 0;
+}
+
+/*
+ * dat_split_crste() - Split a crste into smaller crstes.
+ *
+ * Context: This function is assumed to be called with kvm->mmu_lock held.
+ *
+ * Return: %0 in case of success, %-ENOMEM if running out of memory.
+ */
+static int dat_split_crste(struct kvm_s390_mmu_cache *mc, union crste *crstep,
+			   gfn_t gfn, union asce asce, bool uses_skeys)
+{
+	struct crst_table *table;
+	union crste old, new, init;
+	int i;
+
+	old = READ_ONCE(*crstep);
+	if (is_pmd(old))
+		return dat_split_ste(mc, &crstep->pmd, gfn, asce, uses_skeys);
+
+	BUG_ON(!mc);
+
+	/* Already split, nothing to do. */
+	if (!old.h.i && !old.h.fc)
+		return 0;
+
+	table = dat_alloc_crst_noinit(mc);
+	if (!table)
+		return -ENOMEM;
+
+	new.val = virt_to_phys(table);
+	new.h.tt = old.h.tt;
+	new.h.fc0.tl = _REGION_ENTRY_LENGTH;
+
+	while (old.h.i || old.h.fc) {
+		init = old;
+		init.h.tt--;
+		if (old.h.fc) {
+			for (i = 0; i < _CRST_ENTRIES; i++)
+				table->crstes[i].val = init.val | i * HPAGE_SIZE;
+		} else {
+			crst_table_init((void *)table, init.val);
+		}
+		if (dat_crstep_xchg_atomic(crstep, old, new, gfn, asce))
+			return 0;
+		old = READ_ONCE(*crstep);
+	}
+
+	dat_free_crst(table);
+	return 0;
+}
+
+/**
+ * dat_entry_walk() - Walk the gmap page tables.
+ * @mc: Cache to use to allocate dat tables, if needed; can be NULL if neither
+ *      %DAT_WALK_SPLIT or %DAT_WALK_ALLOC is specified in @flags.
+ * @gfn: Guest frame.
+ * @asce: The ASCE of the address space.
+ * @flags: Flags from WALK_* macros.
+ * @walk_level: Level to walk to, from LEVEL_* macros.
+ * @last: Will be filled the last visited non-pte DAT entry.
+ * @ptepp: Will be filled the last visited pte entry, if any, otherwise NULL.
+ *
+ * Returns a table entry pointer for the given guest address and @walk_level.
+ *
+ * The @flags have the following meanings:
+ * * %DAT_WALK_IGN_HOLES: consider holes as normal table entries
+ * * %DAT_WALK_ALLOC: allocate new tables to reach the requested level, if needed
+ * * %DAT_WALK_SPLIT: split existing large pages to reach the requested level, if needed
+ * * %DAT_WALK_LEAF: return successfully whenever a large page is encountered
+ * * %DAT_WALK_ANY: return successfully even if the requested level could not be reached
+ * * %DAT_WALK_CONTINUE: walk to the requested level with the specified flags, and then try to
+ *                       continue walking to ptes with only DAT_WALK_ANY
+ * * %DAT_WALK_USES_SKEYS: storage keys are in use
+ *
+ * Context: called with kvm->mmu_lock held.
+ *
+ * Return:
+ * * %PGM_ADDRESSING if the requested address lies outside memory
+ * * a PIC number if the requested address lies in a memory hole of type _DAT_TOKEN_PIC
+ * * %-EFAULT if the requested address lies inside a memory hole of a different type
+ * * %-EINVAL if the given ASCE is not compatible with the requested level
+ * * %-EFBIG if the requested level could not be reached because a larger frame was found
+ * * %-ENOENT if the requested level could not be reached for other reasons
+ * * %-ENOMEM if running out of memory while allocating or splitting a table
+ */
+int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags,
+		   int walk_level, union crste **last, union pte **ptepp)
+{
+	union vaddress vaddr = { .addr = gfn_to_gpa(gfn) };
+	bool continue_anyway = flags & DAT_WALK_CONTINUE;
+	bool uses_skeys = flags & DAT_WALK_USES_SKEYS;
+	bool ign_holes = flags & DAT_WALK_IGN_HOLES;
+	bool allocate = flags & DAT_WALK_ALLOC;
+	bool split = flags & DAT_WALK_SPLIT;
+	bool leaf = flags & DAT_WALK_LEAF;
+	bool any = flags & DAT_WALK_ANY;
+	struct page_table *pgtable;
+	struct crst_table *table;
+	union crste entry;
+	int rc;
+
+	*last = NULL;
+	*ptepp = NULL;
+	if (WARN_ON_ONCE(unlikely(!asce.val)))
+		return -EINVAL;
+	if (WARN_ON_ONCE(unlikely(walk_level > asce.dt)))
+		return -EINVAL;
+	if (!asce_contains_gfn(asce, gfn))
+		return PGM_ADDRESSING;
+
+	table = dereference_asce(asce);
+	if (asce.dt >= ASCE_TYPE_REGION1) {
+		*last = table->crstes + vaddr.rfx;
+		entry = READ_ONCE(**last);
+		if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION1))
+			return -EINVAL;
+		if (crste_hole(entry) && !ign_holes)
+			return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT;
+		if (walk_level == TABLE_TYPE_REGION1)
+			return 0;
+		if (entry.pgd.h.i) {
+			if (!allocate)
+				return any ? 0 : -ENOENT;
+			rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys);
+			if (rc)
+				return rc;
+			entry = READ_ONCE(**last);
+		}
+		table = dereference_crste(entry.pgd);
+	}
+
+	if (asce.dt >= ASCE_TYPE_REGION2) {
+		*last = table->crstes + vaddr.rsx;
+		entry = READ_ONCE(**last);
+		if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION2))
+			return -EINVAL;
+		if (crste_hole(entry) && !ign_holes)
+			return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT;
+		if (walk_level == TABLE_TYPE_REGION2)
+			return 0;
+		if (entry.p4d.h.i) {
+			if (!allocate)
+				return any ? 0 : -ENOENT;
+			rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys);
+			if (rc)
+				return rc;
+			entry = READ_ONCE(**last);
+		}
+		table = dereference_crste(entry.p4d);
+	}
+
+	if (asce.dt >= ASCE_TYPE_REGION3) {
+		*last = table->crstes + vaddr.rtx;
+		entry = READ_ONCE(**last);
+		if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION3))
+			return -EINVAL;
+		if (crste_hole(entry) && !ign_holes)
+			return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT;
+		if (walk_level == TABLE_TYPE_REGION3 &&
+		    continue_anyway && !entry.pud.h.fc && !entry.h.i) {
+			walk_level = TABLE_TYPE_PAGE_TABLE;
+			allocate = false;
+		}
+		if (walk_level == TABLE_TYPE_REGION3 || ((leaf || any) && entry.pud.h.fc))
+			return 0;
+		if (entry.pud.h.i && !entry.pud.h.fc) {
+			if (!allocate)
+				return any ? 0 : -ENOENT;
+			rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys);
+			if (rc)
+				return rc;
+			entry = READ_ONCE(**last);
+		}
+		if (walk_level <= TABLE_TYPE_SEGMENT && entry.pud.h.fc) {
+			if (!split)
+				return -EFBIG;
+			rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys);
+			if (rc)
+				return rc;
+			entry = READ_ONCE(**last);
+		}
+		table = dereference_crste(entry.pud);
+	}
+
+	*last = table->crstes + vaddr.sx;
+	entry = READ_ONCE(**last);
+	if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_SEGMENT))
+		return -EINVAL;
+	if (crste_hole(entry) && !ign_holes)
+		return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT;
+	if (continue_anyway && !entry.pmd.h.fc && !entry.h.i) {
+		walk_level = TABLE_TYPE_PAGE_TABLE;
+		allocate = false;
+	}
+	if (walk_level == TABLE_TYPE_SEGMENT || ((leaf || any) && entry.pmd.h.fc))
+		return 0;
+
+	if (entry.pmd.h.i && !entry.pmd.h.fc) {
+		if (!allocate)
+			return any ? 0 : -ENOENT;
+		rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys);
+		if (rc)
+			return rc;
+		entry = READ_ONCE(**last);
+	}
+	if (walk_level <= TABLE_TYPE_PAGE_TABLE && entry.pmd.h.fc) {
+		if (!split)
+			return -EFBIG;
+		rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys);
+		if (rc)
+			return rc;
+		entry = READ_ONCE(**last);
+	}
+	pgtable = dereference_pmd(entry.pmd);
+	*ptepp = pgtable->ptes + vaddr.px;
+	if (pte_hole(**ptepp) && !ign_holes)
+		return (*ptepp)->tok.type == _DAT_TOKEN_PIC ? (*ptepp)->tok.par : -EFAULT;
+	return 0;
+}
+
+static long dat_pte_walk_range(gfn_t gfn, gfn_t end, struct page_table *table, struct dat_walk *w)
+{
+	unsigned int idx = gfn & (_PAGE_ENTRIES - 1);
+	long rc = 0;
+
+	for ( ; gfn < end; idx++, gfn++) {
+		if (pte_hole(READ_ONCE(table->ptes[idx]))) {
+			if (!(w->flags & DAT_WALK_IGN_HOLES))
+				return -EFAULT;
+			if (!(w->flags & DAT_WALK_ANY))
+				continue;
+		}
+
+		rc = w->ops->pte_entry(table->ptes + idx, gfn, gfn + 1, w);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+
+static long dat_crste_walk_range(gfn_t start, gfn_t end, struct crst_table *table,
+				 struct dat_walk *walk)
+{
+	unsigned long idx, cur_shift, cur_size;
+	dat_walk_op the_op;
+	union crste crste;
+	gfn_t cur, next;
+	long rc = 0;
+
+	cur_shift = 8 + table->crstes[0].h.tt * 11;
+	idx = (start >> cur_shift) & (_CRST_ENTRIES - 1);
+	cur_size = 1UL << cur_shift;
+
+	for (cur = ALIGN_DOWN(start, cur_size); cur < end; idx++, cur = next) {
+		next = cur + cur_size;
+		walk->last = table->crstes + idx;
+		crste = READ_ONCE(*walk->last);
+
+		if (crste_hole(crste)) {
+			if (!(walk->flags & DAT_WALK_IGN_HOLES))
+				return -EFAULT;
+			if (!(walk->flags & DAT_WALK_ANY))
+				continue;
+		}
+
+		the_op = walk->ops->crste_ops[crste.h.tt];
+		if (the_op) {
+			rc = the_op(walk->last, cur, next, walk);
+			crste = READ_ONCE(*walk->last);
+		}
+		if (rc)
+			break;
+		if (!crste.h.i && !crste.h.fc) {
+			if (!is_pmd(crste))
+				rc = dat_crste_walk_range(max(start, cur), min(end, next),
+							  _dereference_crste(crste), walk);
+			else if (walk->ops->pte_entry)
+				rc = dat_pte_walk_range(max(start, cur), min(end, next),
+							dereference_pmd(crste.pmd), walk);
+		}
+	}
+	return rc;
+}
+
+/**
+ * _dat_walk_gfn_range() - Walk DAT tables.
+ * @start: The first guest page frame to walk.
+ * @end: The guest page frame immediately after the last one to walk.
+ * @asce: The ASCE of the guest mapping.
+ * @ops: The gmap_walk_ops that will be used to perform the walk.
+ * @flags: Flags from WALK_* (currently only WALK_IGN_HOLES is supported).
+ * @priv: Will be passed as-is to the callbacks.
+ *
+ * Any callback returning non-zero causes the walk to stop immediately.
+ *
+ * Return: %-EINVAL in case of error, %-EFAULT if @start is too high for the
+ *         given ASCE unless the DAT_WALK_IGN_HOLES flag is specified,
+ *         otherwise it returns whatever the callbacks return.
+ */
+long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce,
+			 const struct dat_walk_ops *ops, int flags, void *priv)
+{
+	struct crst_table *table = dereference_asce(asce);
+	struct dat_walk walk = {
+		.ops	= ops,
+		.asce	= asce,
+		.priv	= priv,
+		.flags	= flags,
+		.start	= start,
+		.end	= end,
+	};
+
+	if (WARN_ON_ONCE(unlikely(!asce.val)))
+		return -EINVAL;
+	if (!asce_contains_gfn(asce, start))
+		return (flags & DAT_WALK_IGN_HOLES) ? 0 : -EFAULT;
+
+	return dat_crste_walk_range(start, min(end, asce_end(asce)), table, &walk);
+}
+
+int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey)
+{
+	union crste *crstep;
+	union pgste pgste;
+	union pte *ptep;
+	int rc;
+
+	skey->skey = 0;
+	rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
+	if (rc)
+		return rc;
+
+	if (!ptep) {
+		union crste crste;
+
+		crste = READ_ONCE(*crstep);
+		if (!crste.h.fc || !crste.s.fc1.pr)
+			return 0;
+		skey->skey = page_get_storage_key(large_crste_to_phys(crste, gfn));
+		return 0;
+	}
+	pgste = pgste_get_lock(ptep);
+	if (ptep->h.i) {
+		skey->acc = pgste.acc;
+		skey->fp = pgste.fp;
+	} else {
+		skey->skey = page_get_storage_key(pte_origin(*ptep));
+	}
+	skey->r |= pgste.gr;
+	skey->c |= pgste.gc;
+	pgste_set_unlock(ptep, pgste);
+	return 0;
+}
+
+static void dat_update_ptep_sd(union pgste old, union pgste pgste, union pte *ptep)
+{
+	if (pgste.acc != old.acc || pgste.fp != old.fp || pgste.gr != old.gr || pgste.gc != old.gc)
+		__atomic64_or(_PAGE_SD, &ptep->val);
+}
+
+int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
+			union skey skey, bool nq)
+{
+	union pgste pgste, old;
+	union crste *crstep;
+	union pte *ptep;
+	int rc;
+
+	rc = dat_entry_walk(mc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE,
+			    &crstep, &ptep);
+	if (rc)
+		return rc;
+
+	if (!ptep) {
+		page_set_storage_key(large_crste_to_phys(*crstep, gfn), skey.skey, !nq);
+		return 0;
+	}
+
+	old = pgste_get_lock(ptep);
+	pgste = old;
+
+	pgste.acc = skey.acc;
+	pgste.fp = skey.fp;
+	pgste.gc = skey.c;
+	pgste.gr = skey.r;
+
+	if (!ptep->h.i) {
+		union skey old_skey;
+
+		old_skey.skey = page_get_storage_key(pte_origin(*ptep));
+		pgste.hc |= old_skey.c;
+		pgste.hr |= old_skey.r;
+		old_skey.c = old.gc;
+		old_skey.r = old.gr;
+		skey.r = 0;
+		skey.c = 0;
+		page_set_storage_key(pte_origin(*ptep), skey.skey, !nq);
+	}
+
+	dat_update_ptep_sd(old, pgste, ptep);
+	pgste_set_unlock(ptep, pgste);
+	return 0;
+}
+
+static bool page_cond_set_storage_key(phys_addr_t paddr, union skey skey, union skey *oldkey,
+				      bool nq, bool mr, bool mc)
+{
+	oldkey->skey = page_get_storage_key(paddr);
+	if (oldkey->acc == skey.acc && oldkey->fp == skey.fp &&
+	    (oldkey->r == skey.r || mr) && (oldkey->c == skey.c || mc))
+		return false;
+	page_set_storage_key(paddr, skey.skey, !nq);
+	return true;
+}
+
+int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn,
+			     union skey skey, union skey *oldkey, bool nq, bool mr, bool mc)
+{
+	union pgste pgste, old;
+	union crste *crstep;
+	union skey prev;
+	union pte *ptep;
+	int rc;
+
+	rc = dat_entry_walk(mmc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE,
+			    &crstep, &ptep);
+	if (rc)
+		return rc;
+
+	if (!ptep)
+		return page_cond_set_storage_key(large_crste_to_phys(*crstep, gfn), skey, oldkey,
+						 nq, mr, mc);
+
+	old = pgste_get_lock(ptep);
+	pgste = old;
+
+	rc = 1;
+	pgste.acc = skey.acc;
+	pgste.fp = skey.fp;
+	pgste.gc = skey.c;
+	pgste.gr = skey.r;
+
+	if (!ptep->h.i) {
+		rc = page_cond_set_storage_key(pte_origin(*ptep), skey, &prev, nq, mr, mc);
+		pgste.hc |= prev.c;
+		pgste.hr |= prev.r;
+		prev.c |= old.gc;
+		prev.r |= old.gr;
+	} else {
+		prev.acc = old.acc;
+		prev.fp = old.fp;
+		prev.c = old.gc;
+		prev.r = old.gr;
+	}
+	if (oldkey)
+		*oldkey = prev;
+
+	dat_update_ptep_sd(old, pgste, ptep);
+	pgste_set_unlock(ptep, pgste);
+	return rc;
+}
+
+int dat_reset_reference_bit(union asce asce, gfn_t gfn)
+{
+	union pgste pgste, old;
+	union crste *crstep;
+	union pte *ptep;
+	int rc;
+
+	rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
+	if (rc)
+		return rc;
+
+	if (!ptep) {
+		union crste crste = READ_ONCE(*crstep);
+
+		if (!crste.h.fc || !crste.s.fc1.pr)
+			return 0;
+		return page_reset_referenced(large_crste_to_phys(*crstep, gfn));
+	}
+	old = pgste_get_lock(ptep);
+	pgste = old;
+
+	if (!ptep->h.i) {
+		rc = page_reset_referenced(pte_origin(*ptep));
+		pgste.hr = rc >> 1;
+	}
+	rc |= (pgste.gr << 1) | pgste.gc;
+	pgste.gr = 0;
+
+	dat_update_ptep_sd(old, pgste, ptep);
+	pgste_set_unlock(ptep, pgste);
+	return rc;
+}
+
+static long dat_reset_skeys_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	union pgste pgste;
+
+	pgste = pgste_get_lock(ptep);
+	pgste.acc = 0;
+	pgste.fp = 0;
+	pgste.gr = 0;
+	pgste.gc = 0;
+	if (ptep->s.pr)
+		page_set_storage_key(pte_origin(*ptep), PAGE_DEFAULT_KEY, 1);
+	pgste_set_unlock(ptep, pgste);
+
+	if (need_resched())
+		return next;
+	return 0;
+}
+
+static long dat_reset_skeys_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	phys_addr_t addr, end, origin = crste_origin_large(*crstep);
+
+	if (!crstep->h.fc || !crstep->s.fc1.pr)
+		return 0;
+
+	addr = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin;
+	end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin;
+	while (ALIGN(addr + 1, _SEGMENT_SIZE) <= end)
+		addr = sske_frame(addr, PAGE_DEFAULT_KEY);
+	for ( ; addr < end; addr += PAGE_SIZE)
+		page_set_storage_key(addr, PAGE_DEFAULT_KEY, 1);
+
+	if (need_resched())
+		return next;
+	return 0;
+}
+
+long dat_reset_skeys(union asce asce, gfn_t start)
+{
+	const struct dat_walk_ops ops = {
+		.pte_entry = dat_reset_skeys_pte,
+		.pmd_entry = dat_reset_skeys_crste,
+		.pud_entry = dat_reset_skeys_crste,
+	};
+
+	return _dat_walk_gfn_range(start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, NULL);
+}
+
+struct slot_priv {
+	unsigned long token;
+	struct kvm_s390_mmu_cache *mc;
+};
+
+static long _dat_slot_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	struct slot_priv *p = walk->priv;
+	union crste dummy = { .val = p->token };
+	union pte new_pte, pte = READ_ONCE(*ptep);
+
+	new_pte = _PTE_TOK(dummy.tok.type, dummy.tok.par);
+
+	/* Table entry already in the desired state. */
+	if (pte.val == new_pte.val)
+		return 0;
+
+	dat_ptep_xchg(ptep, new_pte, gfn, walk->asce, false);
+	return 0;
+}
+
+static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	union crste new_crste, crste = READ_ONCE(*crstep);
+	struct slot_priv *p = walk->priv;
+
+	new_crste.val = p->token;
+	new_crste.h.tt = crste.h.tt;
+
+	/* Table entry already in the desired state. */
+	if (crste.val == new_crste.val)
+		return 0;
+
+	/* This table entry needs to be updated. */
+	if (walk->start <= gfn && walk->end >= next) {
+		dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce);
+		/* A lower level table was present, needs to be freed. */
+		if (!crste.h.fc && !crste.h.i) {
+			if (is_pmd(crste))
+				dat_free_pt(dereference_pmd(crste.pmd));
+			else
+				dat_free_level(dereference_crste(crste), true);
+		}
+		return 0;
+	}
+
+	/* A lower level table is present, things will handled there. */
+	if (!crste.h.fc && !crste.h.i)
+		return 0;
+	/* Split (install a lower level table), and handle things there. */
+	return dat_split_crste(p->mc, crstep, gfn, walk->asce, false);
+}
+
+static const struct dat_walk_ops dat_slot_ops = {
+	.pte_entry = _dat_slot_pte,
+	.crste_ops = { _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, },
+};
+
+int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end,
+		 u16 type, u16 param)
+{
+	struct slot_priv priv = {
+		.token = _CRSTE_TOK(0, type, param).val,
+		.mc = mc,
+	};
+
+	return _dat_walk_gfn_range(start, end, asce, &dat_slot_ops,
+				   DAT_WALK_IGN_HOLES | DAT_WALK_ANY, &priv);
+}
+
+static void pgste_set_unlock_multiple(union pte *first, int n, union pgste *pgstes)
+{
+	int i;
+
+	for (i = 0; i < n; i++) {
+		if (!pgstes[i].pcl)
+			break;
+		pgste_set_unlock(first + i, pgstes[i]);
+	}
+}
+
+static bool pgste_get_trylock_multiple(union pte *first, int n, union pgste *pgstes)
+{
+	int i;
+
+	for (i = 0; i < n; i++) {
+		if (!pgste_get_trylock(first + i, pgstes + i))
+			break;
+	}
+	if (i == n)
+		return true;
+	pgste_set_unlock_multiple(first, n, pgstes);
+	return false;
+}
+
+unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param)
+{
+	union pgste pgstes[4] = {};
+	unsigned long res = 0;
+	int i, n;
+
+	n = param.len + 1;
+
+	while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes))
+		cpu_relax();
+
+	for (i = 0; i < n; i++)
+		res = res << 16 | pgstes[i].val16;
+
+	pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes);
+	return res;
+}
+
+void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val)
+{
+	union pgste pgstes[4] = {};
+	int i, n;
+
+	n = param.len + 1;
+
+	while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes))
+		cpu_relax();
+
+	for (i = param.len; i >= 0; i--) {
+		pgstes[i].val16 = val;
+		val = val >> 16;
+	}
+
+	pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes);
+}
+
+static long _dat_test_young_pte(union pte *ptep, gfn_t start, gfn_t end, struct dat_walk *walk)
+{
+	return ptep->s.y;
+}
+
+static long _dat_test_young_crste(union crste *crstep, gfn_t start, gfn_t end,
+				  struct dat_walk *walk)
+{
+	return crstep->h.fc && crstep->s.fc1.y;
+}
+
+static const struct dat_walk_ops test_age_ops = {
+	.pte_entry = _dat_test_young_pte,
+	.pmd_entry = _dat_test_young_crste,
+	.pud_entry = _dat_test_young_crste,
+};
+
+/**
+ * dat_test_age_gfn() - Test young.
+ * @asce: The ASCE whose address range is to be tested.
+ * @start: The first guest frame of the range to check.
+ * @end: The guest frame after the last in the range.
+ *
+ * Context: called by KVM common code with the kvm mmu write lock held.
+ *
+ * Return: %true if any page in the given range is young, otherwise %false.
+ */
+bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end)
+{
+	return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0;
+}
+
+int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
+	     bool uses_skeys, struct guest_fault *f)
+{
+	union crste oldval, newval;
+	union pte newpte, oldpte;
+	union pgste pgste;
+	int rc = 0;
+
+	rc = dat_entry_walk(mc, f->gfn, asce, DAT_WALK_ALLOC_CONTINUE, level, &f->crstep, &f->ptep);
+	if (rc == -EINVAL || rc == -ENOMEM)
+		return rc;
+	if (rc)
+		return -EAGAIN;
+
+	if (WARN_ON_ONCE(unlikely(get_level(f->crstep, f->ptep) > level)))
+		return -EINVAL;
+
+	if (f->ptep) {
+		pgste = pgste_get_lock(f->ptep);
+		oldpte = *f->ptep;
+		newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
+		newpte.s.sd = oldpte.s.sd;
+		oldpte.s.sd = 0;
+		if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
+			pgste = __dat_ptep_xchg(f->ptep, pgste, newpte, f->gfn, asce, uses_skeys);
+			if (f->callback)
+				f->callback(f);
+		} else {
+			rc = -EAGAIN;
+		}
+		pgste_set_unlock(f->ptep, pgste);
+	} else {
+		oldval = READ_ONCE(*f->crstep);
+		newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
+				    f->write_attempt | oldval.s.fc1.d);
+		newval.s.fc1.sd = oldval.s.fc1.sd;
+		if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
+		    crste_origin_large(oldval) != crste_origin_large(newval))
+			return -EAGAIN;
+		if (!dat_crstep_xchg_atomic(f->crstep, oldval, newval, f->gfn, asce))
+			return -EAGAIN;
+		if (f->callback)
+			f->callback(f);
+	}
+
+	return rc;
+}
+
+static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	union crste crste = READ_ONCE(*crstep);
+	int *n = walk->priv;
+
+	if (!crste.h.fc || crste.h.i || crste.h.p)
+		return 0;
+
+	*n = 2;
+	if (crste.s.fc1.prefix_notif)
+		return 0;
+	crste.s.fc1.prefix_notif = 1;
+	dat_crstep_xchg(crstep, crste, gfn, walk->asce);
+	return 0;
+}
+
+static long dat_set_pn_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	int *n = walk->priv;
+	union pgste pgste;
+
+	pgste = pgste_get_lock(ptep);
+	if (!ptep->h.i && !ptep->h.p) {
+		pgste.prefix_notif = 1;
+		*n += 1;
+	}
+	pgste_set_unlock(ptep, pgste);
+	return 0;
+}
+
+int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn)
+{
+	static const struct dat_walk_ops ops = {
+		.pte_entry = dat_set_pn_pte,
+		.pmd_entry = dat_set_pn_crste,
+		.pud_entry = dat_set_pn_crste,
+	};
+
+	int n = 0;
+
+	_dat_walk_gfn_range(gfn, gfn + 2, asce, &ops, DAT_WALK_IGN_HOLES, &n);
+	if (n != 2)
+		return -EAGAIN;
+	return 0;
+}
+
+/**
+ * dat_perform_essa() - Perform ESSA actions on the PGSTE.
+ * @asce: The asce to operate on.
+ * @gfn: The guest page frame to operate on.
+ * @orc: The specific action to perform, see the ESSA_SET_* macros.
+ * @state: The storage attributes to be returned to the guest.
+ * @dirty: Returns whether the function dirtied a previously clean entry.
+ *
+ * Context: Called with kvm->mmu_lock held.
+ *
+ * Return:
+ * * %1 if the page state has been altered and the page is to be added to the CBRL
+ * * %0 if the page state has been altered, but the page is not to be added to the CBRL
+ * * %-1 if the page state has not been altered and the page is not to be added to the CBRL
+ */
+int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty)
+{
+	union crste *crstep;
+	union pgste pgste;
+	union pte *ptep;
+	int res = 0;
+
+	if (dat_entry_walk(NULL, gfn, asce, 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep)) {
+		*state = (union essa_state) { .exception = 1 };
+		return -1;
+	}
+
+	pgste = pgste_get_lock(ptep);
+
+	*state = (union essa_state) {
+		.content = (ptep->h.i << 1) + (ptep->h.i && pgste.zero),
+		.nodat = pgste.nodat,
+		.usage = pgste.usage,
+		};
+
+	switch (orc) {
+	case ESSA_GET_STATE:
+		res = -1;
+		break;
+	case ESSA_SET_STABLE:
+		pgste.usage = PGSTE_GPS_USAGE_STABLE;
+		pgste.nodat = 0;
+		break;
+	case ESSA_SET_UNUSED:
+		pgste.usage = PGSTE_GPS_USAGE_UNUSED;
+		if (ptep->h.i)
+			res = 1;
+		break;
+	case ESSA_SET_VOLATILE:
+		pgste.usage = PGSTE_GPS_USAGE_VOLATILE;
+		if (ptep->h.i)
+			res = 1;
+		break;
+	case ESSA_SET_POT_VOLATILE:
+		if (!ptep->h.i) {
+			pgste.usage = PGSTE_GPS_USAGE_POT_VOLATILE;
+		} else if (pgste.zero) {
+			pgste.usage = PGSTE_GPS_USAGE_VOLATILE;
+		} else if (!pgste.gc) {
+			pgste.usage = PGSTE_GPS_USAGE_VOLATILE;
+			res = 1;
+		}
+		break;
+	case ESSA_SET_STABLE_RESIDENT:
+		pgste.usage = PGSTE_GPS_USAGE_STABLE;
+		/*
+		 * Since the resident state can go away any time after this
+		 * call, we will not make this page resident. We can revisit
+		 * this decision if a guest will ever start using this.
+		 */
+		break;
+	case ESSA_SET_STABLE_IF_RESIDENT:
+		if (!ptep->h.i)
+			pgste.usage = PGSTE_GPS_USAGE_STABLE;
+		break;
+	case ESSA_SET_STABLE_NODAT:
+		pgste.usage = PGSTE_GPS_USAGE_STABLE;
+		pgste.nodat = 1;
+		break;
+	default:
+		WARN_ONCE(1, "Invalid ORC!");
+		res = -1;
+		break;
+	}
+	/* If we are discarding a page, set it to logical zero. */
+	pgste.zero = res == 1;
+	if (orc > 0) {
+		*dirty = !pgste.cmma_d;
+		pgste.cmma_d = 1;
+	}
+
+	pgste_set_unlock(ptep, pgste);
+
+	return res;
+}
+
+static long dat_reset_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	union pgste pgste;
+
+	pgste = pgste_get_lock(ptep);
+	pgste.usage = 0;
+	pgste.nodat = 0;
+	pgste.cmma_d = 0;
+	pgste_set_unlock(ptep, pgste);
+	if (need_resched())
+		return next;
+	return 0;
+}
+
+long dat_reset_cmma(union asce asce, gfn_t start)
+{
+	const struct dat_walk_ops dat_reset_cmma_ops = {
+		.pte_entry = dat_reset_cmma_pte,
+	};
+
+	return _dat_walk_gfn_range(start, asce_end(asce), asce, &dat_reset_cmma_ops,
+				   DAT_WALK_IGN_HOLES, NULL);
+}
+
+struct dat_get_cmma_state {
+	gfn_t start;
+	gfn_t end;
+	unsigned int count;
+	u8 *values;
+	atomic64_t *remaining;
+};
+
+static long __dat_peek_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	struct dat_get_cmma_state *state = walk->priv;
+	union pgste pgste;
+
+	pgste = pgste_get_lock(ptep);
+	state->values[gfn - walk->start] = pgste.usage | (pgste.nodat << 6);
+	pgste_set_unlock(ptep, pgste);
+	state->end = next;
+
+	return 0;
+}
+
+static long __dat_peek_cmma_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	struct dat_get_cmma_state *state = walk->priv;
+
+	if (crstep->h.i)
+		state->end = min(walk->end, next);
+	return 0;
+}
+
+int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values)
+{
+	const struct dat_walk_ops ops = {
+		.pte_entry = __dat_peek_cmma_pte,
+		.pmd_entry = __dat_peek_cmma_crste,
+		.pud_entry = __dat_peek_cmma_crste,
+		.p4d_entry = __dat_peek_cmma_crste,
+		.pgd_entry = __dat_peek_cmma_crste,
+	};
+	struct dat_get_cmma_state state = { .values = values, };
+	int rc;
+
+	rc = _dat_walk_gfn_range(start, start + *count, asce, &ops, DAT_WALK_DEFAULT, &state);
+	*count = state.end - start;
+	/* Return success if at least one value was saved, otherwise an error. */
+	return (rc == -EFAULT && *count > 0) ? 0 : rc;
+}
+
+static long __dat_get_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	struct dat_get_cmma_state *state = walk->priv;
+	union pgste pgste;
+
+	if (state->start != -1) {
+		if ((gfn - state->end) > KVM_S390_MAX_BIT_DISTANCE)
+			return 1;
+		if (gfn - state->start >= state->count)
+			return 1;
+	}
+
+	if (!READ_ONCE(*pgste_of(ptep)).cmma_d)
+		return 0;
+
+	pgste = pgste_get_lock(ptep);
+	if (pgste.cmma_d) {
+		if (state->start == -1)
+			state->start = gfn;
+		pgste.cmma_d = 0;
+		atomic64_dec(state->remaining);
+		state->values[gfn - state->start] = pgste.usage | pgste.nodat << 6;
+		state->end = next;
+	}
+	pgste_set_unlock(ptep, pgste);
+	return 0;
+}
+
+int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem)
+{
+	const struct dat_walk_ops ops = { .pte_entry = __dat_get_cmma_pte, };
+	struct dat_get_cmma_state state = {
+		.remaining = rem,
+		.values = values,
+		.count = *count,
+		.start = -1,
+	};
+
+	_dat_walk_gfn_range(*start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, &state);
+
+	if (state.start == -1) {
+		*count = 0;
+	} else {
+		*count = state.end - state.start;
+		*start = state.start;
+	}
+
+	return 0;
+}
+
+struct dat_set_cmma_state {
+	unsigned long mask;
+	const u8 *bits;
+};
+
+static long __dat_set_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	struct dat_set_cmma_state *state = walk->priv;
+	union pgste pgste, tmp;
+
+	tmp.val = (state->bits[gfn - walk->start] << 24) & state->mask;
+
+	pgste = pgste_get_lock(ptep);
+	pgste.usage = tmp.usage;
+	pgste.nodat = tmp.nodat;
+	pgste_set_unlock(ptep, pgste);
+
+	return 0;
+}
+
+/**
+ * dat_set_cmma_bits() - Set CMMA bits for a range of guest pages.
+ * @mc: Cache used for allocations.
+ * @asce: The ASCE of the guest.
+ * @gfn: The guest frame of the fist page whose CMMA bits are to set.
+ * @count: How many pages need to be processed.
+ * @mask: Which PGSTE bits should be set.
+ * @bits: Points to an array with the CMMA attributes.
+ *
+ * This function sets the CMMA attributes for the given pages. If the input
+ * buffer has zero length, no action is taken, otherwise the attributes are
+ * set and the mm->context.uses_cmm flag is set.
+ *
+ * Each byte in @bits contains new values for bits 32-39 of the PGSTE.
+ * Currently, only the fields NT and US are applied.
+ *
+ * Return: %0 in case of success, a negative error value otherwise.
+ */
+int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
+		      unsigned long count, unsigned long mask, const uint8_t *bits)
+{
+	const struct dat_walk_ops ops = { .pte_entry = __dat_set_cmma_pte, };
+	struct dat_set_cmma_state state = { .mask = mask, .bits = bits, };
+	union crste *crstep;
+	union pte *ptep;
+	gfn_t cur;
+	int rc;
+
+	for (cur = ALIGN_DOWN(gfn, _PAGE_ENTRIES); cur < gfn + count; cur += _PAGE_ENTRIES) {
+		rc = dat_entry_walk(mc, cur, asce, DAT_WALK_ALLOC, TABLE_TYPE_PAGE_TABLE,
+				    &crstep, &ptep);
+		if (rc)
+			return rc;
+	}
+	return _dat_walk_gfn_range(gfn, gfn + count, asce, &ops, DAT_WALK_IGN_HOLES, &state);
+}
diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h
new file mode 100644
index 000000000000..8c7ae07dcc28
--- /dev/null
+++ b/arch/s390/kvm/dat.h
@@ -0,0 +1,970 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  KVM guest address space mapping code
+ *
+ *    Copyright IBM Corp. 2024, 2025
+ *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+
+#ifndef __KVM_S390_DAT_H
+#define __KVM_S390_DAT_H
+
+#include <linux/radix-tree.h>
+#include <linux/refcount.h>
+#include <linux/io.h>
+#include <linux/kvm_types.h>
+#include <linux/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/dat-bits.h>
+
+/*
+ * Base address and length must be sent at the start of each block, therefore
+ * it's cheaper to send some clean data, as long as it's less than the size of
+ * two longs.
+ */
+#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *))
+/* For consistency */
+#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
+
+#define _ASCE(x) ((union asce) { .val = (x), })
+#define NULL_ASCE _ASCE(0)
+
+enum {
+	_DAT_TOKEN_NONE = 0,
+	_DAT_TOKEN_PIC,
+};
+
+#define _CRSTE_TOK(l, t, p) ((union crste) {	\
+		.tok.i = 1,			\
+		.tok.tt = (l),			\
+		.tok.type = (t),		\
+		.tok.par = (p)			\
+	})
+#define _CRSTE_PIC(l, p) _CRSTE_TOK(l, _DAT_TOKEN_PIC, p)
+
+#define _CRSTE_HOLE(l) _CRSTE_PIC(l, PGM_ADDRESSING)
+#define _CRSTE_EMPTY(l) _CRSTE_TOK(l, _DAT_TOKEN_NONE, 0)
+
+#define _PMD_EMPTY _CRSTE_EMPTY(TABLE_TYPE_SEGMENT)
+
+#define _PTE_TOK(t, p) ((union pte) { .tok.i = 1, .tok.type = (t), .tok.par = (p) })
+#define _PTE_EMPTY _PTE_TOK(_DAT_TOKEN_NONE, 0)
+
+/* This fake table type is used for page table walks (both for normal page tables and vSIE) */
+#define TABLE_TYPE_PAGE_TABLE -1
+
+enum dat_walk_flags {
+	DAT_WALK_USES_SKEYS	= 0x40,
+	DAT_WALK_CONTINUE	= 0x20,
+	DAT_WALK_IGN_HOLES	= 0x10,
+	DAT_WALK_SPLIT		= 0x08,
+	DAT_WALK_ALLOC		= 0x04,
+	DAT_WALK_ANY		= 0x02,
+	DAT_WALK_LEAF		= 0x01,
+	DAT_WALK_DEFAULT	= 0
+};
+
+#define DAT_WALK_SPLIT_ALLOC (DAT_WALK_SPLIT | DAT_WALK_ALLOC)
+#define DAT_WALK_ALLOC_CONTINUE (DAT_WALK_CONTINUE | DAT_WALK_ALLOC)
+#define DAT_WALK_LEAF_ALLOC (DAT_WALK_LEAF | DAT_WALK_ALLOC)
+
+union pte {
+	unsigned long val;
+	union page_table_entry h;
+	struct {
+		unsigned long   :56; /* Hardware bits */
+		unsigned long u : 1; /* Page unused */
+		unsigned long s : 1; /* Special */
+		unsigned long w : 1; /* Writable */
+		unsigned long r : 1; /* Readable */
+		unsigned long d : 1; /* Dirty */
+		unsigned long y : 1; /* Young */
+		unsigned long sd: 1; /* Soft dirty */
+		unsigned long pr: 1; /* Present */
+	} s;
+	struct {
+		unsigned char hwbytes[7];
+		unsigned char swbyte;
+	};
+	union {
+		struct {
+			unsigned long type :16; /* Token type */
+			unsigned long par  :16; /* Token parameter */
+			unsigned long      :20;
+			unsigned long      : 1; /* Must be 0 */
+			unsigned long i    : 1; /* Must be 1 */
+			unsigned long      : 2;
+			unsigned long      : 7;
+			unsigned long pr   : 1; /* Must be 0 */
+		};
+		struct {
+			unsigned long token:32; /* Token and parameter */
+			unsigned long      :32;
+		};
+	} tok;
+};
+
+/* Soft dirty, needed as macro for atomic operations on ptes */
+#define _PAGE_SD 0x002
+
+/* Needed as macro to perform atomic operations */
+#define PGSTE_PCL_BIT		0x0080000000000000UL	/* PCL lock, HW bit */
+#define PGSTE_CMMA_D_BIT	0x0000000000008000UL	/* CMMA dirty soft-bit */
+
+enum pgste_gps_usage {
+	PGSTE_GPS_USAGE_STABLE = 0,
+	PGSTE_GPS_USAGE_UNUSED,
+	PGSTE_GPS_USAGE_POT_VOLATILE,
+	PGSTE_GPS_USAGE_VOLATILE,
+};
+
+union pgste {
+	unsigned long val;
+	struct {
+		unsigned long acc          : 4;
+		unsigned long fp           : 1;
+		unsigned long              : 3;
+		unsigned long pcl          : 1;
+		unsigned long hr           : 1;
+		unsigned long hc           : 1;
+		unsigned long              : 2;
+		unsigned long gr           : 1;
+		unsigned long gc           : 1;
+		unsigned long              : 1;
+		unsigned long              :16; /* val16 */
+		unsigned long zero         : 1;
+		unsigned long nodat        : 1;
+		unsigned long              : 4;
+		unsigned long usage        : 2;
+		unsigned long              : 8;
+		unsigned long cmma_d       : 1; /* Dirty flag for CMMA bits */
+		unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
+		unsigned long vsie_notif   : 1; /* Referenced in a shadow table */
+		unsigned long              : 5;
+		unsigned long              : 8;
+	};
+	struct {
+		unsigned short hwbytes0;
+		unsigned short val16;	/* Used to store chunked values, see dat_{s,g}et_ptval() */
+		unsigned short hwbytes4;
+		unsigned char flags;	/* Maps to the software bits */
+		unsigned char hwbyte7;
+	} __packed;
+};
+
+union pmd {
+	unsigned long val;
+	union segment_table_entry h;
+	struct {
+		struct {
+			unsigned long              :44; /* HW */
+			unsigned long              : 3; /* Unused */
+			unsigned long              : 1; /* HW */
+			unsigned long w            : 1; /* Writable soft-bit */
+			unsigned long r            : 1; /* Readable soft-bit */
+			unsigned long d            : 1; /* Dirty */
+			unsigned long y            : 1; /* Young */
+			unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
+			unsigned long              : 3; /* HW */
+			unsigned long vsie_notif   : 1; /* Referenced in a shadow table */
+			unsigned long              : 1; /* Unused */
+			unsigned long              : 4; /* HW */
+			unsigned long sd           : 1; /* Soft-Dirty */
+			unsigned long pr           : 1; /* Present */
+		} fc1;
+	} s;
+};
+
+union pud {
+	unsigned long val;
+	union region3_table_entry h;
+	struct {
+		struct {
+			unsigned long              :33; /* HW */
+			unsigned long              :14; /* Unused */
+			unsigned long              : 1; /* HW */
+			unsigned long w            : 1; /* Writable soft-bit */
+			unsigned long r            : 1; /* Readable soft-bit */
+			unsigned long d            : 1; /* Dirty */
+			unsigned long y            : 1; /* Young */
+			unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
+			unsigned long              : 3; /* HW */
+			unsigned long vsie_notif   : 1; /* Referenced in a shadow table */
+			unsigned long              : 1; /* Unused */
+			unsigned long              : 4; /* HW */
+			unsigned long sd           : 1; /* Soft-Dirty */
+			unsigned long pr           : 1; /* Present */
+		} fc1;
+	} s;
+};
+
+union p4d {
+	unsigned long val;
+	union region2_table_entry h;
+};
+
+union pgd {
+	unsigned long val;
+	union region1_table_entry h;
+};
+
+union crste {
+	unsigned long val;
+	union {
+		struct {
+			unsigned long   :52;
+			unsigned long   : 1;
+			unsigned long fc: 1;
+			unsigned long p : 1;
+			unsigned long   : 1;
+			unsigned long   : 2;
+			unsigned long i : 1;
+			unsigned long   : 1;
+			unsigned long tt: 2;
+			unsigned long   : 2;
+		};
+		struct {
+			unsigned long to:52;
+			unsigned long   : 1;
+			unsigned long fc: 1;
+			unsigned long p : 1;
+			unsigned long   : 1;
+			unsigned long tf: 2;
+			unsigned long i : 1;
+			unsigned long   : 1;
+			unsigned long tt: 2;
+			unsigned long tl: 2;
+		} fc0;
+		struct {
+			unsigned long    :47;
+			unsigned long av : 1; /* ACCF-Validity Control */
+			unsigned long acc: 4; /* Access-Control Bits */
+			unsigned long f  : 1; /* Fetch-Protection Bit */
+			unsigned long fc : 1; /* Format-Control */
+			unsigned long p  : 1; /* DAT-Protection Bit */
+			unsigned long iep: 1; /* Instruction-Execution-Protection */
+			unsigned long    : 2;
+			unsigned long i  : 1; /* Segment-Invalid Bit */
+			unsigned long cs : 1; /* Common-Segment Bit */
+			unsigned long tt : 2; /* Table-Type Bits */
+			unsigned long    : 2;
+		} fc1;
+	} h;
+	struct {
+		struct {
+			unsigned long              :47;
+			unsigned long              : 1; /* HW (should be 0) */
+			unsigned long w            : 1; /* Writable */
+			unsigned long r            : 1; /* Readable */
+			unsigned long d            : 1; /* Dirty */
+			unsigned long y            : 1; /* Young */
+			unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
+			unsigned long              : 3; /* HW */
+			unsigned long vsie_notif   : 1; /* Referenced in a shadow table */
+			unsigned long              : 1;
+			unsigned long              : 4; /* HW */
+			unsigned long sd           : 1; /* Soft-Dirty */
+			unsigned long pr           : 1; /* Present */
+		} fc1;
+	} s;
+	union {
+		struct {
+			unsigned long type :16; /* Token type */
+			unsigned long par  :16; /* Token parameter */
+			unsigned long      :26;
+			unsigned long i    : 1; /* Must be 1 */
+			unsigned long      : 1;
+			unsigned long tt   : 2;
+			unsigned long      : 1;
+			unsigned long pr   : 1; /* Must be 0 */
+		};
+		struct {
+			unsigned long token:32; /* Token and parameter */
+			unsigned long      :32;
+		};
+	} tok;
+	union pmd pmd;
+	union pud pud;
+	union p4d p4d;
+	union pgd pgd;
+};
+
+union skey {
+	unsigned char skey;
+	struct {
+		unsigned char acc :4;
+		unsigned char fp  :1;
+		unsigned char r   :1;
+		unsigned char c   :1;
+		unsigned char zero:1;
+	};
+};
+
+static_assert(sizeof(union pgste) == sizeof(unsigned long));
+static_assert(sizeof(union pte) == sizeof(unsigned long));
+static_assert(sizeof(union pmd) == sizeof(unsigned long));
+static_assert(sizeof(union pud) == sizeof(unsigned long));
+static_assert(sizeof(union p4d) == sizeof(unsigned long));
+static_assert(sizeof(union pgd) == sizeof(unsigned long));
+static_assert(sizeof(union crste) == sizeof(unsigned long));
+static_assert(sizeof(union skey) == sizeof(char));
+
+struct segment_table {
+	union pmd pmds[_CRST_ENTRIES];
+};
+
+struct region3_table {
+	union pud puds[_CRST_ENTRIES];
+};
+
+struct region2_table {
+	union p4d p4ds[_CRST_ENTRIES];
+};
+
+struct region1_table {
+	union pgd pgds[_CRST_ENTRIES];
+};
+
+struct crst_table {
+	union {
+		union crste crstes[_CRST_ENTRIES];
+		struct segment_table segment;
+		struct region3_table region3;
+		struct region2_table region2;
+		struct region1_table region1;
+	};
+};
+
+struct page_table {
+	union pte ptes[_PAGE_ENTRIES];
+	union pgste pgstes[_PAGE_ENTRIES];
+};
+
+static_assert(sizeof(struct crst_table) == _CRST_TABLE_SIZE);
+static_assert(sizeof(struct page_table) == PAGE_SIZE);
+
+struct dat_walk;
+
+typedef long (*dat_walk_op)(union crste *crste, gfn_t gfn, gfn_t next, struct dat_walk *w);
+
+struct dat_walk_ops {
+	union {
+		dat_walk_op crste_ops[4];
+		struct {
+			dat_walk_op pmd_entry;
+			dat_walk_op pud_entry;
+			dat_walk_op p4d_entry;
+			dat_walk_op pgd_entry;
+		};
+	};
+	long (*pte_entry)(union pte *pte, gfn_t gfn, gfn_t next, struct dat_walk *w);
+};
+
+struct dat_walk {
+	const struct dat_walk_ops *ops;
+	union crste *last;
+	union pte *last_pte;
+	union asce asce;
+	gfn_t start;
+	gfn_t end;
+	int flags;
+	void *priv;
+};
+
+struct ptval_param {
+	unsigned char offset : 6;
+	unsigned char len : 2;
+};
+
+/**
+ * _pte() - Useful constructor for union pte
+ * @pfn: the pfn this pte should point to.
+ * @writable: whether the pte should be writable.
+ * @dirty: whether the pte should be dirty.
+ * @special: whether the pte should be marked as special
+ *
+ * The pte is also marked as young and present. If the pte is marked as dirty,
+ * it gets marked as soft-dirty too. If the pte is not dirty, the hardware
+ * protect bit is set (independently of the write softbit); this way proper
+ * dirty tracking can be performed.
+ *
+ * Return: a union pte value.
+ */
+static inline union pte _pte(kvm_pfn_t pfn, bool writable, bool dirty, bool special)
+{
+	union pte res = { .val = PFN_PHYS(pfn) };
+
+	res.h.p = !dirty;
+	res.s.y = 1;
+	res.s.pr = 1;
+	res.s.w = writable;
+	res.s.d = dirty;
+	res.s.sd = dirty;
+	res.s.s = special;
+	return res;
+}
+
+static inline union crste _crste_fc0(kvm_pfn_t pfn, int tt)
+{
+	union crste res = { .val = PFN_PHYS(pfn) };
+
+	res.h.tt = tt;
+	res.h.fc0.tl = _REGION_ENTRY_LENGTH;
+	res.h.fc0.tf = 0;
+	return res;
+}
+
+/**
+ * _crste() - Useful constructor for union crste with FC=1
+ * @pfn: the pfn this pte should point to.
+ * @tt: the table type
+ * @writable: whether the pte should be writable.
+ * @dirty: whether the pte should be dirty.
+ *
+ * The crste is also marked as young and present. If the crste is marked as
+ * dirty, it gets marked as soft-dirty too. If the crste is not dirty, the
+ * hardware protect bit is set (independently of the write softbit); this way
+ * proper dirty tracking can be performed.
+ *
+ * Return: a union crste value.
+ */
+static inline union crste _crste_fc1(kvm_pfn_t pfn, int tt, bool writable, bool dirty)
+{
+	union crste res = { .val = PFN_PHYS(pfn) & _SEGMENT_MASK };
+
+	res.h.tt = tt;
+	res.h.p = !dirty;
+	res.h.fc = 1;
+	res.s.fc1.y = 1;
+	res.s.fc1.pr = 1;
+	res.s.fc1.w = writable;
+	res.s.fc1.d = dirty;
+	res.s.fc1.sd = dirty;
+	return res;
+}
+
+union essa_state {
+	unsigned char val;
+	struct {
+		unsigned char		: 2;
+		unsigned char nodat	: 1;
+		unsigned char exception	: 1;
+		unsigned char usage	: 2;
+		unsigned char content	: 2;
+	};
+};
+
+/**
+ * struct vsie_rmap - reverse mapping for shadow page table entries
+ * @next: pointer to next rmap in the list
+ * @r_gfn: virtual rmap address in the shadow guest address space
+ */
+struct vsie_rmap {
+	struct vsie_rmap *next;
+	union {
+		unsigned long val;
+		struct {
+			long          level: 8;
+			unsigned long      : 4;
+			unsigned long r_gfn:52;
+		};
+	};
+};
+
+static_assert(sizeof(struct vsie_rmap) == 2 * sizeof(long));
+
+#define KVM_S390_MMU_CACHE_N_CRSTS	6
+#define KVM_S390_MMU_CACHE_N_PTS	2
+#define KVM_S390_MMU_CACHE_N_RMAPS	16
+struct kvm_s390_mmu_cache {
+	void *crsts[KVM_S390_MMU_CACHE_N_CRSTS];
+	void *pts[KVM_S390_MMU_CACHE_N_PTS];
+	void *rmaps[KVM_S390_MMU_CACHE_N_RMAPS];
+	short int n_crsts;
+	short int n_pts;
+	short int n_rmaps;
+};
+
+struct guest_fault {
+	gfn_t gfn;		/* Guest frame */
+	kvm_pfn_t pfn;		/* Host PFN */
+	struct page *page;	/* Host page */
+	union pte *ptep;	/* Used to resolve the fault, or NULL */
+	union crste *crstep;	/* Used to resolve the fault, or NULL */
+	bool writable;		/* Mapping is writable */
+	bool write_attempt;	/* Write access attempted */
+	bool attempt_pfault;	/* Attempt a pfault first */
+	bool valid;		/* This entry contains valid data */
+	void (*callback)(struct guest_fault *f);
+	void *priv;
+};
+
+/*
+ *	0	1	2	3	4	5	6	7
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ *  0	|				|	    PGT_ADDR		|
+ *  8	|	 VMADDR		|					|
+ * 16	|								|
+ * 24	|								|
+ */
+#define MKPTVAL(o, l) ((struct ptval_param) { .offset = (o), .len = ((l) + 1) / 2 - 1})
+#define PTVAL_PGT_ADDR	MKPTVAL(4, 8)
+#define PTVAL_VMADDR	MKPTVAL(8, 6)
+
+union pgste __must_check __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new,
+					 gfn_t gfn, union asce asce, bool uses_skeys);
+bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn,
+			    union asce asce);
+void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce);
+
+long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce,
+			 const struct dat_walk_ops *ops, int flags, void *priv);
+
+int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags,
+		   int walk_level, union crste **last, union pte **ptepp);
+void dat_free_level(struct crst_table *table, bool owns_ptes);
+struct crst_table *dat_alloc_crst_sleepable(unsigned long init);
+int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype);
+int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey);
+int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
+			union skey skey, bool nq);
+int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn,
+			     union skey skey, union skey *oldkey, bool nq, bool mr, bool mc);
+int dat_reset_reference_bit(union asce asce, gfn_t gfn);
+long dat_reset_skeys(union asce asce, gfn_t start);
+
+unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param);
+void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val);
+
+int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end,
+		 u16 type, u16 param);
+int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn);
+bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end);
+int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
+	     bool uses_skeys, struct guest_fault *f);
+
+int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty);
+long dat_reset_cmma(union asce asce, gfn_t start_gfn);
+int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values);
+int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem);
+int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
+		      unsigned long count, unsigned long mask, const uint8_t *bits);
+
+int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc);
+
+#define GFP_KVM_S390_MMU_CACHE (GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN)
+
+static inline struct page_table *kvm_s390_mmu_cache_alloc_pt(struct kvm_s390_mmu_cache *mc)
+{
+	if (mc->n_pts)
+		return mc->pts[--mc->n_pts];
+	return (void *)__get_free_page(GFP_KVM_S390_MMU_CACHE);
+}
+
+static inline struct crst_table *kvm_s390_mmu_cache_alloc_crst(struct kvm_s390_mmu_cache *mc)
+{
+	if (mc->n_crsts)
+		return mc->crsts[--mc->n_crsts];
+	return (void *)__get_free_pages(GFP_KVM_S390_MMU_CACHE | __GFP_COMP, CRST_ALLOC_ORDER);
+}
+
+static inline struct vsie_rmap *kvm_s390_mmu_cache_alloc_rmap(struct kvm_s390_mmu_cache *mc)
+{
+	if (mc->n_rmaps)
+		return mc->rmaps[--mc->n_rmaps];
+	return kzalloc(sizeof(struct vsie_rmap), GFP_KVM_S390_MMU_CACHE);
+}
+
+static inline struct crst_table *crste_table_start(union crste *crstep)
+{
+	return (struct crst_table *)ALIGN_DOWN((unsigned long)crstep, _CRST_TABLE_SIZE);
+}
+
+static inline struct page_table *pte_table_start(union pte *ptep)
+{
+	return (struct page_table *)ALIGN_DOWN((unsigned long)ptep, _PAGE_TABLE_SIZE);
+}
+
+static inline bool crdte_crste(union crste *crstep, union crste old, union crste new, gfn_t gfn,
+			       union asce asce)
+{
+	unsigned long dtt = 0x10 | new.h.tt << 2;
+	void *table = crste_table_start(crstep);
+
+	return crdte(old.val, new.val, table, dtt, gfn_to_gpa(gfn), asce.val);
+}
+
+/**
+ * idte_crste() - invalidate a crste entry using idte
+ * @crstep: pointer to the crste to be invalidated
+ * @gfn: a gfn mapped by the crste
+ * @opt: options for the idte instruction
+ * @asce: the asce
+ * @local: whether the operation is cpu-local
+ */
+static __always_inline void idte_crste(union crste *crstep, gfn_t gfn, unsigned long opt,
+				       union asce asce, int local)
+{
+	unsigned long table_origin = __pa(crste_table_start(crstep));
+	unsigned long gaddr = gfn_to_gpa(gfn) & HPAGE_MASK;
+
+	if (__builtin_constant_p(opt) && opt == 0) {
+		/* flush without guest asce */
+		asm volatile("idte	%[table_origin],0,%[gaddr],%[local]"
+			: "+m" (*crstep)
+			: [table_origin] "a" (table_origin), [gaddr] "a" (gaddr),
+			  [local] "i" (local)
+			: "cc");
+	} else {
+		/* flush with guest asce */
+		asm volatile("idte %[table_origin],%[asce],%[gaddr_opt],%[local]"
+			: "+m" (*crstep)
+			: [table_origin] "a" (table_origin), [gaddr_opt] "a" (gaddr | opt),
+			  [asce] "a" (asce.val), [local] "i" (local)
+			: "cc");
+	}
+}
+
+static inline void dat_init_pgstes(struct page_table *pt, unsigned long val)
+{
+	memset64((void *)pt->pgstes, val, PTRS_PER_PTE);
+}
+
+static inline void dat_init_page_table(struct page_table *pt, unsigned long ptes,
+				       unsigned long pgstes)
+{
+	memset64((void *)pt->ptes, ptes, PTRS_PER_PTE);
+	dat_init_pgstes(pt, pgstes);
+}
+
+static inline gfn_t asce_end(union asce asce)
+{
+	return 1ULL << ((asce.dt + 1) * 11 + _SEGMENT_SHIFT - PAGE_SHIFT);
+}
+
+#define _CRSTE(x) ((union crste) { .val = _Generic((x),	\
+			union pgd : (x).val,		\
+			union p4d : (x).val,		\
+			union pud : (x).val,		\
+			union pmd : (x).val,		\
+			union crste : (x).val)})
+
+#define _CRSTEP(x) ((union crste *)_Generic((*(x)),	\
+				union pgd : (x),	\
+				union p4d : (x),	\
+				union pud : (x),	\
+				union pmd : (x),	\
+				union crste : (x)))
+
+#define _CRSTP(x) ((struct crst_table *)_Generic((*(x)),	\
+		struct crst_table : (x),			\
+		struct segment_table : (x),			\
+		struct region3_table : (x),			\
+		struct region2_table : (x),			\
+		struct region1_table : (x)))
+
+static inline bool asce_contains_gfn(union asce asce, gfn_t gfn)
+{
+	return gfn < asce_end(asce);
+}
+
+static inline bool is_pmd(union crste crste)
+{
+	return crste.h.tt == TABLE_TYPE_SEGMENT;
+}
+
+static inline bool is_pud(union crste crste)
+{
+	return crste.h.tt == TABLE_TYPE_REGION3;
+}
+
+static inline bool is_p4d(union crste crste)
+{
+	return crste.h.tt == TABLE_TYPE_REGION2;
+}
+
+static inline bool is_pgd(union crste crste)
+{
+	return crste.h.tt == TABLE_TYPE_REGION1;
+}
+
+static inline phys_addr_t pmd_origin_large(union pmd pmd)
+{
+	return pmd.val & _SEGMENT_ENTRY_ORIGIN_LARGE;
+}
+
+static inline phys_addr_t pud_origin_large(union pud pud)
+{
+	return pud.val & _REGION3_ENTRY_ORIGIN_LARGE;
+}
+
+/**
+ * crste_origin_large() - Return the large frame origin of a large crste
+ * @crste: The crste whose origin is to be returned. Should be either a
+ *         region-3 table entry or a segment table entry, in both cases with
+ *         FC set to 1 (large pages).
+ *
+ * Return: The origin of the large frame pointed to by @crste, or -1 if the
+ *         crste was not large (wrong table type, or FC==0)
+ */
+static inline phys_addr_t crste_origin_large(union crste crste)
+{
+	if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3))
+		return -1;
+	if (is_pmd(crste))
+		return pmd_origin_large(crste.pmd);
+	return pud_origin_large(crste.pud);
+}
+
+#define crste_origin(x) (_Generic((x),				\
+		union pmd : (x).val & _SEGMENT_ENTRY_ORIGIN,	\
+		union pud : (x).val & _REGION_ENTRY_ORIGIN,	\
+		union p4d : (x).val & _REGION_ENTRY_ORIGIN,	\
+		union pgd : (x).val & _REGION_ENTRY_ORIGIN))
+
+static inline unsigned long pte_origin(union pte pte)
+{
+	return pte.val & PAGE_MASK;
+}
+
+static inline bool pmd_prefix(union pmd pmd)
+{
+	return pmd.h.fc && pmd.s.fc1.prefix_notif;
+}
+
+static inline bool pud_prefix(union pud pud)
+{
+	return pud.h.fc && pud.s.fc1.prefix_notif;
+}
+
+static inline bool crste_leaf(union crste crste)
+{
+	return (crste.h.tt <= TABLE_TYPE_REGION3) && crste.h.fc;
+}
+
+static inline bool crste_prefix(union crste crste)
+{
+	return crste_leaf(crste) && crste.s.fc1.prefix_notif;
+}
+
+static inline bool crste_dirty(union crste crste)
+{
+	return crste_leaf(crste) && crste.s.fc1.d;
+}
+
+static inline union pgste *pgste_of(union pte *pte)
+{
+	return (union pgste *)(pte + _PAGE_ENTRIES);
+}
+
+static inline bool pte_hole(union pte pte)
+{
+	return pte.h.i && !pte.tok.pr && pte.tok.type != _DAT_TOKEN_NONE;
+}
+
+static inline bool _crste_hole(union crste crste)
+{
+	return crste.h.i && !crste.tok.pr && crste.tok.type != _DAT_TOKEN_NONE;
+}
+
+#define crste_hole(x) _crste_hole(_CRSTE(x))
+
+static inline bool _crste_none(union crste crste)
+{
+	return crste.h.i && !crste.tok.pr && crste.tok.type == _DAT_TOKEN_NONE;
+}
+
+#define crste_none(x) _crste_none(_CRSTE(x))
+
+static inline phys_addr_t large_pud_to_phys(union pud pud, gfn_t gfn)
+{
+	return pud_origin_large(pud) | (gfn_to_gpa(gfn) & ~_REGION3_MASK);
+}
+
+static inline phys_addr_t large_pmd_to_phys(union pmd pmd, gfn_t gfn)
+{
+	return pmd_origin_large(pmd) | (gfn_to_gpa(gfn) & ~_SEGMENT_MASK);
+}
+
+static inline phys_addr_t large_crste_to_phys(union crste crste, gfn_t gfn)
+{
+	if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3))
+		return -1;
+	if (is_pmd(crste))
+		return large_pmd_to_phys(crste.pmd, gfn);
+	return large_pud_to_phys(crste.pud, gfn);
+}
+
+static inline bool cspg_crste(union crste *crstep, union crste old, union crste new)
+{
+	return cspg(&crstep->val, old.val, new.val);
+}
+
+static inline struct page_table *dereference_pmd(union pmd pmd)
+{
+	return phys_to_virt(crste_origin(pmd));
+}
+
+static inline struct segment_table *dereference_pud(union pud pud)
+{
+	return phys_to_virt(crste_origin(pud));
+}
+
+static inline struct region3_table *dereference_p4d(union p4d p4d)
+{
+	return phys_to_virt(crste_origin(p4d));
+}
+
+static inline struct region2_table *dereference_pgd(union pgd pgd)
+{
+	return phys_to_virt(crste_origin(pgd));
+}
+
+static inline struct crst_table *_dereference_crste(union crste crste)
+{
+	if (unlikely(is_pmd(crste)))
+		return NULL;
+	return phys_to_virt(crste_origin(crste.pud));
+}
+
+#define dereference_crste(x) (_Generic((x),			\
+		union pud : _dereference_crste(_CRSTE(x)),	\
+		union p4d : _dereference_crste(_CRSTE(x)),	\
+		union pgd : _dereference_crste(_CRSTE(x)),	\
+		union crste : _dereference_crste(_CRSTE(x))))
+
+static inline struct crst_table *dereference_asce(union asce asce)
+{
+	return phys_to_virt(asce.val & _ASCE_ORIGIN);
+}
+
+static inline void asce_flush_tlb(union asce asce)
+{
+	__tlb_flush_idte(asce.val);
+}
+
+static inline bool pgste_get_trylock(union pte *ptep, union pgste *res)
+{
+	union pgste *pgstep = pgste_of(ptep);
+	union pgste old_pgste;
+
+	if (READ_ONCE(pgstep->val) & PGSTE_PCL_BIT)
+		return false;
+	old_pgste.val = __atomic64_or_barrier(PGSTE_PCL_BIT, &pgstep->val);
+	if (old_pgste.pcl)
+		return false;
+	old_pgste.pcl = 1;
+	*res = old_pgste;
+	return true;
+}
+
+static inline union pgste pgste_get_lock(union pte *ptep)
+{
+	union pgste res;
+
+	while (!pgste_get_trylock(ptep, &res))
+		cpu_relax();
+	return res;
+}
+
+static inline void pgste_set_unlock(union pte *ptep, union pgste pgste)
+{
+	pgste.pcl = 0;
+	barrier();
+	WRITE_ONCE(*pgste_of(ptep), pgste);
+}
+
+static inline void dat_ptep_xchg(union pte *ptep, union pte new, gfn_t gfn, union asce asce,
+				 bool has_skeys)
+{
+	union pgste pgste;
+
+	pgste = pgste_get_lock(ptep);
+	pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, asce, has_skeys);
+	pgste_set_unlock(ptep, pgste);
+}
+
+static inline void dat_ptep_clear(union pte *ptep, gfn_t gfn, union asce asce, bool has_skeys)
+{
+	dat_ptep_xchg(ptep, _PTE_EMPTY, gfn, asce, has_skeys);
+}
+
+static inline void dat_free_pt(struct page_table *pt)
+{
+	free_page((unsigned long)pt);
+}
+
+static inline void _dat_free_crst(struct crst_table *table)
+{
+	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+}
+
+#define dat_free_crst(x) _dat_free_crst(_CRSTP(x))
+
+static inline void kvm_s390_free_mmu_cache(struct kvm_s390_mmu_cache *mc)
+{
+	if (!mc)
+		return;
+	while (mc->n_pts)
+		dat_free_pt(mc->pts[--mc->n_pts]);
+	while (mc->n_crsts)
+		_dat_free_crst(mc->crsts[--mc->n_crsts]);
+	while (mc->n_rmaps)
+		kfree(mc->rmaps[--mc->n_rmaps]);
+	kfree(mc);
+}
+
+DEFINE_FREE(kvm_s390_mmu_cache, struct kvm_s390_mmu_cache *, if (_T) kvm_s390_free_mmu_cache(_T))
+
+static inline struct kvm_s390_mmu_cache *kvm_s390_new_mmu_cache(void)
+{
+	struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL;
+
+	mc = kzalloc(sizeof(*mc), GFP_KERNEL_ACCOUNT);
+	if (mc && !kvm_s390_mmu_cache_topup(mc))
+		return_ptr(mc);
+	return NULL;
+}
+
+static inline bool dat_pmdp_xchg_atomic(union pmd *pmdp, union pmd old, union pmd new,
+					gfn_t gfn, union asce asce)
+{
+	return dat_crstep_xchg_atomic(_CRSTEP(pmdp), _CRSTE(old), _CRSTE(new), gfn, asce);
+}
+
+static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pud new,
+					gfn_t gfn, union asce asce)
+{
+	return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce);
+}
+
+static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce asce)
+{
+	union crste newcrste = _CRSTE_EMPTY(crstep->h.tt);
+
+	dat_crstep_xchg(crstep, newcrste, gfn, asce);
+}
+
+static inline int get_level(union crste *crstep, union pte *ptep)
+{
+	return ptep ? TABLE_TYPE_PAGE_TABLE : crstep->h.tt;
+}
+
+static inline int dat_delete_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start,
+				  unsigned long npages)
+{
+	return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_PIC, PGM_ADDRESSING);
+}
+
+static inline int dat_create_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start,
+				  unsigned long npages)
+{
+	return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_NONE, 0);
+}
+
+static inline bool crste_is_ucas(union crste crste)
+{
+	return is_pmd(crste) && crste.h.i && crste.h.fc0.tl == 1 && crste.h.fc == 0;
+}
+
+#endif /* __KVM_S390_DAT_H */
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 53233dec8cad..d89d1c381522 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -10,13 +10,13 @@
 
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
-#include <asm/gmap.h>
 #include <asm/gmap_helpers.h>
 #include <asm/virtio-ccw.h>
 #include "kvm-s390.h"
 #include "trace.h"
 #include "trace-s390.h"
 #include "gaccess.h"
+#include "gmap.h"
 
 static void do_discard_gfn_range(struct kvm_vcpu *vcpu, gfn_t gfn_start, gfn_t gfn_end)
 {
diff --git a/arch/s390/kvm/faultin.c b/arch/s390/kvm/faultin.c
new file mode 100644
index 000000000000..e37cd18200f5
--- /dev/null
+++ b/arch/s390/kvm/faultin.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  KVM guest fault handling.
+ *
+ *    Copyright IBM Corp. 2025
+ *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+
+#include "gmap.h"
+#include "trace.h"
+#include "faultin.h"
+
+bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu);
+
+/*
+ * kvm_s390_faultin_gfn() - handle a dat fault.
+ * @vcpu: The vCPU whose gmap is to be fixed up, or NULL if operating on the VM.
+ * @kvm: The VM whose gmap is to be fixed up, or NULL if operating on a vCPU.
+ * @f: The guest fault that needs to be resolved.
+ *
+ * Return:
+ * * 0 on success
+ * * < 0 in case of error
+ * * > 0 in case of guest exceptions
+ *
+ * Context:
+ * * The mm lock must not be held before calling
+ * * kvm->srcu must be held
+ * * may sleep
+ */
+int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f)
+{
+	struct kvm_s390_mmu_cache *local_mc __free(kvm_s390_mmu_cache) = NULL;
+	struct kvm_s390_mmu_cache *mc = NULL;
+	struct kvm_memory_slot *slot;
+	unsigned long inv_seq;
+	int foll, rc = 0;
+
+	foll = f->write_attempt ? FOLL_WRITE : 0;
+	foll |= f->attempt_pfault ? FOLL_NOWAIT : 0;
+
+	if (vcpu) {
+		kvm = vcpu->kvm;
+		mc = vcpu->arch.mc;
+	}
+
+	lockdep_assert_held(&kvm->srcu);
+
+	scoped_guard(read_lock, &kvm->mmu_lock) {
+		if (gmap_try_fixup_minor(kvm->arch.gmap, f) == 0)
+			return 0;
+	}
+
+	while (1) {
+		f->valid = false;
+		inv_seq = kvm->mmu_invalidate_seq;
+		/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
+		smp_rmb();
+
+		if (vcpu)
+			slot = kvm_vcpu_gfn_to_memslot(vcpu, f->gfn);
+		else
+			slot = gfn_to_memslot(kvm, f->gfn);
+		f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page);
+
+		/* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT). */
+		if (f->pfn == KVM_PFN_ERR_NEEDS_IO) {
+			if (unlikely(!f->attempt_pfault))
+				return -EAGAIN;
+			if (unlikely(!vcpu))
+				return -EINVAL;
+			trace_kvm_s390_major_guest_pfault(vcpu);
+			if (kvm_arch_setup_async_pf(vcpu))
+				return 0;
+			vcpu->stat.pfault_sync++;
+			/* Could not setup async pfault, try again synchronously. */
+			foll &= ~FOLL_NOWAIT;
+			f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page);
+		}
+
+		/* Access outside memory, addressing exception. */
+		if (is_noslot_pfn(f->pfn))
+			return PGM_ADDRESSING;
+		/* Signal pending: try again. */
+		if (f->pfn == KVM_PFN_ERR_SIGPENDING)
+			return -EAGAIN;
+		/* Check if it's read-only memory; don't try to actually handle that case. */
+		if (f->pfn == KVM_PFN_ERR_RO_FAULT)
+			return -EOPNOTSUPP;
+		/* Any other error. */
+		if (is_error_pfn(f->pfn))
+			return -EFAULT;
+
+		if (!mc) {
+			local_mc = kvm_s390_new_mmu_cache();
+			if (!local_mc)
+				return -ENOMEM;
+			mc = local_mc;
+		}
+
+		/* Loop, will automatically release the faulted page. */
+		if (mmu_invalidate_retry_gfn_unsafe(kvm, inv_seq, f->gfn)) {
+			kvm_release_faultin_page(kvm, f->page, true, false);
+			continue;
+		}
+
+		scoped_guard(read_lock, &kvm->mmu_lock) {
+			if (!mmu_invalidate_retry_gfn(kvm, inv_seq, f->gfn)) {
+				f->valid = true;
+				rc = gmap_link(mc, kvm->arch.gmap, f);
+				kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt);
+				f->page = NULL;
+			}
+		}
+		kvm_release_faultin_page(kvm, f->page, true, false);
+
+		if (rc == -ENOMEM) {
+			rc = kvm_s390_mmu_cache_topup(mc);
+			if (rc)
+				return rc;
+		} else if (rc != -EAGAIN) {
+			return rc;
+		}
+	}
+}
+
+int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w)
+{
+	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+	int foll = w ? FOLL_WRITE : 0;
+
+	f->write_attempt = w;
+	f->gfn = gfn;
+	f->pfn = __kvm_faultin_pfn(slot, gfn, foll, &f->writable, &f->page);
+	if (is_noslot_pfn(f->pfn))
+		return PGM_ADDRESSING;
+	if (is_sigpending_pfn(f->pfn))
+		return -EINTR;
+	if (f->pfn == KVM_PFN_ERR_NEEDS_IO)
+		return -EAGAIN;
+	if (is_error_pfn(f->pfn))
+		return -EFAULT;
+
+	f->valid = true;
+	return 0;
+}
diff --git a/arch/s390/kvm/faultin.h b/arch/s390/kvm/faultin.h
new file mode 100644
index 000000000000..f86176d2769c
--- /dev/null
+++ b/arch/s390/kvm/faultin.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  KVM guest fault handling.
+ *
+ *    Copyright IBM Corp. 2025
+ *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+
+#ifndef __KVM_S390_FAULTIN_H
+#define __KVM_S390_FAULTIN_H
+
+#include <linux/kvm_host.h>
+
+#include "dat.h"
+
+int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f);
+int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w);
+
+static inline int kvm_s390_faultin_gfn_simple(struct kvm_vcpu *vcpu, struct kvm *kvm,
+					      gfn_t gfn, bool wr)
+{
+	struct guest_fault f = { .gfn = gfn, .write_attempt = wr, };
+
+	return kvm_s390_faultin_gfn(vcpu, kvm, &f);
+}
+
+static inline int kvm_s390_get_guest_page_and_read_gpa(struct kvm *kvm, struct guest_fault *f,
+						       gpa_t gaddr, unsigned long *val)
+{
+	int rc;
+
+	rc = kvm_s390_get_guest_page(kvm, f, gpa_to_gfn(gaddr), false);
+	if (rc)
+		return rc;
+
+	*val = *(unsigned long *)phys_to_virt(pfn_to_phys(f->pfn) | offset_in_page(gaddr));
+
+	return 0;
+}
+
+static inline void kvm_s390_release_multiple(struct kvm *kvm, struct guest_fault *guest_faults,
+					     int n, bool ignore)
+{
+	int i;
+
+	for (i = 0; i < n; i++) {
+		kvm_release_faultin_page(kvm, guest_faults[i].page, ignore,
+					 guest_faults[i].write_attempt);
+		guest_faults[i].page = NULL;
+	}
+}
+
+static inline bool kvm_s390_multiple_faults_need_retry(struct kvm *kvm, unsigned long seq,
+						       struct guest_fault *guest_faults, int n,
+						       bool unsafe)
+{
+	int i;
+
+	for (i = 0; i < n; i++) {
+		if (!guest_faults[i].valid)
+			continue;
+		if (unsafe && mmu_invalidate_retry_gfn_unsafe(kvm, seq, guest_faults[i].gfn))
+			return true;
+		if (!unsafe && mmu_invalidate_retry_gfn(kvm, seq, guest_faults[i].gfn))
+			return true;
+	}
+	return false;
+}
+
+static inline int kvm_s390_get_guest_pages(struct kvm *kvm, struct guest_fault *guest_faults,
+					   gfn_t start, int n_pages, bool write_attempt)
+{
+	int i, rc;
+
+	for (i = 0; i < n_pages; i++) {
+		rc = kvm_s390_get_guest_page(kvm, guest_faults + i, start + i, write_attempt);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+
+#define kvm_s390_release_faultin_array(kvm, array, ignore) \
+	kvm_s390_release_multiple(kvm, array, ARRAY_SIZE(array), ignore)
+
+#define kvm_s390_array_needs_retry_unsafe(kvm, seq, array) \
+	kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), true)
+
+#define kvm_s390_array_needs_retry_safe(kvm, seq, array) \
+	kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), false)
+
+#endif /* __KVM_S390_FAULTIN_H */
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 41ca6b0ee7a9..4630b2a067ea 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -11,41 +11,43 @@
 #include <linux/err.h>
 #include <linux/pgtable.h>
 #include <linux/bitfield.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_types.h>
+#include <asm/diag.h>
 #include <asm/access-regs.h>
 #include <asm/fault.h>
-#include <asm/gmap.h>
 #include <asm/dat-bits.h>
 #include "kvm-s390.h"
+#include "dat.h"
+#include "gmap.h"
 #include "gaccess.h"
+#include "faultin.h"
 
 #define GMAP_SHADOW_FAKE_TABLE 1ULL
 
-/*
- * vaddress union in order to easily decode a virtual address into its
- * region first index, region second index etc. parts.
- */
-union vaddress {
-	unsigned long addr;
-	struct {
-		unsigned long rfx : 11;
-		unsigned long rsx : 11;
-		unsigned long rtx : 11;
-		unsigned long sx  : 11;
-		unsigned long px  : 8;
-		unsigned long bx  : 12;
-	};
-	struct {
-		unsigned long rfx01 : 2;
-		unsigned long	    : 9;
-		unsigned long rsx01 : 2;
-		unsigned long	    : 9;
-		unsigned long rtx01 : 2;
-		unsigned long	    : 9;
-		unsigned long sx01  : 2;
-		unsigned long	    : 29;
-	};
+union dat_table_entry {
+	unsigned long val;
+	union region1_table_entry pgd;
+	union region2_table_entry p4d;
+	union region3_table_entry pud;
+	union segment_table_entry pmd;
+	union page_table_entry pte;
+};
+
+#define WALK_N_ENTRIES 7
+#define LEVEL_MEM -2
+struct pgtwalk {
+	struct guest_fault raw_entries[WALK_N_ENTRIES];
+	gpa_t last_addr;
+	int level;
+	bool p;
 };
 
+static inline struct guest_fault *get_entries(struct pgtwalk *w)
+{
+	return w->raw_entries - LEVEL_MEM;
+}
+
 /*
  * raddress union which will contain the result (real or absolute address)
  * after a page table walk. The rfaa, sfaa and pfra members are used to
@@ -107,6 +109,28 @@ struct aste {
 	/* .. more fields there */
 };
 
+union oac {
+	unsigned int val;
+	struct {
+		struct {
+			unsigned short key : 4;
+			unsigned short     : 4;
+			unsigned short as  : 2;
+			unsigned short     : 4;
+			unsigned short k   : 1;
+			unsigned short a   : 1;
+		} oac1;
+		struct {
+			unsigned short key : 4;
+			unsigned short     : 4;
+			unsigned short as  : 2;
+			unsigned short     : 4;
+			unsigned short k   : 1;
+			unsigned short a   : 1;
+		} oac2;
+	};
+};
+
 int ipte_lock_held(struct kvm *kvm)
 {
 	if (sclp.has_siif)
@@ -423,7 +447,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
 }
 
 /**
- * guest_translate - translate a guest virtual into a guest absolute address
+ * guest_translate_gva() - translate a guest virtual into a guest absolute address
  * @vcpu: virtual cpu
  * @gva: guest virtual address
  * @gpa: points to where guest physical (absolute) address should be stored
@@ -443,9 +467,9 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
  *	      the returned value is the program interruption code as defined
  *	      by the architecture
  */
-static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
-				     unsigned long *gpa, const union asce asce,
-				     enum gacc_mode mode, enum prot_type *prot)
+static unsigned long guest_translate_gva(struct kvm_vcpu *vcpu, unsigned long gva,
+					 unsigned long *gpa, const union asce asce,
+					 enum gacc_mode mode, enum prot_type *prot)
 {
 	union vaddress vaddr = {.addr = gva};
 	union raddress raddr = {.addr = gva};
@@ -626,31 +650,19 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
 	return 1;
 }
 
-static int vm_check_access_key(struct kvm *kvm, u8 access_key,
-			       enum gacc_mode mode, gpa_t gpa)
+static int vm_check_access_key_gpa(struct kvm *kvm, u8 access_key,
+				   enum gacc_mode mode, gpa_t gpa)
 {
-	u8 storage_key, access_control;
-	bool fetch_protected;
-	unsigned long hva;
+	union skey storage_key;
 	int r;
 
-	if (access_key == 0)
-		return 0;
-
-	hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
-	if (kvm_is_error_hva(hva))
-		return PGM_ADDRESSING;
-
-	mmap_read_lock(current->mm);
-	r = get_guest_storage_key(current->mm, hva, &storage_key);
-	mmap_read_unlock(current->mm);
+	scoped_guard(read_lock, &kvm->mmu_lock)
+		r = dat_get_storage_key(kvm->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key);
 	if (r)
 		return r;
-	access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key);
-	if (access_control == access_key)
+	if (access_key == 0 || storage_key.acc == access_key)
 		return 0;
-	fetch_protected = storage_key & _PAGE_FP_BIT;
-	if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected)
+	if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !storage_key.fp)
 		return 0;
 	return PGM_PROTECTION;
 }
@@ -689,12 +701,11 @@ static bool storage_prot_override_applies(u8 access_control)
 	return access_control == PAGE_SPO_ACC;
 }
 
-static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key,
-				 enum gacc_mode mode, union asce asce, gpa_t gpa,
-				 unsigned long ga, unsigned int len)
+static int vcpu_check_access_key_gpa(struct kvm_vcpu *vcpu, u8 access_key,
+				     enum gacc_mode mode, union asce asce, gpa_t gpa,
+				     unsigned long ga, unsigned int len)
 {
-	u8 storage_key, access_control;
-	unsigned long hva;
+	union skey storage_key;
 	int r;
 
 	/* access key 0 matches any storage key -> allow */
@@ -704,26 +715,23 @@ static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key,
 	 * caller needs to ensure that gfn is accessible, so we can
 	 * assume that this cannot fail
 	 */
-	hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa));
-	mmap_read_lock(current->mm);
-	r = get_guest_storage_key(current->mm, hva, &storage_key);
-	mmap_read_unlock(current->mm);
+	scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+		r = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key);
 	if (r)
 		return r;
-	access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key);
 	/* access key matches storage key -> allow */
-	if (access_control == access_key)
+	if (storage_key.acc == access_key)
 		return 0;
 	if (mode == GACC_FETCH || mode == GACC_IFETCH) {
 		/* it is a fetch and fetch protection is off -> allow */
-		if (!(storage_key & _PAGE_FP_BIT))
+		if (!storage_key.fp)
 			return 0;
 		if (fetch_prot_override_applicable(vcpu, mode, asce) &&
 		    fetch_prot_override_applies(ga, len))
 			return 0;
 	}
 	if (storage_prot_override_applicable(vcpu) &&
-	    storage_prot_override_applies(access_control))
+	    storage_prot_override_applies(storage_key.acc))
 		return 0;
 	return PGM_PROTECTION;
 }
@@ -783,7 +791,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 			return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode,
 					 PROT_TYPE_LA);
 		if (psw_bits(*psw).dat) {
-			rc = guest_translate(vcpu, ga, &gpa, asce, mode, &prot);
+			rc = guest_translate_gva(vcpu, ga, &gpa, asce, mode, &prot);
 			if (rc < 0)
 				return rc;
 		} else {
@@ -795,8 +803,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 		}
 		if (rc)
 			return trans_exc(vcpu, rc, ga, ar, mode, prot);
-		rc = vcpu_check_access_key(vcpu, access_key, mode, asce, gpa, ga,
-					   fragment_len);
+		rc = vcpu_check_access_key_gpa(vcpu, access_key, mode, asce, gpa, ga, fragment_len);
 		if (rc)
 			return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_KEYC);
 		if (gpas)
@@ -808,8 +815,8 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 	return 0;
 }
 
-static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
-			     void *data, unsigned int len)
+static int access_guest_page_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
+				 void *data, unsigned int len)
 {
 	const unsigned int offset = offset_in_page(gpa);
 	const gfn_t gfn = gpa_to_gfn(gpa);
@@ -824,38 +831,79 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
 	return rc;
 }
 
-static int
-access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
-			   void *data, unsigned int len, u8 access_key)
+static int mvcos_key(void *to, const void *from, unsigned long size, u8 dst_key, u8 src_key)
 {
-	struct kvm_memory_slot *slot;
-	bool writable;
-	gfn_t gfn;
-	hva_t hva;
-	int rc;
+	union oac spec = {
+		.oac1.key = dst_key,
+		.oac1.k = !!dst_key,
+		.oac2.key = src_key,
+		.oac2.k = !!src_key,
+	};
+	int exception = PGM_PROTECTION;
+
+	asm_inline volatile(
+		"       lr      %%r0,%[spec]\n"
+		"0:     mvcos   %[to],%[from],%[size]\n"
+		"1:     lhi     %[exc],0\n"
+		"2:\n"
+		EX_TABLE(0b, 2b)
+		EX_TABLE(1b, 2b)
+		: [size] "+d" (size), [to] "=Q" (*(char *)to), [exc] "+d" (exception)
+		: [spec] "d" (spec.val), [from] "Q" (*(const char *)from)
+		: "memory", "cc", "0");
+	return exception;
+}
 
-	gfn = gpa >> PAGE_SHIFT;
-	slot = gfn_to_memslot(kvm, gfn);
-	hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+struct acc_page_key_context {
+	void *data;
+	int exception;
+	unsigned short offset;
+	unsigned short len;
+	bool store;
+	u8 access_key;
+};
 
-	if (kvm_is_error_hva(hva))
-		return PGM_ADDRESSING;
-	/*
-	 * Check if it's a ro memslot, even tho that can't occur (they're unsupported).
-	 * Don't try to actually handle that case.
-	 */
-	if (!writable && mode == GACC_STORE)
-		return -EOPNOTSUPP;
-	hva += offset_in_page(gpa);
-	if (mode == GACC_STORE)
-		rc = copy_to_user_key((void __user *)hva, data, len, access_key);
+static void _access_guest_page_with_key_gpa(struct guest_fault *f)
+{
+	struct acc_page_key_context *context = f->priv;
+	void *ptr;
+	int r;
+
+	ptr = __va(PFN_PHYS(f->pfn) | context->offset);
+
+	if (context->store)
+		r = mvcos_key(ptr, context->data, context->len, context->access_key, 0);
 	else
-		rc = copy_from_user_key(data, (void __user *)hva, len, access_key);
+		r = mvcos_key(context->data, ptr, context->len, 0, context->access_key);
+
+	context->exception = r;
+}
+
+static int access_guest_page_with_key_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
+					  void *data, unsigned int len, u8 acc)
+{
+	struct acc_page_key_context context = {
+		.offset = offset_in_page(gpa),
+		.len = len,
+		.data = data,
+		.access_key = acc,
+		.store = mode == GACC_STORE,
+	};
+	struct guest_fault fault = {
+		.gfn = gpa_to_gfn(gpa),
+		.priv = &context,
+		.write_attempt = mode == GACC_STORE,
+		.callback = _access_guest_page_with_key_gpa,
+	};
+	int rc;
+
+	if (KVM_BUG_ON((len + context.offset) > PAGE_SIZE, kvm))
+		return -EINVAL;
+
+	rc = kvm_s390_faultin_gfn(NULL, kvm, &fault);
 	if (rc)
-		return PGM_PROTECTION;
-	if (mode == GACC_STORE)
-		mark_page_dirty_in_slot(kvm, slot, gfn);
-	return 0;
+		return rc;
+	return context.exception;
 }
 
 int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data,
@@ -867,7 +915,7 @@ int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data,
 
 	while (min(PAGE_SIZE - offset, len) > 0) {
 		fragment_len = min(PAGE_SIZE - offset, len);
-		rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key);
+		rc = access_guest_page_with_key_gpa(kvm, mode, gpa, data, fragment_len, access_key);
 		if (rc)
 			return rc;
 		offset = 0;
@@ -927,15 +975,14 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 	for (idx = 0; idx < nr_pages; idx++) {
 		fragment_len = min(PAGE_SIZE - offset_in_page(gpas[idx]), len);
 		if (try_fetch_prot_override && fetch_prot_override_applies(ga, fragment_len)) {
-			rc = access_guest_page(vcpu->kvm, mode, gpas[idx],
-					       data, fragment_len);
+			rc = access_guest_page_gpa(vcpu->kvm, mode, gpas[idx], data, fragment_len);
 		} else {
-			rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx],
-							data, fragment_len, access_key);
+			rc = access_guest_page_with_key_gpa(vcpu->kvm, mode, gpas[idx],
+							    data, fragment_len, access_key);
 		}
 		if (rc == PGM_PROTECTION && try_storage_prot_override)
-			rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx],
-							data, fragment_len, PAGE_SPO_ACC);
+			rc = access_guest_page_with_key_gpa(vcpu->kvm, mode, gpas[idx],
+							    data, fragment_len, PAGE_SPO_ACC);
 		if (rc)
 			break;
 		len -= fragment_len;
@@ -969,7 +1016,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
 	while (len && !rc) {
 		gpa = kvm_s390_real_to_abs(vcpu, gra);
 		fragment_len = min(PAGE_SIZE - offset_in_page(gpa), len);
-		rc = access_guest_page(vcpu->kvm, mode, gpa, data, fragment_len);
+		rc = access_guest_page_gpa(vcpu->kvm, mode, gpa, data, fragment_len);
 		len -= fragment_len;
 		gra += fragment_len;
 		data += fragment_len;
@@ -980,17 +1027,101 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
 }
 
 /**
+ * __cmpxchg_with_key() - Perform cmpxchg, honoring storage keys.
+ * @ptr: Address of value to compare to *@old and exchange with
+ *       @new. Must be aligned to @size.
+ * @old: Old value. Compared to the content pointed to by @ptr in order to
+ *       determine if the exchange occurs. The old value read from *@ptr is
+ *       written here.
+ * @new: New value to place at *@ptr.
+ * @size: Size of the operation in bytes, may only be a power of two up to 16.
+ * @access_key: Access key to use for checking storage key protection.
+ *
+ * Perform a cmpxchg on guest memory, honoring storage key protection.
+ * @access_key alone determines how key checking is performed, neither
+ * storage-protection-override nor fetch-protection-override apply.
+ * In case of an exception *@uval is set to zero.
+ *
+ * Return:
+ * * %0: cmpxchg executed successfully
+ * * %1: cmpxchg executed unsuccessfully
+ * * %PGM_PROTECTION: an exception happened when trying to access *@ptr
+ * * %-EAGAIN: maxed out number of retries (byte and short only)
+ * * %-EINVAL: invalid value for @size
+ */
+static int __cmpxchg_with_key(union kvm_s390_quad *ptr, union kvm_s390_quad *old,
+			      union kvm_s390_quad new, int size, u8 access_key)
+{
+	union kvm_s390_quad tmp = { .sixteen = 0 };
+	int rc;
+
+	/*
+	 * The cmpxchg_key macro depends on the type of "old", so we need
+	 * a case for each valid length and get some code duplication as long
+	 * as we don't introduce a new macro.
+	 */
+	switch (size) {
+	case 1:
+		rc = __cmpxchg_key1(&ptr->one, &tmp.one, old->one, new.one, access_key);
+		break;
+	case 2:
+		rc = __cmpxchg_key2(&ptr->two, &tmp.two, old->two, new.two, access_key);
+		break;
+	case 4:
+		rc = __cmpxchg_key4(&ptr->four, &tmp.four, old->four, new.four, access_key);
+		break;
+	case 8:
+		rc = __cmpxchg_key8(&ptr->eight, &tmp.eight, old->eight, new.eight, access_key);
+		break;
+	case 16:
+		rc = __cmpxchg_key16(&ptr->sixteen, &tmp.sixteen, old->sixteen, new.sixteen,
+				     access_key);
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (!rc && memcmp(&tmp, old, size))
+		rc = 1;
+	*old = tmp;
+	/*
+	 * Assume that the fault is caused by protection, either key protection
+	 * or user page write protection.
+	 */
+	if (rc == -EFAULT)
+		rc = PGM_PROTECTION;
+	return rc;
+}
+
+struct cmpxchg_key_context {
+	union kvm_s390_quad new;
+	union kvm_s390_quad *old;
+	int exception;
+	unsigned short offset;
+	u8 access_key;
+	u8 len;
+};
+
+static void _cmpxchg_guest_abs_with_key(struct guest_fault *f)
+{
+	struct cmpxchg_key_context *context = f->priv;
+
+	context->exception = __cmpxchg_with_key(__va(PFN_PHYS(f->pfn) | context->offset),
+						context->old, context->new, context->len,
+						context->access_key);
+}
+
+/**
  * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address.
  * @kvm: Virtual machine instance.
  * @gpa: Absolute guest address of the location to be changed.
  * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a
  *       non power of two will result in failure.
- * @old_addr: Pointer to old value. If the location at @gpa contains this value,
- *            the exchange will succeed. After calling cmpxchg_guest_abs_with_key()
- *            *@old_addr contains the value at @gpa before the attempt to
- *            exchange the value.
+ * @old: Pointer to old value. If the location at @gpa contains this value,
+ *       the exchange will succeed. After calling cmpxchg_guest_abs_with_key()
+ *       *@old contains the value at @gpa before the attempt to
+ *       exchange the value.
  * @new: The value to place at @gpa.
- * @access_key: The access key to use for the guest access.
+ * @acc: The access key to use for the guest access.
  * @success: output value indicating if an exchange occurred.
  *
  * Atomically exchange the value at @gpa by @new, if it contains *@old.
@@ -1003,89 +1134,36 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  *         * -EAGAIN: transient failure (len 1 or 2)
  *         * -EOPNOTSUPP: read-only memslot (should never occur)
  */
-int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
-			       __uint128_t *old_addr, __uint128_t new,
-			       u8 access_key, bool *success)
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old,
+			       union kvm_s390_quad new, u8 acc, bool *success)
 {
-	gfn_t gfn = gpa_to_gfn(gpa);
-	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
-	bool writable;
-	hva_t hva;
-	int ret;
-
-	if (!IS_ALIGNED(gpa, len))
-		return -EINVAL;
-
-	hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
-	if (kvm_is_error_hva(hva))
-		return PGM_ADDRESSING;
-	/*
-	 * Check if it's a read-only memslot, even though that cannot occur
-	 * since those are unsupported.
-	 * Don't try to actually handle that case.
-	 */
-	if (!writable)
-		return -EOPNOTSUPP;
-
-	hva += offset_in_page(gpa);
-	/*
-	 * The cmpxchg_user_key macro depends on the type of "old", so we need
-	 * a case for each valid length and get some code duplication as long
-	 * as we don't introduce a new macro.
-	 */
-	switch (len) {
-	case 1: {
-		u8 old;
-
-		ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key);
-		*success = !ret && old == *old_addr;
-		*old_addr = old;
-		break;
-	}
-	case 2: {
-		u16 old;
-
-		ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key);
-		*success = !ret && old == *old_addr;
-		*old_addr = old;
-		break;
-	}
-	case 4: {
-		u32 old;
-
-		ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key);
-		*success = !ret && old == *old_addr;
-		*old_addr = old;
-		break;
-	}
-	case 8: {
-		u64 old;
+	struct cmpxchg_key_context context = {
+		.old = old,
+		.new = new,
+		.offset = offset_in_page(gpa),
+		.len = len,
+		.access_key = acc,
+	};
+	struct guest_fault fault = {
+		.gfn = gpa_to_gfn(gpa),
+		.priv = &context,
+		.write_attempt = true,
+		.callback = _cmpxchg_guest_abs_with_key,
+	};
+	int rc;
 
-		ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key);
-		*success = !ret && old == *old_addr;
-		*old_addr = old;
-		break;
-	}
-	case 16: {
-		__uint128_t old;
+	lockdep_assert_held(&kvm->srcu);
 
-		ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key);
-		*success = !ret && old == *old_addr;
-		*old_addr = old;
-		break;
-	}
-	default:
+	if (len > 16 || !IS_ALIGNED(gpa, len))
 		return -EINVAL;
-	}
-	if (*success)
-		mark_page_dirty_in_slot(kvm, slot, gfn);
-	/*
-	 * Assume that the fault is caused by protection, either key protection
-	 * or user page write protection.
-	 */
-	if (ret == -EFAULT)
-		ret = PGM_PROTECTION;
-	return ret;
+
+	rc = kvm_s390_faultin_gfn(NULL, kvm, &fault);
+	if (rc)
+		return rc;
+	*success = !context.exception;
+	if (context.exception == 1)
+		return 0;
+	return context.exception;
 }
 
 /**
@@ -1160,7 +1238,7 @@ int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length,
 
 	while (length && !rc) {
 		fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length);
-		rc = vm_check_access_key(kvm, access_key, mode, gpa);
+		rc = vm_check_access_key_gpa(kvm, access_key, mode, gpa);
 		length -= fragment_len;
 		gpa += fragment_len;
 	}
@@ -1187,304 +1265,375 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
 }
 
 /**
- * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
- * @sg: pointer to the shadow guest address space structure
- * @saddr: faulting address in the shadow gmap
- * @pgt: pointer to the beginning of the page table for the given address if
- *	 successful (return value 0), or to the first invalid DAT entry in
- *	 case of exceptions (return value > 0)
- * @dat_protection: referenced memory is write protected
- * @fake: pgt references contiguous guest memory block, not a pgtable
+ * walk_guest_tables() - Walk the guest page table and pin the dat tables.
+ * @sg: Pointer to the shadow guest address space structure.
+ * @saddr: Faulting address in the shadow gmap.
+ * @w: Will be filled with information on the pinned pages.
+ * @wr: Wndicates a write access if true.
+ *
+ * Return:
+ * * %0 in case of success,
+ * * a PIC code > 0 in case the address translation fails
+ * * an error code < 0 if other errors happen in the host
  */
-static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
-				  unsigned long *pgt, int *dat_protection,
-				  int *fake)
+static int walk_guest_tables(struct gmap *sg, unsigned long saddr, struct pgtwalk *w, bool wr)
 {
-	struct kvm *kvm;
-	struct gmap *parent;
-	union asce asce;
+	struct gmap *parent = sg->parent;
+	struct guest_fault *entries;
+	union dat_table_entry table;
 	union vaddress vaddr;
 	unsigned long ptr;
+	struct kvm *kvm;
+	union asce asce;
 	int rc;
 
-	*fake = 0;
-	*dat_protection = 0;
-	kvm = sg->private;
-	parent = sg->parent;
+	if (!parent)
+		return -EAGAIN;
+	kvm = parent->kvm;
+	WARN_ON(!kvm);
+	asce = sg->guest_asce;
+	entries = get_entries(w);
+
+	w->level = LEVEL_MEM;
+	w->last_addr = saddr;
+	if (asce.r)
+		return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, gpa_to_gfn(saddr), false);
+
 	vaddr.addr = saddr;
-	asce.val = sg->orig_asce;
 	ptr = asce.rsto * PAGE_SIZE;
-	if (asce.r) {
-		*fake = 1;
-		ptr = 0;
-		asce.dt = ASCE_TYPE_REGION1;
-	}
+
+	if (!asce_contains_gfn(asce, gpa_to_gfn(saddr)))
+		return PGM_ASCE_TYPE;
 	switch (asce.dt) {
 	case ASCE_TYPE_REGION1:
-		if (vaddr.rfx01 > asce.tl && !*fake)
+		if (vaddr.rfx01 > asce.tl)
 			return PGM_REGION_FIRST_TRANS;
 		break;
 	case ASCE_TYPE_REGION2:
-		if (vaddr.rfx)
-			return PGM_ASCE_TYPE;
 		if (vaddr.rsx01 > asce.tl)
 			return PGM_REGION_SECOND_TRANS;
 		break;
 	case ASCE_TYPE_REGION3:
-		if (vaddr.rfx || vaddr.rsx)
-			return PGM_ASCE_TYPE;
 		if (vaddr.rtx01 > asce.tl)
 			return PGM_REGION_THIRD_TRANS;
 		break;
 	case ASCE_TYPE_SEGMENT:
-		if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
-			return PGM_ASCE_TYPE;
 		if (vaddr.sx01 > asce.tl)
 			return PGM_SEGMENT_TRANSLATION;
 		break;
 	}
 
+	w->level = asce.dt;
 	switch (asce.dt) {
-	case ASCE_TYPE_REGION1: {
-		union region1_table_entry rfte;
-
-		if (*fake) {
-			ptr += vaddr.rfx * _REGION1_SIZE;
-			rfte.val = ptr;
-			goto shadow_r2t;
-		}
-		*pgt = ptr + vaddr.rfx * 8;
-		rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
+	case ASCE_TYPE_REGION1:
+		w->last_addr = ptr + vaddr.rfx * 8;
+		rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+							  w->last_addr, &table.val);
 		if (rc)
 			return rc;
-		if (rfte.i)
+		if (table.pgd.i)
 			return PGM_REGION_FIRST_TRANS;
-		if (rfte.tt != TABLE_TYPE_REGION1)
+		if (table.pgd.tt != TABLE_TYPE_REGION1)
 			return PGM_TRANSLATION_SPEC;
-		if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+		if (vaddr.rsx01 < table.pgd.tf || vaddr.rsx01 > table.pgd.tl)
 			return PGM_REGION_SECOND_TRANS;
 		if (sg->edat_level >= 1)
-			*dat_protection |= rfte.p;
-		ptr = rfte.rto * PAGE_SIZE;
-shadow_r2t:
-		rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
-		if (rc)
-			return rc;
-		kvm->stat.gmap_shadow_r1_entry++;
-	}
+			w->p |= table.pgd.p;
+		ptr = table.pgd.rto * PAGE_SIZE;
+		w->level--;
 		fallthrough;
-	case ASCE_TYPE_REGION2: {
-		union region2_table_entry rste;
-
-		if (*fake) {
-			ptr += vaddr.rsx * _REGION2_SIZE;
-			rste.val = ptr;
-			goto shadow_r3t;
-		}
-		*pgt = ptr + vaddr.rsx * 8;
-		rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
+	case ASCE_TYPE_REGION2:
+		w->last_addr = ptr + vaddr.rsx * 8;
+		rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+							  w->last_addr, &table.val);
 		if (rc)
 			return rc;
-		if (rste.i)
+		if (table.p4d.i)
 			return PGM_REGION_SECOND_TRANS;
-		if (rste.tt != TABLE_TYPE_REGION2)
+		if (table.p4d.tt != TABLE_TYPE_REGION2)
 			return PGM_TRANSLATION_SPEC;
-		if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+		if (vaddr.rtx01 < table.p4d.tf || vaddr.rtx01 > table.p4d.tl)
 			return PGM_REGION_THIRD_TRANS;
 		if (sg->edat_level >= 1)
-			*dat_protection |= rste.p;
-		ptr = rste.rto * PAGE_SIZE;
-shadow_r3t:
-		rste.p |= *dat_protection;
-		rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
-		if (rc)
-			return rc;
-		kvm->stat.gmap_shadow_r2_entry++;
-	}
+			w->p |= table.p4d.p;
+		ptr = table.p4d.rto * PAGE_SIZE;
+		w->level--;
 		fallthrough;
-	case ASCE_TYPE_REGION3: {
-		union region3_table_entry rtte;
-
-		if (*fake) {
-			ptr += vaddr.rtx * _REGION3_SIZE;
-			rtte.val = ptr;
-			goto shadow_sgt;
-		}
-		*pgt = ptr + vaddr.rtx * 8;
-		rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
+	case ASCE_TYPE_REGION3:
+		w->last_addr = ptr + vaddr.rtx * 8;
+		rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+							  w->last_addr, &table.val);
 		if (rc)
 			return rc;
-		if (rtte.i)
+		if (table.pud.i)
 			return PGM_REGION_THIRD_TRANS;
-		if (rtte.tt != TABLE_TYPE_REGION3)
+		if (table.pud.tt != TABLE_TYPE_REGION3)
 			return PGM_TRANSLATION_SPEC;
-		if (rtte.cr && asce.p && sg->edat_level >= 2)
+		if (table.pud.cr && asce.p && sg->edat_level >= 2)
 			return PGM_TRANSLATION_SPEC;
-		if (rtte.fc && sg->edat_level >= 2) {
-			*dat_protection |= rtte.fc0.p;
-			*fake = 1;
-			ptr = rtte.fc1.rfaa * _REGION3_SIZE;
-			rtte.val = ptr;
-			goto shadow_sgt;
+		if (sg->edat_level >= 1)
+			w->p |= table.pud.p;
+		if (table.pud.fc && sg->edat_level >= 2) {
+			table.val = u64_replace_bits(table.val, saddr, ~_REGION3_MASK);
+			goto edat_applies;
 		}
-		if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
+		if (vaddr.sx01 < table.pud.fc0.tf || vaddr.sx01 > table.pud.fc0.tl)
 			return PGM_SEGMENT_TRANSLATION;
-		if (sg->edat_level >= 1)
-			*dat_protection |= rtte.fc0.p;
-		ptr = rtte.fc0.sto * PAGE_SIZE;
-shadow_sgt:
-		rtte.fc0.p |= *dat_protection;
-		rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
-		if (rc)
-			return rc;
-		kvm->stat.gmap_shadow_r3_entry++;
-	}
+		ptr = table.pud.fc0.sto * PAGE_SIZE;
+		w->level--;
 		fallthrough;
-	case ASCE_TYPE_SEGMENT: {
-		union segment_table_entry ste;
-
-		if (*fake) {
-			ptr += vaddr.sx * _SEGMENT_SIZE;
-			ste.val = ptr;
-			goto shadow_pgt;
-		}
-		*pgt = ptr + vaddr.sx * 8;
-		rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
+	case ASCE_TYPE_SEGMENT:
+		w->last_addr = ptr + vaddr.sx * 8;
+		rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+							  w->last_addr, &table.val);
 		if (rc)
 			return rc;
-		if (ste.i)
+		if (table.pmd.i)
 			return PGM_SEGMENT_TRANSLATION;
-		if (ste.tt != TABLE_TYPE_SEGMENT)
+		if (table.pmd.tt != TABLE_TYPE_SEGMENT)
 			return PGM_TRANSLATION_SPEC;
-		if (ste.cs && asce.p)
+		if (table.pmd.cs && asce.p)
 			return PGM_TRANSLATION_SPEC;
-		*dat_protection |= ste.fc0.p;
-		if (ste.fc && sg->edat_level >= 1) {
-			*fake = 1;
-			ptr = ste.fc1.sfaa * _SEGMENT_SIZE;
-			ste.val = ptr;
-			goto shadow_pgt;
+		w->p |= table.pmd.p;
+		if (table.pmd.fc && sg->edat_level >= 1) {
+			table.val = u64_replace_bits(table.val, saddr, ~_SEGMENT_MASK);
+			goto edat_applies;
 		}
-		ptr = ste.fc0.pto * (PAGE_SIZE / 2);
-shadow_pgt:
-		ste.fc0.p |= *dat_protection;
-		rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
+		ptr = table.pmd.fc0.pto * (PAGE_SIZE / 2);
+		w->level--;
+	}
+	w->last_addr = ptr + vaddr.px * 8;
+	rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+						  w->last_addr, &table.val);
+	if (rc)
+		return rc;
+	if (table.pte.i)
+		return PGM_PAGE_TRANSLATION;
+	if (table.pte.z)
+		return PGM_TRANSLATION_SPEC;
+	w->p |= table.pte.p;
+edat_applies:
+	if (wr && w->p)
+		return PGM_PROTECTION;
+
+	return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, table.pte.pfra, wr);
+}
+
+static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union pte *ptep,
+			  struct guest_fault *f, bool p)
+{
+	union pgste pgste;
+	union pte newpte;
+	int rc;
+
+	lockdep_assert_held(&sg->kvm->mmu_lock);
+	lockdep_assert_held(&sg->parent->children_lock);
+
+	scoped_guard(spinlock, &sg->host_to_rmap_lock)
+		rc = gmap_insert_rmap(sg, f->gfn, gpa_to_gfn(raddr), TABLE_TYPE_PAGE_TABLE);
+	if (rc)
+		return rc;
+
+	pgste = pgste_get_lock(ptep_h);
+	newpte = _pte(f->pfn, f->writable, !p, 0);
+	newpte.s.d |= ptep->s.d;
+	newpte.s.sd |= ptep->s.sd;
+	newpte.h.p &= ptep->h.p;
+	pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false);
+	pgste.vsie_notif = 1;
+	pgste_set_unlock(ptep_h, pgste);
+
+	newpte = _pte(f->pfn, 0, !p, 0);
+	pgste = pgste_get_lock(ptep);
+	pgste = __dat_ptep_xchg(ptep, pgste, newpte, gpa_to_gfn(raddr), sg->asce, uses_skeys(sg));
+	pgste_set_unlock(ptep, pgste);
+
+	return 0;
+}
+
+static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table,
+			    struct guest_fault *f, bool p)
+{
+	union crste newcrste;
+	gfn_t gfn;
+	int rc;
+
+	lockdep_assert_held(&sg->kvm->mmu_lock);
+	lockdep_assert_held(&sg->parent->children_lock);
+
+	gfn = f->gfn & gpa_to_gfn(is_pmd(*table) ? _SEGMENT_MASK : _REGION3_MASK);
+	scoped_guard(spinlock, &sg->host_to_rmap_lock)
+		rc = gmap_insert_rmap(sg, gfn, gpa_to_gfn(raddr), host->h.tt);
+	if (rc)
+		return rc;
+
+	newcrste = _crste_fc1(f->pfn, host->h.tt, f->writable, !p);
+	newcrste.s.fc1.d |= host->s.fc1.d;
+	newcrste.s.fc1.sd |= host->s.fc1.sd;
+	newcrste.h.p &= host->h.p;
+	newcrste.s.fc1.vsie_notif = 1;
+	newcrste.s.fc1.prefix_notif = host->s.fc1.prefix_notif;
+	_gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn, false);
+
+	newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p);
+	dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce);
+	return 0;
+}
+
+static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
+			      unsigned long saddr, struct pgtwalk *w)
+{
+	struct guest_fault *entries;
+	int flags, i, hl, gl, l, rc;
+	union crste *table, *host;
+	union pte *ptep, *ptep_h;
+
+	lockdep_assert_held(&sg->kvm->mmu_lock);
+	lockdep_assert_held(&sg->parent->children_lock);
+
+	entries = get_entries(w);
+	ptep_h = NULL;
+	ptep = NULL;
+
+	rc = dat_entry_walk(NULL, gpa_to_gfn(saddr), sg->asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE,
+			    &table, &ptep);
+	if (rc)
+		return rc;
+
+	/* A race occourred. The shadow mapping is already valid, nothing to do */
+	if ((ptep && !ptep->h.i) || (!ptep && crste_leaf(*table)))
+		return 0;
+
+	gl = get_level(table, ptep);
+
+	/*
+	 * Skip levels that are already protected. For each level, protect
+	 * only the page containing the entry, not the whole table.
+	 */
+	for (i = gl ; i >= w->level; i--) {
+		rc = gmap_protect_rmap(mc, sg, entries[i - 1].gfn, gpa_to_gfn(saddr),
+				       entries[i - 1].pfn, i, entries[i - 1].writable);
 		if (rc)
 			return rc;
-		kvm->stat.gmap_shadow_sg_entry++;
 	}
+
+	rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF,
+			    TABLE_TYPE_PAGE_TABLE, &host, &ptep_h);
+	if (rc)
+		return rc;
+
+	hl = get_level(host, ptep_h);
+	/* Get the smallest granularity */
+	l = min3(gl, hl, w->level);
+
+	flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
+	/* If necessary, create the shadow mapping */
+	if (l < gl) {
+		rc = dat_entry_walk(mc, gpa_to_gfn(saddr), sg->asce, flags, l, &table, &ptep);
+		if (rc)
+			return rc;
 	}
-	/* Return the parent address of the page table */
-	*pgt = ptr;
-	return 0;
+	if (l < hl) {
+		rc = dat_entry_walk(mc, entries[LEVEL_MEM].gfn, sg->parent->asce,
+				    flags, l, &host, &ptep_h);
+		if (rc)
+			return rc;
+	}
+
+	if (KVM_BUG_ON(l > TABLE_TYPE_REGION3, sg->kvm))
+		return -EFAULT;
+	if (l == TABLE_TYPE_PAGE_TABLE)
+		return _do_shadow_pte(sg, saddr, ptep_h, ptep, entries + LEVEL_MEM, w->p);
+	return _do_shadow_crste(sg, saddr, host, table, entries + LEVEL_MEM, w->p);
 }
 
-/**
- * shadow_pgt_lookup() - find a shadow page table
- * @sg: pointer to the shadow guest address space structure
- * @saddr: the address in the shadow aguest address space
- * @pgt: parent gmap address of the page table to get shadowed
- * @dat_protection: if the pgtable is marked as protected by dat
- * @fake: pgt references contiguous guest memory block, not a pgtable
- *
- * Returns 0 if the shadow page table was found and -EAGAIN if the page
- * table was not found.
- *
- * Called with sg->mm->mmap_lock in read.
- */
-static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt,
-			     int *dat_protection, int *fake)
+static inline int _gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr,
+					unsigned long seq, struct pgtwalk *walk)
 {
-	unsigned long pt_index;
-	unsigned long *table;
-	struct page *page;
+	struct gmap *parent;
 	int rc;
 
-	spin_lock(&sg->guest_table_lock);
-	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
-	if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
-		/* Shadow page tables are full pages (pte+pgste) */
-		page = pfn_to_page(*table >> PAGE_SHIFT);
-		pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page));
-		*pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE;
-		*dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
-		*fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE);
-		rc = 0;
-	} else  {
-		rc = -EAGAIN;
+	if (kvm_s390_array_needs_retry_unsafe(vcpu->kvm, seq, walk->raw_entries))
+		return -EAGAIN;
+again:
+	rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc);
+	if (rc)
+		return rc;
+	scoped_guard(read_lock, &vcpu->kvm->mmu_lock) {
+		if (kvm_s390_array_needs_retry_safe(vcpu->kvm, seq, walk->raw_entries))
+			return -EAGAIN;
+		parent = READ_ONCE(sg->parent);
+		if (!parent)
+			return -EAGAIN;
+		scoped_guard(spinlock, &parent->children_lock) {
+			if (READ_ONCE(sg->parent) != parent)
+				return -EAGAIN;
+			rc = _gaccess_do_shadow(vcpu->arch.mc, sg, saddr, walk);
+		}
+		if (rc == -ENOMEM)
+			goto again;
+		if (!rc)
+			kvm_s390_release_faultin_array(vcpu->kvm, walk->raw_entries, false);
 	}
-	spin_unlock(&sg->guest_table_lock);
 	return rc;
 }
 
 /**
- * kvm_s390_shadow_fault - handle fault on a shadow page table
- * @vcpu: virtual cpu
- * @sg: pointer to the shadow guest address space structure
- * @saddr: faulting address in the shadow gmap
- * @datptr: will contain the address of the faulting DAT table entry, or of
- *	    the valid leaf, plus some flags
+ * __gaccess_shadow_fault() - Handle fault on a shadow page table.
+ * @vcpu: Virtual cpu that triggered the action.
+ * @sg: The shadow guest address space structure.
+ * @saddr: Faulting address in the shadow gmap.
+ * @datptr: Will contain the address of the faulting DAT table entry, or of
+ *	    the valid leaf, plus some flags.
+ * @wr: Whether this is a write access.
  *
- * Returns: - 0 if the shadow fault was successfully resolved
- *	    - > 0 (pgm exception code) on exceptions while faulting
- *	    - -EAGAIN if the caller can retry immediately
- *	    - -EFAULT when accessing invalid guest addresses
- *	    - -ENOMEM if out of memory
+ * Return:
+ * * %0 if the shadow fault was successfully resolved
+ * * > 0 (pgm exception code) on exceptions while faulting
+ * * %-EAGAIN if the caller can retry immediately
+ * * %-EFAULT when accessing invalid guest addresses
+ * * %-ENOMEM if out of memory
  */
-int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
-			  unsigned long saddr, unsigned long *datptr)
+static int __gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr,
+				  union mvpg_pei *datptr, bool wr)
 {
-	union vaddress vaddr;
-	union page_table_entry pte;
-	unsigned long pgt = 0;
-	int dat_protection, fake;
+	struct pgtwalk walk = {	.p = false, };
+	unsigned long seq;
 	int rc;
 
-	if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm))
-		return -EFAULT;
+	seq = vcpu->kvm->mmu_invalidate_seq;
+	/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
+	smp_rmb();
 
-	mmap_read_lock(sg->mm);
-	/*
-	 * We don't want any guest-2 tables to change - so the parent
-	 * tables/pointers we read stay valid - unshadowing is however
-	 * always possible - only guest_table_lock protects us.
-	 */
-	ipte_lock(vcpu->kvm);
-
-	rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+	rc = walk_guest_tables(sg, saddr, &walk, wr);
+	if (datptr) {
+		datptr->val = walk.last_addr;
+		datptr->dat_prot = wr && walk.p;
+		datptr->not_pte = walk.level > TABLE_TYPE_PAGE_TABLE;
+		datptr->real = sg->guest_asce.r;
+	}
+	if (!rc)
+		rc = _gaccess_shadow_fault(vcpu, sg, saddr, seq, &walk);
 	if (rc)
-		rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
-					    &fake);
+		kvm_s390_release_faultin_array(vcpu->kvm, walk.raw_entries, true);
+	return rc;
+}
 
-	vaddr.addr = saddr;
-	if (fake) {
-		pte.val = pgt + vaddr.px * PAGE_SIZE;
-		goto shadow_page;
-	}
+int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr,
+			 union mvpg_pei *datptr, bool wr)
+{
+	int rc;
 
-	switch (rc) {
-	case PGM_SEGMENT_TRANSLATION:
-	case PGM_REGION_THIRD_TRANS:
-	case PGM_REGION_SECOND_TRANS:
-	case PGM_REGION_FIRST_TRANS:
-		pgt |= PEI_NOT_PTE;
-		break;
-	case 0:
-		pgt += vaddr.px * 8;
-		rc = gmap_read_table(sg->parent, pgt, &pte.val);
-	}
-	if (datptr)
-		*datptr = pgt | dat_protection * PEI_DAT_PROT;
-	if (!rc && pte.i)
-		rc = PGM_PAGE_TRANSLATION;
-	if (!rc && pte.z)
-		rc = PGM_TRANSLATION_SPEC;
-shadow_page:
-	pte.p |= dat_protection;
-	if (!rc)
-		rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
-	vcpu->kvm->stat.gmap_shadow_pg_entry++;
+	if (KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &sg->flags), vcpu->kvm))
+		return -EFAULT;
+
+	rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc);
+	if (rc)
+		return rc;
+
+	ipte_lock(vcpu->kvm);
+	rc = __gaccess_shadow_fault(vcpu, sg, saddr, datptr, wr || sg->guest_asce.r);
 	ipte_unlock(vcpu->kvm);
-	mmap_read_unlock(sg->mm);
+
 	return rc;
 }
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 3fde45a151f2..b5385cec60f4 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -206,8 +206,8 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
 		      void *data, unsigned long len, enum gacc_mode mode);
 
-int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old,
-			       __uint128_t new, u8 access_key, bool *success);
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old,
+			       union kvm_s390_quad new, u8 access_key, bool *success);
 
 /**
  * write_guest_with_key - copy data from kernel space to guest space
@@ -450,11 +450,17 @@ void ipte_unlock(struct kvm *kvm);
 int ipte_lock_held(struct kvm *kvm);
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
 
-/* MVPG PEI indication bits */
-#define PEI_DAT_PROT 2
-#define PEI_NOT_PTE 4
+union mvpg_pei {
+	unsigned long val;
+	struct {
+		unsigned long addr    : 61;
+		unsigned long not_pte :  1;
+		unsigned long dat_prot:  1;
+		unsigned long real    :  1;
+	};
+};
 
-int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
-			  unsigned long saddr, unsigned long *datptr);
+int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr,
+			 union mvpg_pei *datptr, bool wr);
 
 #endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c
deleted file mode 100644
index 56ef153eb8fe..000000000000
--- a/arch/s390/kvm/gmap-vsie.c
+++ /dev/null
@@ -1,141 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Guest memory management for KVM/s390 nested VMs.
- *
- * Copyright IBM Corp. 2008, 2020, 2024
- *
- *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
- *               Martin Schwidefsky <schwidefsky@de.ibm.com>
- *               David Hildenbrand <david@redhat.com>
- *               Janosch Frank <frankja@linux.vnet.ibm.com>
- */
-
-#include <linux/compiler.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/pgtable.h>
-#include <linux/pagemap.h>
-#include <linux/mman.h>
-
-#include <asm/lowcore.h>
-#include <asm/gmap.h>
-#include <asm/uv.h>
-
-#include "kvm-s390.h"
-
-/**
- * gmap_find_shadow - find a specific asce in the list of shadow tables
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns the pointer to a gmap if a shadow table with the given asce is
- * already available, ERR_PTR(-EAGAIN) if another one is just being created,
- * otherwise NULL
- *
- * Context: Called with parent->shadow_lock held
- */
-static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level)
-{
-	struct gmap *sg;
-
-	lockdep_assert_held(&parent->shadow_lock);
-	list_for_each_entry(sg, &parent->children, list) {
-		if (!gmap_shadow_valid(sg, asce, edat_level))
-			continue;
-		if (!sg->initialized)
-			return ERR_PTR(-EAGAIN);
-		refcount_inc(&sg->ref_count);
-		return sg;
-	}
-	return NULL;
-}
-
-/**
- * gmap_shadow - create/find a shadow guest address space
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * The pages of the top level page table referred by the asce parameter
- * will be set to read-only and marked in the PGSTEs of the kvm process.
- * The shadow table will be removed automatically on any change to the
- * PTE mapping for the source table.
- *
- * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
- * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
- * parent gmap table could not be protected.
- */
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level)
-{
-	struct gmap *sg, *new;
-	unsigned long limit;
-	int rc;
-
-	if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) ||
-	    KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private))
-		return ERR_PTR(-EFAULT);
-	spin_lock(&parent->shadow_lock);
-	sg = gmap_find_shadow(parent, asce, edat_level);
-	spin_unlock(&parent->shadow_lock);
-	if (sg)
-		return sg;
-	/* Create a new shadow gmap */
-	limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
-	if (asce & _ASCE_REAL_SPACE)
-		limit = -1UL;
-	new = gmap_alloc(limit);
-	if (!new)
-		return ERR_PTR(-ENOMEM);
-	new->mm = parent->mm;
-	new->parent = gmap_get(parent);
-	new->private = parent->private;
-	new->orig_asce = asce;
-	new->edat_level = edat_level;
-	new->initialized = false;
-	spin_lock(&parent->shadow_lock);
-	/* Recheck if another CPU created the same shadow */
-	sg = gmap_find_shadow(parent, asce, edat_level);
-	if (sg) {
-		spin_unlock(&parent->shadow_lock);
-		gmap_free(new);
-		return sg;
-	}
-	if (asce & _ASCE_REAL_SPACE) {
-		/* only allow one real-space gmap shadow */
-		list_for_each_entry(sg, &parent->children, list) {
-			if (sg->orig_asce & _ASCE_REAL_SPACE) {
-				spin_lock(&sg->guest_table_lock);
-				gmap_unshadow(sg);
-				spin_unlock(&sg->guest_table_lock);
-				list_del(&sg->list);
-				gmap_put(sg);
-				break;
-			}
-		}
-	}
-	refcount_set(&new->ref_count, 2);
-	list_add(&new->list, &parent->children);
-	if (asce & _ASCE_REAL_SPACE) {
-		/* nothing to protect, return right away */
-		new->initialized = true;
-		spin_unlock(&parent->shadow_lock);
-		return new;
-	}
-	spin_unlock(&parent->shadow_lock);
-	/* protect after insertion, so it will get properly invalidated */
-	mmap_read_lock(parent->mm);
-	rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN,
-				      ((asce & _ASCE_TABLE_LENGTH) + 1),
-				      PROT_READ, GMAP_NOTIFY_SHADOW);
-	mmap_read_unlock(parent->mm);
-	spin_lock(&parent->shadow_lock);
-	new->initialized = true;
-	if (rc) {
-		list_del(&new->list);
-		gmap_free(new);
-		new = ERR_PTR(rc);
-	}
-	spin_unlock(&parent->shadow_lock);
-	return new;
-}
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
new file mode 100644
index 000000000000..26cd2b208b6f
--- /dev/null
+++ b/arch/s390/kvm/gmap.c
@@ -0,0 +1,1244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Guest memory management for KVM/s390
+ *
+ * Copyright IBM Corp. 2008, 2020, 2024
+ *
+ *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ *               Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *               David Hildenbrand <david@redhat.com>
+ *               Janosch Frank <frankja@linux.ibm.com>
+ */
+
+#include <linux/compiler.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/pgtable.h>
+#include <linux/pagemap.h>
+#include <asm/lowcore.h>
+#include <asm/uv.h>
+#include <asm/gmap_helpers.h>
+
+#include "dat.h"
+#include "gmap.h"
+#include "kvm-s390.h"
+#include "faultin.h"
+
+static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.sie_block->prog0c & PROG_IN_SIE;
+}
+
+static int gmap_limit_to_type(gfn_t limit)
+{
+	if (!limit)
+		return TABLE_TYPE_REGION1;
+	if (limit <= _REGION3_SIZE >> PAGE_SHIFT)
+		return TABLE_TYPE_SEGMENT;
+	if (limit <= _REGION2_SIZE >> PAGE_SHIFT)
+		return TABLE_TYPE_REGION3;
+	if (limit <= _REGION1_SIZE >> PAGE_SHIFT)
+		return TABLE_TYPE_REGION2;
+	return TABLE_TYPE_REGION1;
+}
+
+/**
+ * gmap_new() - Allocate and initialize a guest address space.
+ * @kvm: The kvm owning the guest.
+ * @limit: Maximum address of the gmap address space.
+ *
+ * Return: A guest address space structure.
+ */
+struct gmap *gmap_new(struct kvm *kvm, gfn_t limit)
+{
+	struct crst_table *table;
+	struct gmap *gmap;
+	int type;
+
+	type = gmap_limit_to_type(limit);
+
+	gmap = kzalloc(sizeof(*gmap), GFP_KERNEL_ACCOUNT);
+	if (!gmap)
+		return NULL;
+	INIT_LIST_HEAD(&gmap->children);
+	INIT_LIST_HEAD(&gmap->list);
+	INIT_LIST_HEAD(&gmap->scb_users);
+	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE);
+	spin_lock_init(&gmap->children_lock);
+	spin_lock_init(&gmap->host_to_rmap_lock);
+	refcount_set(&gmap->refcount, 1);
+
+	table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val);
+	if (!table) {
+		kfree(gmap);
+		return NULL;
+	}
+
+	gmap->asce.val = __pa(table);
+	gmap->asce.dt = type;
+	gmap->asce.tl = _ASCE_TABLE_LENGTH;
+	gmap->asce.x = 1;
+	gmap->asce.p = 1;
+	gmap->asce.s = 1;
+	gmap->kvm = kvm;
+	set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags);
+
+	return gmap;
+}
+
+static void gmap_add_child(struct gmap *parent, struct gmap *child)
+{
+	KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm);
+	KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm);
+	KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm);
+	lockdep_assert_held(&parent->children_lock);
+
+	child->parent = parent;
+
+	if (is_ucontrol(parent))
+		set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
+	else
+		clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags);
+
+	if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags))
+		set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
+	else
+		clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags);
+
+	if (kvm_is_ucontrol(parent->kvm))
+		clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags);
+	list_add(&child->list, &parent->children);
+}
+
+struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit)
+{
+	struct gmap *res;
+
+	lockdep_assert_not_held(&parent->children_lock);
+	res = gmap_new(parent->kvm, limit);
+	if (res) {
+		scoped_guard(spinlock, &parent->children_lock)
+			gmap_add_child(parent, res);
+	}
+	return res;
+}
+
+int gmap_set_limit(struct gmap *gmap, gfn_t limit)
+{
+	struct kvm_s390_mmu_cache *mc;
+	int rc, type;
+
+	type = gmap_limit_to_type(limit);
+
+	mc = kvm_s390_new_mmu_cache();
+	if (!mc)
+		return -ENOMEM;
+
+	do {
+		rc = kvm_s390_mmu_cache_topup(mc);
+		if (rc)
+			return rc;
+		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
+			rc = dat_set_asce_limit(mc, &gmap->asce, type);
+	} while (rc == -ENOMEM);
+
+	kvm_s390_free_mmu_cache(mc);
+	return 0;
+}
+
+static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+{
+	struct vsie_rmap *rmap, *rnext, *head;
+	struct radix_tree_iter iter;
+	unsigned long indices[16];
+	unsigned long index;
+	void __rcu **slot;
+	int i, nr;
+
+	/* A radix tree is freed by deleting all of its entries */
+	index = 0;
+	do {
+		nr = 0;
+		radix_tree_for_each_slot(slot, root, &iter, index) {
+			indices[nr] = iter.index;
+			if (++nr == 16)
+				break;
+		}
+		for (i = 0; i < nr; i++) {
+			index = indices[i];
+			head = radix_tree_delete(root, index);
+			gmap_for_each_rmap_safe(rmap, rnext, head)
+				kfree(rmap);
+		}
+	} while (nr > 0);
+}
+
+void gmap_remove_child(struct gmap *child)
+{
+	if (KVM_BUG_ON(!child->parent, child->kvm))
+		return;
+	lockdep_assert_held(&child->parent->children_lock);
+
+	list_del(&child->list);
+	child->parent = NULL;
+}
+
+/**
+ * gmap_dispose() - Remove and free a guest address space and its children.
+ * @gmap: Pointer to the guest address space structure.
+ */
+void gmap_dispose(struct gmap *gmap)
+{
+	/* The gmap must have been removed from the parent beforehands */
+	KVM_BUG_ON(gmap->parent, gmap->kvm);
+	/* All children of this gmap must have been removed beforehands */
+	KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm);
+	/* No VSIE shadow block is allowed to use this gmap */
+	KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm);
+	/* The ASCE must be valid */
+	KVM_BUG_ON(!gmap->asce.val, gmap->kvm);
+	/* The refcount must be 0 */
+	KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm);
+
+	/* Flush tlb of all gmaps */
+	asce_flush_tlb(gmap->asce);
+
+	/* Free all DAT tables. */
+	dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap));
+
+	/* Free additional data for a shadow gmap */
+	if (is_shadow(gmap))
+		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+
+	kfree(gmap);
+}
+
+/**
+ * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy.
+ * @gmap: The gmap whose ASCE needs to be replaced.
+ *
+ * If the ASCE is a SEGMENT type then this function will return -EINVAL,
+ * otherwise the pointers in the host_to_guest radix tree will keep pointing
+ * to the wrong pages, causing use-after-free and memory corruption.
+ * If the allocation of the new top level page table fails, the ASCE is not
+ * replaced.
+ * In any case, the old ASCE is always removed from the gmap CRST list.
+ * Therefore the caller has to make sure to save a pointer to it
+ * beforehand, unless a leak is actually intended.
+ *
+ * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE,
+ *         -ENOMEM if runinng out of memory.
+ */
+int s390_replace_asce(struct gmap *gmap)
+{
+	struct crst_table *table;
+	union asce asce;
+
+	/* Replacing segment type ASCEs would cause serious issues */
+	if (gmap->asce.dt == ASCE_TYPE_SEGMENT)
+		return -EINVAL;
+
+	table = dat_alloc_crst_sleepable(0);
+	if (!table)
+		return -ENOMEM;
+	memcpy(table, dereference_asce(gmap->asce), sizeof(*table));
+
+	/* Set new table origin while preserving existing ASCE control bits */
+	asce = gmap->asce;
+	asce.rsto = virt_to_pfn(table);
+	WRITE_ONCE(gmap->asce, asce);
+
+	return 0;
+}
+
+bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint)
+{
+	struct kvm *kvm = gmap->kvm;
+	struct kvm_vcpu *vcpu;
+	gfn_t prefix_gfn;
+	unsigned long i;
+
+	if (is_shadow(gmap))
+		return false;
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		/* Match against both prefix pages */
+		prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu));
+		if (prefix_gfn < end && gfn <= prefix_gfn + 1) {
+			if (hint && kvm_s390_is_in_sie(vcpu))
+				return false;
+			VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx",
+				   gfn_to_gpa(gfn), gfn_to_gpa(end));
+			kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
+		}
+	}
+	return true;
+}
+
+struct clear_young_pte_priv {
+	struct gmap *gmap;
+	bool young;
+};
+
+static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
+{
+	struct clear_young_pte_priv *p = walk->priv;
+	union pgste pgste;
+	union pte pte, new;
+
+	pte = READ_ONCE(*ptep);
+
+	if (!pte.s.pr || (!pte.s.y && pte.h.i))
+		return 0;
+
+	pgste = pgste_get_lock(ptep);
+	if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) {
+		new = pte;
+		new.h.i = 1;
+		new.s.y = 0;
+		if ((new.s.d || !new.h.p) && !new.s.s)
+			folio_set_dirty(pfn_folio(pte.h.pfra));
+		new.s.d = 0;
+		new.h.p = 1;
+
+		pgste.prefix_notif = 0;
+		pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap));
+	}
+	p->young = 1;
+	pgste_set_unlock(ptep, pgste);
+	return 0;
+}
+
+static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk)
+{
+	struct clear_young_pte_priv *priv = walk->priv;
+	union crste crste, new;
+
+	crste = READ_ONCE(*crstep);
+
+	if (!crste.h.fc)
+		return 0;
+	if (!crste.s.fc1.y && crste.h.i)
+		return 0;
+	if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) {
+		new = crste;
+		new.h.i = 1;
+		new.s.fc1.y = 0;
+		new.s.fc1.prefix_notif = 0;
+		if (new.s.fc1.d || !new.h.p)
+			folio_set_dirty(phys_to_folio(crste_origin_large(crste)));
+		new.s.fc1.d = 0;
+		new.h.p = 1;
+		dat_crstep_xchg(crstep, new, gfn, walk->asce);
+	}
+	priv->young = 1;
+	return 0;
+}
+
+/**
+ * gmap_age_gfn() - Clear young.
+ * @gmap: The guest gmap.
+ * @start: The first gfn to test.
+ * @end: The gfn after the last one to test.
+ *
+ * Context: Called with the kvm mmu write lock held.
+ * Return: 1 if any page in the given range was young, otherwise 0.
+ */
+bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end)
+{
+	const struct dat_walk_ops ops = {
+		.pte_entry = gmap_clear_young_pte,
+		.pmd_entry = gmap_clear_young_crste,
+		.pud_entry = gmap_clear_young_crste,
+	};
+	struct clear_young_pte_priv priv = {
+		.gmap = gmap,
+		.young = false,
+	};
+
+	_dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
+
+	return priv.young;
+}
+
+struct gmap_unmap_priv {
+	struct gmap *gmap;
+	struct kvm_memory_slot *slot;
+};
+
+static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w)
+{
+	struct gmap_unmap_priv *priv = w->priv;
+	struct folio *folio = NULL;
+	unsigned long vmaddr;
+	union pgste pgste;
+
+	pgste = pgste_get_lock(ptep);
+	if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) {
+		vmaddr = __gfn_to_hva_memslot(priv->slot, gfn);
+		gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr);
+	}
+	if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
+		folio = pfn_folio(ptep->h.pfra);
+	pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn);
+	pgste_set_unlock(ptep, pgste);
+	if (folio)
+		uv_convert_from_secure_folio(folio);
+
+	return 0;
+}
+
+static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	struct gmap_unmap_priv *priv = walk->priv;
+	struct folio *folio = NULL;
+
+	if (crstep->h.fc) {
+		if (crstep->s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
+			folio = phys_to_folio(crste_origin_large(*crstep));
+		gmap_crstep_xchg(priv->gmap, crstep, _CRSTE_EMPTY(crstep->h.tt), gfn);
+		if (folio)
+			uv_convert_from_secure_folio(folio);
+	}
+
+	return 0;
+}
+
+/**
+ * gmap_unmap_gfn_range() - Unmap a range of guest addresses.
+ * @gmap: The gmap to act on.
+ * @slot: The memslot in which the range is located.
+ * @start: The first gfn to unmap.
+ * @end: The gfn after the last one to unmap.
+ *
+ * Context: Called with the kvm mmu write lock held.
+ * Return: false
+ */
+bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
+{
+	const struct dat_walk_ops ops = {
+		.pte_entry = _gmap_unmap_pte,
+		.pmd_entry = _gmap_unmap_crste,
+		.pud_entry = _gmap_unmap_crste,
+	};
+	struct gmap_unmap_priv priv = {
+		.gmap = gmap,
+		.slot = slot,
+	};
+
+	lockdep_assert_held_write(&gmap->kvm->mmu_lock);
+
+	_dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv);
+	return false;
+}
+
+static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn,
+						  struct gmap *gmap)
+{
+	union pte pte = READ_ONCE(*ptep);
+
+	if (!pte.s.pr || (pte.h.p && !pte.s.sd))
+		return pgste;
+
+	/*
+	 * If this page contains one or more prefixes of vCPUS that are currently
+	 * running, do not reset the protection, leave it marked as dirty.
+	 */
+	if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) {
+		pte.h.p = 1;
+		pte.s.sd = 0;
+		pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn);
+	}
+
+	mark_page_dirty(gmap->kvm, gfn);
+
+	return pgste;
+}
+
+static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end,
+					  struct dat_walk *walk)
+{
+	struct gmap *gmap = walk->priv;
+	union pgste pgste;
+
+	pgste = pgste_get_lock(ptep);
+	pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap);
+	pgste_set_unlock(ptep, pgste);
+	return 0;
+}
+
+static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end,
+					    struct dat_walk *walk)
+{
+	struct gmap *gmap = walk->priv;
+	union crste crste, new;
+
+	if (fatal_signal_pending(current))
+		return 1;
+	crste = READ_ONCE(*table);
+	if (!crste.h.fc)
+		return 0;
+	if (crste.h.p && !crste.s.fc1.sd)
+		return 0;
+
+	/*
+	 * If this large page contains one or more prefixes of vCPUs that are
+	 * currently running, do not reset the protection, leave it marked as
+	 * dirty.
+	 */
+	if (!crste.s.fc1.prefix_notif || gmap_mkold_prefix(gmap, gfn, end)) {
+		new = crste;
+		new.h.p = 1;
+		new.s.fc1.sd = 0;
+		gmap_crstep_xchg(gmap, table, new, gfn);
+	}
+
+	for ( ; gfn < end; gfn++)
+		mark_page_dirty(gmap->kvm, gfn);
+
+	return 0;
+}
+
+void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end)
+{
+	const struct dat_walk_ops walk_ops = {
+		.pte_entry = _pte_test_and_clear_softdirty,
+		.pmd_entry = _crste_test_and_clear_softdirty,
+		.pud_entry = _crste_test_and_clear_softdirty,
+	};
+
+	lockdep_assert_held(&gmap->kvm->mmu_lock);
+
+	_dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap);
+}
+
+static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f)
+{
+	union crste newcrste, oldcrste = READ_ONCE(*f->crstep);
+
+	/* Somehow the crste is not large anymore, let the slow path deal with it. */
+	if (!oldcrste.h.fc)
+		return 1;
+
+	f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn));
+	f->writable = oldcrste.s.fc1.w;
+
+	/* Appropriate permissions already (race with another handler), nothing to do. */
+	if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p))
+		return 0;
+
+	if (!f->write_attempt || oldcrste.s.fc1.w) {
+		f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d;
+		newcrste = oldcrste;
+		newcrste.h.i = 0;
+		newcrste.s.fc1.y = 1;
+		if (f->write_attempt) {
+			newcrste.h.p = 0;
+			newcrste.s.fc1.d = 1;
+			newcrste.s.fc1.sd = 1;
+		}
+		if (!oldcrste.s.fc1.d && newcrste.s.fc1.d)
+			SetPageDirty(phys_to_page(crste_origin_large(newcrste)));
+		/* In case of races, let the slow path deal with it. */
+		return !dat_crstep_xchg_atomic(f->crstep, oldcrste, newcrste, f->gfn, asce);
+	}
+	/* Trying to write on a read-only page, let the slow path deal with it. */
+	return 1;
+}
+
+static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste,
+					struct guest_fault *f)
+{
+	union pte newpte, oldpte = READ_ONCE(*f->ptep);
+
+	f->pfn = oldpte.h.pfra;
+	f->writable = oldpte.s.w;
+
+	/* Appropriate permissions already (race with another handler), nothing to do. */
+	if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p))
+		return 0;
+	/* Trying to write on a read-only page, let the slow path deal with it. */
+	if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w))
+		return 1;
+
+	newpte = oldpte;
+	newpte.h.i = 0;
+	newpte.s.y = 1;
+	if (f->write_attempt) {
+		newpte.h.p = 0;
+		newpte.s.d = 1;
+		newpte.s.sd = 1;
+	}
+	if (!oldpte.s.d && newpte.s.d)
+		SetPageDirty(pfn_to_page(newpte.h.pfra));
+	*pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn);
+
+	return 0;
+}
+
+/**
+ * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault.
+ * @gmap: The gmap whose fault needs to be resolved.
+ * @fault: Describes the fault that is being resolved.
+ *
+ * A minor fault is a fault that can be resolved quickly within gmap.
+ * The page is already mapped, the fault is only due to dirty/young tracking.
+ *
+ * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could
+ *         not be resolved and needs to go through the slow path.
+ */
+int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault)
+{
+	union pgste pgste;
+	int rc;
+
+	lockdep_assert_held(&gmap->kvm->mmu_lock);
+
+	rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE,
+			    &fault->crstep, &fault->ptep);
+	/* If a PTE or a leaf CRSTE could not be reached, slow path. */
+	if (rc)
+		return 1;
+
+	if (fault->ptep) {
+		pgste = pgste_get_lock(fault->ptep);
+		rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault);
+		if (!rc && fault->callback)
+			fault->callback(fault);
+		pgste_set_unlock(fault->ptep, pgste);
+	} else {
+		rc = gmap_handle_minor_crste_fault(gmap->asce, fault);
+		if (!rc && fault->callback)
+			fault->callback(fault);
+	}
+	return rc;
+}
+
+static inline bool gmap_2g_allowed(struct gmap *gmap, gfn_t gfn)
+{
+	return false;
+}
+
+static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn)
+{
+	return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags);
+}
+
+int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f)
+{
+	unsigned int order;
+	int rc, level;
+
+	lockdep_assert_held(&gmap->kvm->mmu_lock);
+
+	level = TABLE_TYPE_PAGE_TABLE;
+	if (f->page) {
+		order = folio_order(page_folio(f->page));
+		if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f->gfn))
+			level = TABLE_TYPE_REGION3;
+		else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn))
+			level = TABLE_TYPE_SEGMENT;
+	}
+	rc = dat_link(mc, gmap->asce, level, uses_skeys(gmap), f);
+	KVM_BUG_ON(rc == -EINVAL, gmap->kvm);
+	return rc;
+}
+
+static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
+			     gfn_t p_gfn, gfn_t c_gfn, bool force_alloc)
+{
+	struct page_table *pt;
+	union crste newcrste;
+	union crste *crstep;
+	union pte *ptep;
+	int rc;
+
+	if (force_alloc)
+		rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC,
+				    TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
+	else
+		rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE,
+				    TABLE_TYPE_SEGMENT, &crstep, &ptep);
+	if (rc)
+		return rc;
+	if (!ptep) {
+		newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT);
+		newcrste.h.i = 1;
+		newcrste.h.fc0.tl = 1;
+	} else {
+		pt = pte_table_start(ptep);
+		dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT));
+		newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT);
+	}
+	rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT,
+			    &crstep, &ptep);
+	if (rc)
+		return rc;
+	dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce);
+	return 0;
+}
+
+static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp)
+{
+	union pte *ptep;
+	int rc;
+
+	rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE,
+			    TABLE_TYPE_SEGMENT, crstepp, &ptep);
+	if (rc || (!ptep && !crste_is_ucas(**crstepp)))
+		return -EREMOTE;
+	if (!ptep)
+		return 1;
+	*gaddr &= ~_SEGMENT_MASK;
+	*gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT;
+	return 0;
+}
+
+/**
+ * gmap_ucas_translate() - Translate a vcpu address into a host gmap address
+ * @mc: The memory cache to be used for allocations.
+ * @gmap: The per-cpu gmap.
+ * @gaddr: Pointer to the address to be translated, will get overwritten with
+ *         the translated address in case of success.
+ * Translates the per-vCPU guest address into a fake guest address, which can
+ * then be used with the fake memslots that are identity mapping userspace.
+ * This allows ucontrol VMs to use the normal fault resolution path, like
+ * normal VMs.
+ *
+ * Return: %0 in case of success, otherwise %-EREMOTE.
+ */
+int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr)
+{
+	gpa_t translated_address;
+	union crste *crstep;
+	gfn_t gfn;
+	int rc;
+
+	gfn = gpa_to_gfn(*gaddr);
+
+	scoped_guard(read_lock, &gmap->kvm->mmu_lock) {
+		rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
+		if (rc <= 0)
+			return rc;
+	}
+	do {
+		scoped_guard(write_lock, &gmap->kvm->mmu_lock) {
+			rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep);
+			if (rc <= 0)
+				return rc;
+			translated_address = (*gaddr & ~_SEGMENT_MASK) |
+					     (crstep->val & _SEGMENT_MASK);
+			rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true);
+		}
+		if (!rc) {
+			*gaddr = translated_address;
+			return 0;
+		}
+		if (rc != -ENOMEM)
+			return -EREMOTE;
+		rc = kvm_s390_mmu_cache_topup(mc);
+		if (rc)
+			return rc;
+	} while (1);
+	return 0;
+}
+
+int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count)
+{
+	struct kvm_s390_mmu_cache *mc;
+	int rc;
+
+	mc = kvm_s390_new_mmu_cache();
+	if (!mc)
+		return -ENOMEM;
+
+	while (count) {
+		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
+			rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false);
+		if (rc == -ENOMEM) {
+			rc = kvm_s390_mmu_cache_topup(mc);
+			if (rc)
+				return rc;
+			continue;
+		}
+		if (rc)
+			return rc;
+
+		count--;
+		c_gfn += _PAGE_ENTRIES;
+		p_gfn += _PAGE_ENTRIES;
+	}
+	return rc;
+}
+
+static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn)
+{
+	union crste *crstep;
+	union pte *ptep;
+	int rc;
+
+	rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep);
+	if (!rc)
+		dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce);
+}
+
+void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count)
+{
+	guard(read_lock)(&gmap->kvm->mmu_lock);
+
+	for ( ; count; count--, c_gfn += _PAGE_ENTRIES)
+		gmap_ucas_unmap_one(gmap, c_gfn);
+}
+
+static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	struct gmap *gmap = walk->priv;
+	union crste crste, newcrste;
+
+	crste = READ_ONCE(*crstep);
+	newcrste = _CRSTE_EMPTY(crste.h.tt);
+
+	while (crste_leaf(crste)) {
+		if (crste_prefix(crste))
+			gmap_unmap_prefix(gmap, gfn, next);
+		if (crste.s.fc1.vsie_notif)
+			gmap_handle_vsie_unshadow_event(gmap, gfn);
+		if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce))
+			break;
+		crste = READ_ONCE(*crstep);
+	}
+
+	if (need_resched())
+		return next;
+
+	return 0;
+}
+
+void gmap_split_huge_pages(struct gmap *gmap)
+{
+	const struct dat_walk_ops ops = {
+		.pmd_entry = _gmap_split_crste,
+		.pud_entry = _gmap_split_crste,
+	};
+	gfn_t start = 0;
+
+	do {
+		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
+			start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce,
+						    &ops, DAT_WALK_IGN_HOLES, gmap);
+		cond_resched();
+	} while (start);
+}
+
+static int _gmap_enable_skeys(struct gmap *gmap)
+{
+	gfn_t start = 0;
+	int rc;
+
+	if (uses_skeys(gmap))
+		return 0;
+
+	set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
+	rc = gmap_helper_disable_cow_sharing();
+	if (rc) {
+		clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
+		return rc;
+	}
+
+	do {
+		scoped_guard(write_lock, &gmap->kvm->mmu_lock)
+			start = dat_reset_skeys(gmap->asce, start);
+		cond_resched();
+	} while (start);
+	return 0;
+}
+
+int gmap_enable_skeys(struct gmap *gmap)
+{
+	int rc;
+
+	mmap_write_lock(gmap->kvm->mm);
+	rc = _gmap_enable_skeys(gmap);
+	mmap_write_unlock(gmap->kvm->mm);
+	return rc;
+}
+
+static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	if (!ptep->s.pr)
+		return 0;
+	__kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep)));
+	if (need_resched())
+		return next;
+	return 0;
+}
+
+static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	phys_addr_t origin, cur, end;
+
+	if (!crstep->h.fc || !crstep->s.fc1.pr)
+		return 0;
+
+	origin = crste_origin_large(*crstep);
+	cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin;
+	end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin;
+	for ( ; cur < end; cur += PAGE_SIZE)
+		__kvm_s390_pv_destroy_page(phys_to_page(cur));
+	if (need_resched())
+		return next;
+	return 0;
+}
+
+int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible)
+{
+	const struct dat_walk_ops ops = {
+		.pte_entry = _destroy_pages_pte,
+		.pmd_entry = _destroy_pages_crste,
+		.pud_entry = _destroy_pages_crste,
+	};
+
+	do {
+		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
+			start = _dat_walk_gfn_range(start, end, gmap->asce, &ops,
+						    DAT_WALK_IGN_HOLES, NULL);
+		if (interruptible && fatal_signal_pending(current))
+			return -EINTR;
+		cond_resched();
+	} while (start && start < end);
+	return 0;
+}
+
+int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level)
+{
+	struct vsie_rmap *rmap __free(kvfree) = NULL;
+	struct vsie_rmap *temp;
+	void __rcu **slot;
+	int rc = 0;
+
+	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
+	lockdep_assert_held(&sg->host_to_rmap_lock);
+
+	rmap = kzalloc(sizeof(*rmap), GFP_ATOMIC);
+	if (!rmap)
+		return -ENOMEM;
+
+	rmap->r_gfn = r_gfn;
+	rmap->level = level;
+	slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn);
+	if (slot) {
+		rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock);
+		for (temp = rmap->next; temp; temp = temp->next) {
+			if (temp->val == rmap->val)
+				return 0;
+		}
+		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
+	} else {
+		rmap->next = NULL;
+		rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap);
+		if (rc)
+			return rc;
+	}
+	rmap = NULL;
+
+	return 0;
+}
+
+int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn,
+		      kvm_pfn_t pfn, int level, bool wr)
+{
+	union crste *crstep;
+	union pgste pgste;
+	union pte *ptep;
+	union pte pte;
+	int flags, rc;
+
+	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
+	lockdep_assert_held(&sg->parent->children_lock);
+
+	flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
+	rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags,
+			    TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
+	if (rc)
+		return rc;
+	if (level <= TABLE_TYPE_REGION1) {
+		scoped_guard(spinlock, &sg->host_to_rmap_lock)
+			rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level);
+	}
+	if (rc)
+		return rc;
+
+	if (!pgste_get_trylock(ptep, &pgste))
+		return -EAGAIN;
+	pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false);
+	pte.h.p = 1;
+	pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false);
+	pgste.vsie_notif = 1;
+	pgste_set_unlock(ptep, pgste);
+
+	return 0;
+}
+
+static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+	__atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val);
+	if (need_resched())
+		return next;
+	return 0;
+}
+
+void gmap_set_cmma_all_dirty(struct gmap *gmap)
+{
+	const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, };
+	gfn_t gfn = 0;
+
+	do {
+		scoped_guard(read_lock, &gmap->kvm->mmu_lock)
+			gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops,
+						  DAT_WALK_IGN_HOLES, NULL);
+		cond_resched();
+	} while (gfn);
+}
+
+static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level)
+{
+	unsigned long align = PAGE_SIZE;
+	gpa_t gaddr = gfn_to_gpa(r_gfn);
+	union crste *crstep;
+	union crste crste;
+	union pte *ptep;
+
+	if (level > TABLE_TYPE_PAGE_TABLE)
+		align = 1UL << (11 * level + _SEGMENT_SHIFT);
+	kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align));
+	if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep))
+		return;
+	if (ptep) {
+		if (READ_ONCE(*ptep).val != _PTE_EMPTY.val)
+			dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg));
+		return;
+	}
+	crste = READ_ONCE(*crstep);
+	dat_crstep_clear(crstep, r_gfn, sg->asce);
+	if (crste_leaf(crste) || crste.h.i)
+		return;
+	if (is_pmd(crste))
+		dat_free_pt(dereference_pmd(crste.pmd));
+	else
+		dat_free_level(dereference_crste(crste), true);
+}
+
+static void gmap_unshadow(struct gmap *sg)
+{
+	struct gmap_cache *gmap_cache, *next;
+
+	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
+	KVM_BUG_ON(!sg->parent, sg->kvm);
+
+	lockdep_assert_held(&sg->parent->children_lock);
+
+	gmap_remove_child(sg);
+	kvm_s390_vsie_gmap_notifier(sg, 0, -1UL);
+
+	list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) {
+		gmap_cache->gmap = NULL;
+		list_del(&gmap_cache->list);
+	}
+
+	gmap_put(sg);
+}
+
+void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn)
+{
+	struct vsie_rmap *rmap, *rnext, *head;
+	struct gmap *sg, *next;
+	gfn_t start, end;
+
+	list_for_each_entry_safe(sg, next, &parent->children, list) {
+		start = sg->guest_asce.rsto;
+		end = start + sg->guest_asce.tl + 1;
+		if (!sg->guest_asce.r && gfn >= start && gfn < end) {
+			gmap_unshadow(sg);
+			continue;
+		}
+		scoped_guard(spinlock, &sg->host_to_rmap_lock)
+			head = radix_tree_delete(&sg->host_to_rmap, gfn);
+		gmap_for_each_rmap_safe(rmap, rnext, head)
+			gmap_unshadow_level(sg, rmap->r_gfn, rmap->level);
+	}
+}
+
+/**
+ * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables.
+ * @parent: Pointer to the parent gmap.
+ * @asce: ASCE for which the shadow table is created.
+ * @edat_level: Edat level to be used for the shadow translation.
+ *
+ * Context: Called with parent->children_lock held.
+ *
+ * Return: The pointer to a gmap if a shadow table with the given asce is
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL.
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level)
+{
+	struct gmap *sg;
+
+	lockdep_assert_held(&parent->children_lock);
+	list_for_each_entry(sg, &parent->children, list) {
+		if (!gmap_is_shadow_valid(sg, asce, edat_level))
+			continue;
+		return sg;
+	}
+	return NULL;
+}
+
+#define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE)
+struct gmap_protect_asce_top_level {
+	unsigned long seq;
+	struct guest_fault f[CRST_TABLE_PAGES];
+};
+
+static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
+						struct gmap_protect_asce_top_level *context)
+{
+	int rc, i;
+
+	guard(write_lock)(&sg->kvm->mmu_lock);
+
+	if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f))
+		return -EAGAIN;
+
+	scoped_guard(spinlock, &sg->parent->children_lock) {
+		for (i = 0; i < CRST_TABLE_PAGES; i++) {
+			if (!context->f[i].valid)
+				continue;
+			rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn,
+					       TABLE_TYPE_REGION1 + 1, context->f[i].writable);
+			if (rc)
+				return rc;
+		}
+		gmap_add_child(sg->parent, sg);
+	}
+
+	kvm_s390_release_faultin_array(sg->kvm, context->f, false);
+	return 0;
+}
+
+static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
+					       struct gmap_protect_asce_top_level *context)
+{
+	int rc;
+
+	if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f))
+		return -EAGAIN;
+	do {
+		rc = kvm_s390_mmu_cache_topup(mc);
+		if (rc)
+			return rc;
+		rc = radix_tree_preload(GFP_KERNEL);
+		if (rc)
+			return rc;
+		rc = __gmap_protect_asce_top_level(mc, sg, context);
+		radix_tree_preload_end();
+	} while (rc == -ENOMEM);
+
+	return rc;
+}
+
+static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg)
+{
+	struct gmap_protect_asce_top_level context = {};
+	union asce asce = sg->guest_asce;
+	int rc;
+
+	KVM_BUG_ON(!is_shadow(sg), sg->kvm);
+
+	context.seq = sg->kvm->mmu_invalidate_seq;
+	/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
+	smp_rmb();
+
+	rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false);
+	if (rc > 0)
+		rc = -EFAULT;
+	if (!rc)
+		rc = _gmap_protect_asce_top_level(mc, sg, &context);
+	if (rc)
+		kvm_s390_release_faultin_array(sg->kvm, context.f, true);
+	return rc;
+}
+
+/**
+ * gmap_create_shadow() - Create/find a shadow guest address space.
+ * @mc: The cache to use to allocate dat tables.
+ * @parent: Pointer to the parent gmap.
+ * @asce: ASCE for which the shadow table is created.
+ * @edat_level: Edat level to be used for the shadow translation.
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * The returned shadow gmap will be returned with one extra reference.
+ *
+ * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
+ */
+struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent,
+				union asce asce, int edat_level)
+{
+	struct gmap *sg, *new;
+	int rc;
+
+	scoped_guard(spinlock, &parent->children_lock) {
+		sg = gmap_find_shadow(parent, asce, edat_level);
+		if (sg) {
+			gmap_get(sg);
+			return sg;
+		}
+	}
+	/* Create a new shadow gmap. */
+	new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce));
+	if (!new)
+		return ERR_PTR(-ENOMEM);
+	new->guest_asce = asce;
+	new->edat_level = edat_level;
+	set_bit(GMAP_FLAG_SHADOW, &new->flags);
+
+	scoped_guard(spinlock, &parent->children_lock) {
+		/* Recheck if another CPU created the same shadow. */
+		sg = gmap_find_shadow(parent, asce, edat_level);
+		if (sg) {
+			gmap_put(new);
+			gmap_get(sg);
+			return sg;
+		}
+		if (asce.r) {
+			/* Only allow one real-space gmap shadow. */
+			list_for_each_entry(sg, &parent->children, list) {
+				if (sg->guest_asce.r) {
+					scoped_guard(write_lock, &parent->kvm->mmu_lock)
+						gmap_unshadow(sg);
+					break;
+				}
+			}
+			gmap_add_child(parent, new);
+			/* Nothing to protect, return right away. */
+			gmap_get(new);
+			return new;
+		}
+	}
+
+	gmap_get(new);
+	new->parent = parent;
+	/* Protect while inserting, protects against invalidation races. */
+	rc = gmap_protect_asce_top_level(mc, new);
+	if (rc) {
+		new->parent = NULL;
+		gmap_put(new);
+		gmap_put(new);
+		return ERR_PTR(rc);
+	}
+	return new;
+}
diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h
new file mode 100644
index 000000000000..ccb5cd751e31
--- /dev/null
+++ b/arch/s390/kvm/gmap.h
@@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  KVM guest address space mapping code
+ *
+ *    Copyright IBM Corp. 2007, 2016, 2025
+ *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *               Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+
+#ifndef ARCH_KVM_S390_GMAP_H
+#define ARCH_KVM_S390_GMAP_H
+
+#include "dat.h"
+
+/**
+ * enum gmap_flags - Flags of a gmap.
+ *
+ * @GMAP_FLAG_SHADOW: The gmap is a vsie shadow gmap.
+ * @GMAP_FLAG_OWNS_PAGETABLES: The gmap owns all dat levels; normally 1, is 0
+ *                             only for ucontrol per-cpu gmaps, since they
+ *                             share the page tables with the main gmap.
+ * @GMAP_FLAG_IS_UCONTROL: The gmap is ucontrol (main gmap or per-cpu gmap).
+ * @GMAP_FLAG_ALLOW_HPAGE_1M: 1M hugepages are allowed for this gmap,
+ *                            independently of the page size used by userspace.
+ * @GMAP_FLAG_ALLOW_HPAGE_2G: 2G hugepages are allowed for this gmap,
+ *                            independently of the page size used by userspace.
+ * @GMAP_FLAG_PFAULT_ENABLED: Pfault is enabled for the gmap.
+ * @GMAP_FLAG_USES_SKEYS: If the guest uses storage keys.
+ * @GMAP_FLAG_USES_CMM: Whether the guest uses CMMA.
+ * @GMAP_FLAG_EXPORT_ON_UNMAP: Whether to export guest pages when unmapping.
+ */
+enum gmap_flags {
+	GMAP_FLAG_SHADOW = 0,
+	GMAP_FLAG_OWNS_PAGETABLES,
+	GMAP_FLAG_IS_UCONTROL,
+	GMAP_FLAG_ALLOW_HPAGE_1M,
+	GMAP_FLAG_ALLOW_HPAGE_2G,
+	GMAP_FLAG_PFAULT_ENABLED,
+	GMAP_FLAG_USES_SKEYS,
+	GMAP_FLAG_USES_CMM,
+	GMAP_FLAG_EXPORT_ON_UNMAP,
+};
+
+/**
+ * struct gmap_struct - Guest address space.
+ *
+ * @flags: GMAP_FLAG_* flags.
+ * @edat_level: The edat level of this shadow gmap.
+ * @kvm: The vm.
+ * @asce: The ASCE used by this gmap.
+ * @list: List head used in children gmaps for the children gmap list.
+ * @children_lock: Protects children and scb_users.
+ * @children: List of child gmaps of this gmap.
+ * @scb_users: List of vsie_scb that use this shadow gmap.
+ * @parent: Parent gmap of a child gmap.
+ * @guest_asce: Original ASCE of this shadow gmap.
+ * @host_to_rmap_lock: Protects host_to_rmap.
+ * @host_to_rmap: Radix tree mapping host addresses to guest addresses.
+ */
+struct gmap {
+	unsigned long flags;
+	unsigned char edat_level;
+	struct kvm *kvm;
+	union asce asce;
+	struct list_head list;
+	spinlock_t children_lock;	/* Protects: children, scb_users */
+	struct list_head children;
+	struct list_head scb_users;
+	struct gmap *parent;
+	union asce guest_asce;
+	spinlock_t host_to_rmap_lock;	/* Protects host_to_rmap */
+	struct radix_tree_root host_to_rmap;
+	refcount_t refcount;
+};
+
+struct gmap_cache {
+	struct list_head list;
+	struct gmap *gmap;
+};
+
+#define gmap_for_each_rmap_safe(pos, n, head) \
+	for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
+
+int s390_replace_asce(struct gmap *gmap);
+bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint);
+bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end);
+bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end);
+int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault);
+struct gmap *gmap_new(struct kvm *kvm, gfn_t limit);
+struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit);
+void gmap_remove_child(struct gmap *child);
+void gmap_dispose(struct gmap *gmap);
+int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *fault);
+void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end);
+int gmap_set_limit(struct gmap *gmap, gfn_t limit);
+int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr);
+int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count);
+void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count);
+int gmap_enable_skeys(struct gmap *gmap);
+int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible);
+int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level);
+int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn,
+		      kvm_pfn_t pfn, int level, bool wr);
+void gmap_set_cmma_all_dirty(struct gmap *gmap);
+void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn);
+struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
+				union asce asce, int edat_level);
+void gmap_split_huge_pages(struct gmap *gmap);
+
+static inline bool uses_skeys(struct gmap *gmap)
+{
+	return test_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
+}
+
+static inline bool uses_cmm(struct gmap *gmap)
+{
+	return test_bit(GMAP_FLAG_USES_CMM, &gmap->flags);
+}
+
+static inline bool pfault_enabled(struct gmap *gmap)
+{
+	return test_bit(GMAP_FLAG_PFAULT_ENABLED, &gmap->flags);
+}
+
+static inline bool is_ucontrol(struct gmap *gmap)
+{
+	return test_bit(GMAP_FLAG_IS_UCONTROL, &gmap->flags);
+}
+
+static inline bool is_shadow(struct gmap *gmap)
+{
+	return test_bit(GMAP_FLAG_SHADOW, &gmap->flags);
+}
+
+static inline bool owns_page_tables(struct gmap *gmap)
+{
+	return test_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags);
+}
+
+static inline struct gmap *gmap_put(struct gmap *gmap)
+{
+	if (refcount_dec_and_test(&gmap->refcount))
+		gmap_dispose(gmap);
+	return NULL;
+}
+
+static inline void gmap_get(struct gmap *gmap)
+{
+	WARN_ON_ONCE(unlikely(!refcount_inc_not_zero(&gmap->refcount)));
+}
+
+static inline void gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn)
+{
+	scoped_guard(spinlock, &parent->children_lock)
+		_gmap_handle_vsie_unshadow_event(parent, gfn);
+}
+
+static inline bool gmap_mkold_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end)
+{
+	return _gmap_unmap_prefix(gmap, gfn, end, true);
+}
+
+static inline bool gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end)
+{
+	return _gmap_unmap_prefix(gmap, gfn, end, false);
+}
+
+static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte,
+					  union pgste pgste, gfn_t gfn, bool needs_lock)
+{
+	lockdep_assert_held(&gmap->kvm->mmu_lock);
+	if (!needs_lock)
+		lockdep_assert_held(&gmap->children_lock);
+	else
+		lockdep_assert_not_held(&gmap->children_lock);
+
+	if (pgste.prefix_notif && (newpte.h.p || newpte.h.i)) {
+		pgste.prefix_notif = 0;
+		gmap_unmap_prefix(gmap, gfn, gfn + 1);
+	}
+	if (pgste.vsie_notif && (ptep->h.p != newpte.h.p || newpte.h.i)) {
+		pgste.vsie_notif = 0;
+		if (needs_lock)
+			gmap_handle_vsie_unshadow_event(gmap, gfn);
+		else
+			_gmap_handle_vsie_unshadow_event(gmap, gfn);
+	}
+	return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap));
+}
+
+static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte,
+					 union pgste pgste, gfn_t gfn)
+{
+	return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true);
+}
+
+static inline void _gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne,
+				     gfn_t gfn, bool needs_lock)
+{
+	unsigned long align = 8 + (is_pmd(*crstep) ? 0 : 11);
+
+	lockdep_assert_held(&gmap->kvm->mmu_lock);
+	if (!needs_lock)
+		lockdep_assert_held(&gmap->children_lock);
+
+	gfn = ALIGN_DOWN(gfn, align);
+	if (crste_prefix(*crstep) && (ne.h.p || ne.h.i || !crste_prefix(ne))) {
+		ne.s.fc1.prefix_notif = 0;
+		gmap_unmap_prefix(gmap, gfn, gfn + align);
+	}
+	if (crste_leaf(*crstep) && crstep->s.fc1.vsie_notif &&
+	    (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) {
+		ne.s.fc1.vsie_notif = 0;
+		if (needs_lock)
+			gmap_handle_vsie_unshadow_event(gmap, gfn);
+		else
+			_gmap_handle_vsie_unshadow_event(gmap, gfn);
+	}
+	dat_crstep_xchg(crstep, ne, gfn, gmap->asce);
+}
+
+static inline void gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne,
+				    gfn_t gfn)
+{
+	return _gmap_crstep_xchg(gmap, crstep, ne, gfn, true);
+}
+
+/**
+ * gmap_is_shadow_valid() - check if a shadow guest address space matches the
+ *                          given properties and is still valid.
+ * @sg: Pointer to the shadow guest address space structure.
+ * @asce: ASCE for which the shadow table is requested.
+ * @edat_level: Edat level to be used for the shadow translation.
+ *
+ * Return: true if the gmap shadow is still valid and matches the given
+ * properties and the caller can continue using it; false otherwise, the
+ * caller has to request a new shadow gmap in this case.
+ */
+static inline bool gmap_is_shadow_valid(struct gmap *sg, union asce asce, int edat_level)
+{
+	return sg->guest_asce.val == asce.val && sg->edat_level == edat_level;
+}
+
+#endif /* ARCH_KVM_S390_GMAP_H */
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 420ae62977e2..39aff324203e 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -21,6 +21,7 @@
 #include "gaccess.h"
 #include "trace.h"
 #include "trace-s390.h"
+#include "faultin.h"
 
 u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu)
 {
@@ -367,8 +368,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
 					      reg2, &srcaddr, GACC_FETCH, 0);
 	if (rc)
 		return kvm_s390_inject_prog_cond(vcpu, rc);
-	rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0);
-	if (rc != 0)
+
+	do {
+		rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(srcaddr), false);
+	} while (rc == -EAGAIN);
+	if (rc)
 		return rc;
 
 	/* Ensure that the source is paged-in, no actual access -> no key checking */
@@ -376,8 +380,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
 					      reg1, &dstaddr, GACC_STORE, 0);
 	if (rc)
 		return kvm_s390_inject_prog_cond(vcpu, rc);
-	rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE);
-	if (rc != 0)
+
+	do {
+		rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(dstaddr), true);
+	} while (rc == -EAGAIN);
+	if (rc)
 		return rc;
 
 	kvm_s390_retry_instr(vcpu);
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 249cdc822ec5..1c2bb5cd7e12 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -26,7 +26,6 @@
 #include <linux/uaccess.h>
 #include <asm/sclp.h>
 #include <asm/isc.h>
-#include <asm/gmap.h>
 #include <asm/nmi.h>
 #include <asm/airq.h>
 #include <asm/tpi.h>
@@ -34,6 +33,7 @@
 #include "gaccess.h"
 #include "trace-s390.h"
 #include "pci.h"
+#include "gmap.h"
 
 #define PFAULT_INIT 0x0600
 #define PFAULT_DONE 0x0680
@@ -2632,12 +2632,12 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 	case KVM_DEV_FLIC_APF_ENABLE:
 		if (kvm_is_ucontrol(dev->kvm))
 			return -EINVAL;
-		dev->kvm->arch.gmap->pfault_enabled = 1;
+		set_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags);
 		break;
 	case KVM_DEV_FLIC_APF_DISABLE_WAIT:
 		if (kvm_is_ucontrol(dev->kvm))
 			return -EINVAL;
-		dev->kvm->arch.gmap->pfault_enabled = 0;
+		clear_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags);
 		/*
 		 * Make sure no async faults are in transition when
 		 * clearing the queues. So we don't need to worry
@@ -2768,13 +2768,13 @@ static int adapter_indicators_set(struct kvm *kvm,
 	bit = get_ind_bit(adapter_int->ind_addr,
 			  adapter_int->ind_offset, adapter->swap);
 	set_bit(bit, map);
-	mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT);
+	mark_page_dirty(kvm, adapter_int->ind_gaddr >> PAGE_SHIFT);
 	set_page_dirty_lock(ind_page);
 	map = page_address(summary_page);
 	bit = get_ind_bit(adapter_int->summary_addr,
 			  adapter_int->summary_offset, adapter->swap);
 	summary_set = test_and_set_bit(bit, map);
-	mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT);
+	mark_page_dirty(kvm, adapter_int->summary_gaddr >> PAGE_SHIFT);
 	set_page_dirty_lock(summary_page);
 	srcu_read_unlock(&kvm->srcu, idx);
 
@@ -2870,7 +2870,9 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i))
 			return -EFAULT;
 		e->adapter.summary_addr = uaddr_s;
+		e->adapter.summary_gaddr = ue->u.adapter.summary_addr;
 		e->adapter.ind_addr = uaddr_i;
+		e->adapter.ind_gaddr = ue->u.adapter.ind_addr;
 		e->adapter.summary_offset = ue->u.adapter.summary_offset;
 		e->adapter.ind_offset = ue->u.adapter.ind_offset;
 		e->adapter.adapter_id = ue->u.adapter.adapter_id;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 56a50524b3ee..de645025db0f 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -40,7 +40,6 @@
 #include <asm/lowcore.h>
 #include <asm/machine.h>
 #include <asm/stp.h>
-#include <asm/gmap.h>
 #include <asm/gmap_helpers.h>
 #include <asm/nmi.h>
 #include <asm/isc.h>
@@ -53,6 +52,8 @@
 #include <asm/uv.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
+#include "gmap.h"
+#include "faultin.h"
 #include "pci.h"
 
 #define CREATE_TRACE_POINTS
@@ -264,16 +265,11 @@ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS)
 /* available subfunctions indicated via query / "test bit" */
 static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
 
-static struct gmap_notifier gmap_notifier;
-static struct gmap_notifier vsie_gmap_notifier;
 debug_info_t *kvm_s390_dbf;
 debug_info_t *kvm_s390_dbf_uv;
 
 /* Section: not file related */
 /* forward declarations */
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
-			      unsigned long end);
-
 static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta)
 {
 	u8 delta_idx = 0;
@@ -529,10 +525,6 @@ static int __init __kvm_s390_init(void)
 	if (rc)
 		goto err_gib;
 
-	gmap_notifier.notifier_call = kvm_gmap_notifier;
-	gmap_register_pte_notifier(&gmap_notifier);
-	vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
-	gmap_register_pte_notifier(&vsie_gmap_notifier);
 	atomic_notifier_chain_register(&s390_epoch_delta_notifier,
 				       &kvm_clock_notifier);
 
@@ -552,8 +544,6 @@ err_kvm_uv:
 
 static void __kvm_s390_exit(void)
 {
-	gmap_unregister_pte_notifier(&gmap_notifier);
-	gmap_unregister_pte_notifier(&vsie_gmap_notifier);
 	atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
 					 &kvm_clock_notifier);
 
@@ -564,12 +554,43 @@ static void __kvm_s390_exit(void)
 	debug_unregister(kvm_s390_dbf_uv);
 }
 
+static int kvm_s390_keyop(struct kvm_s390_mmu_cache *mc, struct kvm *kvm, int op,
+			  unsigned long addr, union skey skey)
+{
+	union asce asce = kvm->arch.gmap->asce;
+	gfn_t gfn = gpa_to_gfn(addr);
+	int r;
+
+	guard(read_lock)(&kvm->mmu_lock);
+
+	switch (op) {
+	case KVM_S390_KEYOP_SSKE:
+		r = dat_cond_set_storage_key(mc, asce, gfn, skey, &skey, 0, 0, 0);
+		if (r >= 0)
+			return skey.skey;
+		break;
+	case KVM_S390_KEYOP_ISKE:
+		r = dat_get_storage_key(asce, gfn, &skey);
+		if (!r)
+			return skey.skey;
+		break;
+	case KVM_S390_KEYOP_RRBE:
+		r = dat_reset_reference_bit(asce, gfn);
+		if (r > 0)
+			return r << 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return r;
+}
+
 /* Section: device related */
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg)
 {
 	if (ioctl == KVM_S390_ENABLE_SIE)
-		return s390_enable_sie();
+		return 0;
 	return -EINVAL;
 }
 
@@ -608,6 +629,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_S390_DIAG318:
 	case KVM_CAP_IRQFD_RESAMPLE:
 	case KVM_CAP_S390_USER_OPEREXEC:
+	case KVM_CAP_S390_KEYOP:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
@@ -698,32 +720,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 {
-	int i;
-	gfn_t cur_gfn, last_gfn;
-	unsigned long gaddr, vmaddr;
-	struct gmap *gmap = kvm->arch.gmap;
-	DECLARE_BITMAP(bitmap, _PAGE_ENTRIES);
-
-	/* Loop over all guest segments */
-	cur_gfn = memslot->base_gfn;
-	last_gfn = memslot->base_gfn + memslot->npages;
-	for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) {
-		gaddr = gfn_to_gpa(cur_gfn);
-		vmaddr = gfn_to_hva_memslot(memslot, cur_gfn);
-		if (kvm_is_error_hva(vmaddr))
-			continue;
-
-		bitmap_zero(bitmap, _PAGE_ENTRIES);
-		gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr);
-		for (i = 0; i < _PAGE_ENTRIES; i++) {
-			if (test_bit(i, bitmap))
-				mark_page_dirty(kvm, cur_gfn + i);
-		}
+	gfn_t last_gfn = memslot->base_gfn + memslot->npages;
 
-		if (fatal_signal_pending(current))
-			return;
-		cond_resched();
-	}
+	scoped_guard(read_lock, &kvm->mmu_lock)
+		gmap_sync_dirty_log(kvm->arch.gmap, memslot->base_gfn, last_gfn);
 }
 
 /* Section: vm related */
@@ -883,9 +883,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 			r = -EINVAL;
 		else {
 			r = 0;
-			mmap_write_lock(kvm->mm);
-			kvm->mm->context.allow_gmap_hpage_1m = 1;
-			mmap_write_unlock(kvm->mm);
+			set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &kvm->arch.gmap->flags);
 			/*
 			 * We might have to create fake 4k page
 			 * tables. To avoid that the hardware works on
@@ -958,7 +956,7 @@ static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	int ret;
-	unsigned int idx;
+
 	switch (attr->attr) {
 	case KVM_S390_VM_MEM_ENABLE_CMMA:
 		ret = -ENXIO;
@@ -969,8 +967,6 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 		mutex_lock(&kvm->lock);
 		if (kvm->created_vcpus)
 			ret = -EBUSY;
-		else if (kvm->mm->context.allow_gmap_hpage_1m)
-			ret = -EINVAL;
 		else {
 			kvm->arch.use_cmma = 1;
 			/* Not compatible with cmma. */
@@ -979,7 +975,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 		}
 		mutex_unlock(&kvm->lock);
 		break;
-	case KVM_S390_VM_MEM_CLR_CMMA:
+	case KVM_S390_VM_MEM_CLR_CMMA: {
+		gfn_t start_gfn = 0;
+
 		ret = -ENXIO;
 		if (!sclp.has_cmma)
 			break;
@@ -988,13 +986,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 			break;
 
 		VM_EVENT(kvm, 3, "%s", "RESET: CMMA states");
-		mutex_lock(&kvm->lock);
-		idx = srcu_read_lock(&kvm->srcu);
-		s390_reset_cmma(kvm->arch.gmap->mm);
-		srcu_read_unlock(&kvm->srcu, idx);
-		mutex_unlock(&kvm->lock);
+		do {
+			start_gfn = dat_reset_cmma(kvm->arch.gmap->asce, start_gfn);
+			cond_resched();
+		} while (start_gfn);
 		ret = 0;
 		break;
+	}
 	case KVM_S390_VM_MEM_LIMIT_SIZE: {
 		unsigned long new_limit;
 
@@ -1011,29 +1009,12 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
 		if (!new_limit)
 			return -EINVAL;
 
-		/* gmap_create takes last usable address */
-		if (new_limit != KVM_S390_NO_MEM_LIMIT)
-			new_limit -= 1;
-
 		ret = -EBUSY;
-		mutex_lock(&kvm->lock);
-		if (!kvm->created_vcpus) {
-			/* gmap_create will round the limit up */
-			struct gmap *new = gmap_create(current->mm, new_limit);
-
-			if (!new) {
-				ret = -ENOMEM;
-			} else {
-				gmap_remove(kvm->arch.gmap);
-				new->private = kvm;
-				kvm->arch.gmap = new;
-				ret = 0;
-			}
-		}
-		mutex_unlock(&kvm->lock);
+		if (!kvm->created_vcpus)
+			ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit));
 		VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit);
 		VM_EVENT(kvm, 3, "New guest asce: 0x%p",
-			 (void *) kvm->arch.gmap->asce);
+			 (void *)kvm->arch.gmap->asce.val);
 		break;
 	}
 	default:
@@ -1198,19 +1179,13 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
 		kvm->arch.migration_mode = 1;
 		return 0;
 	}
-	/* mark all the pages in active slots as dirty */
 	kvm_for_each_memslot(ms, bkt, slots) {
 		if (!ms->dirty_bitmap)
 			return -EINVAL;
-		/*
-		 * The second half of the bitmap is only used on x86,
-		 * and would be wasted otherwise, so we put it to good
-		 * use here to keep track of the state of the storage
-		 * attributes.
-		 */
-		memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms));
 		ram_pages += ms->npages;
 	}
+	/* mark all the pages as dirty */
+	gmap_set_cmma_all_dirty(kvm->arch.gmap);
 	atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages);
 	kvm->arch.migration_mode = 1;
 	kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
@@ -2116,40 +2091,32 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
 
 static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 {
-	uint8_t *keys;
-	uint64_t hva;
-	int srcu_idx, i, r = 0;
+	union skey *keys;
+	int i, r = 0;
 
 	if (args->flags != 0)
 		return -EINVAL;
 
 	/* Is this guest using storage keys? */
-	if (!mm_uses_skeys(current->mm))
+	if (!uses_skeys(kvm->arch.gmap))
 		return KVM_S390_GET_SKEYS_NONE;
 
 	/* Enforce sane limit on memory allocation */
 	if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
 		return -EINVAL;
 
-	keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
+	keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT);
 	if (!keys)
 		return -ENOMEM;
 
-	mmap_read_lock(current->mm);
-	srcu_idx = srcu_read_lock(&kvm->srcu);
-	for (i = 0; i < args->count; i++) {
-		hva = gfn_to_hva(kvm, args->start_gfn + i);
-		if (kvm_is_error_hva(hva)) {
-			r = -EFAULT;
-			break;
+	scoped_guard(read_lock, &kvm->mmu_lock) {
+		for (i = 0; i < args->count; i++) {
+			r = dat_get_storage_key(kvm->arch.gmap->asce,
+						args->start_gfn + i, keys + i);
+			if (r)
+				break;
 		}
-
-		r = get_guest_storage_key(current->mm, hva, &keys[i]);
-		if (r)
-			break;
 	}
-	srcu_read_unlock(&kvm->srcu, srcu_idx);
-	mmap_read_unlock(current->mm);
 
 	if (!r) {
 		r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
@@ -2164,10 +2131,9 @@ static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 
 static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 {
-	uint8_t *keys;
-	uint64_t hva;
-	int srcu_idx, i, r = 0;
-	bool unlocked;
+	struct kvm_s390_mmu_cache *mc;
+	union skey *keys;
+	int i, r = 0;
 
 	if (args->flags != 0)
 		return -EINVAL;
@@ -2176,7 +2142,7 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 	if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
 		return -EINVAL;
 
-	keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
+	keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT);
 	if (!keys)
 		return -ENOMEM;
 
@@ -2188,159 +2154,41 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 	}
 
 	/* Enable storage key handling for the guest */
-	r = s390_enable_skey();
+	r = gmap_enable_skeys(kvm->arch.gmap);
 	if (r)
 		goto out;
 
-	i = 0;
-	mmap_read_lock(current->mm);
-	srcu_idx = srcu_read_lock(&kvm->srcu);
-        while (i < args->count) {
-		unlocked = false;
-		hva = gfn_to_hva(kvm, args->start_gfn + i);
-		if (kvm_is_error_hva(hva)) {
-			r = -EFAULT;
-			break;
-		}
-
+	r = -EINVAL;
+	for (i = 0; i < args->count; i++) {
 		/* Lowest order bit is reserved */
-		if (keys[i] & 0x01) {
-			r = -EINVAL;
-			break;
-		}
-
-		r = set_guest_storage_key(current->mm, hva, keys[i], 0);
-		if (r) {
-			r = fixup_user_fault(current->mm, hva,
-					     FAULT_FLAG_WRITE, &unlocked);
-			if (r)
-				break;
-		}
-		if (!r)
-			i++;
-	}
-	srcu_read_unlock(&kvm->srcu, srcu_idx);
-	mmap_read_unlock(current->mm);
-out:
-	kvfree(keys);
-	return r;
-}
-
-/*
- * Base address and length must be sent at the start of each block, therefore
- * it's cheaper to send some clean data, as long as it's less than the size of
- * two longs.
- */
-#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *))
-/* for consistency */
-#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
-
-static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
-			      u8 *res, unsigned long bufsize)
-{
-	unsigned long pgstev, hva, cur_gfn = args->start_gfn;
-
-	args->count = 0;
-	while (args->count < bufsize) {
-		hva = gfn_to_hva(kvm, cur_gfn);
-		/*
-		 * We return an error if the first value was invalid, but we
-		 * return successfully if at least one value was copied.
-		 */
-		if (kvm_is_error_hva(hva))
-			return args->count ? 0 : -EFAULT;
-		if (get_pgste(kvm->mm, hva, &pgstev) < 0)
-			pgstev = 0;
-		res[args->count++] = (pgstev >> 24) & 0x43;
-		cur_gfn++;
-	}
-
-	return 0;
-}
-
-static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots,
-						     gfn_t gfn)
-{
-	return ____gfn_to_memslot(slots, gfn, true);
-}
-
-static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
-					      unsigned long cur_gfn)
-{
-	struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn);
-	unsigned long ofs = cur_gfn - ms->base_gfn;
-	struct rb_node *mnode = &ms->gfn_node[slots->node_idx];
-
-	if (ms->base_gfn + ms->npages <= cur_gfn) {
-		mnode = rb_next(mnode);
-		/* If we are above the highest slot, wrap around */
-		if (!mnode)
-			mnode = rb_first(&slots->gfn_tree);
-
-		ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
-		ofs = 0;
+		if (keys[i].zero)
+			goto out;
 	}
 
-	if (cur_gfn < ms->base_gfn)
-		ofs = 0;
-
-	ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
-	while (ofs >= ms->npages && (mnode = rb_next(mnode))) {
-		ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
-		ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages);
+	mc = kvm_s390_new_mmu_cache();
+	if (!mc) {
+		r = -ENOMEM;
+		goto out;
 	}
-	return ms->base_gfn + ofs;
-}
-
-static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
-			     u8 *res, unsigned long bufsize)
-{
-	unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev;
-	struct kvm_memslots *slots = kvm_memslots(kvm);
-	struct kvm_memory_slot *ms;
 
-	if (unlikely(kvm_memslots_empty(slots)))
-		return 0;
-
-	cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn);
-	ms = gfn_to_memslot(kvm, cur_gfn);
-	args->count = 0;
-	args->start_gfn = cur_gfn;
-	if (!ms)
-		return 0;
-	next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
-	mem_end = kvm_s390_get_gfn_end(slots);
-
-	while (args->count < bufsize) {
-		hva = gfn_to_hva(kvm, cur_gfn);
-		if (kvm_is_error_hva(hva))
-			return 0;
-		/* Decrement only if we actually flipped the bit to 0 */
-		if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
-			atomic64_dec(&kvm->arch.cmma_dirty_pages);
-		if (get_pgste(kvm->mm, hva, &pgstev) < 0)
-			pgstev = 0;
-		/* Save the value */
-		res[args->count++] = (pgstev >> 24) & 0x43;
-		/* If the next bit is too far away, stop. */
-		if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE)
-			return 0;
-		/* If we reached the previous "next", find the next one */
-		if (cur_gfn == next_gfn)
-			next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
-		/* Reached the end of memory or of the buffer, stop */
-		if ((next_gfn >= mem_end) ||
-		    (next_gfn - args->start_gfn >= bufsize))
-			return 0;
-		cur_gfn++;
-		/* Reached the end of the current memslot, take the next one. */
-		if (cur_gfn - ms->base_gfn >= ms->npages) {
-			ms = gfn_to_memslot(kvm, cur_gfn);
-			if (!ms)
-				return 0;
+	r = 0;
+	do {
+		r = kvm_s390_mmu_cache_topup(mc);
+		if (r == -ENOMEM)
+			break;
+		scoped_guard(read_lock, &kvm->mmu_lock) {
+			for (i = 0 ; i < args->count; i++) {
+				r = dat_set_storage_key(mc, kvm->arch.gmap->asce,
+							args->start_gfn + i, keys[i], 0);
+				if (r)
+					break;
+			}
 		}
-	}
-	return 0;
+	} while (r == -ENOMEM);
+	kvm_s390_free_mmu_cache(mc);
+out:
+	kvfree(keys);
+	return r;
 }
 
 /*
@@ -2354,8 +2202,7 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
 static int kvm_s390_get_cmma_bits(struct kvm *kvm,
 				  struct kvm_s390_cmma_log *args)
 {
-	unsigned long bufsize;
-	int srcu_idx, peek, ret;
+	int peek, ret;
 	u8 *values;
 
 	if (!kvm->arch.use_cmma)
@@ -2368,8 +2215,8 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
 	if (!peek && !kvm->arch.migration_mode)
 		return -EINVAL;
 	/* CMMA is disabled or was not used, or the buffer has length zero */
-	bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
-	if (!bufsize || !kvm->mm->context.uses_cmm) {
+	args->count = min(args->count, KVM_S390_CMMA_SIZE_MAX);
+	if (!args->count || !uses_cmm(kvm->arch.gmap)) {
 		memset(args, 0, sizeof(*args));
 		return 0;
 	}
@@ -2379,18 +2226,18 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
 		return 0;
 	}
 
-	values = vmalloc(bufsize);
+	values = vmalloc(args->count);
 	if (!values)
 		return -ENOMEM;
 
-	mmap_read_lock(kvm->mm);
-	srcu_idx = srcu_read_lock(&kvm->srcu);
-	if (peek)
-		ret = kvm_s390_peek_cmma(kvm, args, values, bufsize);
-	else
-		ret = kvm_s390_get_cmma(kvm, args, values, bufsize);
-	srcu_read_unlock(&kvm->srcu, srcu_idx);
-	mmap_read_unlock(kvm->mm);
+	scoped_guard(read_lock, &kvm->mmu_lock) {
+		if (peek)
+			ret = dat_peek_cmma(args->start_gfn, kvm->arch.gmap->asce, &args->count,
+					    values);
+		else
+			ret = dat_get_cmma(kvm->arch.gmap->asce, &args->start_gfn, &args->count,
+					   values, &kvm->arch.cmma_dirty_pages);
+	}
 
 	if (kvm->arch.migration_mode)
 		args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages);
@@ -2412,11 +2259,9 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
 static int kvm_s390_set_cmma_bits(struct kvm *kvm,
 				  const struct kvm_s390_cmma_log *args)
 {
-	unsigned long hva, mask, pgstev, i;
-	uint8_t *bits;
-	int srcu_idx, r = 0;
-
-	mask = args->mask;
+	struct kvm_s390_mmu_cache *mc;
+	u8 *bits = NULL;
+	int r = 0;
 
 	if (!kvm->arch.use_cmma)
 		return -ENXIO;
@@ -2430,9 +2275,12 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
 	if (args->count == 0)
 		return 0;
 
+	mc = kvm_s390_new_mmu_cache();
+	if (!mc)
+		return -ENOMEM;
 	bits = vmalloc(array_size(sizeof(*bits), args->count));
 	if (!bits)
-		return -ENOMEM;
+		goto out;
 
 	r = copy_from_user(bits, (void __user *)args->values, args->count);
 	if (r) {
@@ -2440,29 +2288,19 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
 		goto out;
 	}
 
-	mmap_read_lock(kvm->mm);
-	srcu_idx = srcu_read_lock(&kvm->srcu);
-	for (i = 0; i < args->count; i++) {
-		hva = gfn_to_hva(kvm, args->start_gfn + i);
-		if (kvm_is_error_hva(hva)) {
-			r = -EFAULT;
+	do {
+		r = kvm_s390_mmu_cache_topup(mc);
+		if (r)
 			break;
+		scoped_guard(read_lock, &kvm->mmu_lock) {
+			r = dat_set_cmma_bits(mc, kvm->arch.gmap->asce, args->start_gfn,
+					      args->count, args->mask, bits);
 		}
+	} while (r == -ENOMEM);
 
-		pgstev = bits[i];
-		pgstev = pgstev << 24;
-		mask &= _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT;
-		set_pgste_bits(kvm->mm, hva, mask, pgstev);
-	}
-	srcu_read_unlock(&kvm->srcu, srcu_idx);
-	mmap_read_unlock(kvm->mm);
-
-	if (!kvm->mm->context.uses_cmm) {
-		mmap_write_lock(kvm->mm);
-		kvm->mm->context.uses_cmm = 1;
-		mmap_write_unlock(kvm->mm);
-	}
+	set_bit(GMAP_FLAG_USES_CMM, &kvm->arch.gmap->flags);
 out:
+	kvm_s390_free_mmu_cache(mc);
 	vfree(bits);
 	return r;
 }
@@ -2671,6 +2509,13 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
 			break;
 
 		mmap_write_lock(kvm->mm);
+		/*
+		 * Disable creation of new THPs. Existing THPs can stay, they
+		 * will be split when any part of them gets imported.
+		 */
+		mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, kvm->mm);
+		mm_flags_set(MMF_DISABLE_THP_COMPLETELY, kvm->mm);
+		set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags);
 		r = gmap_helper_disable_cow_sharing();
 		mmap_write_unlock(kvm->mm);
 		if (r)
@@ -2744,9 +2589,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
 		if (copy_from_user(&parms, argp, sizeof(parms)))
 			break;
 
-		/* Currently restricted to 8KB */
+		/* Currently restricted to 1MiB */
 		r = -EINVAL;
-		if (parms.length > PAGE_SIZE * 2)
+		if (parms.length > SZ_1M)
 			break;
 
 		r = -ENOMEM;
@@ -2900,9 +2745,9 @@ static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_fla
 static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop)
 {
 	void __user *uaddr = (void __user *)mop->buf;
+	void *tmpbuf __free(kvfree) = NULL;
 	enum gacc_mode acc_mode;
-	void *tmpbuf = NULL;
-	int r, srcu_idx;
+	int r;
 
 	r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION |
 					KVM_S390_MEMOP_F_CHECK_ONLY);
@@ -2915,52 +2760,32 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop)
 			return -ENOMEM;
 	}
 
-	srcu_idx = srcu_read_lock(&kvm->srcu);
+	acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE;
 
-	if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
-		r = PGM_ADDRESSING;
-		goto out_unlock;
-	}
+	scoped_guard(srcu, &kvm->srcu) {
+		if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)
+			return check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key);
 
-	acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE;
-	if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
-		r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key);
-		goto out_unlock;
-	}
-	if (acc_mode == GACC_FETCH) {
+		if (acc_mode == GACC_STORE && copy_from_user(tmpbuf, uaddr, mop->size))
+			return -EFAULT;
 		r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
-					      mop->size, GACC_FETCH, mop->key);
+					      mop->size, acc_mode, mop->key);
 		if (r)
-			goto out_unlock;
-		if (copy_to_user(uaddr, tmpbuf, mop->size))
-			r = -EFAULT;
-	} else {
-		if (copy_from_user(tmpbuf, uaddr, mop->size)) {
-			r = -EFAULT;
-			goto out_unlock;
-		}
-		r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
-					      mop->size, GACC_STORE, mop->key);
+			return r;
+		if (acc_mode != GACC_STORE && copy_to_user(uaddr, tmpbuf, mop->size))
+			return -EFAULT;
 	}
-
-out_unlock:
-	srcu_read_unlock(&kvm->srcu, srcu_idx);
-
-	vfree(tmpbuf);
-	return r;
+	return 0;
 }
 
 static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop)
 {
 	void __user *uaddr = (void __user *)mop->buf;
 	void __user *old_addr = (void __user *)mop->old_addr;
-	union {
-		__uint128_t quad;
-		char raw[sizeof(__uint128_t)];
-	} old = { .quad = 0}, new = { .quad = 0 };
-	unsigned int off_in_quad = sizeof(new) - mop->size;
-	int r, srcu_idx;
-	bool success;
+	union kvm_s390_quad old = { .sixteen = 0 };
+	union kvm_s390_quad new = { .sixteen = 0 };
+	bool success = false;
+	int r;
 
 	r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION);
 	if (r)
@@ -2972,25 +2797,18 @@ static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *m
 	 */
 	if (mop->size > sizeof(new))
 		return -EINVAL;
-	if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size))
+	if (copy_from_user(&new, uaddr, mop->size))
 		return -EFAULT;
-	if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size))
+	if (copy_from_user(&old, old_addr, mop->size))
 		return -EFAULT;
 
-	srcu_idx = srcu_read_lock(&kvm->srcu);
+	scoped_guard(srcu, &kvm->srcu) {
+		r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old, new,
+					       mop->key, &success);
 
-	if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
-		r = PGM_ADDRESSING;
-		goto out_unlock;
+		if (!success && copy_to_user(old_addr, &old, mop->size))
+			return -EFAULT;
 	}
-
-	r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad,
-				       new.quad, mop->key, &success);
-	if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size))
-		r = -EFAULT;
-
-out_unlock:
-	srcu_read_unlock(&kvm->srcu, srcu_idx);
 	return r;
 }
 
@@ -3145,6 +2963,32 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 			r = -EFAULT;
 		break;
 	}
+	case KVM_S390_KEYOP: {
+		struct kvm_s390_mmu_cache *mc;
+		struct kvm_s390_keyop kop;
+		union skey skey;
+
+		if (copy_from_user(&kop, argp, sizeof(kop))) {
+			r = -EFAULT;
+			break;
+		}
+		skey.skey = kop.key;
+
+		mc = kvm_s390_new_mmu_cache();
+		if (!mc)
+			return -ENOMEM;
+
+		r = kvm_s390_keyop(mc, kvm, kop.operation, kop.guest_addr, skey);
+		kvm_s390_free_mmu_cache(mc);
+		if (r < 0)
+			break;
+
+		kop.key = r;
+		r = 0;
+		if (copy_to_user(argp, &kop, sizeof(kop)))
+			r = -EFAULT;
+		break;
+	}
 	case KVM_S390_ZPCI_OP: {
 		struct kvm_s390_zpci_op args;
 
@@ -3330,6 +3174,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	char debug_name[16];
 	int i, rc;
 
+	mutex_init(&kvm->arch.pv.import_lock);
+
 	rc = -EINVAL;
 #ifdef CONFIG_KVM_S390_UCONTROL
 	if (type & ~KVM_VM_S390_UCONTROL)
@@ -3340,11 +3186,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	if (type)
 		goto out_err;
 #endif
-
-	rc = s390_enable_sie();
-	if (rc)
-		goto out_err;
-
 	rc = -ENOMEM;
 
 	if (!sclp.has_64bscao)
@@ -3418,6 +3259,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
 	VM_EVENT(kvm, 3, "vm created with type %lu", type);
 
+	kvm->arch.mem_limit = type & KVM_VM_S390_UCONTROL ? KVM_S390_NO_MEM_LIMIT : sclp.hamax + 1;
+	kvm->arch.gmap = gmap_new(kvm, gpa_to_gfn(kvm->arch.mem_limit));
+	if (!kvm->arch.gmap)
+		goto out_err;
+	clear_bit(GMAP_FLAG_PFAULT_ENABLED, &kvm->arch.gmap->flags);
+
 	if (type & KVM_VM_S390_UCONTROL) {
 		struct kvm_userspace_memory_region2 fake_memslot = {
 			.slot = KVM_S390_UCONTROL_MEMSLOT,
@@ -3427,23 +3274,15 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 			.flags = 0,
 		};
 
-		kvm->arch.gmap = NULL;
-		kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT;
 		/* one flat fake memslot covering the whole address-space */
 		mutex_lock(&kvm->slots_lock);
 		KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm);
 		mutex_unlock(&kvm->slots_lock);
+		set_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags);
 	} else {
-		if (sclp.hamax == U64_MAX)
-			kvm->arch.mem_limit = TASK_SIZE_MAX;
-		else
-			kvm->arch.mem_limit = min_t(unsigned long, TASK_SIZE_MAX,
-						    sclp.hamax + 1);
-		kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
-		if (!kvm->arch.gmap)
-			goto out_err;
-		kvm->arch.gmap->private = kvm;
-		kvm->arch.gmap->pfault_enabled = 0;
+		struct crst_table *table = dereference_asce(kvm->arch.gmap->asce);
+
+		crst_table_init((void *)table, _CRSTE_HOLE(table->crstes[0].h.tt).val);
 	}
 
 	kvm->arch.use_pfmfi = sclp.has_pfmfi;
@@ -3477,8 +3316,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 		sca_del_vcpu(vcpu);
 	kvm_s390_update_topology_change_report(vcpu->kvm, 1);
 
-	if (kvm_is_ucontrol(vcpu->kvm))
-		gmap_remove(vcpu->arch.gmap);
+	if (kvm_is_ucontrol(vcpu->kvm)) {
+		scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock)
+			gmap_remove_child(vcpu->arch.gmap);
+		vcpu->arch.gmap = gmap_put(vcpu->arch.gmap);
+	}
 
 	if (vcpu->kvm->arch.use_cmma)
 		kvm_s390_vcpu_unsetup_cmma(vcpu);
@@ -3486,6 +3328,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	if (kvm_s390_pv_cpu_get_handle(vcpu))
 		kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc);
 	free_page((unsigned long)(vcpu->arch.sie_block));
+	kvm_s390_free_mmu_cache(vcpu->arch.mc);
 }
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
@@ -3512,25 +3355,14 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 
 	debug_unregister(kvm->arch.dbf);
 	free_page((unsigned long)kvm->arch.sie_page2);
-	if (!kvm_is_ucontrol(kvm))
-		gmap_remove(kvm->arch.gmap);
 	kvm_s390_destroy_adapters(kvm);
 	kvm_s390_clear_float_irqs(kvm);
 	kvm_s390_vsie_destroy(kvm);
+	kvm->arch.gmap = gmap_put(kvm->arch.gmap);
 	KVM_EVENT(3, "vm 0x%p destroyed", kvm);
 }
 
 /* Section: vcpu related */
-static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.gmap = gmap_create(current->mm, -1UL);
-	if (!vcpu->arch.gmap)
-		return -ENOMEM;
-	vcpu->arch.gmap->private = vcpu->kvm;
-
-	return 0;
-}
-
 static void sca_del_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -3871,9 +3703,15 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	int rc;
 
 	BUILD_BUG_ON(sizeof(struct sie_page) != 4096);
+	vcpu->arch.mc = kvm_s390_new_mmu_cache();
+	if (!vcpu->arch.mc)
+		return -ENOMEM;
 	sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT);
-	if (!sie_page)
+	if (!sie_page) {
+		kvm_s390_free_mmu_cache(vcpu->arch.mc);
+		vcpu->arch.mc = NULL;
 		return -ENOMEM;
+	}
 
 	vcpu->arch.sie_block = &sie_page->sie_block;
 	vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb);
@@ -3915,8 +3753,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 		vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
 
 	if (kvm_is_ucontrol(vcpu->kvm)) {
-		rc = __kvm_ucontrol_vcpu_init(vcpu);
-		if (rc)
+		rc = -ENOMEM;
+		vcpu->arch.gmap = gmap_new_child(vcpu->kvm->arch.gmap, -1UL);
+		if (!vcpu->arch.gmap)
 			goto out_free_sie_block;
 	}
 
@@ -3932,8 +3771,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	return 0;
 
 out_ucontrol_uninit:
-	if (kvm_is_ucontrol(vcpu->kvm))
-		gmap_remove(vcpu->arch.gmap);
+	if (kvm_is_ucontrol(vcpu->kvm)) {
+		gmap_remove_child(vcpu->arch.gmap);
+		vcpu->arch.gmap = gmap_put(vcpu->arch.gmap);
+	}
 out_free_sie_block:
 	free_page((unsigned long)(vcpu->arch.sie_block));
 	return rc;
@@ -3997,32 +3838,6 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
 	kvm_s390_vcpu_request(vcpu);
 }
 
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
-			      unsigned long end)
-{
-	struct kvm *kvm = gmap->private;
-	struct kvm_vcpu *vcpu;
-	unsigned long prefix;
-	unsigned long i;
-
-	trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap));
-
-	if (gmap_is_shadow(gmap))
-		return;
-	if (start >= 1UL << 31)
-		/* We are only interested in prefix pages */
-		return;
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		/* match against both prefix pages */
-		prefix = kvm_s390_get_prefix(vcpu);
-		if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
-			VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
-				   start, end);
-			kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
-		}
-	}
-}
-
 bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
 {
 	/* do not poll with more than halt_poll_max_steal percent of steal time */
@@ -4404,72 +4219,41 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu)
 	return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS);
 }
 
-static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags)
-{
-	struct kvm *kvm = gmap->private;
-	gfn_t gfn = gpa_to_gfn(gaddr);
-	bool unlocked;
-	hva_t vmaddr;
-	gpa_t tmp;
-	int rc;
-
-	if (kvm_is_ucontrol(kvm)) {
-		tmp = __gmap_translate(gmap, gaddr);
-		gfn = gpa_to_gfn(tmp);
-	}
-
-	vmaddr = gfn_to_hva(kvm, gfn);
-	rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked);
-	if (!rc)
-		rc = __gmap_link(gmap, gaddr, vmaddr);
-	return rc;
-}
-
-/**
- * __kvm_s390_mprotect_many() - Apply specified protection to guest pages
- * @gmap: the gmap of the guest
- * @gpa: the starting guest address
- * @npages: how many pages to protect
- * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
- * @bits: pgste notification bits to set
- *
- * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one()
- *
- * Context: kvm->srcu and gmap->mm need to be held in read mode
- */
-int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot,
-			     unsigned long bits)
+static int vcpu_ucontrol_translate(struct kvm_vcpu *vcpu, gpa_t *gaddr)
 {
-	unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
-	gpa_t end = gpa + npages * PAGE_SIZE;
 	int rc;
 
-	for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) {
-		rc = gmap_protect_one(gmap, gpa, prot, bits);
-		if (rc == -EAGAIN) {
-			__kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag);
-			rc = gmap_protect_one(gmap, gpa, prot, bits);
+	if (kvm_is_ucontrol(vcpu->kvm)) {
+		rc = gmap_ucas_translate(vcpu->arch.mc, vcpu->arch.gmap, gaddr);
+		if (rc == -EREMOTE) {
+			vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
+			vcpu->run->s390_ucontrol.trans_exc_code = *gaddr;
+			vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION;
 		}
-		if (rc < 0)
-			return rc;
+		return rc;
 	}
-
 	return 0;
 }
 
-static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu)
+static int kvm_s390_fixup_prefix(struct kvm_vcpu *vcpu)
 {
 	gpa_t gaddr = kvm_s390_get_prefix(vcpu);
-	int idx, rc;
-
-	idx = srcu_read_lock(&vcpu->kvm->srcu);
-	mmap_read_lock(vcpu->arch.gmap->mm);
+	gfn_t gfn;
+	int rc;
 
-	rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT);
+	if (vcpu_ucontrol_translate(vcpu, &gaddr))
+		return -EREMOTE;
+	gfn = gpa_to_gfn(gaddr);
 
-	mmap_read_unlock(vcpu->arch.gmap->mm);
-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn, true);
+	if (rc)
+		return rc;
+	rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn + 1, true);
+	if (rc)
+		return rc;
 
+	scoped_guard(write_lock, &vcpu->kvm->mmu_lock)
+		rc = dat_set_prefix_notif_bit(vcpu->kvm->arch.gmap->asce, gfn);
 	return rc;
 }
 
@@ -4489,7 +4273,7 @@ retry:
 	if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) {
 		int rc;
 
-		rc = kvm_s390_mprotect_notify_prefix(vcpu);
+		rc = kvm_s390_fixup_prefix(vcpu);
 		if (rc) {
 			kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
 			return rc;
@@ -4538,8 +4322,7 @@ retry:
 		 * Re-enable CMM virtualization if CMMA is available and
 		 * CMM has been used.
 		 */
-		if ((vcpu->kvm->arch.use_cmma) &&
-		    (vcpu->kvm->mm->context.uses_cmm))
+		if (vcpu->kvm->arch.use_cmma && uses_cmm(vcpu->arch.gmap))
 			vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
 		goto retry;
 	}
@@ -4635,7 +4418,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
 	return true;
 }
 
-static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
+bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
 {
 	hva_t hva;
 	struct kvm_arch_async_pf arch;
@@ -4651,7 +4434,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
 		return false;
 	if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK))
 		return false;
-	if (!vcpu->arch.gmap->pfault_enabled)
+	if (!pfault_enabled(vcpu->arch.gmap))
 		return false;
 
 	hva = gfn_to_hva(vcpu->kvm, current->thread.gmap_teid.addr);
@@ -4744,98 +4527,25 @@ static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu)
 		current->thread.gmap_int_code, current->thread.gmap_teid.val);
 }
 
-/*
- * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu
- * @vcpu: the vCPU whose gmap is to be fixed up
- * @gfn: the guest frame number used for memslots (including fake memslots)
- * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps
- * @foll: FOLL_* flags
- *
- * Return: 0 on success, < 0 in case of error.
- * Context: The mm lock must not be held before calling. May sleep.
- */
-int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int foll)
-{
-	struct kvm_memory_slot *slot;
-	unsigned int fault_flags;
-	bool writable, unlocked;
-	unsigned long vmaddr;
-	struct page *page;
-	kvm_pfn_t pfn;
+static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, gpa_t gaddr, bool wr)
+{
+	struct guest_fault f = {
+		.write_attempt = wr,
+		.attempt_pfault = pfault_enabled(vcpu->arch.gmap),
+	};
 	int rc;
 
-	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
-		return vcpu_post_run_addressing_exception(vcpu);
-
-	fault_flags = foll & FOLL_WRITE ? FAULT_FLAG_WRITE : 0;
-	if (vcpu->arch.gmap->pfault_enabled)
-		foll |= FOLL_NOWAIT;
-	vmaddr = __gfn_to_hva_memslot(slot, gfn);
-
-try_again:
-	pfn = __kvm_faultin_pfn(slot, gfn, foll, &writable, &page);
+	if (vcpu_ucontrol_translate(vcpu, &gaddr))
+		return -EREMOTE;
+	f.gfn = gpa_to_gfn(gaddr);
 
-	/* Access outside memory, inject addressing exception */
-	if (is_noslot_pfn(pfn))
+	rc = kvm_s390_faultin_gfn(vcpu, NULL, &f);
+	if (rc <= 0)
+		return rc;
+	if (rc == PGM_ADDRESSING)
 		return vcpu_post_run_addressing_exception(vcpu);
-	/* Signal pending: try again */
-	if (pfn == KVM_PFN_ERR_SIGPENDING)
-		return -EAGAIN;
-
-	/* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */
-	if (pfn == KVM_PFN_ERR_NEEDS_IO) {
-		trace_kvm_s390_major_guest_pfault(vcpu);
-		if (kvm_arch_setup_async_pf(vcpu))
-			return 0;
-		vcpu->stat.pfault_sync++;
-		/* Could not setup async pfault, try again synchronously */
-		foll &= ~FOLL_NOWAIT;
-		goto try_again;
-	}
-	/* Any other error */
-	if (is_error_pfn(pfn))
-		return -EFAULT;
-
-	/* Success */
-	mmap_read_lock(vcpu->arch.gmap->mm);
-	/* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */
-	rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked);
-	if (!rc)
-		rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr);
-	scoped_guard(spinlock, &vcpu->kvm->mmu_lock) {
-		kvm_release_faultin_page(vcpu->kvm, page, false, writable);
-	}
-	mmap_read_unlock(vcpu->arch.gmap->mm);
-	return rc;
-}
-
-static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int foll)
-{
-	unsigned long gaddr_tmp;
-	gfn_t gfn;
-
-	gfn = gpa_to_gfn(gaddr);
-	if (kvm_is_ucontrol(vcpu->kvm)) {
-		/*
-		 * This translates the per-vCPU guest address into a
-		 * fake guest address, which can then be used with the
-		 * fake memslots that are identity mapping userspace.
-		 * This allows ucontrol VMs to use the normal fault
-		 * resolution path, like normal VMs.
-		 */
-		mmap_read_lock(vcpu->arch.gmap->mm);
-		gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr);
-		mmap_read_unlock(vcpu->arch.gmap->mm);
-		if (gaddr_tmp == -EFAULT) {
-			vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
-			vcpu->run->s390_ucontrol.trans_exc_code = gaddr;
-			vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION;
-			return -EREMOTE;
-		}
-		gfn = gpa_to_gfn(gaddr_tmp);
-	}
-	return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, foll);
+	KVM_BUG_ON(rc, vcpu->kvm);
+	return -EINVAL;
 }
 
 static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
@@ -5012,7 +4722,7 @@ xfer_to_guest_mode_check:
 
 		exit_reason = kvm_s390_enter_exit_sie(vcpu->arch.sie_block,
 						      vcpu->run->s.regs.gprs,
-						      vcpu->arch.gmap->asce);
+						      vcpu->arch.gmap->asce.val);
 
 		__enable_cpu_timer_accounting(vcpu);
 		guest_timing_exit_irqoff();
@@ -5547,8 +5257,8 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu,
 				 struct kvm_s390_mem_op *mop)
 {
 	void __user *uaddr = (void __user *)mop->buf;
+	void *tmpbuf __free(kvfree) = NULL;
 	enum gacc_mode acc_mode;
-	void *tmpbuf = NULL;
 	int r;
 
 	r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION |
@@ -5570,32 +5280,21 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu,
 	if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
 		r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
 				    acc_mode, mop->key);
-		goto out_inject;
-	}
-	if (acc_mode == GACC_FETCH) {
+	} else if (acc_mode == GACC_FETCH) {
 		r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
 					mop->size, mop->key);
-		if (r)
-			goto out_inject;
-		if (copy_to_user(uaddr, tmpbuf, mop->size)) {
-			r = -EFAULT;
-			goto out_free;
-		}
+		if (!r && copy_to_user(uaddr, tmpbuf, mop->size))
+			return -EFAULT;
 	} else {
-		if (copy_from_user(tmpbuf, uaddr, mop->size)) {
-			r = -EFAULT;
-			goto out_free;
-		}
+		if (copy_from_user(tmpbuf, uaddr, mop->size))
+			return -EFAULT;
 		r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
 					 mop->size, mop->key);
 	}
 
-out_inject:
 	if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
 		kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
 
-out_free:
-	vfree(tmpbuf);
 	return r;
 }
 
@@ -5785,37 +5484,39 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 #ifdef CONFIG_KVM_S390_UCONTROL
 	case KVM_S390_UCAS_MAP: {
-		struct kvm_s390_ucas_mapping ucasmap;
+		struct kvm_s390_ucas_mapping ucas;
 
-		if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) {
-			r = -EFAULT;
+		r = -EFAULT;
+		if (copy_from_user(&ucas, argp, sizeof(ucas)))
 			break;
-		}
 
-		if (!kvm_is_ucontrol(vcpu->kvm)) {
-			r = -EINVAL;
+		r = -EINVAL;
+		if (!kvm_is_ucontrol(vcpu->kvm))
+			break;
+		if (!IS_ALIGNED(ucas.user_addr | ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE))
 			break;
-		}
 
-		r = gmap_map_segment(vcpu->arch.gmap, ucasmap.user_addr,
-				     ucasmap.vcpu_addr, ucasmap.length);
+		r = gmap_ucas_map(vcpu->arch.gmap, gpa_to_gfn(ucas.user_addr),
+				  gpa_to_gfn(ucas.vcpu_addr),
+				  ucas.length >> _SEGMENT_SHIFT);
 		break;
 	}
 	case KVM_S390_UCAS_UNMAP: {
-		struct kvm_s390_ucas_mapping ucasmap;
+		struct kvm_s390_ucas_mapping ucas;
 
-		if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) {
-			r = -EFAULT;
+		r = -EFAULT;
+		if (copy_from_user(&ucas, argp, sizeof(ucas)))
 			break;
-		}
 
-		if (!kvm_is_ucontrol(vcpu->kvm)) {
-			r = -EINVAL;
+		r = -EINVAL;
+		if (!kvm_is_ucontrol(vcpu->kvm))
+			break;
+		if (!IS_ALIGNED(ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE))
 			break;
-		}
 
-		r = gmap_unmap_segment(vcpu->arch.gmap, ucasmap.vcpu_addr,
-			ucasmap.length);
+		gmap_ucas_unmap(vcpu->arch.gmap, gpa_to_gfn(ucas.vcpu_addr),
+				ucas.length >> _SEGMENT_SHIFT);
+		r = 0;
 		break;
 	}
 #endif
@@ -5988,37 +5689,89 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 				const struct kvm_memory_slot *new,
 				enum kvm_mr_change change)
 {
+	struct kvm_s390_mmu_cache *mc = NULL;
 	int rc = 0;
 
-	if (kvm_is_ucontrol(kvm))
+	if (change == KVM_MR_FLAGS_ONLY)
 		return;
 
-	switch (change) {
-	case KVM_MR_DELETE:
-		rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE,
-					old->npages * PAGE_SIZE);
-		break;
-	case KVM_MR_MOVE:
-		rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE,
-					old->npages * PAGE_SIZE);
-		if (rc)
+	mc = kvm_s390_new_mmu_cache();
+	if (!mc) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	scoped_guard(write_lock, &kvm->mmu_lock) {
+		switch (change) {
+		case KVM_MR_DELETE:
+			rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages);
 			break;
-		fallthrough;
-	case KVM_MR_CREATE:
-		rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr,
-				      new->base_gfn * PAGE_SIZE,
-				      new->npages * PAGE_SIZE);
-		break;
-	case KVM_MR_FLAGS_ONLY:
-		break;
-	default:
-		WARN(1, "Unknown KVM MR CHANGE: %d\n", change);
+		case KVM_MR_MOVE:
+			rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages);
+			if (rc)
+				break;
+			fallthrough;
+		case KVM_MR_CREATE:
+			rc = dat_create_slot(mc, kvm->arch.gmap->asce, new->base_gfn, new->npages);
+			break;
+		case KVM_MR_FLAGS_ONLY:
+			break;
+		default:
+			WARN(1, "Unknown KVM MR CHANGE: %d\n", change);
+		}
 	}
+out:
 	if (rc)
 		pr_warn("failed to commit memory region\n");
+	kvm_s390_free_mmu_cache(mc);
 	return;
 }
 
+/**
+ * kvm_test_age_gfn() - test young
+ * @kvm: the kvm instance
+ * @range: the range of guest addresses whose young status needs to be cleared
+ *
+ * Context: called by KVM common code without holding the kvm mmu lock
+ * Return: true if any page in the given range is young, otherwise 0.
+ */
+bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	scoped_guard(read_lock, &kvm->mmu_lock)
+		return dat_test_age_gfn(kvm->arch.gmap->asce, range->start, range->end);
+}
+
+/**
+ * kvm_age_gfn() - clear young
+ * @kvm: the kvm instance
+ * @range: the range of guest addresses whose young status needs to be cleared
+ *
+ * Context: called by KVM common code without holding the kvm mmu lock
+ * Return: true if any page in the given range was young, otherwise 0.
+ */
+bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	scoped_guard(read_lock, &kvm->mmu_lock)
+		return gmap_age_gfn(kvm->arch.gmap, range->start, range->end);
+}
+
+/**
+ * kvm_unmap_gfn_range() - Unmap a range of guest addresses
+ * @kvm: the kvm instance
+ * @range: the range of guest page frames to invalidate
+ *
+ * This function always returns false because every DAT table modification
+ * has to use the appropriate DAT table manipulation instructions, which will
+ * keep the TLB coherent, hence no additional TLB flush is ever required.
+ *
+ * Context: called by KVM common code with the kvm mmu write lock held
+ * Return: false
+ */
+bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	return gmap_unmap_gfn_range(kvm->arch.gmap, range->slot, range->start, range->end);
+}
+
 static inline unsigned long nonhyp_mask(int i)
 {
 	unsigned int nonhyp_fai = (sclp.hmfai << i * 2) >> 30;
@@ -6035,11 +5788,6 @@ static int __init kvm_s390_init(void)
 		return -ENODEV;
 	}
 
-	if (nested && hpage) {
-		pr_info("A KVM host that supports nesting cannot back its KVM guests with huge pages\n");
-		return -EINVAL;
-	}
-
 	for (i = 0; i < 16; i++)
 		kvm_s390_fac_base[i] |=
 			stfle_fac_list[i] & nonhyp_mask(i);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 65c950760993..bf1d7798c1af 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -19,9 +19,19 @@
 #include <asm/facility.h>
 #include <asm/processor.h>
 #include <asm/sclp.h>
+#include "dat.h"
+#include "gmap.h"
 
 #define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0)
 
+union kvm_s390_quad {
+	__uint128_t sixteen;
+	unsigned long eight;
+	unsigned int four;
+	unsigned short two;
+	unsigned char one;
+};
+
 static inline void kvm_s390_fpu_store(struct kvm_run *run)
 {
 	fpu_stfpc(&run->s.regs.fpc);
@@ -106,9 +116,7 @@ static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
 static inline int kvm_is_ucontrol(struct kvm *kvm)
 {
 #ifdef CONFIG_KVM_S390_UCONTROL
-	if (kvm->arch.gmap)
-		return 0;
-	return 1;
+	return test_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags);
 #else
 	return 0;
 #endif
@@ -432,14 +440,9 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu);
 /* implemented in vsie.c */
 int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
 void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
-void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
-				 unsigned long end);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end);
 void kvm_s390_vsie_init(struct kvm *kvm);
 void kvm_s390_vsie_destroy(struct kvm *kvm);
-int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
-
-/* implemented in gmap-vsie.c */
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level);
 
 /* implemented in sigp.c */
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
@@ -461,14 +464,10 @@ void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
 void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
 __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
 int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc);
-int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags);
 int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot,
 			     unsigned long bits);
 
-static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags)
-{
-	return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags);
-}
+bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu);
 
 /* implemented in diag.c */
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 0b14d894f38a..a3250ad83a8e 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -21,13 +21,14 @@
 #include <asm/ebcdic.h>
 #include <asm/sysinfo.h>
 #include <asm/page-states.h>
-#include <asm/gmap.h>
 #include <asm/ptrace.h>
 #include <asm/sclp.h>
 #include <asm/ap.h>
+#include <asm/gmap_helpers.h>
 #include "gaccess.h"
 #include "kvm-s390.h"
 #include "trace.h"
+#include "gmap.h"
 
 static int handle_ri(struct kvm_vcpu *vcpu)
 {
@@ -222,7 +223,7 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.skey_enabled)
 		return 0;
 
-	rc = s390_enable_skey();
+	rc = gmap_enable_skeys(vcpu->arch.gmap);
 	VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
 	if (rc)
 		return rc;
@@ -255,10 +256,9 @@ static int try_handle_skey(struct kvm_vcpu *vcpu)
 
 static int handle_iske(struct kvm_vcpu *vcpu)
 {
-	unsigned long gaddr, vmaddr;
-	unsigned char key;
+	unsigned long gaddr;
 	int reg1, reg2;
-	bool unlocked;
+	union skey key;
 	int rc;
 
 	vcpu->stat.instruction_iske++;
@@ -275,37 +275,21 @@ static int handle_iske(struct kvm_vcpu *vcpu)
 	gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
 	gaddr = kvm_s390_logical_to_effective(vcpu, gaddr);
 	gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
-	vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr));
-	if (kvm_is_error_hva(vmaddr))
-		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-retry:
-	unlocked = false;
-	mmap_read_lock(current->mm);
-	rc = get_guest_storage_key(current->mm, vmaddr, &key);
-
-	if (rc) {
-		rc = fixup_user_fault(current->mm, vmaddr,
-				      FAULT_FLAG_WRITE, &unlocked);
-		if (!rc) {
-			mmap_read_unlock(current->mm);
-			goto retry;
-		}
-	}
-	mmap_read_unlock(current->mm);
-	if (rc == -EFAULT)
-		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+	scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+		rc = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr), &key);
+	if (rc > 0)
+		return kvm_s390_inject_program_int(vcpu, rc);
 	if (rc < 0)
 		return rc;
 	vcpu->run->s.regs.gprs[reg1] &= ~0xff;
-	vcpu->run->s.regs.gprs[reg1] |= key;
+	vcpu->run->s.regs.gprs[reg1] |= key.skey;
 	return 0;
 }
 
 static int handle_rrbe(struct kvm_vcpu *vcpu)
 {
-	unsigned long vmaddr, gaddr;
+	unsigned long gaddr;
 	int reg1, reg2;
-	bool unlocked;
 	int rc;
 
 	vcpu->stat.instruction_rrbe++;
@@ -322,24 +306,10 @@ static int handle_rrbe(struct kvm_vcpu *vcpu)
 	gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
 	gaddr = kvm_s390_logical_to_effective(vcpu, gaddr);
 	gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
-	vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr));
-	if (kvm_is_error_hva(vmaddr))
-		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-retry:
-	unlocked = false;
-	mmap_read_lock(current->mm);
-	rc = reset_guest_reference_bit(current->mm, vmaddr);
-	if (rc < 0) {
-		rc = fixup_user_fault(current->mm, vmaddr,
-				      FAULT_FLAG_WRITE, &unlocked);
-		if (!rc) {
-			mmap_read_unlock(current->mm);
-			goto retry;
-		}
-	}
-	mmap_read_unlock(current->mm);
-	if (rc == -EFAULT)
-		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+	scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+		rc = dat_reset_reference_bit(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr));
+	if (rc > 0)
+		return kvm_s390_inject_program_int(vcpu, rc);
 	if (rc < 0)
 		return rc;
 	kvm_s390_set_psw_cc(vcpu, rc);
@@ -354,9 +324,8 @@ static int handle_sske(struct kvm_vcpu *vcpu)
 {
 	unsigned char m3 = vcpu->arch.sie_block->ipb >> 28;
 	unsigned long start, end;
-	unsigned char key, oldkey;
+	union skey key, oldkey;
 	int reg1, reg2;
-	bool unlocked;
 	int rc;
 
 	vcpu->stat.instruction_sske++;
@@ -377,7 +346,7 @@ static int handle_sske(struct kvm_vcpu *vcpu)
 
 	kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
 
-	key = vcpu->run->s.regs.gprs[reg1] & 0xfe;
+	key.skey = vcpu->run->s.regs.gprs[reg1] & 0xfe;
 	start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
 	start = kvm_s390_logical_to_effective(vcpu, start);
 	if (m3 & SSKE_MB) {
@@ -389,27 +358,17 @@ static int handle_sske(struct kvm_vcpu *vcpu)
 	}
 
 	while (start != end) {
-		unsigned long vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
-		unlocked = false;
-
-		if (kvm_is_error_hva(vmaddr))
-			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
-		mmap_read_lock(current->mm);
-		rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey,
-						m3 & SSKE_NQ, m3 & SSKE_MR,
-						m3 & SSKE_MC);
-
-		if (rc < 0) {
-			rc = fixup_user_fault(current->mm, vmaddr,
-					      FAULT_FLAG_WRITE, &unlocked);
-			rc = !rc ? -EAGAIN : rc;
+		scoped_guard(read_lock, &vcpu->kvm->mmu_lock) {
+			rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce,
+						      gpa_to_gfn(start), key, &oldkey,
+						      m3 & SSKE_NQ, m3 & SSKE_MR, m3 & SSKE_MC);
 		}
-		mmap_read_unlock(current->mm);
-		if (rc == -EFAULT)
+		if (rc > 1)
 			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-		if (rc == -EAGAIN)
+		if (rc == -ENOMEM) {
+			kvm_s390_mmu_cache_topup(vcpu->arch.mc);
 			continue;
+		}
 		if (rc < 0)
 			return rc;
 		start += PAGE_SIZE;
@@ -422,7 +381,7 @@ static int handle_sske(struct kvm_vcpu *vcpu)
 		} else {
 			kvm_s390_set_psw_cc(vcpu, rc);
 			vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL;
-			vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8;
+			vcpu->run->s.regs.gprs[reg1] |= (u64)oldkey.skey << 8;
 		}
 	}
 	if (m3 & SSKE_MB) {
@@ -1082,7 +1041,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 	bool mr = false, mc = false, nq;
 	int reg1, reg2;
 	unsigned long start, end;
-	unsigned char key;
+	union skey key;
 
 	vcpu->stat.instruction_pfmf++;
 
@@ -1110,7 +1069,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 	}
 
 	nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ;
-	key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
+	key.skey = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
 	start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
 	start = kvm_s390_logical_to_effective(vcpu, start);
 
@@ -1141,14 +1100,6 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 	}
 
 	while (start != end) {
-		unsigned long vmaddr;
-		bool unlocked = false;
-
-		/* Translate guest address to host address */
-		vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
-		if (kvm_is_error_hva(vmaddr))
-			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
 		if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
 			if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE))
 				return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
@@ -1159,19 +1110,17 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 
 			if (rc)
 				return rc;
-			mmap_read_lock(current->mm);
-			rc = cond_set_guest_storage_key(current->mm, vmaddr,
-							key, NULL, nq, mr, mc);
-			if (rc < 0) {
-				rc = fixup_user_fault(current->mm, vmaddr,
-						      FAULT_FLAG_WRITE, &unlocked);
-				rc = !rc ? -EAGAIN : rc;
+			scoped_guard(read_lock, &vcpu->kvm->mmu_lock) {
+				rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce,
+							      gpa_to_gfn(start), key,
+							      NULL, nq, mr, mc);
 			}
-			mmap_read_unlock(current->mm);
-			if (rc == -EFAULT)
-				return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-			if (rc == -EAGAIN)
+			if (rc > 1)
+				return kvm_s390_inject_program_int(vcpu, rc);
+			if (rc == -ENOMEM) {
+				kvm_s390_mmu_cache_topup(vcpu->arch.mc);
 				continue;
+			}
 			if (rc < 0)
 				return rc;
 		}
@@ -1195,8 +1144,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
 {
 	int r1, r2, nappended, entries;
-	unsigned long gfn, hva, res, pgstev, ptev;
+	union essa_state state;
 	unsigned long *cbrlo;
+	unsigned long gfn;
+	bool dirtied;
 
 	/*
 	 * We don't need to set SD.FPF.SK to 1 here, because if we have a
@@ -1205,33 +1156,12 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
 
 	kvm_s390_get_regs_rre(vcpu, &r1, &r2);
 	gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT;
-	hva = gfn_to_hva(vcpu->kvm, gfn);
 	entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
 
-	if (kvm_is_error_hva(hva))
-		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
-	nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev);
-	if (nappended < 0) {
-		res = orc ? 0x10 : 0;
-		vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */
+	nappended = dat_perform_essa(vcpu->arch.gmap->asce, gfn, orc, &state, &dirtied);
+	vcpu->run->s.regs.gprs[r1] = state.val;
+	if (nappended < 0)
 		return 0;
-	}
-	res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22;
-	/*
-	 * Set the block-content state part of the result. 0 means resident, so
-	 * nothing to do if the page is valid. 2 is for preserved pages
-	 * (non-present and non-zero), and 3 for zero pages (non-present and
-	 * zero).
-	 */
-	if (ptev & _PAGE_INVALID) {
-		res |= 2;
-		if (pgstev & _PGSTE_GPS_ZERO)
-			res |= 1;
-	}
-	if (pgstev & _PGSTE_GPS_NODAT)
-		res |= 0x20;
-	vcpu->run->s.regs.gprs[r1] = res;
 	/*
 	 * It is possible that all the normal 511 slots were full, in which case
 	 * we will now write in the 512th slot, which is reserved for host use.
@@ -1243,17 +1173,34 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
 		cbrlo[entries] = gfn << PAGE_SHIFT;
 	}
 
-	if (orc) {
-		struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn);
-
-		/* Increment only if we are really flipping the bit */
-		if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
-			atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages);
-	}
+	if (dirtied)
+		atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages);
 
 	return nappended;
 }
 
+static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len)
+{
+	union crste *crstep;
+	union pgste pgste;
+	union pte *ptep;
+	int i;
+
+	lockdep_assert_held(&vcpu->kvm->mmu_lock);
+
+	for (i = 0; i < len; i++) {
+		if (dat_entry_walk(NULL, gpa_to_gfn(cbrl[i]), vcpu->arch.gmap->asce,
+				   0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep))
+			continue;
+		if (!ptep || ptep->s.pr)
+			continue;
+		pgste = pgste_get_lock(ptep);
+		if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero)
+			gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]);
+		pgste_set_unlock(ptep, pgste);
+	}
+}
+
 static int handle_essa(struct kvm_vcpu *vcpu)
 {
 	lockdep_assert_held(&vcpu->kvm->srcu);
@@ -1261,11 +1208,9 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 	/* entries expected to be 1FF */
 	int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
 	unsigned long *cbrlo;
-	struct gmap *gmap;
 	int i, orc;
 
 	VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries);
-	gmap = vcpu->arch.gmap;
 	vcpu->stat.instruction_essa++;
 	if (!vcpu->kvm->arch.use_cmma)
 		return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
@@ -1289,11 +1234,7 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 		 * value really needs to be written to; if the value is
 		 * already correct, we do nothing and avoid the lock.
 		 */
-		if (vcpu->kvm->mm->context.uses_cmm == 0) {
-			mmap_write_lock(vcpu->kvm->mm);
-			vcpu->kvm->mm->context.uses_cmm = 1;
-			mmap_write_unlock(vcpu->kvm->mm);
-		}
+		set_bit(GMAP_FLAG_USES_CMM, &vcpu->arch.gmap->flags);
 		/*
 		 * If we are here, we are supposed to have CMMA enabled in
 		 * the SIE block. Enabling CMMA works on a per-CPU basis,
@@ -1307,20 +1248,22 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 		/* Retry the ESSA instruction */
 		kvm_s390_retry_instr(vcpu);
 	} else {
-		mmap_read_lock(vcpu->kvm->mm);
-		i = __do_essa(vcpu, orc);
-		mmap_read_unlock(vcpu->kvm->mm);
+		scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+			i = __do_essa(vcpu, orc);
 		if (i < 0)
 			return i;
 		/* Account for the possible extra cbrl entry */
 		entries += i;
 	}
-	vcpu->arch.sie_block->cbrlo &= PAGE_MASK;	/* reset nceo */
+	/* reset nceo */
+	vcpu->arch.sie_block->cbrlo &= PAGE_MASK;
 	cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
-	mmap_read_lock(gmap->mm);
-	for (i = 0; i < entries; ++i)
-		__gmap_zap(gmap, cbrlo[i]);
-	mmap_read_unlock(gmap->mm);
+
+	mmap_read_lock(vcpu->kvm->mm);
+	scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+		_essa_clear_cbrl(vcpu, cbrlo, entries);
+	mmap_read_unlock(vcpu->kvm->mm);
+
 	return 0;
 }
 
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index 6ba5a0305e25..461b413c76a3 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -12,13 +12,16 @@
 #include <linux/minmax.h>
 #include <linux/pagemap.h>
 #include <linux/sched/signal.h>
-#include <asm/gmap.h>
 #include <asm/uv.h>
 #include <asm/mman.h>
 #include <linux/pagewalk.h>
 #include <linux/sched/mm.h>
 #include <linux/mmu_notifier.h>
 #include "kvm-s390.h"
+#include "dat.h"
+#include "gaccess.h"
+#include "gmap.h"
+#include "faultin.h"
 
 bool kvm_s390_pv_is_protected(struct kvm *kvm)
 {
@@ -35,6 +38,85 @@ bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
 EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected);
 
 /**
+ * should_export_before_import() - Determine whether an export is needed
+ * before an import-like operation.
+ * @uvcb: The Ultravisor control block of the UVC to be performed.
+ * @mm: The mm of the process.
+ *
+ * Returns whether an export is needed before every import-like operation.
+ * This is needed for shared pages, which don't trigger a secure storage
+ * exception when accessed from a different guest.
+ *
+ * Although considered as one, the Unpin Page UVC is not an actual import,
+ * so it is not affected.
+ *
+ * No export is needed also when there is only one protected VM, because the
+ * page cannot belong to the wrong VM in that case (there is no "other VM"
+ * it can belong to).
+ *
+ * Return: %true if an export is needed before every import, otherwise %false.
+ */
+static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
+{
+	/*
+	 * The misc feature indicates, among other things, that importing a
+	 * shared page from a different protected VM will automatically also
+	 * transfer its ownership.
+	 */
+	if (uv_has_feature(BIT_UV_FEAT_MISC))
+		return false;
+	if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
+		return false;
+	return atomic_read(&mm->context.protected_count) > 1;
+}
+
+struct pv_make_secure {
+	void *uvcb;
+	struct folio *folio;
+	int rc;
+	bool needs_export;
+};
+
+static int __kvm_s390_pv_make_secure(struct guest_fault *f, struct folio *folio)
+{
+	struct pv_make_secure *priv = f->priv;
+	int rc;
+
+	if (priv->needs_export)
+		uv_convert_from_secure(folio_to_phys(folio));
+
+	if (folio_test_hugetlb(folio))
+		return -EFAULT;
+	if (folio_test_large(folio))
+		return -E2BIG;
+
+	if (!f->page)
+		folio_get(folio);
+	rc = __make_folio_secure(folio, priv->uvcb);
+	if (!f->page)
+		folio_put(folio);
+
+	return rc;
+}
+
+static void _kvm_s390_pv_make_secure(struct guest_fault *f)
+{
+	struct pv_make_secure *priv = f->priv;
+	struct folio *folio;
+
+	folio = pfn_folio(f->pfn);
+	priv->rc = -EAGAIN;
+	if (folio_trylock(folio)) {
+		priv->rc = __kvm_s390_pv_make_secure(f, folio);
+		if (priv->rc == -E2BIG || priv->rc == -EBUSY) {
+			priv->folio = folio;
+			folio_get(folio);
+		}
+		folio_unlock(folio);
+	}
+}
+
+/**
  * kvm_s390_pv_make_secure() - make one guest page secure
  * @kvm: the guest
  * @gaddr: the guest address that needs to be made secure
@@ -45,14 +127,34 @@ EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected);
  */
 int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb)
 {
-	unsigned long vmaddr;
+	struct pv_make_secure priv = { .uvcb = uvcb };
+	struct guest_fault f = {
+		.write_attempt = true,
+		.gfn = gpa_to_gfn(gaddr),
+		.callback = _kvm_s390_pv_make_secure,
+		.priv = &priv,
+	};
+	int rc;
 
 	lockdep_assert_held(&kvm->srcu);
 
-	vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr));
-	if (kvm_is_error_hva(vmaddr))
-		return -EFAULT;
-	return make_hva_secure(kvm->mm, vmaddr, uvcb);
+	priv.needs_export = should_export_before_import(uvcb, kvm->mm);
+
+	scoped_guard(mutex, &kvm->arch.pv.import_lock) {
+		rc = kvm_s390_faultin_gfn(NULL, kvm, &f);
+
+		if (!rc) {
+			rc = priv.rc;
+			if (priv.folio) {
+				rc = s390_wiggle_split_folio(kvm->mm, priv.folio);
+				if (!rc)
+					rc = -EAGAIN;
+			}
+		}
+	}
+	if (priv.folio)
+		folio_put(priv.folio);
+	return rc;
 }
 
 int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr)
@@ -299,35 +401,6 @@ done_fast:
 	return 0;
 }
 
-/**
- * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory.
- * @kvm: the VM whose memory is to be cleared.
- *
- * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot.
- * The CPUs of the protected VM need to be destroyed beforehand.
- */
-static void kvm_s390_destroy_lower_2g(struct kvm *kvm)
-{
-	const unsigned long pages_2g = SZ_2G / PAGE_SIZE;
-	struct kvm_memory_slot *slot;
-	unsigned long len;
-	int srcu_idx;
-
-	srcu_idx = srcu_read_lock(&kvm->srcu);
-
-	/* Take the memslot containing guest absolute address 0 */
-	slot = gfn_to_memslot(kvm, 0);
-	/* Clear all slots or parts thereof that are below 2GB */
-	while (slot && slot->base_gfn < pages_2g) {
-		len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE;
-		s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len);
-		/* Take the next memslot */
-		slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages);
-	}
-
-	srcu_read_unlock(&kvm->srcu, srcu_idx);
-}
-
 static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
 {
 	struct uv_cb_destroy_fast uvcb = {
@@ -342,7 +415,6 @@ static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
 		*rc = uvcb.header.rc;
 	if (rrc)
 		*rrc = uvcb.header.rrc;
-	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
 	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x",
 		     uvcb.header.rc, uvcb.header.rrc);
 	WARN_ONCE(cc && uvcb.header.rc != 0x104,
@@ -391,7 +463,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
 		return -EINVAL;
 
 	/* Guest with segment type ASCE, refuse to destroy asynchronously */
-	if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+	if (kvm->arch.gmap->asce.dt == TABLE_TYPE_SEGMENT)
 		return -EINVAL;
 
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
@@ -404,8 +476,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
 		priv->stor_var = kvm->arch.pv.stor_var;
 		priv->stor_base = kvm->arch.pv.stor_base;
 		priv->handle = kvm_s390_pv_get_handle(kvm);
-		priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table;
-		WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+		priv->old_gmap_table = (unsigned long)dereference_asce(kvm->arch.gmap->asce);
 		if (s390_replace_asce(kvm->arch.gmap))
 			res = -ENOMEM;
 	}
@@ -415,7 +486,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
 		return res;
 	}
 
-	kvm_s390_destroy_lower_2g(kvm);
+	gmap_pv_destroy_range(kvm->arch.gmap, 0, gpa_to_gfn(SZ_2G), false);
 	kvm_s390_clear_pv_state(kvm);
 	kvm->arch.pv.set_aside = priv;
 
@@ -449,7 +520,6 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 
 	cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
 			   UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
-	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
 	if (!cc) {
 		atomic_dec(&kvm->mm->context.protected_count);
 		kvm_s390_pv_dealloc_vm(kvm);
@@ -532,7 +602,7 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
 	 * cleanup has been performed.
 	 */
 	if (need_zap && mmget_not_zero(kvm->mm)) {
-		s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
+		gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), false);
 		mmput(kvm->mm);
 	}
 
@@ -570,7 +640,7 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 		return -EINVAL;
 
 	/* When a fatal signal is received, stop immediately */
-	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
+	if (gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), true))
 		goto done;
 	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
 		ret = -EIO;
@@ -609,6 +679,7 @@ static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
 	r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
 	if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm))
 		kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy);
+	set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags);
 }
 
 static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = {
@@ -642,7 +713,7 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 	/* Inputs */
 	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
 	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
-	uvcb.guest_asce = kvm->arch.gmap->asce;
+	uvcb.guest_asce = kvm->arch.gmap->asce.val;
 	uvcb.guest_sca = virt_to_phys(kvm->arch.sca);
 	uvcb.conf_base_stor_origin =
 		virt_to_phys((void *)kvm->arch.pv.stor_base);
@@ -650,6 +721,9 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 	uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap;
 	uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr;
 
+	clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &kvm->arch.gmap->flags);
+	gmap_split_huge_pages(kvm->arch.gmap);
+
 	cc = uv_call_sched(0, (u64)&uvcb);
 	*rc = uvcb.header.rc;
 	*rrc = uvcb.header.rrc;
@@ -669,7 +743,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 		}
 		return -EIO;
 	}
-	kvm->arch.gmap->guest_handle = uvcb.guest_handle;
 	return 0;
 }
 
@@ -704,26 +777,14 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
 		.tweak[1] = offset,
 	};
 	int ret = kvm_s390_pv_make_secure(kvm, addr, &uvcb);
-	unsigned long vmaddr;
-	bool unlocked;
 
 	*rc = uvcb.header.rc;
 	*rrc = uvcb.header.rrc;
 
 	if (ret == -ENXIO) {
-		mmap_read_lock(kvm->mm);
-		vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr));
-		if (kvm_is_error_hva(vmaddr)) {
-			ret = -EFAULT;
-		} else {
-			ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked);
-			if (!ret)
-				ret = __gmap_link(kvm->arch.gmap, addr, vmaddr);
-		}
-		mmap_read_unlock(kvm->mm);
+		ret = kvm_s390_faultin_gfn_simple(NULL, kvm, gpa_to_gfn(addr), true);
 		if (!ret)
 			return -EAGAIN;
-		return ret;
 	}
 
 	if (ret && ret != -EAGAIN)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index b526621d2a1b..d249b10044eb 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -15,7 +15,6 @@
 #include <linux/io.h>
 #include <linux/mman.h>
 
-#include <asm/gmap.h>
 #include <asm/mmu_context.h>
 #include <asm/sclp.h>
 #include <asm/nmi.h>
@@ -23,6 +22,7 @@
 #include <asm/facility.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
+#include "gmap.h"
 
 enum vsie_page_flags {
 	VSIE_PAGE_IN_USE = 0,
@@ -41,8 +41,11 @@ struct vsie_page {
 	 * are reused conditionally, should be accessed via READ_ONCE.
 	 */
 	struct kvm_s390_sie_block *scb_o;	/* 0x0218 */
-	/* the shadow gmap in use by the vsie_page */
-	struct gmap *gmap;			/* 0x0220 */
+	/*
+	 * Flags: must be set/cleared atomically after the vsie page can be
+	 * looked up by other CPUs.
+	 */
+	unsigned long flags;			/* 0x0220 */
 	/* address of the last reported fault to guest2 */
 	unsigned long fault_addr;		/* 0x0228 */
 	/* calculated guest addresses of satellite control blocks */
@@ -57,33 +60,14 @@ struct vsie_page {
 	 * radix tree.
 	 */
 	gpa_t scb_gpa;				/* 0x0258 */
-	/*
-	 * Flags: must be set/cleared atomically after the vsie page can be
-	 * looked up by other CPUs.
-	 */
-	unsigned long flags;			/* 0x0260 */
-	__u8 reserved[0x0700 - 0x0268];		/* 0x0268 */
+	/* the shadow gmap in use by the vsie_page */
+	struct gmap_cache gmap_cache;		/* 0x0260 */
+	__u8 reserved[0x0700 - 0x0278];		/* 0x0278 */
 	struct kvm_s390_crypto_cb crycb;	/* 0x0700 */
 	__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE];	/* 0x0800 */
 };
 
-/**
- * gmap_shadow_valid() - check if a shadow guest address space matches the
- *                       given properties and is still valid
- * @sg: pointer to the shadow guest address space structure
- * @asce: ASCE for which the shadow table is requested
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns 1 if the gmap shadow is still valid and matches the given
- * properties, the caller can continue using it. Returns 0 otherwise; the
- * caller has to request a new shadow gmap in this case.
- */
-int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
-{
-	if (sg->removed)
-		return 0;
-	return sg->orig_asce == asce && sg->edat_level == edat_level;
-}
+static_assert(sizeof(struct vsie_page) == PAGE_SIZE);
 
 /* trigger a validity icpt for the given scb */
 static int set_validity_icpt(struct kvm_s390_sie_block *scb,
@@ -612,26 +596,17 @@ out:
 	return rc;
 }
 
-void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
-				 unsigned long end)
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end)
 {
-	struct kvm *kvm = gmap->private;
-	struct vsie_page *cur;
+	struct vsie_page *cur, *next;
 	unsigned long prefix;
-	int i;
 
-	if (!gmap_is_shadow(gmap))
-		return;
+	KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &gmap->flags), gmap->kvm);
 	/*
 	 * Only new shadow blocks are added to the list during runtime,
 	 * therefore we can safely reference them all the time.
 	 */
-	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
-		cur = READ_ONCE(kvm->arch.vsie.pages[i]);
-		if (!cur)
-			continue;
-		if (READ_ONCE(cur->gmap) != gmap)
-			continue;
+	list_for_each_entry_safe(cur, next, &gmap->scb_users, gmap_cache.list) {
 		prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
 		/* with mso/msl, the prefix lies at an offset */
 		prefix += cur->scb_s.mso;
@@ -652,7 +627,7 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
  *          - -EAGAIN if the caller can retry immediately
  *          - -ENOMEM if out of memory
  */
-static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
 {
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 	u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
@@ -667,10 +642,9 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	/* with mso/msl, the prefix lies at offset *mso* */
 	prefix += scb_s->mso;
 
-	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
+	rc = gaccess_shadow_fault(vcpu, sg, prefix, NULL, true);
 	if (!rc && (scb_s->ecb & ECB_TE))
-		rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-					   prefix + PAGE_SIZE, NULL);
+		rc = gaccess_shadow_fault(vcpu, sg, prefix + PAGE_SIZE, NULL, true);
 	/*
 	 * We don't have to mprotect, we will be called for all unshadows.
 	 * SIE will detect if protection applies and trigger a validity.
@@ -951,8 +925,9 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
  *          - > 0 if control has to be given to guest 2
  *          - < 0 if an error occurred
  */
-static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
 {
+	bool wr = kvm_s390_cur_gmap_fault_is_write();
 	int rc;
 
 	if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION)
@@ -960,12 +935,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		return inject_fault(vcpu, PGM_PROTECTION,
 				    current->thread.gmap_teid.addr * PAGE_SIZE, 1);
 
-	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-				   current->thread.gmap_teid.addr * PAGE_SIZE, NULL);
+	rc = gaccess_shadow_fault(vcpu, sg, current->thread.gmap_teid.addr * PAGE_SIZE, NULL, wr);
 	if (rc > 0) {
 		rc = inject_fault(vcpu, rc,
-				  current->thread.gmap_teid.addr * PAGE_SIZE,
-				  kvm_s390_cur_gmap_fault_is_write());
+				  current->thread.gmap_teid.addr * PAGE_SIZE, wr);
 		if (rc >= 0)
 			vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE;
 	}
@@ -978,12 +951,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
  *
  * Will ignore any errors. The next SIE fault will do proper fault handling.
  */
-static void handle_last_fault(struct kvm_vcpu *vcpu,
-			      struct vsie_page *vsie_page)
+static void handle_last_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
 {
 	if (vsie_page->fault_addr)
-		kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-				      vsie_page->fault_addr, NULL);
+		gaccess_shadow_fault(vcpu, sg, vsie_page->fault_addr, NULL, true);
 	vsie_page->fault_addr = 0;
 }
 
@@ -1065,11 +1036,12 @@ static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
 	}
 }
 
-static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
 {
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
-	unsigned long pei_dest, pei_src, src, dest, mask, prefix;
+	unsigned long src, dest, mask, prefix;
 	u64 *pei_block = &vsie_page->scb_o->mcic;
+	union mvpg_pei pei_dest, pei_src;
 	int edat, rc_dest, rc_src;
 	union ctlreg0 cr0;
 
@@ -1083,8 +1055,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
 	src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
 
-	rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
-	rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
+	rc_dest = gaccess_shadow_fault(vcpu, sg, dest, &pei_dest, true);
+	rc_src = gaccess_shadow_fault(vcpu, sg, src, &pei_src, false);
 	/*
 	 * Either everything went well, or something non-critical went wrong
 	 * e.g. because of a race. In either case, simply retry.
@@ -1119,8 +1091,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
 	}
 	if (!rc_dest && !rc_src) {
-		pei_block[0] = pei_dest;
-		pei_block[1] = pei_src;
+		pei_block[0] = pei_dest.val;
+		pei_block[1] = pei_src.val;
 		return 1;
 	}
 
@@ -1144,7 +1116,7 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
  *          - > 0 if control has to be given to guest 2
  *          - < 0 if an error occurred
  */
-static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
 	__releases(vcpu->kvm->srcu)
 	__acquires(vcpu->kvm->srcu)
 {
@@ -1153,7 +1125,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	int guest_bp_isolation;
 	int rc = 0;
 
-	handle_last_fault(vcpu, vsie_page);
+	handle_last_fault(vcpu, vsie_page, sg);
 
 	kvm_vcpu_srcu_read_unlock(vcpu);
 
@@ -1191,7 +1163,7 @@ xfer_to_guest_mode_check:
 			goto xfer_to_guest_mode_check;
 		}
 		guest_timing_enter_irqoff();
-		rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce);
+		rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, sg->asce.val);
 		guest_timing_exit_irqoff();
 		local_irq_enable();
 	}
@@ -1215,7 +1187,7 @@ skip_sie:
 	if (rc > 0)
 		rc = 0; /* we could still have an icpt */
 	else if (current->thread.gmap_int_code)
-		return handle_fault(vcpu, vsie_page);
+		return handle_fault(vcpu, vsie_page, sg);
 
 	switch (scb_s->icptcode) {
 	case ICPT_INST:
@@ -1233,7 +1205,7 @@ skip_sie:
 		break;
 	case ICPT_PARTEXEC:
 		if (scb_s->ipa == 0xb254)
-			rc = vsie_handle_mvpg(vcpu, vsie_page);
+			rc = vsie_handle_mvpg(vcpu, vsie_page, sg);
 		break;
 	}
 	return rc;
@@ -1241,43 +1213,67 @@ skip_sie:
 
 static void release_gmap_shadow(struct vsie_page *vsie_page)
 {
-	if (vsie_page->gmap)
-		gmap_put(vsie_page->gmap);
-	WRITE_ONCE(vsie_page->gmap, NULL);
+	struct gmap *gmap = vsie_page->gmap_cache.gmap;
+
+	lockdep_assert_held(&gmap->kvm->arch.gmap->children_lock);
+
+	list_del(&vsie_page->gmap_cache.list);
+	vsie_page->gmap_cache.gmap = NULL;
 	prefix_unmapped(vsie_page);
+
+	if (list_empty(&gmap->scb_users)) {
+		gmap_remove_child(gmap);
+		gmap_put(gmap);
+	}
 }
 
-static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
-			       struct vsie_page *vsie_page)
+static struct gmap *acquire_gmap_shadow(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
-	unsigned long asce;
 	union ctlreg0 cr0;
 	struct gmap *gmap;
+	union asce asce;
 	int edat;
 
-	asce = vcpu->arch.sie_block->gcr[1];
+	asce.val = vcpu->arch.sie_block->gcr[1];
 	cr0.val = vcpu->arch.sie_block->gcr[0];
 	edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
 	edat += edat && test_kvm_facility(vcpu->kvm, 78);
 
-	/*
-	 * ASCE or EDAT could have changed since last icpt, or the gmap
-	 * we're holding has been unshadowed. If the gmap is still valid,
-	 * we can safely reuse it.
-	 */
-	if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) {
-		vcpu->kvm->stat.gmap_shadow_reuse++;
-		return 0;
+	scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) {
+		gmap = vsie_page->gmap_cache.gmap;
+		if (gmap) {
+			/*
+			 * ASCE or EDAT could have changed since last icpt, or the gmap
+			 * we're holding has been unshadowed. If the gmap is still valid,
+			 * we can safely reuse it.
+			 */
+			if (gmap_is_shadow_valid(gmap, asce, edat)) {
+				vcpu->kvm->stat.gmap_shadow_reuse++;
+				gmap_get(gmap);
+				return gmap;
+			}
+			/* release the old shadow and mark the prefix as unmapped */
+			release_gmap_shadow(vsie_page);
+		}
 	}
-
-	/* release the old shadow - if any, and mark the prefix as unmapped */
-	release_gmap_shadow(vsie_page);
-	gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+again:
+	gmap = gmap_create_shadow(vcpu->arch.mc, vcpu->kvm->arch.gmap, asce, edat);
 	if (IS_ERR(gmap))
-		return PTR_ERR(gmap);
-	vcpu->kvm->stat.gmap_shadow_create++;
-	WRITE_ONCE(vsie_page->gmap, gmap);
-	return 0;
+		return gmap;
+	scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) {
+		/* unlikely race condition, remove the previous shadow */
+		if (vsie_page->gmap_cache.gmap)
+			release_gmap_shadow(vsie_page);
+		if (!gmap->parent) {
+			gmap_put(gmap);
+			goto again;
+		}
+		vcpu->kvm->stat.gmap_shadow_create++;
+		list_add(&vsie_page->gmap_cache.list, &gmap->scb_users);
+		vsie_page->gmap_cache.gmap = gmap;
+		prefix_unmapped(vsie_page);
+	}
+	return gmap;
 }
 
 /*
@@ -1330,15 +1326,20 @@ static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
 static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct gmap *sg;
 	int rc = 0;
 
 	while (1) {
-		rc = acquire_gmap_shadow(vcpu, vsie_page);
+		sg = acquire_gmap_shadow(vcpu, vsie_page);
+		if (IS_ERR(sg)) {
+			rc = PTR_ERR(sg);
+			sg = NULL;
+		}
 		if (!rc)
-			rc = map_prefix(vcpu, vsie_page);
+			rc = map_prefix(vcpu, vsie_page, sg);
 		if (!rc) {
 			update_intervention_requests(vsie_page);
-			rc = do_vsie_run(vcpu, vsie_page);
+			rc = do_vsie_run(vcpu, vsie_page, sg);
 		}
 		atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
 
@@ -1361,6 +1362,9 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			kvm_s390_rewind_psw(vcpu, 4);
 			break;
 		}
+		if (sg)
+			sg = gmap_put(sg);
+		cond_resched();
 	}
 
 	if (rc == -EFAULT) {
@@ -1457,8 +1461,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
 	vsie_page->scb_gpa = ULONG_MAX;
 
 	/* Double use of the same address or allocation failure. */
-	if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9,
-			      vsie_page)) {
+	if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, vsie_page)) {
 		put_vsie_page(vsie_page);
 		mutex_unlock(&kvm->arch.vsie.mutex);
 		return NULL;
@@ -1467,7 +1470,12 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
 	mutex_unlock(&kvm->arch.vsie.mutex);
 
 	memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
-	release_gmap_shadow(vsie_page);
+	if (vsie_page->gmap_cache.gmap) {
+		scoped_guard(spinlock, &kvm->arch.gmap->children_lock)
+			if (vsie_page->gmap_cache.gmap)
+				release_gmap_shadow(vsie_page);
+	}
+	prefix_unmapped(vsie_page);
 	vsie_page->fault_addr = 0;
 	vsie_page->scb_s.ihcpu = 0xffffU;
 	return vsie_page;
@@ -1498,11 +1506,13 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
 	}
 
 	vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
-	if (IS_ERR(vsie_page))
+	if (IS_ERR(vsie_page)) {
 		return PTR_ERR(vsie_page);
-	else if (!vsie_page)
+	} else if (!vsie_page) {
 		/* double use of sie control block - simply do nothing */
+		kvm_s390_rewind_psw(vcpu, 4);
 		return 0;
+	}
 
 	rc = pin_scb(vcpu, vsie_page, scb_addr);
 	if (rc)
@@ -1543,8 +1553,10 @@ void kvm_s390_vsie_destroy(struct kvm *kvm)
 	mutex_lock(&kvm->arch.vsie.mutex);
 	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
 		vsie_page = kvm->arch.vsie.pages[i];
+		scoped_guard(spinlock, &kvm->arch.gmap->children_lock)
+			if (vsie_page->gmap_cache.gmap)
+				release_gmap_shadow(vsie_page);
 		kvm->arch.vsie.pages[i] = NULL;
-		release_gmap_shadow(vsie_page);
 		/* free the radix tree entry */
 		if (vsie_page->scb_gpa != ULONG_MAX)
 			radix_tree_delete(&kvm->arch.vsie.addr_to_page,
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index 1a6ba105e071..0ac2f3998b14 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -34,136 +34,19 @@ void debug_user_asce(int exit)
 }
 #endif /*CONFIG_DEBUG_ENTRY */
 
-union oac {
-	unsigned int val;
-	struct {
-		struct {
-			unsigned short key : 4;
-			unsigned short	   : 4;
-			unsigned short as  : 2;
-			unsigned short	   : 4;
-			unsigned short k   : 1;
-			unsigned short a   : 1;
-		} oac1;
-		struct {
-			unsigned short key : 4;
-			unsigned short	   : 4;
-			unsigned short as  : 2;
-			unsigned short	   : 4;
-			unsigned short k   : 1;
-			unsigned short a   : 1;
-		} oac2;
-	};
-};
-
-static uaccess_kmsan_or_inline __must_check unsigned long
-raw_copy_from_user_key(void *to, const void __user *from, unsigned long size, unsigned long key)
-{
-	unsigned long osize;
-	union oac spec = {
-		.oac2.key = key,
-		.oac2.as = PSW_BITS_AS_SECONDARY,
-		.oac2.k = 1,
-		.oac2.a = 1,
-	};
-	int cc;
-
-	while (1) {
-		osize = size;
-		asm_inline volatile(
-			"	lr	%%r0,%[spec]\n"
-			"0:	mvcos	%[to],%[from],%[size]\n"
-			"1:	nopr	%%r7\n"
-			CC_IPM(cc)
-			EX_TABLE_UA_MVCOS_FROM(0b, 0b)
-			EX_TABLE_UA_MVCOS_FROM(1b, 0b)
-			: CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char *)to)
-			: [spec] "d" (spec.val), [from] "Q" (*(const char __user *)from)
-			: CC_CLOBBER_LIST("memory", "0"));
-		if (CC_TRANSFORM(cc) == 0)
-			return osize - size;
-		size -= 4096;
-		to += 4096;
-		from += 4096;
-	}
-}
-
-unsigned long _copy_from_user_key(void *to, const void __user *from,
-				  unsigned long n, unsigned long key)
-{
-	unsigned long res = n;
-
-	might_fault();
-	if (!should_fail_usercopy()) {
-		instrument_copy_from_user_before(to, from, n);
-		res = raw_copy_from_user_key(to, from, n, key);
-		instrument_copy_from_user_after(to, from, n, res);
-	}
-	if (unlikely(res))
-		memset(to + (n - res), 0, res);
-	return res;
-}
-EXPORT_SYMBOL(_copy_from_user_key);
-
-static uaccess_kmsan_or_inline __must_check unsigned long
-raw_copy_to_user_key(void __user *to, const void *from, unsigned long size, unsigned long key)
-{
-	unsigned long osize;
-	union oac spec = {
-		.oac1.key = key,
-		.oac1.as = PSW_BITS_AS_SECONDARY,
-		.oac1.k = 1,
-		.oac1.a = 1,
-	};
-	int cc;
-
-	while (1) {
-		osize = size;
-		asm_inline volatile(
-			"	lr	%%r0,%[spec]\n"
-			"0:	mvcos	%[to],%[from],%[size]\n"
-			"1:	nopr	%%r7\n"
-			CC_IPM(cc)
-			EX_TABLE_UA_MVCOS_TO(0b, 0b)
-			EX_TABLE_UA_MVCOS_TO(1b, 0b)
-			: CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to)
-			: [spec] "d" (spec.val), [from] "Q" (*(const char *)from)
-			: CC_CLOBBER_LIST("memory", "0"));
-		if (CC_TRANSFORM(cc) == 0)
-			return osize - size;
-		size -= 4096;
-		to += 4096;
-		from += 4096;
-	}
-}
-
-unsigned long _copy_to_user_key(void __user *to, const void *from,
-				unsigned long n, unsigned long key)
-{
-	might_fault();
-	if (should_fail_usercopy())
-		return n;
-	instrument_copy_to_user(to, from, n);
-	return raw_copy_to_user_key(to, from, n, key);
-}
-EXPORT_SYMBOL(_copy_to_user_key);
-
 #define CMPXCHG_USER_KEY_MAX_LOOPS 128
 
-static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsigned int *uval,
-						    unsigned int old, unsigned int new,
-						    unsigned int mask, unsigned long key)
+static nokprobe_inline int __cmpxchg_key_small(void *address, unsigned int *uval,
+					       unsigned int old, unsigned int new,
+					       unsigned int mask, unsigned long key)
 {
 	unsigned long count;
 	unsigned int prev;
-	bool sacf_flag;
 	int rc = 0;
 
 	skey_regions_initialize();
-	sacf_flag = enable_sacf_uaccess();
 	asm_inline volatile(
 		"20:	spka	0(%[key])\n"
-		"	sacf	256\n"
 		"	llill	%[count],%[max_loops]\n"
 		"0:	l	%[prev],%[address]\n"
 		"1:	nr	%[prev],%[mask]\n"
@@ -178,8 +61,7 @@ static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsig
 		"	nr	%[tmp],%[mask]\n"
 		"	jnz	5f\n"
 		"	brct	%[count],2b\n"
-		"5:	sacf	768\n"
-		"	spka	%[default_key]\n"
+		"5:	spka	%[default_key]\n"
 		"21:\n"
 		EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev])
 		EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev])
@@ -197,16 +79,16 @@ static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsig
 		[default_key] "J" (PAGE_DEFAULT_KEY),
 		[max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS)
 		: "memory", "cc");
-	disable_sacf_uaccess(sacf_flag);
 	*uval = prev;
 	if (!count)
 		rc = -EAGAIN;
 	return rc;
 }
 
-int __kprobes __cmpxchg_user_key1(unsigned long address, unsigned char *uval,
-				  unsigned char old, unsigned char new, unsigned long key)
+int __kprobes __cmpxchg_key1(void *addr, unsigned char *uval, unsigned char old,
+			     unsigned char new, unsigned long key)
 {
+	unsigned long address = (unsigned long)addr;
 	unsigned int prev, shift, mask, _old, _new;
 	int rc;
 
@@ -215,15 +97,16 @@ int __kprobes __cmpxchg_user_key1(unsigned long address, unsigned char *uval,
 	_old = (unsigned int)old << shift;
 	_new = (unsigned int)new << shift;
 	mask = ~(0xff << shift);
-	rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key);
+	rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key);
 	*uval = prev >> shift;
 	return rc;
 }
-EXPORT_SYMBOL(__cmpxchg_user_key1);
+EXPORT_SYMBOL(__cmpxchg_key1);
 
-int __kprobes __cmpxchg_user_key2(unsigned long address, unsigned short *uval,
-				  unsigned short old, unsigned short new, unsigned long key)
+int __kprobes __cmpxchg_key2(void *addr, unsigned short *uval, unsigned short old,
+			     unsigned short new, unsigned long key)
 {
+	unsigned long address = (unsigned long)addr;
 	unsigned int prev, shift, mask, _old, _new;
 	int rc;
 
@@ -232,27 +115,23 @@ int __kprobes __cmpxchg_user_key2(unsigned long address, unsigned short *uval,
 	_old = (unsigned int)old << shift;
 	_new = (unsigned int)new << shift;
 	mask = ~(0xffff << shift);
-	rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key);
+	rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key);
 	*uval = prev >> shift;
 	return rc;
 }
-EXPORT_SYMBOL(__cmpxchg_user_key2);
+EXPORT_SYMBOL(__cmpxchg_key2);
 
-int __kprobes __cmpxchg_user_key4(unsigned long address, unsigned int *uval,
-				  unsigned int old, unsigned int new, unsigned long key)
+int __kprobes __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old,
+			     unsigned int new, unsigned long key)
 {
 	unsigned int prev = old;
-	bool sacf_flag;
 	int rc = 0;
 
 	skey_regions_initialize();
-	sacf_flag = enable_sacf_uaccess();
 	asm_inline volatile(
 		"20:	spka	0(%[key])\n"
-		"	sacf	256\n"
 		"0:	cs	%[prev],%[new],%[address]\n"
-		"1:	sacf	768\n"
-		"	spka	%[default_key]\n"
+		"1:	spka	%[default_key]\n"
 		"21:\n"
 		EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
 		EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
@@ -264,27 +143,22 @@ int __kprobes __cmpxchg_user_key4(unsigned long address, unsigned int *uval,
 		[key] "a" (key << 4),
 		[default_key] "J" (PAGE_DEFAULT_KEY)
 		: "memory", "cc");
-	disable_sacf_uaccess(sacf_flag);
 	*uval = prev;
 	return rc;
 }
-EXPORT_SYMBOL(__cmpxchg_user_key4);
+EXPORT_SYMBOL(__cmpxchg_key4);
 
-int __kprobes __cmpxchg_user_key8(unsigned long address, unsigned long *uval,
-				  unsigned long old, unsigned long new, unsigned long key)
+int __kprobes __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old,
+			     unsigned long new, unsigned long key)
 {
 	unsigned long prev = old;
-	bool sacf_flag;
 	int rc = 0;
 
 	skey_regions_initialize();
-	sacf_flag = enable_sacf_uaccess();
 	asm_inline volatile(
 		"20:	spka	0(%[key])\n"
-		"	sacf	256\n"
 		"0:	csg	%[prev],%[new],%[address]\n"
-		"1:	sacf	768\n"
-		"	spka	%[default_key]\n"
+		"1:	spka	%[default_key]\n"
 		"21:\n"
 		EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
 		EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
@@ -296,27 +170,22 @@ int __kprobes __cmpxchg_user_key8(unsigned long address, unsigned long *uval,
 		[key] "a" (key << 4),
 		[default_key] "J" (PAGE_DEFAULT_KEY)
 		: "memory", "cc");
-	disable_sacf_uaccess(sacf_flag);
 	*uval = prev;
 	return rc;
 }
-EXPORT_SYMBOL(__cmpxchg_user_key8);
+EXPORT_SYMBOL(__cmpxchg_key8);
 
-int __kprobes __cmpxchg_user_key16(unsigned long address, __uint128_t *uval,
-				   __uint128_t old, __uint128_t new, unsigned long key)
+int __kprobes __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old,
+			      __uint128_t new, unsigned long key)
 {
 	__uint128_t prev = old;
-	bool sacf_flag;
 	int rc = 0;
 
 	skey_regions_initialize();
-	sacf_flag = enable_sacf_uaccess();
 	asm_inline volatile(
 		"20:	spka	0(%[key])\n"
-		"	sacf	256\n"
 		"0:	cdsg	%[prev],%[new],%[address]\n"
-		"1:	sacf	768\n"
-		"	spka	%[default_key]\n"
+		"1:	spka	%[default_key]\n"
 		"21:\n"
 		EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev])
 		EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev])
@@ -328,8 +197,7 @@ int __kprobes __cmpxchg_user_key16(unsigned long address, __uint128_t *uval,
 		[key] "a" (key << 4),
 		[default_key] "J" (PAGE_DEFAULT_KEY)
 		: "memory", "cc");
-	disable_sacf_uaccess(sacf_flag);
 	*uval = prev;
 	return rc;
 }
-EXPORT_SYMBOL(__cmpxchg_user_key16);
+EXPORT_SYMBOL(__cmpxchg_key16);
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index bd0401cc7ca5..193899c39ca7 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -10,7 +10,6 @@ obj-$(CONFIG_CMM)		+= cmm.o
 obj-$(CONFIG_DEBUG_VIRTUAL)	+= physaddr.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_PTDUMP)		+= dump_pagetables.o
-obj-$(CONFIG_PGSTE)		+= gmap.o
 obj-$(CONFIG_PFAULT)		+= pfault.o
 
 obj-$(subst m,y,$(CONFIG_KVM))	+= gmap_helpers.o
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index e2e13778c36a..a52aa7a99b6b 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -403,7 +403,7 @@ void do_dat_exception(struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(do_dat_exception);
 
-#if IS_ENABLED(CONFIG_PGSTE)
+#if IS_ENABLED(CONFIG_KVM)
 
 void do_secure_storage_access(struct pt_regs *regs)
 {
@@ -470,4 +470,4 @@ void do_secure_storage_access(struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(do_secure_storage_access);
 
-#endif /* CONFIG_PGSTE */
+#endif /* CONFIG_KVM */
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
deleted file mode 100644
index dd85bcca817d..000000000000
--- a/arch/s390/mm/gmap.c
+++ /dev/null
@@ -1,2436 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *  KVM guest address space mapping code
- *
- *    Copyright IBM Corp. 2007, 2020
- *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- *		 David Hildenbrand <david@redhat.com>
- *		 Janosch Frank <frankja@linux.vnet.ibm.com>
- */
-
-#include <linux/cpufeature.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/pagewalk.h>
-#include <linux/swap.h>
-#include <linux/smp.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/swapops.h>
-#include <linux/ksm.h>
-#include <linux/mman.h>
-#include <linux/pgtable.h>
-#include <asm/page-states.h>
-#include <asm/pgalloc.h>
-#include <asm/machine.h>
-#include <asm/gmap_helpers.h>
-#include <asm/gmap.h>
-#include <asm/page.h>
-
-/*
- * The address is saved in a radix tree directly; NULL would be ambiguous,
- * since 0 is a valid address, and NULL is returned when nothing was found.
- * The lower bits are ignored by all users of the macro, so it can be used
- * to distinguish a valid address 0 from a NULL.
- */
-#define VALID_GADDR_FLAG 1
-#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG)
-#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG)
-
-#define GMAP_SHADOW_FAKE_TABLE 1ULL
-
-static struct page *gmap_alloc_crst(void)
-{
-	struct page *page;
-
-	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
-	if (!page)
-		return NULL;
-	__arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER);
-	return page;
-}
-
-/**
- * gmap_alloc - allocate and initialize a guest address space
- * @limit: maximum address of the gmap address space
- *
- * Returns a guest address space structure.
- */
-struct gmap *gmap_alloc(unsigned long limit)
-{
-	struct gmap *gmap;
-	struct page *page;
-	unsigned long *table;
-	unsigned long etype, atype;
-
-	if (limit < _REGION3_SIZE) {
-		limit = _REGION3_SIZE - 1;
-		atype = _ASCE_TYPE_SEGMENT;
-		etype = _SEGMENT_ENTRY_EMPTY;
-	} else if (limit < _REGION2_SIZE) {
-		limit = _REGION2_SIZE - 1;
-		atype = _ASCE_TYPE_REGION3;
-		etype = _REGION3_ENTRY_EMPTY;
-	} else if (limit < _REGION1_SIZE) {
-		limit = _REGION1_SIZE - 1;
-		atype = _ASCE_TYPE_REGION2;
-		etype = _REGION2_ENTRY_EMPTY;
-	} else {
-		limit = -1UL;
-		atype = _ASCE_TYPE_REGION1;
-		etype = _REGION1_ENTRY_EMPTY;
-	}
-	gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
-	if (!gmap)
-		goto out;
-	INIT_LIST_HEAD(&gmap->children);
-	INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
-	INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
-	INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
-	spin_lock_init(&gmap->guest_table_lock);
-	spin_lock_init(&gmap->shadow_lock);
-	refcount_set(&gmap->ref_count, 1);
-	page = gmap_alloc_crst();
-	if (!page)
-		goto out_free;
-	table = page_to_virt(page);
-	crst_table_init(table, etype);
-	gmap->table = table;
-	gmap->asce = atype | _ASCE_TABLE_LENGTH |
-		_ASCE_USER_BITS | __pa(table);
-	gmap->asce_end = limit;
-	return gmap;
-
-out_free:
-	kfree(gmap);
-out:
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(gmap_alloc);
-
-/**
- * gmap_create - create a guest address space
- * @mm: pointer to the parent mm_struct
- * @limit: maximum size of the gmap address space
- *
- * Returns a guest address space structure.
- */
-struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
-{
-	struct gmap *gmap;
-	unsigned long gmap_asce;
-
-	gmap = gmap_alloc(limit);
-	if (!gmap)
-		return NULL;
-	gmap->mm = mm;
-	spin_lock(&mm->context.lock);
-	list_add_rcu(&gmap->list, &mm->context.gmap_list);
-	if (list_is_singular(&mm->context.gmap_list))
-		gmap_asce = gmap->asce;
-	else
-		gmap_asce = -1UL;
-	WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
-	spin_unlock(&mm->context.lock);
-	return gmap;
-}
-EXPORT_SYMBOL_GPL(gmap_create);
-
-static void gmap_flush_tlb(struct gmap *gmap)
-{
-	__tlb_flush_idte(gmap->asce);
-}
-
-static void gmap_radix_tree_free(struct radix_tree_root *root)
-{
-	struct radix_tree_iter iter;
-	unsigned long indices[16];
-	unsigned long index;
-	void __rcu **slot;
-	int i, nr;
-
-	/* A radix tree is freed by deleting all of its entries */
-	index = 0;
-	do {
-		nr = 0;
-		radix_tree_for_each_slot(slot, root, &iter, index) {
-			indices[nr] = iter.index;
-			if (++nr == 16)
-				break;
-		}
-		for (i = 0; i < nr; i++) {
-			index = indices[i];
-			radix_tree_delete(root, index);
-		}
-	} while (nr > 0);
-}
-
-static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
-{
-	struct gmap_rmap *rmap, *rnext, *head;
-	struct radix_tree_iter iter;
-	unsigned long indices[16];
-	unsigned long index;
-	void __rcu **slot;
-	int i, nr;
-
-	/* A radix tree is freed by deleting all of its entries */
-	index = 0;
-	do {
-		nr = 0;
-		radix_tree_for_each_slot(slot, root, &iter, index) {
-			indices[nr] = iter.index;
-			if (++nr == 16)
-				break;
-		}
-		for (i = 0; i < nr; i++) {
-			index = indices[i];
-			head = radix_tree_delete(root, index);
-			gmap_for_each_rmap_safe(rmap, rnext, head)
-				kfree(rmap);
-		}
-	} while (nr > 0);
-}
-
-static void gmap_free_crst(unsigned long *table, bool free_ptes)
-{
-	bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0;
-	int i;
-
-	if (is_segment) {
-		if (!free_ptes)
-			goto out;
-		for (i = 0; i < _CRST_ENTRIES; i++)
-			if (!(table[i] & _SEGMENT_ENTRY_INVALID))
-				page_table_free_pgste(page_ptdesc(phys_to_page(table[i])));
-	} else {
-		for (i = 0; i < _CRST_ENTRIES; i++)
-			if (!(table[i] & _REGION_ENTRY_INVALID))
-				gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes);
-	}
-
-out:
-	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
-}
-
-/**
- * gmap_free - free a guest address space
- * @gmap: pointer to the guest address space structure
- *
- * No locks required. There are no references to this gmap anymore.
- */
-void gmap_free(struct gmap *gmap)
-{
-	/* Flush tlb of all gmaps (if not already done for shadows) */
-	if (!(gmap_is_shadow(gmap) && gmap->removed))
-		gmap_flush_tlb(gmap);
-	/* Free all segment & region tables. */
-	gmap_free_crst(gmap->table, gmap_is_shadow(gmap));
-
-	gmap_radix_tree_free(&gmap->guest_to_host);
-	gmap_radix_tree_free(&gmap->host_to_guest);
-
-	/* Free additional data for a shadow gmap */
-	if (gmap_is_shadow(gmap)) {
-		gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
-		/* Release reference to the parent */
-		gmap_put(gmap->parent);
-	}
-
-	kfree(gmap);
-}
-EXPORT_SYMBOL_GPL(gmap_free);
-
-/**
- * gmap_get - increase reference counter for guest address space
- * @gmap: pointer to the guest address space structure
- *
- * Returns the gmap pointer
- */
-struct gmap *gmap_get(struct gmap *gmap)
-{
-	refcount_inc(&gmap->ref_count);
-	return gmap;
-}
-EXPORT_SYMBOL_GPL(gmap_get);
-
-/**
- * gmap_put - decrease reference counter for guest address space
- * @gmap: pointer to the guest address space structure
- *
- * If the reference counter reaches zero the guest address space is freed.
- */
-void gmap_put(struct gmap *gmap)
-{
-	if (refcount_dec_and_test(&gmap->ref_count))
-		gmap_free(gmap);
-}
-EXPORT_SYMBOL_GPL(gmap_put);
-
-/**
- * gmap_remove - remove a guest address space but do not free it yet
- * @gmap: pointer to the guest address space structure
- */
-void gmap_remove(struct gmap *gmap)
-{
-	struct gmap *sg, *next;
-	unsigned long gmap_asce;
-
-	/* Remove all shadow gmaps linked to this gmap */
-	if (!list_empty(&gmap->children)) {
-		spin_lock(&gmap->shadow_lock);
-		list_for_each_entry_safe(sg, next, &gmap->children, list) {
-			list_del(&sg->list);
-			gmap_put(sg);
-		}
-		spin_unlock(&gmap->shadow_lock);
-	}
-	/* Remove gmap from the pre-mm list */
-	spin_lock(&gmap->mm->context.lock);
-	list_del_rcu(&gmap->list);
-	if (list_empty(&gmap->mm->context.gmap_list))
-		gmap_asce = 0;
-	else if (list_is_singular(&gmap->mm->context.gmap_list))
-		gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
-					     struct gmap, list)->asce;
-	else
-		gmap_asce = -1UL;
-	WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
-	spin_unlock(&gmap->mm->context.lock);
-	synchronize_rcu();
-	/* Put reference */
-	gmap_put(gmap);
-}
-EXPORT_SYMBOL_GPL(gmap_remove);
-
-/*
- * gmap_alloc_table is assumed to be called with mmap_lock held
- */
-static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
-			    unsigned long init, unsigned long gaddr)
-{
-	struct page *page;
-	unsigned long *new;
-
-	/* since we dont free the gmap table until gmap_free we can unlock */
-	page = gmap_alloc_crst();
-	if (!page)
-		return -ENOMEM;
-	new = page_to_virt(page);
-	crst_table_init(new, init);
-	spin_lock(&gmap->guest_table_lock);
-	if (*table & _REGION_ENTRY_INVALID) {
-		*table = __pa(new) | _REGION_ENTRY_LENGTH |
-			(*table & _REGION_ENTRY_TYPE_MASK);
-		page = NULL;
-	}
-	spin_unlock(&gmap->guest_table_lock);
-	if (page)
-		__free_pages(page, CRST_ALLOC_ORDER);
-	return 0;
-}
-
-static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr)
-{
-	return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
-}
-
-static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr)
-{
-	return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
-}
-
-static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr,
-				       unsigned long *gaddr)
-{
-	*gaddr = host_to_guest_delete(gmap, vmaddr);
-	if (IS_GADDR_VALID(*gaddr))
-		return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1);
-	return NULL;
-}
-
-/**
- * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
- * @gmap: pointer to the guest address space structure
- * @vmaddr: address in the host process address space
- *
- * Returns 1 if a TLB flush is required
- */
-static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
-{
-	unsigned long gaddr;
-	int flush = 0;
-	pmd_t *pmdp;
-
-	BUG_ON(gmap_is_shadow(gmap));
-	spin_lock(&gmap->guest_table_lock);
-
-	pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
-	if (pmdp) {
-		flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY);
-		*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
-	}
-
-	spin_unlock(&gmap->guest_table_lock);
-	return flush;
-}
-
-/**
- * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
- * @gmap: pointer to the guest address space structure
- * @gaddr: address in the guest address space
- *
- * Returns 1 if a TLB flush is required
- */
-static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
-{
-	unsigned long vmaddr;
-
-	vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
-						   gaddr >> PMD_SHIFT);
-	return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
-}
-
-/**
- * gmap_unmap_segment - unmap segment from the guest address space
- * @gmap: pointer to the guest address space structure
- * @to: address in the guest address space
- * @len: length of the memory area to unmap
- *
- * Returns 0 if the unmap succeeded, -EINVAL if not.
- */
-int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
-{
-	unsigned long off;
-	int flush;
-
-	BUG_ON(gmap_is_shadow(gmap));
-	if ((to | len) & (PMD_SIZE - 1))
-		return -EINVAL;
-	if (len == 0 || to + len < to)
-		return -EINVAL;
-
-	flush = 0;
-	mmap_write_lock(gmap->mm);
-	for (off = 0; off < len; off += PMD_SIZE)
-		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
-	mmap_write_unlock(gmap->mm);
-	if (flush)
-		gmap_flush_tlb(gmap);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(gmap_unmap_segment);
-
-/**
- * gmap_map_segment - map a segment to the guest address space
- * @gmap: pointer to the guest address space structure
- * @from: source address in the parent address space
- * @to: target address in the guest address space
- * @len: length of the memory area to map
- *
- * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
- */
-int gmap_map_segment(struct gmap *gmap, unsigned long from,
-		     unsigned long to, unsigned long len)
-{
-	unsigned long off;
-	int flush;
-
-	BUG_ON(gmap_is_shadow(gmap));
-	if ((from | to | len) & (PMD_SIZE - 1))
-		return -EINVAL;
-	if (len == 0 || from + len < from || to + len < to ||
-	    from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
-		return -EINVAL;
-
-	flush = 0;
-	mmap_write_lock(gmap->mm);
-	for (off = 0; off < len; off += PMD_SIZE) {
-		/* Remove old translation */
-		flush |= __gmap_unmap_by_gaddr(gmap, to + off);
-		/* Store new translation */
-		if (radix_tree_insert(&gmap->guest_to_host,
-				      (to + off) >> PMD_SHIFT,
-				      (void *) from + off))
-			break;
-	}
-	mmap_write_unlock(gmap->mm);
-	if (flush)
-		gmap_flush_tlb(gmap);
-	if (off >= len)
-		return 0;
-	gmap_unmap_segment(gmap, to, len);
-	return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(gmap_map_segment);
-
-/**
- * __gmap_translate - translate a guest address to a user space address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- *
- * Returns user space address which corresponds to the guest address or
- * -EFAULT if no such mapping exists.
- * This function does not establish potentially missing page table entries.
- * The mmap_lock of the mm that belongs to the address space must be held
- * when this function gets called.
- *
- * Note: Can also be called for shadow gmaps.
- */
-unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
-{
-	unsigned long vmaddr;
-
-	vmaddr = (unsigned long)
-		radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
-	/* Note: guest_to_host is empty for a shadow gmap */
-	return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
-}
-EXPORT_SYMBOL_GPL(__gmap_translate);
-
-/**
- * gmap_unlink - disconnect a page table from the gmap shadow tables
- * @mm: pointer to the parent mm_struct
- * @table: pointer to the host page table
- * @vmaddr: vm address associated with the host page table
- */
-void gmap_unlink(struct mm_struct *mm, unsigned long *table,
-		 unsigned long vmaddr)
-{
-	struct gmap *gmap;
-	int flush;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
-		flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
-		if (flush)
-			gmap_flush_tlb(gmap);
-	}
-	rcu_read_unlock();
-}
-
-static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
-			   unsigned long gaddr);
-
-/**
- * __gmap_link - set up shadow page tables to connect a host to a guest address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- * @vmaddr: vm address
- *
- * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
- * if the vm address is already mapped to a different guest segment.
- * The mmap_lock of the mm that belongs to the address space must be held
- * when this function gets called.
- */
-int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
-{
-	struct mm_struct *mm;
-	unsigned long *table;
-	spinlock_t *ptl;
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-	pmd_t *pmd;
-	u64 unprot;
-	int rc;
-
-	BUG_ON(gmap_is_shadow(gmap));
-	/* Create higher level tables in the gmap page table */
-	table = gmap->table;
-	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
-		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
-		if ((*table & _REGION_ENTRY_INVALID) &&
-		    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
-				     gaddr & _REGION1_MASK))
-			return -ENOMEM;
-		table = __va(*table & _REGION_ENTRY_ORIGIN);
-	}
-	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
-		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
-		if ((*table & _REGION_ENTRY_INVALID) &&
-		    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
-				     gaddr & _REGION2_MASK))
-			return -ENOMEM;
-		table = __va(*table & _REGION_ENTRY_ORIGIN);
-	}
-	if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
-		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
-		if ((*table & _REGION_ENTRY_INVALID) &&
-		    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
-				     gaddr & _REGION3_MASK))
-			return -ENOMEM;
-		table = __va(*table & _REGION_ENTRY_ORIGIN);
-	}
-	table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
-	/* Walk the parent mm page table */
-	mm = gmap->mm;
-	pgd = pgd_offset(mm, vmaddr);
-	VM_BUG_ON(pgd_none(*pgd));
-	p4d = p4d_offset(pgd, vmaddr);
-	VM_BUG_ON(p4d_none(*p4d));
-	pud = pud_offset(p4d, vmaddr);
-	VM_BUG_ON(pud_none(*pud));
-	/* large puds cannot yet be handled */
-	if (pud_leaf(*pud))
-		return -EFAULT;
-	pmd = pmd_offset(pud, vmaddr);
-	VM_BUG_ON(pmd_none(*pmd));
-	/* Are we allowed to use huge pages? */
-	if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
-		return -EFAULT;
-	/* Link gmap segment table entry location to page table. */
-	rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
-	if (rc)
-		return rc;
-	ptl = pmd_lock(mm, pmd);
-	spin_lock(&gmap->guest_table_lock);
-	if (*table == _SEGMENT_ENTRY_EMPTY) {
-		rc = radix_tree_insert(&gmap->host_to_guest,
-				       vmaddr >> PMD_SHIFT,
-				       (void *)MAKE_VALID_GADDR(gaddr));
-		if (!rc) {
-			if (pmd_leaf(*pmd)) {
-				*table = (pmd_val(*pmd) &
-					  _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
-					| _SEGMENT_ENTRY_GMAP_UC
-					| _SEGMENT_ENTRY;
-			} else
-				*table = (pmd_val(*pmd) &
-					_SEGMENT_ENTRY_HARDWARE_BITS)
-					| _SEGMENT_ENTRY;
-		}
-	} else if (*table & _SEGMENT_ENTRY_PROTECT &&
-		   !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
-		unprot = (u64)*table;
-		unprot &= ~_SEGMENT_ENTRY_PROTECT;
-		unprot |= _SEGMENT_ENTRY_GMAP_UC;
-		gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
-	}
-	spin_unlock(&gmap->guest_table_lock);
-	spin_unlock(ptl);
-	radix_tree_preload_end();
-	return rc;
-}
-EXPORT_SYMBOL(__gmap_link);
-
-/*
- * this function is assumed to be called with mmap_lock held
- */
-void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
-{
-	unsigned long vmaddr;
-
-	mmap_assert_locked(gmap->mm);
-
-	/* Find the vm address for the guest address */
-	vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
-						   gaddr >> PMD_SHIFT);
-	if (vmaddr) {
-		vmaddr |= gaddr & ~PMD_MASK;
-		gmap_helper_zap_one_page(gmap->mm, vmaddr);
-	}
-}
-EXPORT_SYMBOL_GPL(__gmap_zap);
-
-static LIST_HEAD(gmap_notifier_list);
-static DEFINE_SPINLOCK(gmap_notifier_lock);
-
-/**
- * gmap_register_pte_notifier - register a pte invalidation callback
- * @nb: pointer to the gmap notifier block
- */
-void gmap_register_pte_notifier(struct gmap_notifier *nb)
-{
-	spin_lock(&gmap_notifier_lock);
-	list_add_rcu(&nb->list, &gmap_notifier_list);
-	spin_unlock(&gmap_notifier_lock);
-}
-EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
-
-/**
- * gmap_unregister_pte_notifier - remove a pte invalidation callback
- * @nb: pointer to the gmap notifier block
- */
-void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
-{
-	spin_lock(&gmap_notifier_lock);
-	list_del_rcu(&nb->list);
-	spin_unlock(&gmap_notifier_lock);
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
-
-/**
- * gmap_call_notifier - call all registered invalidation callbacks
- * @gmap: pointer to guest mapping meta data structure
- * @start: start virtual address in the guest address space
- * @end: end virtual address in the guest address space
- */
-static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
-			       unsigned long end)
-{
-	struct gmap_notifier *nb;
-
-	list_for_each_entry(nb, &gmap_notifier_list, list)
-		nb->notifier_call(gmap, start, end);
-}
-
-/**
- * gmap_table_walk - walk the gmap page tables
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: virtual address in the guest address space
- * @level: page table level to stop at
- *
- * Returns a table entry pointer for the given guest address and @level
- * @level=0 : returns a pointer to a page table table entry (or NULL)
- * @level=1 : returns a pointer to a segment table entry (or NULL)
- * @level=2 : returns a pointer to a region-3 table entry (or NULL)
- * @level=3 : returns a pointer to a region-2 table entry (or NULL)
- * @level=4 : returns a pointer to a region-1 table entry (or NULL)
- *
- * Returns NULL if the gmap page tables could not be walked to the
- * requested level.
- *
- * Note: Can also be called for shadow gmaps.
- */
-unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level)
-{
-	const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
-	unsigned long *table = gmap->table;
-
-	if (gmap_is_shadow(gmap) && gmap->removed)
-		return NULL;
-
-	if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
-		return NULL;
-
-	if (asce_type != _ASCE_TYPE_REGION1 &&
-	    gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
-		return NULL;
-
-	switch (asce_type) {
-	case _ASCE_TYPE_REGION1:
-		table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
-		if (level == 4)
-			break;
-		if (*table & _REGION_ENTRY_INVALID)
-			return NULL;
-		table = __va(*table & _REGION_ENTRY_ORIGIN);
-		fallthrough;
-	case _ASCE_TYPE_REGION2:
-		table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
-		if (level == 3)
-			break;
-		if (*table & _REGION_ENTRY_INVALID)
-			return NULL;
-		table = __va(*table & _REGION_ENTRY_ORIGIN);
-		fallthrough;
-	case _ASCE_TYPE_REGION3:
-		table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
-		if (level == 2)
-			break;
-		if (*table & _REGION_ENTRY_INVALID)
-			return NULL;
-		table = __va(*table & _REGION_ENTRY_ORIGIN);
-		fallthrough;
-	case _ASCE_TYPE_SEGMENT:
-		table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
-		if (level == 1)
-			break;
-		if (*table & _REGION_ENTRY_INVALID)
-			return NULL;
-		table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
-		table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT;
-	}
-	return table;
-}
-EXPORT_SYMBOL(gmap_table_walk);
-
-/**
- * gmap_pte_op_walk - walk the gmap page table, get the page table lock
- *		      and return the pte pointer
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: virtual address in the guest address space
- * @ptl: pointer to the spinlock pointer
- *
- * Returns a pointer to the locked pte for a guest address, or NULL
- */
-static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
-			       spinlock_t **ptl)
-{
-	unsigned long *table;
-
-	BUG_ON(gmap_is_shadow(gmap));
-	/* Walk the gmap page table, lock and get pte pointer */
-	table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
-	if (!table || *table & _SEGMENT_ENTRY_INVALID)
-		return NULL;
-	return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
-}
-
-/**
- * gmap_pte_op_fixup - force a page in and connect the gmap page table
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: virtual address in the guest address space
- * @vmaddr: address in the host process address space
- * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
- *
- * Returns 0 if the caller can retry __gmap_translate (might fail again),
- * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
- * up or connecting the gmap page table.
- */
-static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
-			     unsigned long vmaddr, int prot)
-{
-	struct mm_struct *mm = gmap->mm;
-	unsigned int fault_flags;
-	bool unlocked = false;
-
-	BUG_ON(gmap_is_shadow(gmap));
-	fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
-	if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
-		return -EFAULT;
-	if (unlocked)
-		/* lost mmap_lock, caller has to retry __gmap_translate */
-		return 0;
-	/* Connect the page tables */
-	return __gmap_link(gmap, gaddr, vmaddr);
-}
-
-/**
- * gmap_pte_op_end - release the page table lock
- * @ptep: pointer to the locked pte
- * @ptl: pointer to the page table spinlock
- */
-static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl)
-{
-	pte_unmap_unlock(ptep, ptl);
-}
-
-/**
- * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
- *		      and return the pmd pointer
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: virtual address in the guest address space
- *
- * Returns a pointer to the pmd for a guest address, or NULL
- */
-static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
-{
-	pmd_t *pmdp;
-
-	BUG_ON(gmap_is_shadow(gmap));
-	pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
-	if (!pmdp)
-		return NULL;
-
-	/* without huge pages, there is no need to take the table lock */
-	if (!gmap->mm->context.allow_gmap_hpage_1m)
-		return pmd_none(*pmdp) ? NULL : pmdp;
-
-	spin_lock(&gmap->guest_table_lock);
-	if (pmd_none(*pmdp)) {
-		spin_unlock(&gmap->guest_table_lock);
-		return NULL;
-	}
-
-	/* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
-	if (!pmd_leaf(*pmdp))
-		spin_unlock(&gmap->guest_table_lock);
-	return pmdp;
-}
-
-/**
- * gmap_pmd_op_end - release the guest_table_lock if needed
- * @gmap: pointer to the guest mapping meta data structure
- * @pmdp: pointer to the pmd
- */
-static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
-{
-	if (pmd_leaf(*pmdp))
-		spin_unlock(&gmap->guest_table_lock);
-}
-
-/*
- * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
- * @pmdp: pointer to the pmd to be protected
- * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
- * @bits: notification bits to set
- *
- * Returns:
- * 0 if successfully protected
- * -EAGAIN if a fixup is needed
- * -EINVAL if unsupported notifier bits have been specified
- *
- * Expected to be called with sg->mm->mmap_lock in read and
- * guest_table_lock held.
- */
-static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
-			    pmd_t *pmdp, int prot, unsigned long bits)
-{
-	int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
-	int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
-	pmd_t new = *pmdp;
-
-	/* Fixup needed */
-	if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
-		return -EAGAIN;
-
-	if (prot == PROT_NONE && !pmd_i) {
-		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
-		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
-	}
-
-	if (prot == PROT_READ && !pmd_p) {
-		new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
-		new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
-		gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
-	}
-
-	if (bits & GMAP_NOTIFY_MPROT)
-		set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
-
-	/* Shadow GMAP protection needs split PMDs */
-	if (bits & GMAP_NOTIFY_SHADOW)
-		return -EINVAL;
-
-	return 0;
-}
-
-/*
- * gmap_protect_pte - remove access rights to memory and set pgste bits
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: virtual address in the guest address space
- * @pmdp: pointer to the pmd associated with the pte
- * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
- * @bits: notification bits to set
- *
- * Returns 0 if successfully protected, -ENOMEM if out of memory and
- * -EAGAIN if a fixup is needed.
- *
- * Expected to be called with sg->mm->mmap_lock in read
- */
-static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
-			    pmd_t *pmdp, int prot, unsigned long bits)
-{
-	int rc;
-	pte_t *ptep;
-	spinlock_t *ptl;
-	unsigned long pbits = 0;
-
-	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
-		return -EAGAIN;
-
-	ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
-	if (!ptep)
-		return -ENOMEM;
-
-	pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
-	pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
-	/* Protect and unlock. */
-	rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
-	gmap_pte_op_end(ptep, ptl);
-	return rc;
-}
-
-/*
- * gmap_protect_range - remove access rights to memory and set pgste bits
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: virtual address in the guest address space
- * @len: size of area
- * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
- * @bits: pgste notification bits to set
- *
- * Returns:
- *   PAGE_SIZE if a small page was successfully protected;
- *   HPAGE_SIZE if a large page was successfully protected;
- *   -ENOMEM if out of memory;
- *   -EFAULT if gaddr is invalid (or mapping for shadows is missing);
- *   -EAGAIN if the guest mapping is missing and should be fixed by the caller.
- *
- * Context: Called with sg->mm->mmap_lock in read.
- */
-int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits)
-{
-	pmd_t *pmdp;
-	int rc = 0;
-
-	BUG_ON(gmap_is_shadow(gmap));
-
-	pmdp = gmap_pmd_op_walk(gmap, gaddr);
-	if (!pmdp)
-		return -EAGAIN;
-
-	if (!pmd_leaf(*pmdp)) {
-		rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits);
-		if (!rc)
-			rc = PAGE_SIZE;
-	} else {
-		rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits);
-		if (!rc)
-			rc = HPAGE_SIZE;
-	}
-	gmap_pmd_op_end(gmap, pmdp);
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_protect_one);
-
-/**
- * gmap_read_table - get an unsigned long value from a guest page table using
- *                   absolute addressing, without marking the page referenced.
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: virtual address in the guest address space
- * @val: pointer to the unsigned long value to return
- *
- * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
- * if reading using the virtual address failed. -EINVAL if called on a gmap
- * shadow.
- *
- * Called with gmap->mm->mmap_lock in read.
- */
-int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
-{
-	unsigned long address, vmaddr;
-	spinlock_t *ptl;
-	pte_t *ptep, pte;
-	int rc;
-
-	if (gmap_is_shadow(gmap))
-		return -EINVAL;
-
-	while (1) {
-		rc = -EAGAIN;
-		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
-		if (ptep) {
-			pte = *ptep;
-			if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
-				address = pte_val(pte) & PAGE_MASK;
-				address += gaddr & ~PAGE_MASK;
-				*val = *(unsigned long *)__va(address);
-				set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
-				/* Do *NOT* clear the _PAGE_INVALID bit! */
-				rc = 0;
-			}
-			gmap_pte_op_end(ptep, ptl);
-		}
-		if (!rc)
-			break;
-		vmaddr = __gmap_translate(gmap, gaddr);
-		if (IS_ERR_VALUE(vmaddr)) {
-			rc = vmaddr;
-			break;
-		}
-		rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
-		if (rc)
-			break;
-	}
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_read_table);
-
-/**
- * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
- * @sg: pointer to the shadow guest address space structure
- * @vmaddr: vm address associated with the rmap
- * @rmap: pointer to the rmap structure
- *
- * Called with the sg->guest_table_lock
- */
-static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
-				    struct gmap_rmap *rmap)
-{
-	struct gmap_rmap *temp;
-	void __rcu **slot;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
-	if (slot) {
-		rmap->next = radix_tree_deref_slot_protected(slot,
-							&sg->guest_table_lock);
-		for (temp = rmap->next; temp; temp = temp->next) {
-			if (temp->raddr == rmap->raddr) {
-				kfree(rmap);
-				return;
-			}
-		}
-		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
-	} else {
-		rmap->next = NULL;
-		radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
-				  rmap);
-	}
-}
-
-/**
- * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
- * @sg: pointer to the shadow guest address space structure
- * @raddr: rmap address in the shadow gmap
- * @paddr: address in the parent guest address space
- * @len: length of the memory area to protect
- *
- * Returns 0 if successfully protected and the rmap was created, -ENOMEM
- * if out of memory and -EFAULT if paddr is invalid.
- */
-static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
-			     unsigned long paddr, unsigned long len)
-{
-	struct gmap *parent;
-	struct gmap_rmap *rmap;
-	unsigned long vmaddr;
-	spinlock_t *ptl;
-	pte_t *ptep;
-	int rc;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	parent = sg->parent;
-	while (len) {
-		vmaddr = __gmap_translate(parent, paddr);
-		if (IS_ERR_VALUE(vmaddr))
-			return vmaddr;
-		rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
-		if (!rmap)
-			return -ENOMEM;
-		rmap->raddr = raddr;
-		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
-		if (rc) {
-			kfree(rmap);
-			return rc;
-		}
-		rc = -EAGAIN;
-		ptep = gmap_pte_op_walk(parent, paddr, &ptl);
-		if (ptep) {
-			spin_lock(&sg->guest_table_lock);
-			rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
-					     PGSTE_VSIE_BIT);
-			if (!rc)
-				gmap_insert_rmap(sg, vmaddr, rmap);
-			spin_unlock(&sg->guest_table_lock);
-			gmap_pte_op_end(ptep, ptl);
-		}
-		radix_tree_preload_end();
-		if (rc) {
-			kfree(rmap);
-			rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
-			if (rc)
-				return rc;
-			continue;
-		}
-		paddr += PAGE_SIZE;
-		len -= PAGE_SIZE;
-	}
-	return 0;
-}
-
-#define _SHADOW_RMAP_MASK	0x7
-#define _SHADOW_RMAP_REGION1	0x5
-#define _SHADOW_RMAP_REGION2	0x4
-#define _SHADOW_RMAP_REGION3	0x3
-#define _SHADOW_RMAP_SEGMENT	0x2
-#define _SHADOW_RMAP_PGTABLE	0x1
-
-/**
- * gmap_idte_one - invalidate a single region or segment table entry
- * @asce: region or segment table *origin* + table-type bits
- * @vaddr: virtual address to identify the table entry to flush
- *
- * The invalid bit of a single region or segment table entry is set
- * and the associated TLB entries depending on the entry are flushed.
- * The table-type of the @asce identifies the portion of the @vaddr
- * that is used as the invalidation index.
- */
-static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
-{
-	asm volatile(
-		"	idte	%0,0,%1"
-		: : "a" (asce), "a" (vaddr) : "cc", "memory");
-}
-
-/**
- * gmap_unshadow_page - remove a page from a shadow page table
- * @sg: pointer to the shadow guest address space structure
- * @raddr: rmap address in the shadow guest address space
- *
- * Called with the sg->guest_table_lock
- */
-static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
-{
-	unsigned long *table;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
-	if (!table || *table & _PAGE_INVALID)
-		return;
-	gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1);
-	ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
-}
-
-/**
- * __gmap_unshadow_pgt - remove all entries from a shadow page table
- * @sg: pointer to the shadow guest address space structure
- * @raddr: rmap address in the shadow guest address space
- * @pgt: pointer to the start of a shadow page table
- *
- * Called with the sg->guest_table_lock
- */
-static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
-				unsigned long *pgt)
-{
-	int i;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE)
-		pgt[i] = _PAGE_INVALID;
-}
-
-/**
- * gmap_unshadow_pgt - remove a shadow page table from a segment entry
- * @sg: pointer to the shadow guest address space structure
- * @raddr: address in the shadow guest address space
- *
- * Called with the sg->guest_table_lock
- */
-static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
-{
-	unsigned long *ste;
-	phys_addr_t sto, pgt;
-	struct ptdesc *ptdesc;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
-	if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
-		return;
-	gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
-	sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
-	gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
-	pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
-	*ste = _SEGMENT_ENTRY_EMPTY;
-	__gmap_unshadow_pgt(sg, raddr, __va(pgt));
-	/* Free page table */
-	ptdesc = page_ptdesc(phys_to_page(pgt));
-	page_table_free_pgste(ptdesc);
-}
-
-/**
- * __gmap_unshadow_sgt - remove all entries from a shadow segment table
- * @sg: pointer to the shadow guest address space structure
- * @raddr: rmap address in the shadow guest address space
- * @sgt: pointer to the start of a shadow segment table
- *
- * Called with the sg->guest_table_lock
- */
-static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
-				unsigned long *sgt)
-{
-	struct ptdesc *ptdesc;
-	phys_addr_t pgt;
-	int i;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
-		if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
-			continue;
-		pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
-		sgt[i] = _SEGMENT_ENTRY_EMPTY;
-		__gmap_unshadow_pgt(sg, raddr, __va(pgt));
-		/* Free page table */
-		ptdesc = page_ptdesc(phys_to_page(pgt));
-		page_table_free_pgste(ptdesc);
-	}
-}
-
-/**
- * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
- * @sg: pointer to the shadow guest address space structure
- * @raddr: rmap address in the shadow guest address space
- *
- * Called with the shadow->guest_table_lock
- */
-static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
-{
-	unsigned long r3o, *r3e;
-	phys_addr_t sgt;
-	struct page *page;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
-	if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
-		return;
-	gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
-	r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
-	gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
-	sgt = *r3e & _REGION_ENTRY_ORIGIN;
-	*r3e = _REGION3_ENTRY_EMPTY;
-	__gmap_unshadow_sgt(sg, raddr, __va(sgt));
-	/* Free segment table */
-	page = phys_to_page(sgt);
-	__free_pages(page, CRST_ALLOC_ORDER);
-}
-
-/**
- * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
- * @sg: pointer to the shadow guest address space structure
- * @raddr: address in the shadow guest address space
- * @r3t: pointer to the start of a shadow region-3 table
- *
- * Called with the sg->guest_table_lock
- */
-static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
-				unsigned long *r3t)
-{
-	struct page *page;
-	phys_addr_t sgt;
-	int i;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
-		if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
-			continue;
-		sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
-		r3t[i] = _REGION3_ENTRY_EMPTY;
-		__gmap_unshadow_sgt(sg, raddr, __va(sgt));
-		/* Free segment table */
-		page = phys_to_page(sgt);
-		__free_pages(page, CRST_ALLOC_ORDER);
-	}
-}
-
-/**
- * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
- * @sg: pointer to the shadow guest address space structure
- * @raddr: rmap address in the shadow guest address space
- *
- * Called with the sg->guest_table_lock
- */
-static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
-{
-	unsigned long r2o, *r2e;
-	phys_addr_t r3t;
-	struct page *page;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
-	if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
-		return;
-	gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
-	r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
-	gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
-	r3t = *r2e & _REGION_ENTRY_ORIGIN;
-	*r2e = _REGION2_ENTRY_EMPTY;
-	__gmap_unshadow_r3t(sg, raddr, __va(r3t));
-	/* Free region 3 table */
-	page = phys_to_page(r3t);
-	__free_pages(page, CRST_ALLOC_ORDER);
-}
-
-/**
- * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
- * @sg: pointer to the shadow guest address space structure
- * @raddr: rmap address in the shadow guest address space
- * @r2t: pointer to the start of a shadow region-2 table
- *
- * Called with the sg->guest_table_lock
- */
-static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
-				unsigned long *r2t)
-{
-	phys_addr_t r3t;
-	struct page *page;
-	int i;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
-		if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
-			continue;
-		r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
-		r2t[i] = _REGION2_ENTRY_EMPTY;
-		__gmap_unshadow_r3t(sg, raddr, __va(r3t));
-		/* Free region 3 table */
-		page = phys_to_page(r3t);
-		__free_pages(page, CRST_ALLOC_ORDER);
-	}
-}
-
-/**
- * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
- * @sg: pointer to the shadow guest address space structure
- * @raddr: rmap address in the shadow guest address space
- *
- * Called with the sg->guest_table_lock
- */
-static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
-{
-	unsigned long r1o, *r1e;
-	struct page *page;
-	phys_addr_t r2t;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
-	if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
-		return;
-	gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
-	r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
-	gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
-	r2t = *r1e & _REGION_ENTRY_ORIGIN;
-	*r1e = _REGION1_ENTRY_EMPTY;
-	__gmap_unshadow_r2t(sg, raddr, __va(r2t));
-	/* Free region 2 table */
-	page = phys_to_page(r2t);
-	__free_pages(page, CRST_ALLOC_ORDER);
-}
-
-/**
- * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
- * @sg: pointer to the shadow guest address space structure
- * @raddr: rmap address in the shadow guest address space
- * @r1t: pointer to the start of a shadow region-1 table
- *
- * Called with the shadow->guest_table_lock
- */
-static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
-				unsigned long *r1t)
-{
-	unsigned long asce;
-	struct page *page;
-	phys_addr_t r2t;
-	int i;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	asce = __pa(r1t) | _ASCE_TYPE_REGION1;
-	for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
-		if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
-			continue;
-		r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
-		__gmap_unshadow_r2t(sg, raddr, __va(r2t));
-		/* Clear entry and flush translation r1t -> r2t */
-		gmap_idte_one(asce, raddr);
-		r1t[i] = _REGION1_ENTRY_EMPTY;
-		/* Free region 2 table */
-		page = phys_to_page(r2t);
-		__free_pages(page, CRST_ALLOC_ORDER);
-	}
-}
-
-/**
- * gmap_unshadow - remove a shadow page table completely
- * @sg: pointer to the shadow guest address space structure
- *
- * Called with sg->guest_table_lock
- */
-void gmap_unshadow(struct gmap *sg)
-{
-	unsigned long *table;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	if (sg->removed)
-		return;
-	sg->removed = 1;
-	gmap_call_notifier(sg, 0, -1UL);
-	gmap_flush_tlb(sg);
-	table = __va(sg->asce & _ASCE_ORIGIN);
-	switch (sg->asce & _ASCE_TYPE_MASK) {
-	case _ASCE_TYPE_REGION1:
-		__gmap_unshadow_r1t(sg, 0, table);
-		break;
-	case _ASCE_TYPE_REGION2:
-		__gmap_unshadow_r2t(sg, 0, table);
-		break;
-	case _ASCE_TYPE_REGION3:
-		__gmap_unshadow_r3t(sg, 0, table);
-		break;
-	case _ASCE_TYPE_SEGMENT:
-		__gmap_unshadow_sgt(sg, 0, table);
-		break;
-	}
-}
-EXPORT_SYMBOL(gmap_unshadow);
-
-/**
- * gmap_shadow_r2t - create an empty shadow region 2 table
- * @sg: pointer to the shadow guest address space structure
- * @saddr: faulting address in the shadow gmap
- * @r2t: parent gmap address of the region 2 table to get shadowed
- * @fake: r2t references contiguous guest memory block, not a r2t
- *
- * The r2t parameter specifies the address of the source table. The
- * four pages of the source table are made read-only in the parent gmap
- * address space. A write to the source table area @r2t will automatically
- * remove the shadow r2 table and all of its descendants.
- *
- * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
- * shadow table structure is incomplete, -ENOMEM if out of memory and
- * -EFAULT if an address in the parent gmap could not be resolved.
- *
- * Called with sg->mm->mmap_lock in read.
- */
-int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
-		    int fake)
-{
-	unsigned long raddr, origin, offset, len;
-	unsigned long *table;
-	phys_addr_t s_r2t;
-	struct page *page;
-	int rc;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	/* Allocate a shadow region second table */
-	page = gmap_alloc_crst();
-	if (!page)
-		return -ENOMEM;
-	s_r2t = page_to_phys(page);
-	/* Install shadow region second table */
-	spin_lock(&sg->guest_table_lock);
-	table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
-	if (!table) {
-		rc = -EAGAIN;		/* Race with unshadow */
-		goto out_free;
-	}
-	if (!(*table & _REGION_ENTRY_INVALID)) {
-		rc = 0;			/* Already established */
-		goto out_free;
-	} else if (*table & _REGION_ENTRY_ORIGIN) {
-		rc = -EAGAIN;		/* Race with shadow */
-		goto out_free;
-	}
-	crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
-	/* mark as invalid as long as the parent table is not protected */
-	*table = s_r2t | _REGION_ENTRY_LENGTH |
-		 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
-	if (sg->edat_level >= 1)
-		*table |= (r2t & _REGION_ENTRY_PROTECT);
-	if (fake) {
-		/* nothing to protect for fake tables */
-		*table &= ~_REGION_ENTRY_INVALID;
-		spin_unlock(&sg->guest_table_lock);
-		return 0;
-	}
-	spin_unlock(&sg->guest_table_lock);
-	/* Make r2t read-only in parent gmap page table */
-	raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1;
-	origin = r2t & _REGION_ENTRY_ORIGIN;
-	offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
-	len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
-	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
-	spin_lock(&sg->guest_table_lock);
-	if (!rc) {
-		table = gmap_table_walk(sg, saddr, 4);
-		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
-			rc = -EAGAIN;		/* Race with unshadow */
-		else
-			*table &= ~_REGION_ENTRY_INVALID;
-	} else {
-		gmap_unshadow_r2t(sg, raddr);
-	}
-	spin_unlock(&sg->guest_table_lock);
-	return rc;
-out_free:
-	spin_unlock(&sg->guest_table_lock);
-	__free_pages(page, CRST_ALLOC_ORDER);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
-
-/**
- * gmap_shadow_r3t - create a shadow region 3 table
- * @sg: pointer to the shadow guest address space structure
- * @saddr: faulting address in the shadow gmap
- * @r3t: parent gmap address of the region 3 table to get shadowed
- * @fake: r3t references contiguous guest memory block, not a r3t
- *
- * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
- * shadow table structure is incomplete, -ENOMEM if out of memory and
- * -EFAULT if an address in the parent gmap could not be resolved.
- *
- * Called with sg->mm->mmap_lock in read.
- */
-int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
-		    int fake)
-{
-	unsigned long raddr, origin, offset, len;
-	unsigned long *table;
-	phys_addr_t s_r3t;
-	struct page *page;
-	int rc;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	/* Allocate a shadow region second table */
-	page = gmap_alloc_crst();
-	if (!page)
-		return -ENOMEM;
-	s_r3t = page_to_phys(page);
-	/* Install shadow region second table */
-	spin_lock(&sg->guest_table_lock);
-	table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
-	if (!table) {
-		rc = -EAGAIN;		/* Race with unshadow */
-		goto out_free;
-	}
-	if (!(*table & _REGION_ENTRY_INVALID)) {
-		rc = 0;			/* Already established */
-		goto out_free;
-	} else if (*table & _REGION_ENTRY_ORIGIN) {
-		rc = -EAGAIN;		/* Race with shadow */
-		goto out_free;
-	}
-	crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
-	/* mark as invalid as long as the parent table is not protected */
-	*table = s_r3t | _REGION_ENTRY_LENGTH |
-		 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
-	if (sg->edat_level >= 1)
-		*table |= (r3t & _REGION_ENTRY_PROTECT);
-	if (fake) {
-		/* nothing to protect for fake tables */
-		*table &= ~_REGION_ENTRY_INVALID;
-		spin_unlock(&sg->guest_table_lock);
-		return 0;
-	}
-	spin_unlock(&sg->guest_table_lock);
-	/* Make r3t read-only in parent gmap page table */
-	raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2;
-	origin = r3t & _REGION_ENTRY_ORIGIN;
-	offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
-	len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
-	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
-	spin_lock(&sg->guest_table_lock);
-	if (!rc) {
-		table = gmap_table_walk(sg, saddr, 3);
-		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
-			rc = -EAGAIN;		/* Race with unshadow */
-		else
-			*table &= ~_REGION_ENTRY_INVALID;
-	} else {
-		gmap_unshadow_r3t(sg, raddr);
-	}
-	spin_unlock(&sg->guest_table_lock);
-	return rc;
-out_free:
-	spin_unlock(&sg->guest_table_lock);
-	__free_pages(page, CRST_ALLOC_ORDER);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
-
-/**
- * gmap_shadow_sgt - create a shadow segment table
- * @sg: pointer to the shadow guest address space structure
- * @saddr: faulting address in the shadow gmap
- * @sgt: parent gmap address of the segment table to get shadowed
- * @fake: sgt references contiguous guest memory block, not a sgt
- *
- * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
- * shadow table structure is incomplete, -ENOMEM if out of memory and
- * -EFAULT if an address in the parent gmap could not be resolved.
- *
- * Called with sg->mm->mmap_lock in read.
- */
-int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
-		    int fake)
-{
-	unsigned long raddr, origin, offset, len;
-	unsigned long *table;
-	phys_addr_t s_sgt;
-	struct page *page;
-	int rc;
-
-	BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
-	/* Allocate a shadow segment table */
-	page = gmap_alloc_crst();
-	if (!page)
-		return -ENOMEM;
-	s_sgt = page_to_phys(page);
-	/* Install shadow region second table */
-	spin_lock(&sg->guest_table_lock);
-	table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
-	if (!table) {
-		rc = -EAGAIN;		/* Race with unshadow */
-		goto out_free;
-	}
-	if (!(*table & _REGION_ENTRY_INVALID)) {
-		rc = 0;			/* Already established */
-		goto out_free;
-	} else if (*table & _REGION_ENTRY_ORIGIN) {
-		rc = -EAGAIN;		/* Race with shadow */
-		goto out_free;
-	}
-	crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
-	/* mark as invalid as long as the parent table is not protected */
-	*table = s_sgt | _REGION_ENTRY_LENGTH |
-		 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
-	if (sg->edat_level >= 1)
-		*table |= sgt & _REGION_ENTRY_PROTECT;
-	if (fake) {
-		/* nothing to protect for fake tables */
-		*table &= ~_REGION_ENTRY_INVALID;
-		spin_unlock(&sg->guest_table_lock);
-		return 0;
-	}
-	spin_unlock(&sg->guest_table_lock);
-	/* Make sgt read-only in parent gmap page table */
-	raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3;
-	origin = sgt & _REGION_ENTRY_ORIGIN;
-	offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
-	len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
-	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
-	spin_lock(&sg->guest_table_lock);
-	if (!rc) {
-		table = gmap_table_walk(sg, saddr, 2);
-		if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
-			rc = -EAGAIN;		/* Race with unshadow */
-		else
-			*table &= ~_REGION_ENTRY_INVALID;
-	} else {
-		gmap_unshadow_sgt(sg, raddr);
-	}
-	spin_unlock(&sg->guest_table_lock);
-	return rc;
-out_free:
-	spin_unlock(&sg->guest_table_lock);
-	__free_pages(page, CRST_ALLOC_ORDER);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
-
-static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr)
-{
-	unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc));
-
-	pgstes += _PAGE_ENTRIES;
-
-	pgstes[0] &= ~PGSTE_ST2_MASK;
-	pgstes[1] &= ~PGSTE_ST2_MASK;
-	pgstes[2] &= ~PGSTE_ST2_MASK;
-	pgstes[3] &= ~PGSTE_ST2_MASK;
-
-	pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK;
-	pgstes[1] |= pgt_addr & PGSTE_ST2_MASK;
-	pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK;
-	pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK;
-}
-
-/**
- * gmap_shadow_pgt - instantiate a shadow page table
- * @sg: pointer to the shadow guest address space structure
- * @saddr: faulting address in the shadow gmap
- * @pgt: parent gmap address of the page table to get shadowed
- * @fake: pgt references contiguous guest memory block, not a pgtable
- *
- * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
- * shadow table structure is incomplete, -ENOMEM if out of memory,
- * -EFAULT if an address in the parent gmap could not be resolved and
- *
- * Called with gmap->mm->mmap_lock in read
- */
-int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
-		    int fake)
-{
-	unsigned long raddr, origin;
-	unsigned long *table;
-	struct ptdesc *ptdesc;
-	phys_addr_t s_pgt;
-	int rc;
-
-	BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
-	/* Allocate a shadow page table */
-	ptdesc = page_table_alloc_pgste(sg->mm);
-	if (!ptdesc)
-		return -ENOMEM;
-	origin = pgt & _SEGMENT_ENTRY_ORIGIN;
-	if (fake)
-		origin |= GMAP_SHADOW_FAKE_TABLE;
-	gmap_pgste_set_pgt_addr(ptdesc, origin);
-	s_pgt = page_to_phys(ptdesc_page(ptdesc));
-	/* Install shadow page table */
-	spin_lock(&sg->guest_table_lock);
-	table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
-	if (!table) {
-		rc = -EAGAIN;		/* Race with unshadow */
-		goto out_free;
-	}
-	if (!(*table & _SEGMENT_ENTRY_INVALID)) {
-		rc = 0;			/* Already established */
-		goto out_free;
-	} else if (*table & _SEGMENT_ENTRY_ORIGIN) {
-		rc = -EAGAIN;		/* Race with shadow */
-		goto out_free;
-	}
-	/* mark as invalid as long as the parent table is not protected */
-	*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
-		 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
-	if (fake) {
-		/* nothing to protect for fake tables */
-		*table &= ~_SEGMENT_ENTRY_INVALID;
-		spin_unlock(&sg->guest_table_lock);
-		return 0;
-	}
-	spin_unlock(&sg->guest_table_lock);
-	/* Make pgt read-only in parent gmap page table (not the pgste) */
-	raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
-	origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
-	rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
-	spin_lock(&sg->guest_table_lock);
-	if (!rc) {
-		table = gmap_table_walk(sg, saddr, 1);
-		if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
-			rc = -EAGAIN;		/* Race with unshadow */
-		else
-			*table &= ~_SEGMENT_ENTRY_INVALID;
-	} else {
-		gmap_unshadow_pgt(sg, raddr);
-	}
-	spin_unlock(&sg->guest_table_lock);
-	return rc;
-out_free:
-	spin_unlock(&sg->guest_table_lock);
-	page_table_free_pgste(ptdesc);
-	return rc;
-
-}
-EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
-
-/**
- * gmap_shadow_page - create a shadow page mapping
- * @sg: pointer to the shadow guest address space structure
- * @saddr: faulting address in the shadow gmap
- * @pte: pte in parent gmap address space to get shadowed
- *
- * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
- * shadow table structure is incomplete, -ENOMEM if out of memory and
- * -EFAULT if an address in the parent gmap could not be resolved.
- *
- * Called with sg->mm->mmap_lock in read.
- */
-int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
-{
-	struct gmap *parent;
-	struct gmap_rmap *rmap;
-	unsigned long vmaddr, paddr;
-	spinlock_t *ptl;
-	pte_t *sptep, *tptep;
-	int prot;
-	int rc;
-
-	BUG_ON(!gmap_is_shadow(sg));
-	parent = sg->parent;
-	prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
-
-	rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
-	if (!rmap)
-		return -ENOMEM;
-	rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
-
-	while (1) {
-		paddr = pte_val(pte) & PAGE_MASK;
-		vmaddr = __gmap_translate(parent, paddr);
-		if (IS_ERR_VALUE(vmaddr)) {
-			rc = vmaddr;
-			break;
-		}
-		rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
-		if (rc)
-			break;
-		rc = -EAGAIN;
-		sptep = gmap_pte_op_walk(parent, paddr, &ptl);
-		if (sptep) {
-			spin_lock(&sg->guest_table_lock);
-			/* Get page table pointer */
-			tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
-			if (!tptep) {
-				spin_unlock(&sg->guest_table_lock);
-				gmap_pte_op_end(sptep, ptl);
-				radix_tree_preload_end();
-				break;
-			}
-			rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
-			if (rc > 0) {
-				/* Success and a new mapping */
-				gmap_insert_rmap(sg, vmaddr, rmap);
-				rmap = NULL;
-				rc = 0;
-			}
-			gmap_pte_op_end(sptep, ptl);
-			spin_unlock(&sg->guest_table_lock);
-		}
-		radix_tree_preload_end();
-		if (!rc)
-			break;
-		rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
-		if (rc)
-			break;
-	}
-	kfree(rmap);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow_page);
-
-/*
- * gmap_shadow_notify - handle notifications for shadow gmap
- *
- * Called with sg->parent->shadow_lock.
- */
-static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
-			       unsigned long gaddr)
-{
-	struct gmap_rmap *rmap, *rnext, *head;
-	unsigned long start, end, bits, raddr;
-
-	BUG_ON(!gmap_is_shadow(sg));
-
-	spin_lock(&sg->guest_table_lock);
-	if (sg->removed) {
-		spin_unlock(&sg->guest_table_lock);
-		return;
-	}
-	/* Check for top level table */
-	start = sg->orig_asce & _ASCE_ORIGIN;
-	end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE;
-	if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
-	    gaddr < end) {
-		/* The complete shadow table has to go */
-		gmap_unshadow(sg);
-		spin_unlock(&sg->guest_table_lock);
-		list_del(&sg->list);
-		gmap_put(sg);
-		return;
-	}
-	/* Remove the page table tree from on specific entry */
-	head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
-	gmap_for_each_rmap_safe(rmap, rnext, head) {
-		bits = rmap->raddr & _SHADOW_RMAP_MASK;
-		raddr = rmap->raddr ^ bits;
-		switch (bits) {
-		case _SHADOW_RMAP_REGION1:
-			gmap_unshadow_r2t(sg, raddr);
-			break;
-		case _SHADOW_RMAP_REGION2:
-			gmap_unshadow_r3t(sg, raddr);
-			break;
-		case _SHADOW_RMAP_REGION3:
-			gmap_unshadow_sgt(sg, raddr);
-			break;
-		case _SHADOW_RMAP_SEGMENT:
-			gmap_unshadow_pgt(sg, raddr);
-			break;
-		case _SHADOW_RMAP_PGTABLE:
-			gmap_unshadow_page(sg, raddr);
-			break;
-		}
-		kfree(rmap);
-	}
-	spin_unlock(&sg->guest_table_lock);
-}
-
-/**
- * ptep_notify - call all invalidation callbacks for a specific pte.
- * @mm: pointer to the process mm_struct
- * @vmaddr: virtual address in the process address space
- * @pte: pointer to the page table entry
- * @bits: bits from the pgste that caused the notify call
- *
- * This function is assumed to be called with the page table lock held
- * for the pte to notify.
- */
-void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
-		 pte_t *pte, unsigned long bits)
-{
-	unsigned long offset, gaddr = 0;
-	struct gmap *gmap, *sg, *next;
-
-	offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
-	offset = offset * (PAGE_SIZE / sizeof(pte_t));
-	rcu_read_lock();
-	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
-		spin_lock(&gmap->guest_table_lock);
-		gaddr = host_to_guest_lookup(gmap, vmaddr) + offset;
-		spin_unlock(&gmap->guest_table_lock);
-		if (!IS_GADDR_VALID(gaddr))
-			continue;
-
-		if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
-			spin_lock(&gmap->shadow_lock);
-			list_for_each_entry_safe(sg, next,
-						 &gmap->children, list)
-				gmap_shadow_notify(sg, vmaddr, gaddr);
-			spin_unlock(&gmap->shadow_lock);
-		}
-		if (bits & PGSTE_IN_BIT)
-			gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
-	}
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(ptep_notify);
-
-static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
-			     unsigned long gaddr)
-{
-	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
-	gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
-}
-
-/**
- * gmap_pmdp_xchg - exchange a gmap pmd with another
- * @gmap: pointer to the guest address space structure
- * @pmdp: pointer to the pmd entry
- * @new: replacement entry
- * @gaddr: the affected guest address
- *
- * This function is assumed to be called with the guest_table_lock
- * held.
- */
-static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
-			   unsigned long gaddr)
-{
-	gaddr &= HPAGE_MASK;
-	pmdp_notify_gmap(gmap, pmdp, gaddr);
-	new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
-	if (machine_has_tlb_guest())
-		__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
-			    IDTE_GLOBAL);
-	else
-		__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
-	set_pmd(pmdp, new);
-}
-
-static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
-			    int purge)
-{
-	pmd_t *pmdp;
-	struct gmap *gmap;
-	unsigned long gaddr;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
-		spin_lock(&gmap->guest_table_lock);
-		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
-		if (pmdp) {
-			pmdp_notify_gmap(gmap, pmdp, gaddr);
-			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
-						   _SEGMENT_ENTRY_GMAP_UC |
-						   _SEGMENT_ENTRY));
-			if (purge)
-				__pmdp_cspg(pmdp);
-			set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
-		}
-		spin_unlock(&gmap->guest_table_lock);
-	}
-	rcu_read_unlock();
-}
-
-/**
- * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
- *                        flushing
- * @mm: pointer to the process mm_struct
- * @vmaddr: virtual address in the process address space
- */
-void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
-{
-	gmap_pmdp_clear(mm, vmaddr, 0);
-}
-EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
-
-/**
- * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
- * @mm: pointer to the process mm_struct
- * @vmaddr: virtual address in the process address space
- */
-void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
-{
-	unsigned long gaddr;
-	struct gmap *gmap;
-	pmd_t *pmdp;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
-		spin_lock(&gmap->guest_table_lock);
-		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
-		if (pmdp) {
-			pmdp_notify_gmap(gmap, pmdp, gaddr);
-			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
-						   _SEGMENT_ENTRY_GMAP_UC |
-						   _SEGMENT_ENTRY));
-			if (machine_has_tlb_guest())
-				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
-					    gmap->asce, IDTE_LOCAL);
-			else
-				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
-			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
-		}
-		spin_unlock(&gmap->guest_table_lock);
-	}
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
-
-/**
- * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
- * @mm: pointer to the process mm_struct
- * @vmaddr: virtual address in the process address space
- */
-void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
-{
-	unsigned long gaddr;
-	struct gmap *gmap;
-	pmd_t *pmdp;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
-		spin_lock(&gmap->guest_table_lock);
-		pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
-		if (pmdp) {
-			pmdp_notify_gmap(gmap, pmdp, gaddr);
-			WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
-						   _SEGMENT_ENTRY_GMAP_UC |
-						   _SEGMENT_ENTRY));
-			if (machine_has_tlb_guest())
-				__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
-					    gmap->asce, IDTE_GLOBAL);
-			else
-				__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
-			*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
-		}
-		spin_unlock(&gmap->guest_table_lock);
-	}
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
-
-/**
- * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
- * @gmap: pointer to guest address space
- * @pmdp: pointer to the pmd to be tested
- * @gaddr: virtual address in the guest address space
- *
- * This function is assumed to be called with the guest_table_lock
- * held.
- */
-static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
-					  unsigned long gaddr)
-{
-	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
-		return false;
-
-	/* Already protected memory, which did not change is clean */
-	if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
-	    !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
-		return false;
-
-	/* Clear UC indication and reset protection */
-	set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
-	gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
-	return true;
-}
-
-/**
- * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
- * @gmap: pointer to guest address space
- * @bitmap: dirty bitmap for this pmd
- * @gaddr: virtual address in the guest address space
- * @vmaddr: virtual address in the host address space
- *
- * This function is assumed to be called with the guest_table_lock
- * held.
- */
-void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
-			     unsigned long gaddr, unsigned long vmaddr)
-{
-	int i;
-	pmd_t *pmdp;
-	pte_t *ptep;
-	spinlock_t *ptl;
-
-	pmdp = gmap_pmd_op_walk(gmap, gaddr);
-	if (!pmdp)
-		return;
-
-	if (pmd_leaf(*pmdp)) {
-		if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
-			bitmap_fill(bitmap, _PAGE_ENTRIES);
-	} else {
-		for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
-			ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
-			if (!ptep)
-				continue;
-			if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
-				set_bit(i, bitmap);
-			pte_unmap_unlock(ptep, ptl);
-		}
-	}
-	gmap_pmd_op_end(gmap, pmdp);
-}
-EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
-				    unsigned long end, struct mm_walk *walk)
-{
-	struct vm_area_struct *vma = walk->vma;
-
-	split_huge_pmd(vma, pmd, addr);
-	return 0;
-}
-
-static const struct mm_walk_ops thp_split_walk_ops = {
-	.pmd_entry	= thp_split_walk_pmd_entry,
-	.walk_lock	= PGWALK_WRLOCK_VERIFY,
-};
-
-static inline void thp_split_mm(struct mm_struct *mm)
-{
-	struct vm_area_struct *vma;
-	VMA_ITERATOR(vmi, mm, 0);
-
-	for_each_vma(vmi, vma) {
-		vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE);
-		walk_page_vma(vma, &thp_split_walk_ops, NULL);
-	}
-	mm->def_flags |= VM_NOHUGEPAGE;
-}
-#else
-static inline void thp_split_mm(struct mm_struct *mm)
-{
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-/*
- * switch on pgstes for its userspace process (for kvm)
- */
-int s390_enable_sie(void)
-{
-	struct mm_struct *mm = current->mm;
-
-	/* Do we have pgstes? if yes, we are done */
-	if (mm_has_pgste(mm))
-		return 0;
-	mmap_write_lock(mm);
-	mm->context.has_pgste = 1;
-	/* split thp mappings and disable thp for future mappings */
-	thp_split_mm(mm);
-	mmap_write_unlock(mm);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(s390_enable_sie);
-
-/*
- * Enable storage key handling from now on and initialize the storage
- * keys with the default key.
- */
-static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
-				  unsigned long next, struct mm_walk *walk)
-{
-	/* Clear storage key */
-	ptep_zap_key(walk->mm, addr, pte);
-	return 0;
-}
-
-/*
- * Give a chance to schedule after setting a key to 256 pages.
- * We only hold the mm lock, which is a rwsem and the kvm srcu.
- * Both can sleep.
- */
-static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
-				  unsigned long next, struct mm_walk *walk)
-{
-	cond_resched();
-	return 0;
-}
-
-static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
-				      unsigned long hmask, unsigned long next,
-				      struct mm_walk *walk)
-{
-	pmd_t *pmd = (pmd_t *)pte;
-	unsigned long start, end;
-	struct folio *folio = page_folio(pmd_page(*pmd));
-
-	/*
-	 * The write check makes sure we do not set a key on shared
-	 * memory. This is needed as the walker does not differentiate
-	 * between actual guest memory and the process executable or
-	 * shared libraries.
-	 */
-	if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
-	    !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
-		return 0;
-
-	start = pmd_val(*pmd) & HPAGE_MASK;
-	end = start + HPAGE_SIZE;
-	__storage_key_init_range(start, end);
-	set_bit(PG_arch_1, &folio->flags.f);
-	cond_resched();
-	return 0;
-}
-
-static const struct mm_walk_ops enable_skey_walk_ops = {
-	.hugetlb_entry		= __s390_enable_skey_hugetlb,
-	.pte_entry		= __s390_enable_skey_pte,
-	.pmd_entry		= __s390_enable_skey_pmd,
-	.walk_lock		= PGWALK_WRLOCK,
-};
-
-int s390_enable_skey(void)
-{
-	struct mm_struct *mm = current->mm;
-	int rc = 0;
-
-	mmap_write_lock(mm);
-	if (mm_uses_skeys(mm))
-		goto out_up;
-
-	mm->context.uses_skeys = 1;
-	rc = gmap_helper_disable_cow_sharing();
-	if (rc) {
-		mm->context.uses_skeys = 0;
-		goto out_up;
-	}
-	walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
-
-out_up:
-	mmap_write_unlock(mm);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(s390_enable_skey);
-
-/*
- * Reset CMMA state, make all pages stable again.
- */
-static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
-			     unsigned long next, struct mm_walk *walk)
-{
-	ptep_zap_unused(walk->mm, addr, pte, 1);
-	return 0;
-}
-
-static const struct mm_walk_ops reset_cmma_walk_ops = {
-	.pte_entry		= __s390_reset_cmma,
-	.walk_lock		= PGWALK_WRLOCK,
-};
-
-void s390_reset_cmma(struct mm_struct *mm)
-{
-	mmap_write_lock(mm);
-	walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
-	mmap_write_unlock(mm);
-}
-EXPORT_SYMBOL_GPL(s390_reset_cmma);
-
-#define GATHER_GET_PAGES 32
-
-struct reset_walk_state {
-	unsigned long next;
-	unsigned long count;
-	unsigned long pfns[GATHER_GET_PAGES];
-};
-
-static int s390_gather_pages(pte_t *ptep, unsigned long addr,
-			     unsigned long next, struct mm_walk *walk)
-{
-	struct reset_walk_state *p = walk->private;
-	pte_t pte = READ_ONCE(*ptep);
-
-	if (pte_present(pte)) {
-		/* we have a reference from the mapping, take an extra one */
-		get_page(phys_to_page(pte_val(pte)));
-		p->pfns[p->count] = phys_to_pfn(pte_val(pte));
-		p->next = next;
-		p->count++;
-	}
-	return p->count >= GATHER_GET_PAGES;
-}
-
-static const struct mm_walk_ops gather_pages_ops = {
-	.pte_entry = s390_gather_pages,
-	.walk_lock = PGWALK_RDLOCK,
-};
-
-/*
- * Call the Destroy secure page UVC on each page in the given array of PFNs.
- * Each page needs to have an extra reference, which will be released here.
- */
-void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
-{
-	struct folio *folio;
-	unsigned long i;
-
-	for (i = 0; i < count; i++) {
-		folio = pfn_folio(pfns[i]);
-		/* we always have an extra reference */
-		uv_destroy_folio(folio);
-		/* get rid of the extra reference */
-		folio_put(folio);
-		cond_resched();
-	}
-}
-EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
-
-/**
- * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
- * in the given range of the given address space.
- * @mm: the mm to operate on
- * @start: the start of the range
- * @end: the end of the range
- * @interruptible: if not 0, stop when a fatal signal is received
- *
- * Walk the given range of the given address space and call the destroy
- * secure page UVC on each page. Optionally exit early if a fatal signal is
- * pending.
- *
- * Return: 0 on success, -EINTR if the function stopped before completing
- */
-int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
-			    unsigned long end, bool interruptible)
-{
-	struct reset_walk_state state = { .next = start };
-	int r = 1;
-
-	while (r > 0) {
-		state.count = 0;
-		mmap_read_lock(mm);
-		r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
-		mmap_read_unlock(mm);
-		cond_resched();
-		s390_uv_destroy_pfns(state.count, state.pfns);
-		if (interruptible && fatal_signal_pending(current))
-			return -EINTR;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
-
-/**
- * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
- * @gmap: the gmap whose ASCE needs to be replaced
- *
- * If the ASCE is a SEGMENT type then this function will return -EINVAL,
- * otherwise the pointers in the host_to_guest radix tree will keep pointing
- * to the wrong pages, causing use-after-free and memory corruption.
- * If the allocation of the new top level page table fails, the ASCE is not
- * replaced.
- * In any case, the old ASCE is always removed from the gmap CRST list.
- * Therefore the caller has to make sure to save a pointer to it
- * beforehand, unless a leak is actually intended.
- */
-int s390_replace_asce(struct gmap *gmap)
-{
-	unsigned long asce;
-	struct page *page;
-	void *table;
-
-	/* Replacing segment type ASCEs would cause serious issues */
-	if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
-		return -EINVAL;
-
-	page = gmap_alloc_crst();
-	if (!page)
-		return -ENOMEM;
-	table = page_to_virt(page);
-	memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
-
-	/* Set new table origin while preserving existing ASCE control bits */
-	asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
-	WRITE_ONCE(gmap->asce, asce);
-	WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
-	WRITE_ONCE(gmap->table, table);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(s390_replace_asce);
diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
index d41b19925a5a..d653c64b869a 100644
--- a/arch/s390/mm/gmap_helpers.c
+++ b/arch/s390/mm/gmap_helpers.c
@@ -15,7 +15,6 @@
 #include <linux/pagewalk.h>
 #include <linux/ksm.h>
 #include <asm/gmap_helpers.h>
-#include <asm/pgtable.h>
 
 /**
  * ptep_zap_softleaf_entry() - discard a software leaf entry.
@@ -47,9 +46,7 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
 void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
 {
 	struct vm_area_struct *vma;
-	unsigned long pgstev;
 	spinlock_t *ptl;
-	pgste_t pgste;
 	pte_t *ptep;
 
 	mmap_assert_locked(mm);
@@ -64,18 +61,8 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
 	if (unlikely(!ptep))
 		return;
 	if (pte_swap(*ptep)) {
-		preempt_disable();
-		pgste = pgste_get_lock(ptep);
-		pgstev = pgste_val(pgste);
-
-		if ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
-		    (pgstev & _PGSTE_GPS_ZERO)) {
-			ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep));
-			pte_clear(mm, vmaddr, ptep);
-		}
-
-		pgste_set_unlock(ptep, pgste);
-		preempt_enable();
+		ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep));
+		pte_clear(mm, vmaddr, ptep);
 	}
 	pte_unmap_unlock(ptep, ptl);
 }
@@ -108,6 +95,85 @@ void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned lo
 }
 EXPORT_SYMBOL_GPL(gmap_helper_discard);
 
+/**
+ * gmap_helper_try_set_pte_unused() - mark a pte entry as unused
+ * @mm: the mm
+ * @vmaddr: the userspace address whose pte is to be marked
+ *
+ * Mark the pte corresponding the given address as unused. This will cause
+ * core mm code to just drop this page instead of swapping it.
+ *
+ * This function needs to be called with interrupts disabled (for example
+ * while holding a spinlock), or while holding the mmap lock. Normally this
+ * function is called as a result of an unmap operation, and thus KVM common
+ * code will already hold kvm->mmu_lock in write mode.
+ *
+ * Context: Needs to be called while holding the mmap lock or with interrupts
+ *          disabled.
+ */
+void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr)
+{
+	pmd_t *pmdp, pmd, pmdval;
+	pud_t *pudp, pud;
+	p4d_t *p4dp, p4d;
+	pgd_t *pgdp, pgd;
+	spinlock_t *ptl;	/* Lock for the host (userspace) page table */
+	pte_t *ptep;
+
+	pgdp = pgd_offset(mm, vmaddr);
+	pgd = pgdp_get(pgdp);
+	if (pgd_none(pgd) || !pgd_present(pgd))
+		return;
+
+	p4dp = p4d_offset(pgdp, vmaddr);
+	p4d = p4dp_get(p4dp);
+	if (p4d_none(p4d) || !p4d_present(p4d))
+		return;
+
+	pudp = pud_offset(p4dp, vmaddr);
+	pud = pudp_get(pudp);
+	if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud))
+		return;
+
+	pmdp = pmd_offset(pudp, vmaddr);
+	pmd = pmdp_get_lockless(pmdp);
+	if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd))
+		return;
+
+	ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl);
+	if (!ptep)
+		return;
+
+	/*
+	 * Several paths exists that takes the ptl lock and then call the
+	 * mmu_notifier, which takes the mmu_lock. The unmap path, instead,
+	 * takes the mmu_lock in write mode first, and then potentially
+	 * calls this function, which takes the ptl lock. This can lead to a
+	 * deadlock.
+	 * The unused page mechanism is only an optimization, if the
+	 * _PAGE_UNUSED bit is not set, the unused page is swapped as normal
+	 * instead of being discarded.
+	 * If the lock is contended the bit is not set and the deadlock is
+	 * avoided.
+	 */
+	if (spin_trylock(ptl)) {
+		/*
+		 * Make sure the pte we are touching is still the correct
+		 * one. In theory this check should not be needed, but
+		 * better safe than sorry.
+		 * Disabling interrupts or holding the mmap lock is enough to
+		 * guarantee that no concurrent updates to the page tables
+		 * are possible.
+		 */
+		if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp))))
+			__atomic64_or(_PAGE_UNUSED, (long *)ptep);
+		spin_unlock(ptl);
+	}
+
+	pte_unmap(ptep);
+}
+EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused);
+
 static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
 				   unsigned long end, struct mm_walk *walk)
 {
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index d42e61c7594e..35a898e15b1c 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -135,29 +135,6 @@ static inline pte_t __rste_to_pte(unsigned long rste)
 	return __pte(pteval);
 }
 
-static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
-{
-	struct folio *folio;
-	unsigned long size, paddr;
-
-	if (!mm_uses_skeys(mm) ||
-	    rste & _SEGMENT_ENTRY_INVALID)
-		return;
-
-	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
-		folio = page_folio(pud_page(__pud(rste)));
-		size = PUD_SIZE;
-		paddr = rste & PUD_MASK;
-	} else {
-		folio = page_folio(pmd_page(__pmd(rste)));
-		size = PMD_SIZE;
-		paddr = rste & PMD_MASK;
-	}
-
-	if (!test_and_set_bit(PG_arch_1, &folio->flags.f))
-		__storage_key_init_range(paddr, paddr + size);
-}
-
 void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t pte)
 {
@@ -173,7 +150,6 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	} else if (likely(pte_present(pte)))
 		rste |= _SEGMENT_ENTRY_LARGE;
 
-	clear_huge_pte_skeys(mm, rste);
 	set_pte(ptep, __pte(rste));
 }
 
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index 01f9b39e65f5..5bee173db72e 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -13,6 +13,7 @@
 #include <asm/page.h>
 
 int __bootdata_preserved(cmma_flag);
+EXPORT_SYMBOL(cmma_flag);
 
 void arch_free_page(struct page *page, int order)
 {
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index d3ce04a4b248..bb29c38ae624 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -16,13 +16,6 @@
 #include <asm/asm.h>
 #include <asm/set_memory.h>
 
-static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
-{
-	asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0"
-		     : [addr] "+a" (addr) : [skey] "d" (skey));
-	return addr;
-}
-
 void __storage_key_init_range(unsigned long start, unsigned long end)
 {
 	unsigned long boundary, size;
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 7df23528c01b..7ac44543e051 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -114,30 +114,6 @@ err_p4d:
 	return -ENOMEM;
 }
 
-#ifdef CONFIG_PGSTE
-
-struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm)
-{
-	struct ptdesc *ptdesc;
-	u64 *table;
-
-	ptdesc = pagetable_alloc_noprof(GFP_KERNEL_ACCOUNT, 0);
-	if (ptdesc) {
-		table = (u64 *)ptdesc_address(ptdesc);
-		__arch_set_page_dat(table, 1);
-		memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
-		memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
-	}
-	return ptdesc;
-}
-
-void page_table_free_pgste(struct ptdesc *ptdesc)
-{
-	pagetable_free(ptdesc);
-}
-
-#endif /* CONFIG_PGSTE */
-
 unsigned long *page_table_alloc_noprof(struct mm_struct *mm)
 {
 	gfp_t gfp = GFP_KERNEL_ACCOUNT;
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 666adcd681ab..4acd8b140c4b 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -24,7 +24,6 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/page-states.h>
-#include <asm/pgtable.h>
 #include <asm/machine.h>
 
 pgprot_t pgprot_writecombine(pgprot_t prot)
@@ -116,149 +115,14 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
 	return old;
 }
 
-static inline pgste_t pgste_get(pte_t *ptep)
-{
-	unsigned long pgste = 0;
-#ifdef CONFIG_PGSTE
-	pgste = *(unsigned long *)(ptep + PTRS_PER_PTE);
-#endif
-	return __pgste(pgste);
-}
-
-static inline void pgste_set(pte_t *ptep, pgste_t pgste)
-{
-#ifdef CONFIG_PGSTE
-	*(pgste_t *)(ptep + PTRS_PER_PTE) = pgste;
-#endif
-}
-
-static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste,
-				       struct mm_struct *mm)
-{
-#ifdef CONFIG_PGSTE
-	unsigned long address, bits, skey;
-
-	if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID)
-		return pgste;
-	address = pte_val(pte) & PAGE_MASK;
-	skey = (unsigned long) page_get_storage_key(address);
-	bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
-	/* Transfer page changed & referenced bit to guest bits in pgste */
-	pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */
-	/* Copy page access key and fetch protection bit to pgste */
-	pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT);
-	pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56);
-#endif
-	return pgste;
-
-}
-
-static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry,
-				 struct mm_struct *mm)
-{
-#ifdef CONFIG_PGSTE
-	unsigned long address;
-	unsigned long nkey;
-
-	if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID)
-		return;
-	VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID));
-	address = pte_val(entry) & PAGE_MASK;
-	/*
-	 * Set page access key and fetch protection bit from pgste.
-	 * The guest C/R information is still in the PGSTE, set real
-	 * key C/R to 0.
-	 */
-	nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
-	nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
-	page_set_storage_key(address, nkey, 0);
-#endif
-}
-
-static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
-{
-#ifdef CONFIG_PGSTE
-	if ((pte_val(entry) & _PAGE_PRESENT) &&
-	    (pte_val(entry) & _PAGE_WRITE) &&
-	    !(pte_val(entry) & _PAGE_INVALID)) {
-		if (!machine_has_esop()) {
-			/*
-			 * Without enhanced suppression-on-protection force
-			 * the dirty bit on for all writable ptes.
-			 */
-			entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY));
-			entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT));
-		}
-		if (!(pte_val(entry) & _PAGE_PROTECT))
-			/* This pte allows write access, set user-dirty */
-			pgste = set_pgste_bit(pgste, PGSTE_UC_BIT);
-	}
-#endif
-	set_pte(ptep, entry);
-	return pgste;
-}
-
-static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
-				       unsigned long addr,
-				       pte_t *ptep, pgste_t pgste)
-{
-#ifdef CONFIG_PGSTE
-	unsigned long bits;
-
-	bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
-	if (bits) {
-		pgste = __pgste(pgste_val(pgste) ^ bits);
-		ptep_notify(mm, addr, ptep, bits);
-	}
-#endif
-	return pgste;
-}
-
-static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
-				      unsigned long addr, pte_t *ptep)
-{
-	pgste_t pgste = __pgste(0);
-
-	if (mm_has_pgste(mm)) {
-		pgste = pgste_get_lock(ptep);
-		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
-	}
-	return pgste;
-}
-
-static inline pte_t ptep_xchg_commit(struct mm_struct *mm,
-				    unsigned long addr, pte_t *ptep,
-				    pgste_t pgste, pte_t old, pte_t new)
-{
-	if (mm_has_pgste(mm)) {
-		if (pte_val(old) & _PAGE_INVALID)
-			pgste_set_key(ptep, pgste, new, mm);
-		if (pte_val(new) & _PAGE_INVALID) {
-			pgste = pgste_update_all(old, pgste, mm);
-			if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
-			    _PGSTE_GPS_USAGE_UNUSED)
-				old = set_pte_bit(old, __pgprot(_PAGE_UNUSED));
-		}
-		pgste = pgste_set_pte(ptep, pgste, new);
-		pgste_set_unlock(ptep, pgste);
-	} else {
-		set_pte(ptep, new);
-	}
-	return old;
-}
-
 pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr,
 		       pte_t *ptep, pte_t new)
 {
-	pgste_t pgste;
 	pte_t old;
-	int nodat;
 
 	preempt_disable();
-	pgste = ptep_xchg_start(mm, addr, ptep);
-	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
-	old = ptep_flush_direct(mm, addr, ptep, nodat);
-	old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+	old = ptep_flush_direct(mm, addr, ptep, 1);
+	set_pte(ptep, new);
 	preempt_enable();
 	return old;
 }
@@ -292,15 +156,11 @@ EXPORT_SYMBOL(ptep_reset_dat_prot);
 pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t new)
 {
-	pgste_t pgste;
 	pte_t old;
-	int nodat;
 
 	preempt_disable();
-	pgste = ptep_xchg_start(mm, addr, ptep);
-	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
-	old = ptep_flush_lazy(mm, addr, ptep, nodat);
-	old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+	old = ptep_flush_lazy(mm, addr, ptep, 1);
+	set_pte(ptep, new);
 	preempt_enable();
 	return old;
 }
@@ -309,47 +169,22 @@ EXPORT_SYMBOL(ptep_xchg_lazy);
 pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t *ptep)
 {
-	pgste_t pgste;
-	pte_t old;
-	int nodat;
-	struct mm_struct *mm = vma->vm_mm;
-
-	pgste = ptep_xchg_start(mm, addr, ptep);
-	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
-	old = ptep_flush_lazy(mm, addr, ptep, nodat);
-	if (mm_has_pgste(mm)) {
-		pgste = pgste_update_all(old, pgste, mm);
-		pgste_set(ptep, pgste);
-	}
-	return old;
+	return ptep_flush_lazy(vma->vm_mm, addr, ptep, 1);
 }
 
 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t *ptep, pte_t old_pte, pte_t pte)
 {
-	pgste_t pgste;
-	struct mm_struct *mm = vma->vm_mm;
-
-	if (mm_has_pgste(mm)) {
-		pgste = pgste_get(ptep);
-		pgste_set_key(ptep, pgste, pte, mm);
-		pgste = pgste_set_pte(ptep, pgste, pte);
-		pgste_set_unlock(ptep, pgste);
-	} else {
-		set_pte(ptep, pte);
-	}
+	set_pte(ptep, pte);
 }
 
 static inline void pmdp_idte_local(struct mm_struct *mm,
 				   unsigned long addr, pmd_t *pmdp)
 {
 	if (machine_has_tlb_guest())
-		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
-			    mm->context.asce, IDTE_LOCAL);
+		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_LOCAL);
 	else
 		__pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL);
-	if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
-		gmap_pmdp_idte_local(mm, addr);
 }
 
 static inline void pmdp_idte_global(struct mm_struct *mm,
@@ -358,12 +193,8 @@ static inline void pmdp_idte_global(struct mm_struct *mm,
 	if (machine_has_tlb_guest()) {
 		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
 			    mm->context.asce, IDTE_GLOBAL);
-		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
-			gmap_pmdp_idte_global(mm, addr);
 	} else {
 		__pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL);
-		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
-			gmap_pmdp_idte_global(mm, addr);
 	}
 }
 
@@ -398,8 +229,6 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
 			  cpumask_of(smp_processor_id()))) {
 		set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID)));
 		mm->context.flush_mm = 1;
-		if (mm_has_pgste(mm))
-			gmap_pmdp_invalidate(mm, addr);
 	} else {
 		pmdp_idte_global(mm, addr, pmdp);
 	}
@@ -407,40 +236,6 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
 	return old;
 }
 
-#ifdef CONFIG_PGSTE
-static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp)
-{
-	struct vm_area_struct *vma;
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-
-	/* We need a valid VMA, otherwise this is clearly a fault. */
-	vma = vma_lookup(mm, addr);
-	if (!vma)
-		return -EFAULT;
-
-	pgd = pgd_offset(mm, addr);
-	if (!pgd_present(*pgd))
-		return -ENOENT;
-
-	p4d = p4d_offset(pgd, addr);
-	if (!p4d_present(*p4d))
-		return -ENOENT;
-
-	pud = pud_offset(p4d, addr);
-	if (!pud_present(*pud))
-		return -ENOENT;
-
-	/* Large PUDs are not supported yet. */
-	if (pud_leaf(*pud))
-		return -EFAULT;
-
-	*pmdp = pmd_offset(pud, addr);
-	return 0;
-}
-#endif
-
 pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
 		       pmd_t *pmdp, pmd_t new)
 {
@@ -558,598 +353,3 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 	return pgtable;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-#ifdef CONFIG_PGSTE
-void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, pte_t entry)
-{
-	pgste_t pgste;
-
-	/* the mm_has_pgste() check is done in set_pte_at() */
-	preempt_disable();
-	pgste = pgste_get_lock(ptep);
-	pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO);
-	pgste_set_key(ptep, pgste, entry, mm);
-	pgste = pgste_set_pte(ptep, pgste, entry);
-	pgste_set_unlock(ptep, pgste);
-	preempt_enable();
-}
-
-void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	pgste_t pgste;
-
-	preempt_disable();
-	pgste = pgste_get_lock(ptep);
-	pgste = set_pgste_bit(pgste, PGSTE_IN_BIT);
-	pgste_set_unlock(ptep, pgste);
-	preempt_enable();
-}
-
-/**
- * ptep_force_prot - change access rights of a locked pte
- * @mm: pointer to the process mm_struct
- * @addr: virtual address in the guest address space
- * @ptep: pointer to the page table entry
- * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
- * @bit: pgste bit to set (e.g. for notification)
- *
- * Returns 0 if the access rights were changed and -EAGAIN if the current
- * and requested access rights are incompatible.
- */
-int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
-		    pte_t *ptep, int prot, unsigned long bit)
-{
-	pte_t entry;
-	pgste_t pgste;
-	int pte_i, pte_p, nodat;
-
-	pgste = pgste_get_lock(ptep);
-	entry = *ptep;
-	/* Check pte entry after all locks have been acquired */
-	pte_i = pte_val(entry) & _PAGE_INVALID;
-	pte_p = pte_val(entry) & _PAGE_PROTECT;
-	if ((pte_i && (prot != PROT_NONE)) ||
-	    (pte_p && (prot & PROT_WRITE))) {
-		pgste_set_unlock(ptep, pgste);
-		return -EAGAIN;
-	}
-	/* Change access rights and set pgste bit */
-	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
-	if (prot == PROT_NONE && !pte_i) {
-		ptep_flush_direct(mm, addr, ptep, nodat);
-		pgste = pgste_update_all(entry, pgste, mm);
-		entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID));
-	}
-	if (prot == PROT_READ && !pte_p) {
-		ptep_flush_direct(mm, addr, ptep, nodat);
-		entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID));
-		entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT));
-	}
-	pgste = set_pgste_bit(pgste, bit);
-	pgste = pgste_set_pte(ptep, pgste, entry);
-	pgste_set_unlock(ptep, pgste);
-	return 0;
-}
-
-int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
-		    pte_t *sptep, pte_t *tptep, pte_t pte)
-{
-	pgste_t spgste, tpgste;
-	pte_t spte, tpte;
-	int rc = -EAGAIN;
-
-	if (!(pte_val(*tptep) & _PAGE_INVALID))
-		return 0;	/* already shadowed */
-	spgste = pgste_get_lock(sptep);
-	spte = *sptep;
-	if (!(pte_val(spte) & _PAGE_INVALID) &&
-	    !((pte_val(spte) & _PAGE_PROTECT) &&
-	      !(pte_val(pte) & _PAGE_PROTECT))) {
-		spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT);
-		tpgste = pgste_get_lock(tptep);
-		tpte = __pte((pte_val(spte) & PAGE_MASK) |
-			     (pte_val(pte) & _PAGE_PROTECT));
-		/* don't touch the storage key - it belongs to parent pgste */
-		tpgste = pgste_set_pte(tptep, tpgste, tpte);
-		pgste_set_unlock(tptep, tpgste);
-		rc = 1;
-	}
-	pgste_set_unlock(sptep, spgste);
-	return rc;
-}
-
-void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
-{
-	pgste_t pgste;
-	int nodat;
-
-	pgste = pgste_get_lock(ptep);
-	/* notifier is called by the caller */
-	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
-	ptep_flush_direct(mm, saddr, ptep, nodat);
-	/* don't touch the storage key - it belongs to parent pgste */
-	pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
-	pgste_set_unlock(ptep, pgste);
-}
-
-static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
-{
-	if (softleaf_is_swap(entry))
-		dec_mm_counter(mm, MM_SWAPENTS);
-	else if (softleaf_is_migration(entry)) {
-		struct folio *folio = softleaf_to_folio(entry);
-
-		dec_mm_counter(mm, mm_counter(folio));
-	}
-	free_swap_and_cache(entry);
-}
-
-void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, int reset)
-{
-	unsigned long pgstev;
-	pgste_t pgste;
-	pte_t pte;
-
-	/* Zap unused and logically-zero pages */
-	preempt_disable();
-	pgste = pgste_get_lock(ptep);
-	pgstev = pgste_val(pgste);
-	pte = *ptep;
-	if (!reset && pte_swap(pte) &&
-	    ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
-	     (pgstev & _PGSTE_GPS_ZERO))) {
-		ptep_zap_softleaf_entry(mm, softleaf_from_pte(pte));
-		pte_clear(mm, addr, ptep);
-	}
-	if (reset)
-		pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
-	pgste_set_unlock(ptep, pgste);
-	preempt_enable();
-}
-
-void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	unsigned long ptev;
-	pgste_t pgste;
-
-	/* Clear storage key ACC and F, but set R/C */
-	preempt_disable();
-	pgste = pgste_get_lock(ptep);
-	pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT);
-	pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT);
-	ptev = pte_val(*ptep);
-	if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
-		page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0);
-	pgste_set_unlock(ptep, pgste);
-	preempt_enable();
-}
-
-/*
- * Test and reset if a guest page is dirty
- */
-bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
-		       pte_t *ptep)
-{
-	pgste_t pgste;
-	pte_t pte;
-	bool dirty;
-	int nodat;
-
-	pgste = pgste_get_lock(ptep);
-	dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
-	pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT);
-	pte = *ptep;
-	if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
-		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
-		nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
-		ptep_ipte_global(mm, addr, ptep, nodat);
-		if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE))
-			pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
-		else
-			pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID));
-		set_pte(ptep, pte);
-	}
-	pgste_set_unlock(ptep, pgste);
-	return dirty;
-}
-EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc);
-
-int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			  unsigned char key, bool nq)
-{
-	unsigned long keyul, paddr;
-	spinlock_t *ptl;
-	pgste_t old, new;
-	pmd_t *pmdp;
-	pte_t *ptep;
-
-	/*
-	 * If we don't have a PTE table and if there is no huge page mapped,
-	 * we can ignore attempts to set the key to 0, because it already is 0.
-	 */
-	switch (pmd_lookup(mm, addr, &pmdp)) {
-	case -ENOENT:
-		return key ? -EFAULT : 0;
-	case 0:
-		break;
-	default:
-		return -EFAULT;
-	}
-again:
-	ptl = pmd_lock(mm, pmdp);
-	if (!pmd_present(*pmdp)) {
-		spin_unlock(ptl);
-		return key ? -EFAULT : 0;
-	}
-
-	if (pmd_leaf(*pmdp)) {
-		paddr = pmd_val(*pmdp) & HPAGE_MASK;
-		paddr |= addr & ~HPAGE_MASK;
-		/*
-		 * Huge pmds need quiescing operations, they are
-		 * always mapped.
-		 */
-		page_set_storage_key(paddr, key, 1);
-		spin_unlock(ptl);
-		return 0;
-	}
-	spin_unlock(ptl);
-
-	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-	if (!ptep)
-		goto again;
-	new = old = pgste_get_lock(ptep);
-	new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT |
-				   PGSTE_ACC_BITS | PGSTE_FP_BIT);
-	keyul = (unsigned long) key;
-	new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48);
-	new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56);
-	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
-		unsigned long bits, skey;
-
-		paddr = pte_val(*ptep) & PAGE_MASK;
-		skey = (unsigned long) page_get_storage_key(paddr);
-		bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
-		skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
-		/* Set storage key ACC and FP */
-		page_set_storage_key(paddr, skey, !nq);
-		/* Merge host changed & referenced into pgste  */
-		new = set_pgste_bit(new, bits << 52);
-	}
-	/* changing the guest storage key is considered a change of the page */
-	if ((pgste_val(new) ^ pgste_val(old)) &
-	    (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
-		new = set_pgste_bit(new, PGSTE_UC_BIT);
-
-	pgste_set_unlock(ptep, new);
-	pte_unmap_unlock(ptep, ptl);
-	return 0;
-}
-EXPORT_SYMBOL(set_guest_storage_key);
-
-/*
- * Conditionally set a guest storage key (handling csske).
- * oldkey will be updated when either mr or mc is set and a pointer is given.
- *
- * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
- * storage key was updated and -EFAULT on access errors.
- */
-int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			       unsigned char key, unsigned char *oldkey,
-			       bool nq, bool mr, bool mc)
-{
-	unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
-	int rc;
-
-	/* we can drop the pgste lock between getting and setting the key */
-	if (mr | mc) {
-		rc = get_guest_storage_key(current->mm, addr, &tmp);
-		if (rc)
-			return rc;
-		if (oldkey)
-			*oldkey = tmp;
-		if (!mr)
-			mask |= _PAGE_REFERENCED;
-		if (!mc)
-			mask |= _PAGE_CHANGED;
-		if (!((tmp ^ key) & mask))
-			return 0;
-	}
-	rc = set_guest_storage_key(current->mm, addr, key, nq);
-	return rc < 0 ? rc : 1;
-}
-EXPORT_SYMBOL(cond_set_guest_storage_key);
-
-/*
- * Reset a guest reference bit (rrbe), returning the reference and changed bit.
- *
- * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
- */
-int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
-{
-	spinlock_t *ptl;
-	unsigned long paddr;
-	pgste_t old, new;
-	pmd_t *pmdp;
-	pte_t *ptep;
-	int cc = 0;
-
-	/*
-	 * If we don't have a PTE table and if there is no huge page mapped,
-	 * the storage key is 0 and there is nothing for us to do.
-	 */
-	switch (pmd_lookup(mm, addr, &pmdp)) {
-	case -ENOENT:
-		return 0;
-	case 0:
-		break;
-	default:
-		return -EFAULT;
-	}
-again:
-	ptl = pmd_lock(mm, pmdp);
-	if (!pmd_present(*pmdp)) {
-		spin_unlock(ptl);
-		return 0;
-	}
-
-	if (pmd_leaf(*pmdp)) {
-		paddr = pmd_val(*pmdp) & HPAGE_MASK;
-		paddr |= addr & ~HPAGE_MASK;
-		cc = page_reset_referenced(paddr);
-		spin_unlock(ptl);
-		return cc;
-	}
-	spin_unlock(ptl);
-
-	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-	if (!ptep)
-		goto again;
-	new = old = pgste_get_lock(ptep);
-	/* Reset guest reference bit only */
-	new = clear_pgste_bit(new, PGSTE_GR_BIT);
-
-	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
-		paddr = pte_val(*ptep) & PAGE_MASK;
-		cc = page_reset_referenced(paddr);
-		/* Merge real referenced bit into host-set */
-		new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT);
-	}
-	/* Reflect guest's logical view, not physical */
-	cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
-	/* Changing the guest storage key is considered a change of the page */
-	if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
-		new = set_pgste_bit(new, PGSTE_UC_BIT);
-
-	pgste_set_unlock(ptep, new);
-	pte_unmap_unlock(ptep, ptl);
-	return cc;
-}
-EXPORT_SYMBOL(reset_guest_reference_bit);
-
-int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			  unsigned char *key)
-{
-	unsigned long paddr;
-	spinlock_t *ptl;
-	pgste_t pgste;
-	pmd_t *pmdp;
-	pte_t *ptep;
-
-	/*
-	 * If we don't have a PTE table and if there is no huge page mapped,
-	 * the storage key is 0.
-	 */
-	*key = 0;
-
-	switch (pmd_lookup(mm, addr, &pmdp)) {
-	case -ENOENT:
-		return 0;
-	case 0:
-		break;
-	default:
-		return -EFAULT;
-	}
-again:
-	ptl = pmd_lock(mm, pmdp);
-	if (!pmd_present(*pmdp)) {
-		spin_unlock(ptl);
-		return 0;
-	}
-
-	if (pmd_leaf(*pmdp)) {
-		paddr = pmd_val(*pmdp) & HPAGE_MASK;
-		paddr |= addr & ~HPAGE_MASK;
-		*key = page_get_storage_key(paddr);
-		spin_unlock(ptl);
-		return 0;
-	}
-	spin_unlock(ptl);
-
-	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-	if (!ptep)
-		goto again;
-	pgste = pgste_get_lock(ptep);
-	*key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
-	paddr = pte_val(*ptep) & PAGE_MASK;
-	if (!(pte_val(*ptep) & _PAGE_INVALID))
-		*key = page_get_storage_key(paddr);
-	/* Reflect guest's logical view, not physical */
-	*key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
-	pgste_set_unlock(ptep, pgste);
-	pte_unmap_unlock(ptep, ptl);
-	return 0;
-}
-EXPORT_SYMBOL(get_guest_storage_key);
-
-/**
- * pgste_perform_essa - perform ESSA actions on the PGSTE.
- * @mm: the memory context. It must have PGSTEs, no check is performed here!
- * @hva: the host virtual address of the page whose PGSTE is to be processed
- * @orc: the specific action to perform, see the ESSA_SET_* macros.
- * @oldpte: the PTE will be saved there if the pointer is not NULL.
- * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL.
- *
- * Return: 1 if the page is to be added to the CBRL, otherwise 0,
- *	   or < 0 in case of error. -EINVAL is returned for invalid values
- *	   of orc, -EFAULT for invalid addresses.
- */
-int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
-			unsigned long *oldpte, unsigned long *oldpgste)
-{
-	struct vm_area_struct *vma;
-	unsigned long pgstev;
-	spinlock_t *ptl;
-	pgste_t pgste;
-	pte_t *ptep;
-	int res = 0;
-
-	WARN_ON_ONCE(orc > ESSA_MAX);
-	if (unlikely(orc > ESSA_MAX))
-		return -EINVAL;
-
-	vma = vma_lookup(mm, hva);
-	if (!vma || is_vm_hugetlb_page(vma))
-		return -EFAULT;
-	ptep = get_locked_pte(mm, hva, &ptl);
-	if (unlikely(!ptep))
-		return -EFAULT;
-	pgste = pgste_get_lock(ptep);
-	pgstev = pgste_val(pgste);
-	if (oldpte)
-		*oldpte = pte_val(*ptep);
-	if (oldpgste)
-		*oldpgste = pgstev;
-
-	switch (orc) {
-	case ESSA_GET_STATE:
-		break;
-	case ESSA_SET_STABLE:
-		pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
-		pgstev |= _PGSTE_GPS_USAGE_STABLE;
-		break;
-	case ESSA_SET_UNUSED:
-		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
-		pgstev |= _PGSTE_GPS_USAGE_UNUSED;
-		if (pte_val(*ptep) & _PAGE_INVALID)
-			res = 1;
-		break;
-	case ESSA_SET_VOLATILE:
-		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
-		pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
-		if (pte_val(*ptep) & _PAGE_INVALID)
-			res = 1;
-		break;
-	case ESSA_SET_POT_VOLATILE:
-		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
-		if (!(pte_val(*ptep) & _PAGE_INVALID)) {
-			pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE;
-			break;
-		}
-		if (pgstev & _PGSTE_GPS_ZERO) {
-			pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
-			break;
-		}
-		if (!(pgstev & PGSTE_GC_BIT)) {
-			pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
-			res = 1;
-			break;
-		}
-		break;
-	case ESSA_SET_STABLE_RESIDENT:
-		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
-		pgstev |= _PGSTE_GPS_USAGE_STABLE;
-		/*
-		 * Since the resident state can go away any time after this
-		 * call, we will not make this page resident. We can revisit
-		 * this decision if a guest will ever start using this.
-		 */
-		break;
-	case ESSA_SET_STABLE_IF_RESIDENT:
-		if (!(pte_val(*ptep) & _PAGE_INVALID)) {
-			pgstev &= ~_PGSTE_GPS_USAGE_MASK;
-			pgstev |= _PGSTE_GPS_USAGE_STABLE;
-		}
-		break;
-	case ESSA_SET_STABLE_NODAT:
-		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
-		pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT;
-		break;
-	default:
-		/* we should never get here! */
-		break;
-	}
-	/* If we are discarding a page, set it to logical zero */
-	if (res)
-		pgstev |= _PGSTE_GPS_ZERO;
-
-	pgste = __pgste(pgstev);
-	pgste_set_unlock(ptep, pgste);
-	pte_unmap_unlock(ptep, ptl);
-	return res;
-}
-EXPORT_SYMBOL(pgste_perform_essa);
-
-/**
- * set_pgste_bits - set specific PGSTE bits.
- * @mm: the memory context. It must have PGSTEs, no check is performed here!
- * @hva: the host virtual address of the page whose PGSTE is to be processed
- * @bits: a bitmask representing the bits that will be touched
- * @value: the values of the bits to be written. Only the bits in the mask
- *	   will be written.
- *
- * Return: 0 on success, < 0 in case of error.
- */
-int set_pgste_bits(struct mm_struct *mm, unsigned long hva,
-			unsigned long bits, unsigned long value)
-{
-	struct vm_area_struct *vma;
-	spinlock_t *ptl;
-	pgste_t new;
-	pte_t *ptep;
-
-	vma = vma_lookup(mm, hva);
-	if (!vma || is_vm_hugetlb_page(vma))
-		return -EFAULT;
-	ptep = get_locked_pte(mm, hva, &ptl);
-	if (unlikely(!ptep))
-		return -EFAULT;
-	new = pgste_get_lock(ptep);
-
-	new = clear_pgste_bit(new, bits);
-	new = set_pgste_bit(new, value & bits);
-
-	pgste_set_unlock(ptep, new);
-	pte_unmap_unlock(ptep, ptl);
-	return 0;
-}
-EXPORT_SYMBOL(set_pgste_bits);
-
-/**
- * get_pgste - get the current PGSTE for the given address.
- * @mm: the memory context. It must have PGSTEs, no check is performed here!
- * @hva: the host virtual address of the page whose PGSTE is to be processed
- * @pgstep: will be written with the current PGSTE for the given address.
- *
- * Return: 0 on success, < 0 in case of error.
- */
-int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep)
-{
-	struct vm_area_struct *vma;
-	spinlock_t *ptl;
-	pte_t *ptep;
-
-	vma = vma_lookup(mm, hva);
-	if (!vma || is_vm_hugetlb_page(vma))
-		return -EFAULT;
-	ptep = get_locked_pte(mm, hva, &ptl);
-	if (unlikely(!ptep))
-		return -EFAULT;
-	*pgstep = pgste_val(pgste_get(ptep));
-	pte_unmap_unlock(ptep, ptl);
-	return 0;
-}
-EXPORT_SYMBOL(get_pgste);
-#endif