diff options
| author | Paolo Bonzini <pbonzini@redhat.com> | 2026-02-11 18:52:27 +0100 |
|---|---|---|
| committer | Paolo Bonzini <pbonzini@redhat.com> | 2026-02-11 18:52:27 +0100 |
| commit | b1195183ed42f1522fae3fe44ebee3af437aa000 (patch) | |
| tree | a0ea8797c7b816bf50fe57b0a549da325afbd88d /arch | |
| parent | bf2c3138ae3694d4687cbe451c774c288ae2ad06 (diff) | |
| parent | e3372ffb5f9e2dda3da259b768aab6271672b90d (diff) | |
Merge tag 'kvm-s390-next-7.0-1' of https://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into HEAD
- gmap rewrite: completely new memory management for kvm/s390
- vSIE improvement
- maintainership change for s390 vfio-pci
- small quality of life improvement for protected guests
Diffstat (limited to 'arch')
44 files changed, 5562 insertions, 5490 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 0e5fad5f06ca..961cbf023c1b 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -32,9 +32,6 @@ config GENERIC_BUG_RELATIVE_POINTERS config GENERIC_LOCKBREAK def_bool y if PREEMPTION -config PGSTE - def_bool y if KVM - config AUDIT_ARCH def_bool y diff --git a/arch/s390/include/asm/dat-bits.h b/arch/s390/include/asm/dat-bits.h index 8d65eec2f124..c40874e0e426 100644 --- a/arch/s390/include/asm/dat-bits.h +++ b/arch/s390/include/asm/dat-bits.h @@ -9,6 +9,32 @@ #ifndef _S390_DAT_BITS_H #define _S390_DAT_BITS_H +/* + * vaddress union in order to easily decode a virtual address into its + * region first index, region second index etc. parts. + */ +union vaddress { + unsigned long addr; + struct { + unsigned long rfx : 11; + unsigned long rsx : 11; + unsigned long rtx : 11; + unsigned long sx : 11; + unsigned long px : 8; + unsigned long bx : 12; + }; + struct { + unsigned long rfx01 : 2; + unsigned long : 9; + unsigned long rsx01 : 2; + unsigned long : 9; + unsigned long rtx01 : 2; + unsigned long : 9; + unsigned long sx01 : 2; + unsigned long : 29; + }; +}; + union asce { unsigned long val; struct { @@ -98,7 +124,8 @@ union region3_table_entry { struct { unsigned long : 53; unsigned long fc: 1; /* Format-Control */ - unsigned long : 4; + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 3; unsigned long i : 1; /* Region-Invalid Bit */ unsigned long cr: 1; /* Common-Region Bit */ unsigned long tt: 2; /* Table-Type Bits */ @@ -140,7 +167,8 @@ union segment_table_entry { struct { unsigned long : 53; unsigned long fc: 1; /* Format-Control */ - unsigned long : 4; + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long : 3; unsigned long i : 1; /* Segment-Invalid Bit */ unsigned long cs: 1; /* Common-Segment Bit */ unsigned long tt: 2; /* Table-Type Bits */ diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h deleted file mode 100644 index 66c5808fd011..000000000000 --- a/arch/s390/include/asm/gmap.h +++ /dev/null @@ -1,174 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * KVM guest address space mapping code - * - * Copyright IBM Corp. 2007, 2016 - * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> - */ - -#ifndef _ASM_S390_GMAP_H -#define _ASM_S390_GMAP_H - -#include <linux/radix-tree.h> -#include <linux/refcount.h> - -/* Generic bits for GMAP notification on DAT table entry changes. */ -#define GMAP_NOTIFY_SHADOW 0x2 -#define GMAP_NOTIFY_MPROT 0x1 - -/* Status bits only for huge segment entries */ -#define _SEGMENT_ENTRY_GMAP_IN 0x0800 /* invalidation notify bit */ -#define _SEGMENT_ENTRY_GMAP_UC 0x0002 /* dirty (migration) */ - -/** - * struct gmap_struct - guest address space - * @list: list head for the mm->context gmap list - * @mm: pointer to the parent mm_struct - * @guest_to_host: radix tree with guest to host address translation - * @host_to_guest: radix tree with pointer to segment table entries - * @guest_table_lock: spinlock to protect all entries in the guest page table - * @ref_count: reference counter for the gmap structure - * @table: pointer to the page directory - * @asce: address space control element for gmap page table - * @pfault_enabled: defines if pfaults are applicable for the guest - * @guest_handle: protected virtual machine handle for the ultravisor - * @host_to_rmap: radix tree with gmap_rmap lists - * @children: list of shadow gmap structures - * @shadow_lock: spinlock to protect the shadow gmap list - * @parent: pointer to the parent gmap for shadow guest address spaces - * @orig_asce: ASCE for which the shadow page table has been created - * @edat_level: edat level to be used for the shadow translation - * @removed: flag to indicate if a shadow guest address space has been removed - * @initialized: flag to indicate if a shadow guest address space can be used - */ -struct gmap { - struct list_head list; - struct mm_struct *mm; - struct radix_tree_root guest_to_host; - struct radix_tree_root host_to_guest; - spinlock_t guest_table_lock; - refcount_t ref_count; - unsigned long *table; - unsigned long asce; - unsigned long asce_end; - void *private; - bool pfault_enabled; - /* only set for protected virtual machines */ - unsigned long guest_handle; - /* Additional data for shadow guest address spaces */ - struct radix_tree_root host_to_rmap; - struct list_head children; - spinlock_t shadow_lock; - struct gmap *parent; - unsigned long orig_asce; - int edat_level; - bool removed; - bool initialized; -}; - -/** - * struct gmap_rmap - reverse mapping for shadow page table entries - * @next: pointer to next rmap in the list - * @raddr: virtual rmap address in the shadow guest address space - */ -struct gmap_rmap { - struct gmap_rmap *next; - unsigned long raddr; -}; - -#define gmap_for_each_rmap(pos, head) \ - for (pos = (head); pos; pos = pos->next) - -#define gmap_for_each_rmap_safe(pos, n, head) \ - for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n) - -/** - * struct gmap_notifier - notify function block for page invalidation - * @notifier_call: address of callback function - */ -struct gmap_notifier { - struct list_head list; - struct rcu_head rcu; - void (*notifier_call)(struct gmap *gmap, unsigned long start, - unsigned long end); -}; - -static inline int gmap_is_shadow(struct gmap *gmap) -{ - return !!gmap->parent; -} - -struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); -void gmap_remove(struct gmap *gmap); -struct gmap *gmap_get(struct gmap *gmap); -void gmap_put(struct gmap *gmap); -void gmap_free(struct gmap *gmap); -struct gmap *gmap_alloc(unsigned long limit); - -int gmap_map_segment(struct gmap *gmap, unsigned long from, - unsigned long to, unsigned long len); -int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); -unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); -int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); -void __gmap_zap(struct gmap *, unsigned long gaddr); -void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); - -int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); - -void gmap_unshadow(struct gmap *sg); -int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, - int fake); -int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, - int fake); -int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, - int fake); -int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, - int fake); -int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); - -void gmap_register_pte_notifier(struct gmap_notifier *); -void gmap_unregister_pte_notifier(struct gmap_notifier *); - -int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits); - -void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], - unsigned long gaddr, unsigned long vmaddr); -int s390_replace_asce(struct gmap *gmap); -void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); -int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, - unsigned long end, bool interruptible); -unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level); - -/** - * s390_uv_destroy_range - Destroy a range of pages in the given mm. - * @mm: the mm on which to operate on - * @start: the start of the range - * @end: the end of the range - * - * This function will call cond_sched, so it should not generate stalls, but - * it will otherwise only return when it completed. - */ -static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - (void)__s390_uv_destroy_range(mm, start, end, false); -} - -/** - * s390_uv_destroy_range_interruptible - Destroy a range of pages in the - * given mm, but stop when a fatal signal is received. - * @mm: the mm on which to operate on - * @start: the start of the range - * @end: the end of the range - * - * This function will call cond_sched, so it should not generate stalls. If - * a fatal signal is received, it will return with -EINTR immediately, - * without finishing destroying the whole range. Upon successful - * completion, 0 is returned. - */ -static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - return __s390_uv_destroy_range(mm, start, end, true); -} -#endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/gmap_helpers.h b/arch/s390/include/asm/gmap_helpers.h index 5356446a61c4..2d3ae421077e 100644 --- a/arch/s390/include/asm/gmap_helpers.h +++ b/arch/s390/include/asm/gmap_helpers.h @@ -11,5 +11,6 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr); void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end); int gmap_helper_disable_cow_sharing(void); +void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr); #endif /* _ASM_S390_GMAP_HELPERS_H */ diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 69131736daaa..6983e52eaf81 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -37,12 +37,6 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, return __huge_ptep_get_and_clear(mm, addr, ptep); } -static inline void arch_clear_hugetlb_flags(struct folio *folio) -{ - clear_bit(PG_arch_1, &folio->flags.f); -} -#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags - #define __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index ae1223264d3c..64a50f0862aa 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -27,6 +27,7 @@ #include <asm/isc.h> #include <asm/guarded_storage.h> +#define KVM_HAVE_MMU_RWLOCK #define KVM_MAX_VCPUS 255 #define KVM_INTERNAL_MEM_SLOTS 1 @@ -441,6 +442,7 @@ struct kvm_vcpu_arch { bool acrs_loaded; struct kvm_s390_pv_vcpu pv; union diag318_info diag318_info; + struct kvm_s390_mmu_cache *mc; }; struct kvm_vm_stat { @@ -630,8 +632,12 @@ struct kvm_s390_pv { void *set_aside; struct list_head need_cleanup; struct mmu_notifier mmu_notifier; + /* Protects against concurrent import-like operations */ + struct mutex import_lock; }; +struct kvm_s390_mmu_cache; + struct kvm_arch { struct esca_block *sca; debug_info_t *dbf; @@ -671,6 +677,7 @@ struct kvm_arch { struct kvm_s390_pv pv; struct list_head kzdev_list; spinlock_t kzdev_list_lock; + struct kvm_s390_mmu_cache *mc; }; #define KVM_HVA_ERR_BAD (-1UL) diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index f07e49b419ab..d4fd7bf3692e 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -19,23 +19,10 @@ typedef struct { /* The mmu context belongs to a secure guest. */ atomic_t protected_count; /* - * The following bitfields need a down_write on the mm - * semaphore when they are written to. As they are only - * written once, they can be read without a lock. - */ - /* The mmu context uses extended page tables. */ - unsigned int has_pgste:1; - /* The mmu context uses storage keys. */ - unsigned int uses_skeys:1; - /* The mmu context uses CMM. */ - unsigned int uses_cmm:1; - /* * The mmu context allows COW-sharing of memory pages (KSM, zeropage). * Note that COW-sharing during fork() is currently always allowed. */ unsigned int allow_cow_sharing:1; - /* The gmaps associated with this context are allowed to use huge pages. */ - unsigned int allow_gmap_hpage_1m:1; } mm_context_t; #define INIT_MM_CONTEXT(name) \ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index d9b8501bc93d..bd1ef5e2d2eb 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -29,12 +29,8 @@ static inline int init_new_context(struct task_struct *tsk, atomic_set(&mm->context.protected_count, 0); mm->context.gmap_asce = 0; mm->context.flush_mm = 0; -#ifdef CONFIG_PGSTE - mm->context.has_pgste = 0; - mm->context.uses_skeys = 0; - mm->context.uses_cmm = 0; +#if IS_ENABLED(CONFIG_KVM) mm->context.allow_cow_sharing = 1; - mm->context.allow_gmap_hpage_1m = 0; #endif switch (mm->context.asce_limit) { default: diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index c1d63b613bf9..6de2f4d25b63 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -78,7 +78,6 @@ static inline void copy_page(void *to, void *from) #ifdef STRICT_MM_TYPECHECKS typedef struct { unsigned long pgprot; } pgprot_t; -typedef struct { unsigned long pgste; } pgste_t; typedef struct { unsigned long pte; } pte_t; typedef struct { unsigned long pmd; } pmd_t; typedef struct { unsigned long pud; } pud_t; @@ -94,7 +93,6 @@ static __always_inline unsigned long name ## _val(name ## _t name) \ #else /* STRICT_MM_TYPECHECKS */ typedef unsigned long pgprot_t; -typedef unsigned long pgste_t; typedef unsigned long pte_t; typedef unsigned long pmd_t; typedef unsigned long pud_t; @@ -110,7 +108,6 @@ static __always_inline unsigned long name ## _val(name ## _t name) \ #endif /* STRICT_MM_TYPECHECKS */ DEFINE_PGVAL_FUNC(pgprot) -DEFINE_PGVAL_FUNC(pgste) DEFINE_PGVAL_FUNC(pte) DEFINE_PGVAL_FUNC(pmd) DEFINE_PGVAL_FUNC(pud) @@ -120,7 +117,6 @@ DEFINE_PGVAL_FUNC(pgd) typedef pte_t *pgtable_t; #define __pgprot(x) ((pgprot_t) { (x) } ) -#define __pgste(x) ((pgste_t) { (x) } ) #define __pte(x) ((pte_t) { (x) } ) #define __pmd(x) ((pmd_t) { (x) } ) #define __pud(x) ((pud_t) { (x) } ) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index a16e65072371..a5de9e61ea9e 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -27,10 +27,6 @@ unsigned long *page_table_alloc_noprof(struct mm_struct *); #define page_table_alloc(...) alloc_hooks(page_table_alloc_noprof(__VA_ARGS__)) void page_table_free(struct mm_struct *, unsigned long *); -struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm); -#define page_table_alloc_pgste(...) alloc_hooks(page_table_alloc_pgste_noprof(__VA_ARGS__)) -void page_table_free_pgste(struct ptdesc *ptdesc); - static inline void crst_table_init(unsigned long *crst, unsigned long entry) { memset64((u64 *)crst, entry, _CRST_ENTRIES); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index bca9b29778c3..1c3c3be93be9 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -413,28 +413,6 @@ void setup_protection_map(void); * SW-bits: y young, d dirty, r read, w write */ -/* Page status table bits for virtualization */ -#define PGSTE_ACC_BITS 0xf000000000000000UL -#define PGSTE_FP_BIT 0x0800000000000000UL -#define PGSTE_PCL_BIT 0x0080000000000000UL -#define PGSTE_HR_BIT 0x0040000000000000UL -#define PGSTE_HC_BIT 0x0020000000000000UL -#define PGSTE_GR_BIT 0x0004000000000000UL -#define PGSTE_GC_BIT 0x0002000000000000UL -#define PGSTE_ST2_MASK 0x0000ffff00000000UL -#define PGSTE_UC_BIT 0x0000000000008000UL /* user dirty (migration) */ -#define PGSTE_IN_BIT 0x0000000000004000UL /* IPTE notify bit */ -#define PGSTE_VSIE_BIT 0x0000000000002000UL /* ref'd in a shadow table */ - -/* Guest Page State used for virtualization */ -#define _PGSTE_GPS_ZERO 0x0000000080000000UL -#define _PGSTE_GPS_NODAT 0x0000000040000000UL -#define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL -#define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL -#define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL -#define _PGSTE_GPS_USAGE_POT_VOLATILE 0x0000000002000000UL -#define _PGSTE_GPS_USAGE_VOLATILE _PGSTE_GPS_USAGE_MASK - /* * A user page table pointer has the space-switch-event bit, the * private-space-control bit and the storage-alteration-event-control @@ -566,34 +544,15 @@ static inline bool mm_pmd_folded(struct mm_struct *mm) } #define mm_pmd_folded(mm) mm_pmd_folded(mm) -static inline int mm_has_pgste(struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - if (unlikely(mm->context.has_pgste)) - return 1; -#endif - return 0; -} - static inline int mm_is_protected(struct mm_struct *mm) { -#ifdef CONFIG_PGSTE +#if IS_ENABLED(CONFIG_KVM) if (unlikely(atomic_read(&mm->context.protected_count))) return 1; #endif return 0; } -static inline pgste_t clear_pgste_bit(pgste_t pgste, unsigned long mask) -{ - return __pgste(pgste_val(pgste) & ~mask); -} - -static inline pgste_t set_pgste_bit(pgste_t pgste, unsigned long mask) -{ - return __pgste(pgste_val(pgste) | mask); -} - static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) { return __pte(pte_val(pte) & ~pgprot_val(prot)); @@ -632,22 +591,13 @@ static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot) #define mm_forbids_zeropage mm_forbids_zeropage static inline int mm_forbids_zeropage(struct mm_struct *mm) { -#ifdef CONFIG_PGSTE +#if IS_ENABLED(CONFIG_KVM) if (!mm->context.allow_cow_sharing) return 1; #endif return 0; } -static inline int mm_uses_skeys(struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - if (mm->context.uses_skeys) - return 1; -#endif - return 0; -} - /** * cspg() - Compare and Swap and Purge (CSPG) * @ptr: Pointer to the value to be exchanged @@ -1136,6 +1086,13 @@ static inline pte_t pte_mkhuge(pte_t pte) } #endif +static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) +{ + asm volatile("sske %[skey],%[addr],1" + : [addr] "+a" (addr) : [skey] "d" (skey)); + return addr; +} + #define IPTE_GLOBAL 0 #define IPTE_LOCAL 1 @@ -1232,7 +1189,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); /* At this point the reference through the mapping is still present */ if (mm_is_protected(mm) && pte_present(res)) - uv_convert_from_secure_pte(res); + WARN_ON_ONCE(uv_convert_from_secure_pte(res)); return res; } @@ -1250,7 +1207,7 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID)); /* At this point the reference through the mapping is still present */ if (mm_is_protected(vma->vm_mm) && pte_present(res)) - uv_convert_from_secure_pte(res); + WARN_ON_ONCE(uv_convert_from_secure_pte(res)); return res; } @@ -1287,9 +1244,10 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, /* * If something went wrong and the page could not be destroyed, or * if this is not a mm teardown, the slower export is used as - * fallback instead. + * fallback instead. If even that fails, print a warning and leak + * the page, to avoid crashing the whole system. */ - uv_convert_from_secure_pte(res); + WARN_ON_ONCE(uv_convert_from_secure_pte(res)); return res; } @@ -1348,50 +1306,13 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, { if (pte_same(*ptep, entry)) return 0; - if (cpu_has_rdp() && !mm_has_pgste(vma->vm_mm) && pte_allow_rdp(*ptep, entry)) + if (cpu_has_rdp() && pte_allow_rdp(*ptep, entry)) ptep_reset_dat_prot(vma->vm_mm, addr, ptep, entry); else ptep_xchg_direct(vma->vm_mm, addr, ptep, entry); return 1; } -/* - * Additional functions to handle KVM guest page tables - */ -void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry); -void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void ptep_notify(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, unsigned long bits); -int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr, - pte_t *ptep, int prot, unsigned long bit); -void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, - pte_t *ptep , int reset); -void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, - pte_t *sptep, pte_t *tptep, pte_t pte); -void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep); - -bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long address, - pte_t *ptep); -int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char key, bool nq); -int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char key, unsigned char *oldkey, - bool nq, bool mr, bool mc); -int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr); -int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char *key); - -int set_pgste_bits(struct mm_struct *mm, unsigned long addr, - unsigned long bits, unsigned long value); -int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep); -int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, - unsigned long *oldpte, unsigned long *oldpgste); -void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr); -void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr); -void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr); - #define pgprot_writecombine pgprot_writecombine pgprot_t pgprot_writecombine(pgprot_t prot); @@ -1406,23 +1327,12 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, { if (pte_present(entry)) entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED)); - if (mm_has_pgste(mm)) { - for (;;) { - ptep_set_pte_at(mm, addr, ptep, entry); - if (--nr == 0) - break; - ptep++; - entry = __pte(pte_val(entry) + PAGE_SIZE); - addr += PAGE_SIZE; - } - } else { - for (;;) { - set_pte(ptep, entry); - if (--nr == 0) - break; - ptep++; - entry = __pte(pte_val(entry) + PAGE_SIZE); - } + for (;;) { + set_pte(ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); } } #define set_ptes set_ptes @@ -2015,9 +1925,6 @@ extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t p extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot); extern void vmem_unmap_4k_page(unsigned long addr); extern pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc); -extern int s390_enable_sie(void); -extern int s390_enable_skey(void); -extern void s390_reset_cmma(struct mm_struct *mm); /* s390 has a private copy of get unmapped area to deal with cache synonyms */ #define HAVE_ARCH_UNMAPPED_AREA @@ -2026,40 +1933,4 @@ extern void s390_reset_cmma(struct mm_struct *mm); #define pmd_pgtable(pmd) \ ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)) -static inline unsigned long gmap_pgste_get_pgt_addr(unsigned long *pgt) -{ - unsigned long *pgstes, res; - - pgstes = pgt + _PAGE_ENTRIES; - - res = (pgstes[0] & PGSTE_ST2_MASK) << 16; - res |= pgstes[1] & PGSTE_ST2_MASK; - res |= (pgstes[2] & PGSTE_ST2_MASK) >> 16; - res |= (pgstes[3] & PGSTE_ST2_MASK) >> 32; - - return res; -} - -static inline pgste_t pgste_get_lock(pte_t *ptep) -{ - unsigned long value = 0; -#ifdef CONFIG_PGSTE - unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); - - do { - value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); - } while (value & PGSTE_PCL_BIT); - value |= PGSTE_PCL_BIT; -#endif - return __pgste(value); -} - -static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - barrier(); - WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); -#endif -} - #endif /* _S390_PAGE_H */ diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 1e50f6f1ad9d..7354b42ee994 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -36,7 +36,6 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, #include <asm/tlbflush.h> #include <asm-generic/tlb.h> -#include <asm/gmap.h> /* * Release the page cache reference for a pte removed by @@ -85,8 +84,6 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_pmds = 1; - if (mm_has_pgste(tlb->mm)) - gmap_unlink(tlb->mm, (unsigned long *)pte, address); tlb_remove_ptdesc(tlb, virt_to_ptdesc(pte)); } diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index c5e02addcd67..dff035372601 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -471,65 +471,15 @@ do { \ #define arch_get_kernel_nofault __mvc_kernel_nofault #define arch_put_kernel_nofault __mvc_kernel_nofault -void __cmpxchg_user_key_called_with_bad_pointer(void); - -int __cmpxchg_user_key1(unsigned long address, unsigned char *uval, - unsigned char old, unsigned char new, unsigned long key); -int __cmpxchg_user_key2(unsigned long address, unsigned short *uval, - unsigned short old, unsigned short new, unsigned long key); -int __cmpxchg_user_key4(unsigned long address, unsigned int *uval, - unsigned int old, unsigned int new, unsigned long key); -int __cmpxchg_user_key8(unsigned long address, unsigned long *uval, - unsigned long old, unsigned long new, unsigned long key); -int __cmpxchg_user_key16(unsigned long address, __uint128_t *uval, - __uint128_t old, __uint128_t new, unsigned long key); - -static __always_inline int _cmpxchg_user_key(unsigned long address, void *uval, - __uint128_t old, __uint128_t new, - unsigned long key, int size) -{ - switch (size) { - case 1: return __cmpxchg_user_key1(address, uval, old, new, key); - case 2: return __cmpxchg_user_key2(address, uval, old, new, key); - case 4: return __cmpxchg_user_key4(address, uval, old, new, key); - case 8: return __cmpxchg_user_key8(address, uval, old, new, key); - case 16: return __cmpxchg_user_key16(address, uval, old, new, key); - default: __cmpxchg_user_key_called_with_bad_pointer(); - } - return 0; -} - -/** - * cmpxchg_user_key() - cmpxchg with user space target, honoring storage keys - * @ptr: User space address of value to compare to @old and exchange with - * @new. Must be aligned to sizeof(*@ptr). - * @uval: Address where the old value of *@ptr is written to. - * @old: Old value. Compared to the content pointed to by @ptr in order to - * determine if the exchange occurs. The old value read from *@ptr is - * written to *@uval. - * @new: New value to place at *@ptr. - * @key: Access key to use for checking storage key protection. - * - * Perform a cmpxchg on a user space target, honoring storage key protection. - * @key alone determines how key checking is performed, neither - * storage-protection-override nor fetch-protection-override apply. - * The caller must compare *@uval and @old to determine if values have been - * exchanged. In case of an exception *@uval is set to zero. - * - * Return: 0: cmpxchg executed - * -EFAULT: an exception happened when trying to access *@ptr - * -EAGAIN: maxed out number of retries (byte and short only) - */ -#define cmpxchg_user_key(ptr, uval, old, new, key) \ -({ \ - __typeof__(ptr) __ptr = (ptr); \ - __typeof__(uval) __uval = (uval); \ - \ - BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval))); \ - might_fault(); \ - __chk_user_ptr(__ptr); \ - _cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval), \ - (old), (new), (key), sizeof(*(__ptr))); \ -}) +int __cmpxchg_key1(void *address, unsigned char *uval, unsigned char old, + unsigned char new, unsigned long key); +int __cmpxchg_key2(void *address, unsigned short *uval, unsigned short old, + unsigned short new, unsigned long key); +int __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old, + unsigned int new, unsigned long key); +int __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old, + unsigned long new, unsigned long key); +int __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old, + __uint128_t new, unsigned long key); #endif /* __S390_UACCESS_H */ diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 8018549a1ad2..d919e69662f5 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -631,7 +631,8 @@ int uv_pin_shared(unsigned long paddr); int uv_destroy_folio(struct folio *folio); int uv_destroy_pte(pte_t pte); int uv_convert_from_secure_pte(pte_t pte); -int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb); +int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio); +int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb); int uv_convert_from_secure(unsigned long paddr); int uv_convert_from_secure_folio(struct folio *folio); diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index ed46950be86f..a284f98d9716 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -134,14 +134,15 @@ static int uv_destroy(unsigned long paddr) */ int uv_destroy_folio(struct folio *folio) { + unsigned long i; int rc; - /* Large folios cannot be secure */ - if (unlikely(folio_test_large(folio))) - return 0; - folio_get(folio); - rc = uv_destroy(folio_to_phys(folio)); + for (i = 0; i < (1 << folio_order(folio)); i++) { + rc = uv_destroy(folio_to_phys(folio) + i * PAGE_SIZE); + if (rc) + break; + } if (!rc) clear_bit(PG_arch_1, &folio->flags.f); folio_put(folio); @@ -183,14 +184,15 @@ EXPORT_SYMBOL_GPL(uv_convert_from_secure); */ int uv_convert_from_secure_folio(struct folio *folio) { + unsigned long i; int rc; - /* Large folios cannot be secure */ - if (unlikely(folio_test_large(folio))) - return 0; - folio_get(folio); - rc = uv_convert_from_secure(folio_to_phys(folio)); + for (i = 0; i < (1 << folio_order(folio)); i++) { + rc = uv_convert_from_secure(folio_to_phys(folio) + i * PAGE_SIZE); + if (rc) + break; + } if (!rc) clear_bit(PG_arch_1, &folio->flags.f); folio_put(folio); @@ -207,39 +209,6 @@ int uv_convert_from_secure_pte(pte_t pte) return uv_convert_from_secure_folio(pfn_folio(pte_pfn(pte))); } -/** - * should_export_before_import - Determine whether an export is needed - * before an import-like operation - * @uvcb: the Ultravisor control block of the UVC to be performed - * @mm: the mm of the process - * - * Returns whether an export is needed before every import-like operation. - * This is needed for shared pages, which don't trigger a secure storage - * exception when accessed from a different guest. - * - * Although considered as one, the Unpin Page UVC is not an actual import, - * so it is not affected. - * - * No export is needed also when there is only one protected VM, because the - * page cannot belong to the wrong VM in that case (there is no "other VM" - * it can belong to). - * - * Return: true if an export is needed before every import, otherwise false. - */ -static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) -{ - /* - * The misc feature indicates, among other things, that importing a - * shared page from a different protected VM will automatically also - * transfer its ownership. - */ - if (uv_has_feature(BIT_UV_FEAT_MISC)) - return false; - if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) - return false; - return atomic_read(&mm->context.protected_count) > 1; -} - /* * Calculate the expected ref_count for a folio that would otherwise have no * further pins. This was cribbed from similar functions in other places in @@ -279,7 +248,7 @@ static int expected_folio_refs(struct folio *folio) * (it's the same logic as split_folio()), and the folio must be * locked. */ -static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) +int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) { int expected, cc = 0; @@ -309,20 +278,7 @@ static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) return -EAGAIN; return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; } - -static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct uv_cb_header *uvcb) -{ - int rc; - - if (!folio_trylock(folio)) - return -EAGAIN; - if (should_export_before_import(uvcb, mm)) - uv_convert_from_secure(folio_to_phys(folio)); - rc = __make_folio_secure(folio, uvcb); - folio_unlock(folio); - - return rc; -} +EXPORT_SYMBOL(__make_folio_secure); /** * s390_wiggle_split_folio() - try to drain extra references to a folio and @@ -337,7 +293,7 @@ static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct u * but another attempt can be made; * -EINVAL in case of other folio splitting errors. See split_folio(). */ -static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio) +int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio) { int rc, tried_splits; @@ -409,56 +365,7 @@ static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio) } return -EAGAIN; } - -int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb) -{ - struct vm_area_struct *vma; - struct folio_walk fw; - struct folio *folio; - int rc; - - mmap_read_lock(mm); - vma = vma_lookup(mm, hva); - if (!vma) { - mmap_read_unlock(mm); - return -EFAULT; - } - folio = folio_walk_start(&fw, vma, hva, 0); - if (!folio) { - mmap_read_unlock(mm); - return -ENXIO; - } - - folio_get(folio); - /* - * Secure pages cannot be huge and userspace should not combine both. - * In case userspace does it anyway this will result in an -EFAULT for - * the unpack. The guest is thus never reaching secure mode. - * If userspace plays dirty tricks and decides to map huge pages at a - * later point in time, it will receive a segmentation fault or - * KVM_RUN will return -EFAULT. - */ - if (folio_test_hugetlb(folio)) - rc = -EFAULT; - else if (folio_test_large(folio)) - rc = -E2BIG; - else if (!pte_write(fw.pte) || (pte_val(fw.pte) & _PAGE_INVALID)) - rc = -ENXIO; - else - rc = make_folio_secure(mm, folio, uvcb); - folio_walk_end(&fw, vma); - mmap_read_unlock(mm); - - if (rc == -E2BIG || rc == -EBUSY) { - rc = s390_wiggle_split_folio(mm, folio); - if (!rc) - rc = -EAGAIN; - } - folio_put(folio); - - return rc; -} -EXPORT_SYMBOL_GPL(make_hva_secure); +EXPORT_SYMBOL_GPL(s390_wiggle_split_folio); /* * To be called with the folio locked or with an extra reference! This will @@ -470,21 +377,18 @@ int arch_make_folio_accessible(struct folio *folio) { int rc = 0; - /* Large folios cannot be secure */ - if (unlikely(folio_test_large(folio))) - return 0; - /* - * PG_arch_1 is used in 2 places: - * 1. for storage keys of hugetlb folios and KVM - * 2. As an indication that this small folio might be secure. This can - * overindicate, e.g. we set the bit before calling - * convert_to_secure. - * As secure pages are never large folios, both variants can co-exists. + * PG_arch_1 is used as an indication that this small folio might be + * secure. This can overindicate, e.g. we set the bit before calling + * convert_to_secure. */ if (!test_bit(PG_arch_1, &folio->flags.f)) return 0; + /* Large folios cannot be secure. */ + if (WARN_ON_ONCE(folio_test_large(folio))) + return -EFAULT; + rc = uv_pin_shared(folio_to_phys(folio)); if (!rc) { clear_bit(PG_arch_1, &folio->flags.f); diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index f4ec8c1ce214..917ac740513e 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -30,6 +30,8 @@ config KVM select KVM_VFIO select MMU_NOTIFIER select VIRT_XFER_TO_GUEST_WORK + select KVM_GENERIC_MMU_NOTIFIER + select KVM_MMU_LOCKLESS_AGING help Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 9a723c48b05a..dac9d53b23d8 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -8,7 +8,8 @@ include $(srctree)/virt/kvm/Makefile.kvm ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o +kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o +kvm-y += dat.o gmap.o faultin.o kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c new file mode 100644 index 000000000000..129dc55a4a0d --- /dev/null +++ b/arch/s390/kvm/dat.c @@ -0,0 +1,1391 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2020, 2024 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + * Martin Schwidefsky <schwidefsky@de.ibm.com> + * David Hildenbrand <david@redhat.com> + * Janosch Frank <frankja@linux.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/pagewalk.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/swapops.h> +#include <linux/ksm.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/pgtable.h> +#include <linux/kvm_types.h> +#include <linux/kvm_host.h> +#include <linux/pgalloc.h> + +#include <asm/page-states.h> +#include <asm/tlb.h> +#include "dat.h" + +int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc) +{ + void *o; + + for ( ; mc->n_crsts < KVM_S390_MMU_CACHE_N_CRSTS; mc->n_crsts++) { + o = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER); + if (!o) + return -ENOMEM; + mc->crsts[mc->n_crsts] = o; + } + for ( ; mc->n_pts < KVM_S390_MMU_CACHE_N_PTS; mc->n_pts++) { + o = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + if (!o) + return -ENOMEM; + mc->pts[mc->n_pts] = o; + } + for ( ; mc->n_rmaps < KVM_S390_MMU_CACHE_N_RMAPS; mc->n_rmaps++) { + o = kzalloc(sizeof(*mc->rmaps[0]), GFP_KERNEL_ACCOUNT); + if (!o) + return -ENOMEM; + mc->rmaps[mc->n_rmaps] = o; + } + return 0; +} + +static inline struct page_table *dat_alloc_pt_noinit(struct kvm_s390_mmu_cache *mc) +{ + struct page_table *res; + + res = kvm_s390_mmu_cache_alloc_pt(mc); + if (res) + __arch_set_page_dat(res, 1); + return res; +} + +static inline struct crst_table *dat_alloc_crst_noinit(struct kvm_s390_mmu_cache *mc) +{ + struct crst_table *res; + + res = kvm_s390_mmu_cache_alloc_crst(mc); + if (res) + __arch_set_page_dat(res, 1UL << CRST_ALLOC_ORDER); + return res; +} + +struct crst_table *dat_alloc_crst_sleepable(unsigned long init) +{ + struct page *page; + void *virt; + + page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_COMP, CRST_ALLOC_ORDER); + if (!page) + return NULL; + virt = page_to_virt(page); + __arch_set_page_dat(virt, 1UL << CRST_ALLOC_ORDER); + crst_table_init(virt, init); + return virt; +} + +void dat_free_level(struct crst_table *table, bool owns_ptes) +{ + unsigned int i; + + for (i = 0; i < _CRST_ENTRIES; i++) { + if (table->crstes[i].h.fc || table->crstes[i].h.i) + continue; + if (!is_pmd(table->crstes[i])) + dat_free_level(dereference_crste(table->crstes[i]), owns_ptes); + else if (owns_ptes) + dat_free_pt(dereference_pmd(table->crstes[i].pmd)); + } + dat_free_crst(table); +} + +int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype) +{ + struct crst_table *table; + union crste crste; + + while (asce->dt > newtype) { + table = dereference_asce(*asce); + crste = table->crstes[0]; + if (crste.h.fc) + return 0; + if (!crste.h.i) { + asce->rsto = crste.h.fc0.to; + dat_free_crst(table); + } else { + crste.h.tt--; + crst_table_init((void *)table, crste.val); + } + asce->dt--; + } + while (asce->dt < newtype) { + crste = _crste_fc0(asce->rsto, asce->dt + 1); + table = dat_alloc_crst_noinit(mc); + if (!table) + return -ENOMEM; + crst_table_init((void *)table, _CRSTE_HOLE(crste.h.tt).val); + table->crstes[0] = crste; + asce->rsto = __pa(table) >> PAGE_SHIFT; + asce->dt++; + } + return 0; +} + +/** + * dat_crstep_xchg() - Exchange a gmap CRSTE with another. + * @crstep: Pointer to the CRST entry + * @new: Replacement entry. + * @gfn: The affected guest address. + * @asce: The ASCE of the address space. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + */ +void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce) +{ + if (crstep->h.i) { + WRITE_ONCE(*crstep, new); + return; + } else if (cpu_has_edat2()) { + crdte_crste(crstep, *crstep, new, gfn, asce); + return; + } + + if (machine_has_tlb_guest()) + idte_crste(crstep, gfn, IDTE_GUEST_ASCE, asce, IDTE_GLOBAL); + else + idte_crste(crstep, gfn, 0, NULL_ASCE, IDTE_GLOBAL); + WRITE_ONCE(*crstep, new); +} + +/** + * dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another. + * @crstep: Pointer to the CRST entry. + * @old: Expected old value. + * @new: Replacement entry. + * @gfn: The affected guest address. + * @asce: The asce of the address space. + * + * This function is needed to atomically exchange a CRSTE that potentially + * maps a prefix area, without having to invalidate it inbetween. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: %true if the exchange was successful. + */ +bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn, + union asce asce) +{ + if (old.h.i) + return arch_try_cmpxchg((long *)crstep, &old.val, new.val); + if (cpu_has_edat2()) + return crdte_crste(crstep, old, new, gfn, asce); + return cspg_crste(crstep, old, new); +} + +static void dat_set_storage_key_from_pgste(union pte pte, union pgste pgste) +{ + union skey nkey = { .acc = pgste.acc, .fp = pgste.fp }; + + page_set_storage_key(pte_origin(pte), nkey.skey, 0); +} + +static void dat_move_storage_key(union pte old, union pte new) +{ + page_set_storage_key(pte_origin(new), page_get_storage_key(pte_origin(old)), 1); +} + +static union pgste dat_save_storage_key_into_pgste(union pte pte, union pgste pgste) +{ + union skey skey; + + skey.skey = page_get_storage_key(pte_origin(pte)); + + pgste.acc = skey.acc; + pgste.fp = skey.fp; + pgste.gr |= skey.r; + pgste.gc |= skey.c; + + return pgste; +} + +union pgste __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, gfn_t gfn, + union asce asce, bool uses_skeys) +{ + union pte old = READ_ONCE(*ptep); + + /* Updating only the software bits while holding the pgste lock. */ + if (!((ptep->val ^ new.val) & ~_PAGE_SW_BITS)) { + WRITE_ONCE(ptep->swbyte, new.swbyte); + return pgste; + } + + if (!old.h.i) { + unsigned long opts = IPTE_GUEST_ASCE | (pgste.nodat ? IPTE_NODAT : 0); + + if (machine_has_tlb_guest()) + __ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, opts, asce.val, IPTE_GLOBAL); + else + __ptep_ipte(gfn_to_gpa(gfn), (void *)ptep, 0, 0, IPTE_GLOBAL); + } + + if (uses_skeys) { + if (old.h.i && !new.h.i) + /* Invalid to valid: restore storage keys from PGSTE. */ + dat_set_storage_key_from_pgste(new, pgste); + else if (!old.h.i && new.h.i) + /* Valid to invalid: save storage keys to PGSTE. */ + pgste = dat_save_storage_key_into_pgste(old, pgste); + else if (!old.h.i && !new.h.i) + /* Valid to valid: move storage keys. */ + if (old.h.pfra != new.h.pfra) + dat_move_storage_key(old, new); + /* Invalid to invalid: nothing to do. */ + } + + WRITE_ONCE(*ptep, new); + return pgste; +} + +/* + * dat_split_ste() - Split a segment table entry into page table entries. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: 0 in case of success, -ENOMEM if running out of memory. + */ +static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t gfn, + union asce asce, bool uses_skeys) +{ + union pgste pgste_init; + struct page_table *pt; + union pmd new, old; + union pte init; + int i; + + BUG_ON(!mc); + old = READ_ONCE(*pmdp); + + /* Already split, nothing to do. */ + if (!old.h.i && !old.h.fc) + return 0; + + pt = dat_alloc_pt_noinit(mc); + if (!pt) + return -ENOMEM; + new.val = virt_to_phys(pt); + + while (old.h.i || old.h.fc) { + init.val = pmd_origin_large(old); + init.h.p = old.h.p; + init.h.i = old.h.i; + init.s.d = old.s.fc1.d; + init.s.w = old.s.fc1.w; + init.s.y = old.s.fc1.y; + init.s.sd = old.s.fc1.sd; + init.s.pr = old.s.fc1.pr; + pgste_init.val = 0; + if (old.h.fc) { + for (i = 0; i < _PAGE_ENTRIES; i++) + pt->ptes[i].val = init.val | i * PAGE_SIZE; + /* No need to take locks as the page table is not installed yet. */ + pgste_init.prefix_notif = old.s.fc1.prefix_notif; + pgste_init.pcl = uses_skeys && init.h.i; + dat_init_pgstes(pt, pgste_init.val); + } else { + dat_init_page_table(pt, init.val, 0); + } + + if (dat_pmdp_xchg_atomic(pmdp, old, new, gfn, asce)) { + if (!pgste_init.pcl) + return 0; + for (i = 0; i < _PAGE_ENTRIES; i++) { + union pgste pgste = pt->pgstes[i]; + + pgste = dat_save_storage_key_into_pgste(pt->ptes[i], pgste); + pgste_set_unlock(pt->ptes + i, pgste); + } + return 0; + } + old = READ_ONCE(*pmdp); + } + + dat_free_pt(pt); + return 0; +} + +/* + * dat_split_crste() - Split a crste into smaller crstes. + * + * Context: This function is assumed to be called with kvm->mmu_lock held. + * + * Return: %0 in case of success, %-ENOMEM if running out of memory. + */ +static int dat_split_crste(struct kvm_s390_mmu_cache *mc, union crste *crstep, + gfn_t gfn, union asce asce, bool uses_skeys) +{ + struct crst_table *table; + union crste old, new, init; + int i; + + old = READ_ONCE(*crstep); + if (is_pmd(old)) + return dat_split_ste(mc, &crstep->pmd, gfn, asce, uses_skeys); + + BUG_ON(!mc); + + /* Already split, nothing to do. */ + if (!old.h.i && !old.h.fc) + return 0; + + table = dat_alloc_crst_noinit(mc); + if (!table) + return -ENOMEM; + + new.val = virt_to_phys(table); + new.h.tt = old.h.tt; + new.h.fc0.tl = _REGION_ENTRY_LENGTH; + + while (old.h.i || old.h.fc) { + init = old; + init.h.tt--; + if (old.h.fc) { + for (i = 0; i < _CRST_ENTRIES; i++) + table->crstes[i].val = init.val | i * HPAGE_SIZE; + } else { + crst_table_init((void *)table, init.val); + } + if (dat_crstep_xchg_atomic(crstep, old, new, gfn, asce)) + return 0; + old = READ_ONCE(*crstep); + } + + dat_free_crst(table); + return 0; +} + +/** + * dat_entry_walk() - Walk the gmap page tables. + * @mc: Cache to use to allocate dat tables, if needed; can be NULL if neither + * %DAT_WALK_SPLIT or %DAT_WALK_ALLOC is specified in @flags. + * @gfn: Guest frame. + * @asce: The ASCE of the address space. + * @flags: Flags from WALK_* macros. + * @walk_level: Level to walk to, from LEVEL_* macros. + * @last: Will be filled the last visited non-pte DAT entry. + * @ptepp: Will be filled the last visited pte entry, if any, otherwise NULL. + * + * Returns a table entry pointer for the given guest address and @walk_level. + * + * The @flags have the following meanings: + * * %DAT_WALK_IGN_HOLES: consider holes as normal table entries + * * %DAT_WALK_ALLOC: allocate new tables to reach the requested level, if needed + * * %DAT_WALK_SPLIT: split existing large pages to reach the requested level, if needed + * * %DAT_WALK_LEAF: return successfully whenever a large page is encountered + * * %DAT_WALK_ANY: return successfully even if the requested level could not be reached + * * %DAT_WALK_CONTINUE: walk to the requested level with the specified flags, and then try to + * continue walking to ptes with only DAT_WALK_ANY + * * %DAT_WALK_USES_SKEYS: storage keys are in use + * + * Context: called with kvm->mmu_lock held. + * + * Return: + * * %PGM_ADDRESSING if the requested address lies outside memory + * * a PIC number if the requested address lies in a memory hole of type _DAT_TOKEN_PIC + * * %-EFAULT if the requested address lies inside a memory hole of a different type + * * %-EINVAL if the given ASCE is not compatible with the requested level + * * %-EFBIG if the requested level could not be reached because a larger frame was found + * * %-ENOENT if the requested level could not be reached for other reasons + * * %-ENOMEM if running out of memory while allocating or splitting a table + */ +int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags, + int walk_level, union crste **last, union pte **ptepp) +{ + union vaddress vaddr = { .addr = gfn_to_gpa(gfn) }; + bool continue_anyway = flags & DAT_WALK_CONTINUE; + bool uses_skeys = flags & DAT_WALK_USES_SKEYS; + bool ign_holes = flags & DAT_WALK_IGN_HOLES; + bool allocate = flags & DAT_WALK_ALLOC; + bool split = flags & DAT_WALK_SPLIT; + bool leaf = flags & DAT_WALK_LEAF; + bool any = flags & DAT_WALK_ANY; + struct page_table *pgtable; + struct crst_table *table; + union crste entry; + int rc; + + *last = NULL; + *ptepp = NULL; + if (WARN_ON_ONCE(unlikely(!asce.val))) + return -EINVAL; + if (WARN_ON_ONCE(unlikely(walk_level > asce.dt))) + return -EINVAL; + if (!asce_contains_gfn(asce, gfn)) + return PGM_ADDRESSING; + + table = dereference_asce(asce); + if (asce.dt >= ASCE_TYPE_REGION1) { + *last = table->crstes + vaddr.rfx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION1)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION1) + return 0; + if (entry.pgd.h.i) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.pgd); + } + + if (asce.dt >= ASCE_TYPE_REGION2) { + *last = table->crstes + vaddr.rsx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION2)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION2) + return 0; + if (entry.p4d.h.i) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.p4d); + } + + if (asce.dt >= ASCE_TYPE_REGION3) { + *last = table->crstes + vaddr.rtx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_REGION3)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (walk_level == TABLE_TYPE_REGION3 && + continue_anyway && !entry.pud.h.fc && !entry.h.i) { + walk_level = TABLE_TYPE_PAGE_TABLE; + allocate = false; + } + if (walk_level == TABLE_TYPE_REGION3 || ((leaf || any) && entry.pud.h.fc)) + return 0; + if (entry.pud.h.i && !entry.pud.h.fc) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + if (walk_level <= TABLE_TYPE_SEGMENT && entry.pud.h.fc) { + if (!split) + return -EFBIG; + rc = dat_split_crste(mc, *last, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + table = dereference_crste(entry.pud); + } + + *last = table->crstes + vaddr.sx; + entry = READ_ONCE(**last); + if (WARN_ON_ONCE(entry.h.tt != TABLE_TYPE_SEGMENT)) + return -EINVAL; + if (crste_hole(entry) && !ign_holes) + return entry.tok.type == _DAT_TOKEN_PIC ? entry.tok.par : -EFAULT; + if (continue_anyway && !entry.pmd.h.fc && !entry.h.i) { + walk_level = TABLE_TYPE_PAGE_TABLE; + allocate = false; + } + if (walk_level == TABLE_TYPE_SEGMENT || ((leaf || any) && entry.pmd.h.fc)) + return 0; + + if (entry.pmd.h.i && !entry.pmd.h.fc) { + if (!allocate) + return any ? 0 : -ENOENT; + rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + if (walk_level <= TABLE_TYPE_PAGE_TABLE && entry.pmd.h.fc) { + if (!split) + return -EFBIG; + rc = dat_split_ste(mc, &(*last)->pmd, gfn, asce, uses_skeys); + if (rc) + return rc; + entry = READ_ONCE(**last); + } + pgtable = dereference_pmd(entry.pmd); + *ptepp = pgtable->ptes + vaddr.px; + if (pte_hole(**ptepp) && !ign_holes) + return (*ptepp)->tok.type == _DAT_TOKEN_PIC ? (*ptepp)->tok.par : -EFAULT; + return 0; +} + +static long dat_pte_walk_range(gfn_t gfn, gfn_t end, struct page_table *table, struct dat_walk *w) +{ + unsigned int idx = gfn & (_PAGE_ENTRIES - 1); + long rc = 0; + + for ( ; gfn < end; idx++, gfn++) { + if (pte_hole(READ_ONCE(table->ptes[idx]))) { + if (!(w->flags & DAT_WALK_IGN_HOLES)) + return -EFAULT; + if (!(w->flags & DAT_WALK_ANY)) + continue; + } + + rc = w->ops->pte_entry(table->ptes + idx, gfn, gfn + 1, w); + if (rc) + break; + } + return rc; +} + +static long dat_crste_walk_range(gfn_t start, gfn_t end, struct crst_table *table, + struct dat_walk *walk) +{ + unsigned long idx, cur_shift, cur_size; + dat_walk_op the_op; + union crste crste; + gfn_t cur, next; + long rc = 0; + + cur_shift = 8 + table->crstes[0].h.tt * 11; + idx = (start >> cur_shift) & (_CRST_ENTRIES - 1); + cur_size = 1UL << cur_shift; + + for (cur = ALIGN_DOWN(start, cur_size); cur < end; idx++, cur = next) { + next = cur + cur_size; + walk->last = table->crstes + idx; + crste = READ_ONCE(*walk->last); + + if (crste_hole(crste)) { + if (!(walk->flags & DAT_WALK_IGN_HOLES)) + return -EFAULT; + if (!(walk->flags & DAT_WALK_ANY)) + continue; + } + + the_op = walk->ops->crste_ops[crste.h.tt]; + if (the_op) { + rc = the_op(walk->last, cur, next, walk); + crste = READ_ONCE(*walk->last); + } + if (rc) + break; + if (!crste.h.i && !crste.h.fc) { + if (!is_pmd(crste)) + rc = dat_crste_walk_range(max(start, cur), min(end, next), + _dereference_crste(crste), walk); + else if (walk->ops->pte_entry) + rc = dat_pte_walk_range(max(start, cur), min(end, next), + dereference_pmd(crste.pmd), walk); + } + } + return rc; +} + +/** + * _dat_walk_gfn_range() - Walk DAT tables. + * @start: The first guest page frame to walk. + * @end: The guest page frame immediately after the last one to walk. + * @asce: The ASCE of the guest mapping. + * @ops: The gmap_walk_ops that will be used to perform the walk. + * @flags: Flags from WALK_* (currently only WALK_IGN_HOLES is supported). + * @priv: Will be passed as-is to the callbacks. + * + * Any callback returning non-zero causes the walk to stop immediately. + * + * Return: %-EINVAL in case of error, %-EFAULT if @start is too high for the + * given ASCE unless the DAT_WALK_IGN_HOLES flag is specified, + * otherwise it returns whatever the callbacks return. + */ +long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce, + const struct dat_walk_ops *ops, int flags, void *priv) +{ + struct crst_table *table = dereference_asce(asce); + struct dat_walk walk = { + .ops = ops, + .asce = asce, + .priv = priv, + .flags = flags, + .start = start, + .end = end, + }; + + if (WARN_ON_ONCE(unlikely(!asce.val))) + return -EINVAL; + if (!asce_contains_gfn(asce, start)) + return (flags & DAT_WALK_IGN_HOLES) ? 0 : -EFAULT; + + return dat_crste_walk_range(start, min(end, asce_end(asce)), table, &walk); +} + +int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + int rc; + + skey->skey = 0; + rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) { + union crste crste; + + crste = READ_ONCE(*crstep); + if (!crste.h.fc || !crste.s.fc1.pr) + return 0; + skey->skey = page_get_storage_key(large_crste_to_phys(crste, gfn)); + return 0; + } + pgste = pgste_get_lock(ptep); + if (ptep->h.i) { + skey->acc = pgste.acc; + skey->fp = pgste.fp; + } else { + skey->skey = page_get_storage_key(pte_origin(*ptep)); + } + skey->r |= pgste.gr; + skey->c |= pgste.gc; + pgste_set_unlock(ptep, pgste); + return 0; +} + +static void dat_update_ptep_sd(union pgste old, union pgste pgste, union pte *ptep) +{ + if (pgste.acc != old.acc || pgste.fp != old.fp || pgste.gr != old.gr || pgste.gc != old.gc) + __atomic64_or(_PAGE_SD, &ptep->val); +} + +int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + union skey skey, bool nq) +{ + union pgste pgste, old; + union crste *crstep; + union pte *ptep; + int rc; + + rc = dat_entry_walk(mc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE, + &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) { + page_set_storage_key(large_crste_to_phys(*crstep, gfn), skey.skey, !nq); + return 0; + } + + old = pgste_get_lock(ptep); + pgste = old; + + pgste.acc = skey.acc; + pgste.fp = skey.fp; + pgste.gc = skey.c; + pgste.gr = skey.r; + + if (!ptep->h.i) { + union skey old_skey; + + old_skey.skey = page_get_storage_key(pte_origin(*ptep)); + pgste.hc |= old_skey.c; + pgste.hr |= old_skey.r; + old_skey.c = old.gc; + old_skey.r = old.gr; + skey.r = 0; + skey.c = 0; + page_set_storage_key(pte_origin(*ptep), skey.skey, !nq); + } + + dat_update_ptep_sd(old, pgste, ptep); + pgste_set_unlock(ptep, pgste); + return 0; +} + +static bool page_cond_set_storage_key(phys_addr_t paddr, union skey skey, union skey *oldkey, + bool nq, bool mr, bool mc) +{ + oldkey->skey = page_get_storage_key(paddr); + if (oldkey->acc == skey.acc && oldkey->fp == skey.fp && + (oldkey->r == skey.r || mr) && (oldkey->c == skey.c || mc)) + return false; + page_set_storage_key(paddr, skey.skey, !nq); + return true; +} + +int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn, + union skey skey, union skey *oldkey, bool nq, bool mr, bool mc) +{ + union pgste pgste, old; + union crste *crstep; + union skey prev; + union pte *ptep; + int rc; + + rc = dat_entry_walk(mmc, gfn, asce, DAT_WALK_LEAF_ALLOC, TABLE_TYPE_PAGE_TABLE, + &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) + return page_cond_set_storage_key(large_crste_to_phys(*crstep, gfn), skey, oldkey, + nq, mr, mc); + + old = pgste_get_lock(ptep); + pgste = old; + + rc = 1; + pgste.acc = skey.acc; + pgste.fp = skey.fp; + pgste.gc = skey.c; + pgste.gr = skey.r; + + if (!ptep->h.i) { + rc = page_cond_set_storage_key(pte_origin(*ptep), skey, &prev, nq, mr, mc); + pgste.hc |= prev.c; + pgste.hr |= prev.r; + prev.c |= old.gc; + prev.r |= old.gr; + } else { + prev.acc = old.acc; + prev.fp = old.fp; + prev.c = old.gc; + prev.r = old.gr; + } + if (oldkey) + *oldkey = prev; + + dat_update_ptep_sd(old, pgste, ptep); + pgste_set_unlock(ptep, pgste); + return rc; +} + +int dat_reset_reference_bit(union asce asce, gfn_t gfn) +{ + union pgste pgste, old; + union crste *crstep; + union pte *ptep; + int rc; + + rc = dat_entry_walk(NULL, gfn, asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + if (rc) + return rc; + + if (!ptep) { + union crste crste = READ_ONCE(*crstep); + + if (!crste.h.fc || !crste.s.fc1.pr) + return 0; + return page_reset_referenced(large_crste_to_phys(*crstep, gfn)); + } + old = pgste_get_lock(ptep); + pgste = old; + + if (!ptep->h.i) { + rc = page_reset_referenced(pte_origin(*ptep)); + pgste.hr = rc >> 1; + } + rc |= (pgste.gr << 1) | pgste.gc; + pgste.gr = 0; + + dat_update_ptep_sd(old, pgste, ptep); + pgste_set_unlock(ptep, pgste); + return rc; +} + +static long dat_reset_skeys_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste.acc = 0; + pgste.fp = 0; + pgste.gr = 0; + pgste.gc = 0; + if (ptep->s.pr) + page_set_storage_key(pte_origin(*ptep), PAGE_DEFAULT_KEY, 1); + pgste_set_unlock(ptep, pgste); + + if (need_resched()) + return next; + return 0; +} + +static long dat_reset_skeys_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + phys_addr_t addr, end, origin = crste_origin_large(*crstep); + + if (!crstep->h.fc || !crstep->s.fc1.pr) + return 0; + + addr = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin; + end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin; + while (ALIGN(addr + 1, _SEGMENT_SIZE) <= end) + addr = sske_frame(addr, PAGE_DEFAULT_KEY); + for ( ; addr < end; addr += PAGE_SIZE) + page_set_storage_key(addr, PAGE_DEFAULT_KEY, 1); + + if (need_resched()) + return next; + return 0; +} + +long dat_reset_skeys(union asce asce, gfn_t start) +{ + const struct dat_walk_ops ops = { + .pte_entry = dat_reset_skeys_pte, + .pmd_entry = dat_reset_skeys_crste, + .pud_entry = dat_reset_skeys_crste, + }; + + return _dat_walk_gfn_range(start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, NULL); +} + +struct slot_priv { + unsigned long token; + struct kvm_s390_mmu_cache *mc; +}; + +static long _dat_slot_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct slot_priv *p = walk->priv; + union crste dummy = { .val = p->token }; + union pte new_pte, pte = READ_ONCE(*ptep); + + new_pte = _PTE_TOK(dummy.tok.type, dummy.tok.par); + + /* Table entry already in the desired state. */ + if (pte.val == new_pte.val) + return 0; + + dat_ptep_xchg(ptep, new_pte, gfn, walk->asce, false); + return 0; +} + +static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union crste new_crste, crste = READ_ONCE(*crstep); + struct slot_priv *p = walk->priv; + + new_crste.val = p->token; + new_crste.h.tt = crste.h.tt; + + /* Table entry already in the desired state. */ + if (crste.val == new_crste.val) + return 0; + + /* This table entry needs to be updated. */ + if (walk->start <= gfn && walk->end >= next) { + dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce); + /* A lower level table was present, needs to be freed. */ + if (!crste.h.fc && !crste.h.i) { + if (is_pmd(crste)) + dat_free_pt(dereference_pmd(crste.pmd)); + else + dat_free_level(dereference_crste(crste), true); + } + return 0; + } + + /* A lower level table is present, things will handled there. */ + if (!crste.h.fc && !crste.h.i) + return 0; + /* Split (install a lower level table), and handle things there. */ + return dat_split_crste(p->mc, crstep, gfn, walk->asce, false); +} + +static const struct dat_walk_ops dat_slot_ops = { + .pte_entry = _dat_slot_pte, + .crste_ops = { _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, }, +}; + +int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end, + u16 type, u16 param) +{ + struct slot_priv priv = { + .token = _CRSTE_TOK(0, type, param).val, + .mc = mc, + }; + + return _dat_walk_gfn_range(start, end, asce, &dat_slot_ops, + DAT_WALK_IGN_HOLES | DAT_WALK_ANY, &priv); +} + +static void pgste_set_unlock_multiple(union pte *first, int n, union pgste *pgstes) +{ + int i; + + for (i = 0; i < n; i++) { + if (!pgstes[i].pcl) + break; + pgste_set_unlock(first + i, pgstes[i]); + } +} + +static bool pgste_get_trylock_multiple(union pte *first, int n, union pgste *pgstes) +{ + int i; + + for (i = 0; i < n; i++) { + if (!pgste_get_trylock(first + i, pgstes + i)) + break; + } + if (i == n) + return true; + pgste_set_unlock_multiple(first, n, pgstes); + return false; +} + +unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param) +{ + union pgste pgstes[4] = {}; + unsigned long res = 0; + int i, n; + + n = param.len + 1; + + while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes)) + cpu_relax(); + + for (i = 0; i < n; i++) + res = res << 16 | pgstes[i].val16; + + pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes); + return res; +} + +void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val) +{ + union pgste pgstes[4] = {}; + int i, n; + + n = param.len + 1; + + while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes)) + cpu_relax(); + + for (i = param.len; i >= 0; i--) { + pgstes[i].val16 = val; + val = val >> 16; + } + + pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes); +} + +static long _dat_test_young_pte(union pte *ptep, gfn_t start, gfn_t end, struct dat_walk *walk) +{ + return ptep->s.y; +} + +static long _dat_test_young_crste(union crste *crstep, gfn_t start, gfn_t end, + struct dat_walk *walk) +{ + return crstep->h.fc && crstep->s.fc1.y; +} + +static const struct dat_walk_ops test_age_ops = { + .pte_entry = _dat_test_young_pte, + .pmd_entry = _dat_test_young_crste, + .pud_entry = _dat_test_young_crste, +}; + +/** + * dat_test_age_gfn() - Test young. + * @asce: The ASCE whose address range is to be tested. + * @start: The first guest frame of the range to check. + * @end: The guest frame after the last in the range. + * + * Context: called by KVM common code with the kvm mmu write lock held. + * + * Return: %true if any page in the given range is young, otherwise %false. + */ +bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end) +{ + return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0; +} + +int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, + bool uses_skeys, struct guest_fault *f) +{ + union crste oldval, newval; + union pte newpte, oldpte; + union pgste pgste; + int rc = 0; + + rc = dat_entry_walk(mc, f->gfn, asce, DAT_WALK_ALLOC_CONTINUE, level, &f->crstep, &f->ptep); + if (rc == -EINVAL || rc == -ENOMEM) + return rc; + if (rc) + return -EAGAIN; + + if (WARN_ON_ONCE(unlikely(get_level(f->crstep, f->ptep) > level))) + return -EINVAL; + + if (f->ptep) { + pgste = pgste_get_lock(f->ptep); + oldpte = *f->ptep; + newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page); + newpte.s.sd = oldpte.s.sd; + oldpte.s.sd = 0; + if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) { + pgste = __dat_ptep_xchg(f->ptep, pgste, newpte, f->gfn, asce, uses_skeys); + if (f->callback) + f->callback(f); + } else { + rc = -EAGAIN; + } + pgste_set_unlock(f->ptep, pgste); + } else { + oldval = READ_ONCE(*f->crstep); + newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable, + f->write_attempt | oldval.s.fc1.d); + newval.s.fc1.sd = oldval.s.fc1.sd; + if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val && + crste_origin_large(oldval) != crste_origin_large(newval)) + return -EAGAIN; + if (!dat_crstep_xchg_atomic(f->crstep, oldval, newval, f->gfn, asce)) + return -EAGAIN; + if (f->callback) + f->callback(f); + } + + return rc; +} + +static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union crste crste = READ_ONCE(*crstep); + int *n = walk->priv; + + if (!crste.h.fc || crste.h.i || crste.h.p) + return 0; + + *n = 2; + if (crste.s.fc1.prefix_notif) + return 0; + crste.s.fc1.prefix_notif = 1; + dat_crstep_xchg(crstep, crste, gfn, walk->asce); + return 0; +} + +static long dat_set_pn_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + int *n = walk->priv; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + if (!ptep->h.i && !ptep->h.p) { + pgste.prefix_notif = 1; + *n += 1; + } + pgste_set_unlock(ptep, pgste); + return 0; +} + +int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn) +{ + static const struct dat_walk_ops ops = { + .pte_entry = dat_set_pn_pte, + .pmd_entry = dat_set_pn_crste, + .pud_entry = dat_set_pn_crste, + }; + + int n = 0; + + _dat_walk_gfn_range(gfn, gfn + 2, asce, &ops, DAT_WALK_IGN_HOLES, &n); + if (n != 2) + return -EAGAIN; + return 0; +} + +/** + * dat_perform_essa() - Perform ESSA actions on the PGSTE. + * @asce: The asce to operate on. + * @gfn: The guest page frame to operate on. + * @orc: The specific action to perform, see the ESSA_SET_* macros. + * @state: The storage attributes to be returned to the guest. + * @dirty: Returns whether the function dirtied a previously clean entry. + * + * Context: Called with kvm->mmu_lock held. + * + * Return: + * * %1 if the page state has been altered and the page is to be added to the CBRL + * * %0 if the page state has been altered, but the page is not to be added to the CBRL + * * %-1 if the page state has not been altered and the page is not to be added to the CBRL + */ +int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + int res = 0; + + if (dat_entry_walk(NULL, gfn, asce, 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep)) { + *state = (union essa_state) { .exception = 1 }; + return -1; + } + + pgste = pgste_get_lock(ptep); + + *state = (union essa_state) { + .content = (ptep->h.i << 1) + (ptep->h.i && pgste.zero), + .nodat = pgste.nodat, + .usage = pgste.usage, + }; + + switch (orc) { + case ESSA_GET_STATE: + res = -1; + break; + case ESSA_SET_STABLE: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + pgste.nodat = 0; + break; + case ESSA_SET_UNUSED: + pgste.usage = PGSTE_GPS_USAGE_UNUSED; + if (ptep->h.i) + res = 1; + break; + case ESSA_SET_VOLATILE: + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + if (ptep->h.i) + res = 1; + break; + case ESSA_SET_POT_VOLATILE: + if (!ptep->h.i) { + pgste.usage = PGSTE_GPS_USAGE_POT_VOLATILE; + } else if (pgste.zero) { + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + } else if (!pgste.gc) { + pgste.usage = PGSTE_GPS_USAGE_VOLATILE; + res = 1; + } + break; + case ESSA_SET_STABLE_RESIDENT: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + /* + * Since the resident state can go away any time after this + * call, we will not make this page resident. We can revisit + * this decision if a guest will ever start using this. + */ + break; + case ESSA_SET_STABLE_IF_RESIDENT: + if (!ptep->h.i) + pgste.usage = PGSTE_GPS_USAGE_STABLE; + break; + case ESSA_SET_STABLE_NODAT: + pgste.usage = PGSTE_GPS_USAGE_STABLE; + pgste.nodat = 1; + break; + default: + WARN_ONCE(1, "Invalid ORC!"); + res = -1; + break; + } + /* If we are discarding a page, set it to logical zero. */ + pgste.zero = res == 1; + if (orc > 0) { + *dirty = !pgste.cmma_d; + pgste.cmma_d = 1; + } + + pgste_set_unlock(ptep, pgste); + + return res; +} + +static long dat_reset_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste.usage = 0; + pgste.nodat = 0; + pgste.cmma_d = 0; + pgste_set_unlock(ptep, pgste); + if (need_resched()) + return next; + return 0; +} + +long dat_reset_cmma(union asce asce, gfn_t start) +{ + const struct dat_walk_ops dat_reset_cmma_ops = { + .pte_entry = dat_reset_cmma_pte, + }; + + return _dat_walk_gfn_range(start, asce_end(asce), asce, &dat_reset_cmma_ops, + DAT_WALK_IGN_HOLES, NULL); +} + +struct dat_get_cmma_state { + gfn_t start; + gfn_t end; + unsigned int count; + u8 *values; + atomic64_t *remaining; +}; + +static long __dat_peek_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + state->values[gfn - walk->start] = pgste.usage | (pgste.nodat << 6); + pgste_set_unlock(ptep, pgste); + state->end = next; + + return 0; +} + +static long __dat_peek_cmma_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + + if (crstep->h.i) + state->end = min(walk->end, next); + return 0; +} + +int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values) +{ + const struct dat_walk_ops ops = { + .pte_entry = __dat_peek_cmma_pte, + .pmd_entry = __dat_peek_cmma_crste, + .pud_entry = __dat_peek_cmma_crste, + .p4d_entry = __dat_peek_cmma_crste, + .pgd_entry = __dat_peek_cmma_crste, + }; + struct dat_get_cmma_state state = { .values = values, }; + int rc; + + rc = _dat_walk_gfn_range(start, start + *count, asce, &ops, DAT_WALK_DEFAULT, &state); + *count = state.end - start; + /* Return success if at least one value was saved, otherwise an error. */ + return (rc == -EFAULT && *count > 0) ? 0 : rc; +} + +static long __dat_get_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_get_cmma_state *state = walk->priv; + union pgste pgste; + + if (state->start != -1) { + if ((gfn - state->end) > KVM_S390_MAX_BIT_DISTANCE) + return 1; + if (gfn - state->start >= state->count) + return 1; + } + + if (!READ_ONCE(*pgste_of(ptep)).cmma_d) + return 0; + + pgste = pgste_get_lock(ptep); + if (pgste.cmma_d) { + if (state->start == -1) + state->start = gfn; + pgste.cmma_d = 0; + atomic64_dec(state->remaining); + state->values[gfn - state->start] = pgste.usage | pgste.nodat << 6; + state->end = next; + } + pgste_set_unlock(ptep, pgste); + return 0; +} + +int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem) +{ + const struct dat_walk_ops ops = { .pte_entry = __dat_get_cmma_pte, }; + struct dat_get_cmma_state state = { + .remaining = rem, + .values = values, + .count = *count, + .start = -1, + }; + + _dat_walk_gfn_range(*start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, &state); + + if (state.start == -1) { + *count = 0; + } else { + *count = state.end - state.start; + *start = state.start; + } + + return 0; +} + +struct dat_set_cmma_state { + unsigned long mask; + const u8 *bits; +}; + +static long __dat_set_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct dat_set_cmma_state *state = walk->priv; + union pgste pgste, tmp; + + tmp.val = (state->bits[gfn - walk->start] << 24) & state->mask; + + pgste = pgste_get_lock(ptep); + pgste.usage = tmp.usage; + pgste.nodat = tmp.nodat; + pgste_set_unlock(ptep, pgste); + + return 0; +} + +/** + * dat_set_cmma_bits() - Set CMMA bits for a range of guest pages. + * @mc: Cache used for allocations. + * @asce: The ASCE of the guest. + * @gfn: The guest frame of the fist page whose CMMA bits are to set. + * @count: How many pages need to be processed. + * @mask: Which PGSTE bits should be set. + * @bits: Points to an array with the CMMA attributes. + * + * This function sets the CMMA attributes for the given pages. If the input + * buffer has zero length, no action is taken, otherwise the attributes are + * set and the mm->context.uses_cmm flag is set. + * + * Each byte in @bits contains new values for bits 32-39 of the PGSTE. + * Currently, only the fields NT and US are applied. + * + * Return: %0 in case of success, a negative error value otherwise. + */ +int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + unsigned long count, unsigned long mask, const uint8_t *bits) +{ + const struct dat_walk_ops ops = { .pte_entry = __dat_set_cmma_pte, }; + struct dat_set_cmma_state state = { .mask = mask, .bits = bits, }; + union crste *crstep; + union pte *ptep; + gfn_t cur; + int rc; + + for (cur = ALIGN_DOWN(gfn, _PAGE_ENTRIES); cur < gfn + count; cur += _PAGE_ENTRIES) { + rc = dat_entry_walk(mc, cur, asce, DAT_WALK_ALLOC, TABLE_TYPE_PAGE_TABLE, + &crstep, &ptep); + if (rc) + return rc; + } + return _dat_walk_gfn_range(gfn, gfn + count, asce, &ops, DAT_WALK_IGN_HOLES, &state); +} diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h new file mode 100644 index 000000000000..8c7ae07dcc28 --- /dev/null +++ b/arch/s390/kvm/dat.h @@ -0,0 +1,970 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2024, 2025 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + */ + +#ifndef __KVM_S390_DAT_H +#define __KVM_S390_DAT_H + +#include <linux/radix-tree.h> +#include <linux/refcount.h> +#include <linux/io.h> +#include <linux/kvm_types.h> +#include <linux/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/dat-bits.h> + +/* + * Base address and length must be sent at the start of each block, therefore + * it's cheaper to send some clean data, as long as it's less than the size of + * two longs. + */ +#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) +/* For consistency */ +#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) + +#define _ASCE(x) ((union asce) { .val = (x), }) +#define NULL_ASCE _ASCE(0) + +enum { + _DAT_TOKEN_NONE = 0, + _DAT_TOKEN_PIC, +}; + +#define _CRSTE_TOK(l, t, p) ((union crste) { \ + .tok.i = 1, \ + .tok.tt = (l), \ + .tok.type = (t), \ + .tok.par = (p) \ + }) +#define _CRSTE_PIC(l, p) _CRSTE_TOK(l, _DAT_TOKEN_PIC, p) + +#define _CRSTE_HOLE(l) _CRSTE_PIC(l, PGM_ADDRESSING) +#define _CRSTE_EMPTY(l) _CRSTE_TOK(l, _DAT_TOKEN_NONE, 0) + +#define _PMD_EMPTY _CRSTE_EMPTY(TABLE_TYPE_SEGMENT) + +#define _PTE_TOK(t, p) ((union pte) { .tok.i = 1, .tok.type = (t), .tok.par = (p) }) +#define _PTE_EMPTY _PTE_TOK(_DAT_TOKEN_NONE, 0) + +/* This fake table type is used for page table walks (both for normal page tables and vSIE) */ +#define TABLE_TYPE_PAGE_TABLE -1 + +enum dat_walk_flags { + DAT_WALK_USES_SKEYS = 0x40, + DAT_WALK_CONTINUE = 0x20, + DAT_WALK_IGN_HOLES = 0x10, + DAT_WALK_SPLIT = 0x08, + DAT_WALK_ALLOC = 0x04, + DAT_WALK_ANY = 0x02, + DAT_WALK_LEAF = 0x01, + DAT_WALK_DEFAULT = 0 +}; + +#define DAT_WALK_SPLIT_ALLOC (DAT_WALK_SPLIT | DAT_WALK_ALLOC) +#define DAT_WALK_ALLOC_CONTINUE (DAT_WALK_CONTINUE | DAT_WALK_ALLOC) +#define DAT_WALK_LEAF_ALLOC (DAT_WALK_LEAF | DAT_WALK_ALLOC) + +union pte { + unsigned long val; + union page_table_entry h; + struct { + unsigned long :56; /* Hardware bits */ + unsigned long u : 1; /* Page unused */ + unsigned long s : 1; /* Special */ + unsigned long w : 1; /* Writable */ + unsigned long r : 1; /* Readable */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long sd: 1; /* Soft dirty */ + unsigned long pr: 1; /* Present */ + } s; + struct { + unsigned char hwbytes[7]; + unsigned char swbyte; + }; + union { + struct { + unsigned long type :16; /* Token type */ + unsigned long par :16; /* Token parameter */ + unsigned long :20; + unsigned long : 1; /* Must be 0 */ + unsigned long i : 1; /* Must be 1 */ + unsigned long : 2; + unsigned long : 7; + unsigned long pr : 1; /* Must be 0 */ + }; + struct { + unsigned long token:32; /* Token and parameter */ + unsigned long :32; + }; + } tok; +}; + +/* Soft dirty, needed as macro for atomic operations on ptes */ +#define _PAGE_SD 0x002 + +/* Needed as macro to perform atomic operations */ +#define PGSTE_PCL_BIT 0x0080000000000000UL /* PCL lock, HW bit */ +#define PGSTE_CMMA_D_BIT 0x0000000000008000UL /* CMMA dirty soft-bit */ + +enum pgste_gps_usage { + PGSTE_GPS_USAGE_STABLE = 0, + PGSTE_GPS_USAGE_UNUSED, + PGSTE_GPS_USAGE_POT_VOLATILE, + PGSTE_GPS_USAGE_VOLATILE, +}; + +union pgste { + unsigned long val; + struct { + unsigned long acc : 4; + unsigned long fp : 1; + unsigned long : 3; + unsigned long pcl : 1; + unsigned long hr : 1; + unsigned long hc : 1; + unsigned long : 2; + unsigned long gr : 1; + unsigned long gc : 1; + unsigned long : 1; + unsigned long :16; /* val16 */ + unsigned long zero : 1; + unsigned long nodat : 1; + unsigned long : 4; + unsigned long usage : 2; + unsigned long : 8; + unsigned long cmma_d : 1; /* Dirty flag for CMMA bits */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 5; + unsigned long : 8; + }; + struct { + unsigned short hwbytes0; + unsigned short val16; /* Used to store chunked values, see dat_{s,g}et_ptval() */ + unsigned short hwbytes4; + unsigned char flags; /* Maps to the software bits */ + unsigned char hwbyte7; + } __packed; +}; + +union pmd { + unsigned long val; + union segment_table_entry h; + struct { + struct { + unsigned long :44; /* HW */ + unsigned long : 3; /* Unused */ + unsigned long : 1; /* HW */ + unsigned long w : 1; /* Writable soft-bit */ + unsigned long r : 1; /* Readable soft-bit */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long : 3; /* HW */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 1; /* Unused */ + unsigned long : 4; /* HW */ + unsigned long sd : 1; /* Soft-Dirty */ + unsigned long pr : 1; /* Present */ + } fc1; + } s; +}; + +union pud { + unsigned long val; + union region3_table_entry h; + struct { + struct { + unsigned long :33; /* HW */ + unsigned long :14; /* Unused */ + unsigned long : 1; /* HW */ + unsigned long w : 1; /* Writable soft-bit */ + unsigned long r : 1; /* Readable soft-bit */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long : 3; /* HW */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 1; /* Unused */ + unsigned long : 4; /* HW */ + unsigned long sd : 1; /* Soft-Dirty */ + unsigned long pr : 1; /* Present */ + } fc1; + } s; +}; + +union p4d { + unsigned long val; + union region2_table_entry h; +}; + +union pgd { + unsigned long val; + union region1_table_entry h; +}; + +union crste { + unsigned long val; + union { + struct { + unsigned long :52; + unsigned long : 1; + unsigned long fc: 1; + unsigned long p : 1; + unsigned long : 1; + unsigned long : 2; + unsigned long i : 1; + unsigned long : 1; + unsigned long tt: 2; + unsigned long : 2; + }; + struct { + unsigned long to:52; + unsigned long : 1; + unsigned long fc: 1; + unsigned long p : 1; + unsigned long : 1; + unsigned long tf: 2; + unsigned long i : 1; + unsigned long : 1; + unsigned long tt: 2; + unsigned long tl: 2; + } fc0; + struct { + unsigned long :47; + unsigned long av : 1; /* ACCF-Validity Control */ + unsigned long acc: 4; /* Access-Control Bits */ + unsigned long f : 1; /* Fetch-Protection Bit */ + unsigned long fc : 1; /* Format-Control */ + unsigned long p : 1; /* DAT-Protection Bit */ + unsigned long iep: 1; /* Instruction-Execution-Protection */ + unsigned long : 2; + unsigned long i : 1; /* Segment-Invalid Bit */ + unsigned long cs : 1; /* Common-Segment Bit */ + unsigned long tt : 2; /* Table-Type Bits */ + unsigned long : 2; + } fc1; + } h; + struct { + struct { + unsigned long :47; + unsigned long : 1; /* HW (should be 0) */ + unsigned long w : 1; /* Writable */ + unsigned long r : 1; /* Readable */ + unsigned long d : 1; /* Dirty */ + unsigned long y : 1; /* Young */ + unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */ + unsigned long : 3; /* HW */ + unsigned long vsie_notif : 1; /* Referenced in a shadow table */ + unsigned long : 1; + unsigned long : 4; /* HW */ + unsigned long sd : 1; /* Soft-Dirty */ + unsigned long pr : 1; /* Present */ + } fc1; + } s; + union { + struct { + unsigned long type :16; /* Token type */ + unsigned long par :16; /* Token parameter */ + unsigned long :26; + unsigned long i : 1; /* Must be 1 */ + unsigned long : 1; + unsigned long tt : 2; + unsigned long : 1; + unsigned long pr : 1; /* Must be 0 */ + }; + struct { + unsigned long token:32; /* Token and parameter */ + unsigned long :32; + }; + } tok; + union pmd pmd; + union pud pud; + union p4d p4d; + union pgd pgd; +}; + +union skey { + unsigned char skey; + struct { + unsigned char acc :4; + unsigned char fp :1; + unsigned char r :1; + unsigned char c :1; + unsigned char zero:1; + }; +}; + +static_assert(sizeof(union pgste) == sizeof(unsigned long)); +static_assert(sizeof(union pte) == sizeof(unsigned long)); +static_assert(sizeof(union pmd) == sizeof(unsigned long)); +static_assert(sizeof(union pud) == sizeof(unsigned long)); +static_assert(sizeof(union p4d) == sizeof(unsigned long)); +static_assert(sizeof(union pgd) == sizeof(unsigned long)); +static_assert(sizeof(union crste) == sizeof(unsigned long)); +static_assert(sizeof(union skey) == sizeof(char)); + +struct segment_table { + union pmd pmds[_CRST_ENTRIES]; +}; + +struct region3_table { + union pud puds[_CRST_ENTRIES]; +}; + +struct region2_table { + union p4d p4ds[_CRST_ENTRIES]; +}; + +struct region1_table { + union pgd pgds[_CRST_ENTRIES]; +}; + +struct crst_table { + union { + union crste crstes[_CRST_ENTRIES]; + struct segment_table segment; + struct region3_table region3; + struct region2_table region2; + struct region1_table region1; + }; +}; + +struct page_table { + union pte ptes[_PAGE_ENTRIES]; + union pgste pgstes[_PAGE_ENTRIES]; +}; + +static_assert(sizeof(struct crst_table) == _CRST_TABLE_SIZE); +static_assert(sizeof(struct page_table) == PAGE_SIZE); + +struct dat_walk; + +typedef long (*dat_walk_op)(union crste *crste, gfn_t gfn, gfn_t next, struct dat_walk *w); + +struct dat_walk_ops { + union { + dat_walk_op crste_ops[4]; + struct { + dat_walk_op pmd_entry; + dat_walk_op pud_entry; + dat_walk_op p4d_entry; + dat_walk_op pgd_entry; + }; + }; + long (*pte_entry)(union pte *pte, gfn_t gfn, gfn_t next, struct dat_walk *w); +}; + +struct dat_walk { + const struct dat_walk_ops *ops; + union crste *last; + union pte *last_pte; + union asce asce; + gfn_t start; + gfn_t end; + int flags; + void *priv; +}; + +struct ptval_param { + unsigned char offset : 6; + unsigned char len : 2; +}; + +/** + * _pte() - Useful constructor for union pte + * @pfn: the pfn this pte should point to. + * @writable: whether the pte should be writable. + * @dirty: whether the pte should be dirty. + * @special: whether the pte should be marked as special + * + * The pte is also marked as young and present. If the pte is marked as dirty, + * it gets marked as soft-dirty too. If the pte is not dirty, the hardware + * protect bit is set (independently of the write softbit); this way proper + * dirty tracking can be performed. + * + * Return: a union pte value. + */ +static inline union pte _pte(kvm_pfn_t pfn, bool writable, bool dirty, bool special) +{ + union pte res = { .val = PFN_PHYS(pfn) }; + + res.h.p = !dirty; + res.s.y = 1; + res.s.pr = 1; + res.s.w = writable; + res.s.d = dirty; + res.s.sd = dirty; + res.s.s = special; + return res; +} + +static inline union crste _crste_fc0(kvm_pfn_t pfn, int tt) +{ + union crste res = { .val = PFN_PHYS(pfn) }; + + res.h.tt = tt; + res.h.fc0.tl = _REGION_ENTRY_LENGTH; + res.h.fc0.tf = 0; + return res; +} + +/** + * _crste() - Useful constructor for union crste with FC=1 + * @pfn: the pfn this pte should point to. + * @tt: the table type + * @writable: whether the pte should be writable. + * @dirty: whether the pte should be dirty. + * + * The crste is also marked as young and present. If the crste is marked as + * dirty, it gets marked as soft-dirty too. If the crste is not dirty, the + * hardware protect bit is set (independently of the write softbit); this way + * proper dirty tracking can be performed. + * + * Return: a union crste value. + */ +static inline union crste _crste_fc1(kvm_pfn_t pfn, int tt, bool writable, bool dirty) +{ + union crste res = { .val = PFN_PHYS(pfn) & _SEGMENT_MASK }; + + res.h.tt = tt; + res.h.p = !dirty; + res.h.fc = 1; + res.s.fc1.y = 1; + res.s.fc1.pr = 1; + res.s.fc1.w = writable; + res.s.fc1.d = dirty; + res.s.fc1.sd = dirty; + return res; +} + +union essa_state { + unsigned char val; + struct { + unsigned char : 2; + unsigned char nodat : 1; + unsigned char exception : 1; + unsigned char usage : 2; + unsigned char content : 2; + }; +}; + +/** + * struct vsie_rmap - reverse mapping for shadow page table entries + * @next: pointer to next rmap in the list + * @r_gfn: virtual rmap address in the shadow guest address space + */ +struct vsie_rmap { + struct vsie_rmap *next; + union { + unsigned long val; + struct { + long level: 8; + unsigned long : 4; + unsigned long r_gfn:52; + }; + }; +}; + +static_assert(sizeof(struct vsie_rmap) == 2 * sizeof(long)); + +#define KVM_S390_MMU_CACHE_N_CRSTS 6 +#define KVM_S390_MMU_CACHE_N_PTS 2 +#define KVM_S390_MMU_CACHE_N_RMAPS 16 +struct kvm_s390_mmu_cache { + void *crsts[KVM_S390_MMU_CACHE_N_CRSTS]; + void *pts[KVM_S390_MMU_CACHE_N_PTS]; + void *rmaps[KVM_S390_MMU_CACHE_N_RMAPS]; + short int n_crsts; + short int n_pts; + short int n_rmaps; +}; + +struct guest_fault { + gfn_t gfn; /* Guest frame */ + kvm_pfn_t pfn; /* Host PFN */ + struct page *page; /* Host page */ + union pte *ptep; /* Used to resolve the fault, or NULL */ + union crste *crstep; /* Used to resolve the fault, or NULL */ + bool writable; /* Mapping is writable */ + bool write_attempt; /* Write access attempted */ + bool attempt_pfault; /* Attempt a pfault first */ + bool valid; /* This entry contains valid data */ + void (*callback)(struct guest_fault *f); + void *priv; +}; + +/* + * 0 1 2 3 4 5 6 7 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | | PGT_ADDR | + * 8 | VMADDR | | + * 16 | | + * 24 | | + */ +#define MKPTVAL(o, l) ((struct ptval_param) { .offset = (o), .len = ((l) + 1) / 2 - 1}) +#define PTVAL_PGT_ADDR MKPTVAL(4, 8) +#define PTVAL_VMADDR MKPTVAL(8, 6) + +union pgste __must_check __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new, + gfn_t gfn, union asce asce, bool uses_skeys); +bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn, + union asce asce); +void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce); + +long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce, + const struct dat_walk_ops *ops, int flags, void *priv); + +int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags, + int walk_level, union crste **last, union pte **ptepp); +void dat_free_level(struct crst_table *table, bool owns_ptes); +struct crst_table *dat_alloc_crst_sleepable(unsigned long init); +int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype); +int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey); +int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + union skey skey, bool nq); +int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn, + union skey skey, union skey *oldkey, bool nq, bool mr, bool mc); +int dat_reset_reference_bit(union asce asce, gfn_t gfn); +long dat_reset_skeys(union asce asce, gfn_t start); + +unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param); +void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val); + +int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end, + u16 type, u16 param); +int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn); +bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end); +int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level, + bool uses_skeys, struct guest_fault *f); + +int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty); +long dat_reset_cmma(union asce asce, gfn_t start_gfn); +int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values); +int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem); +int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn, + unsigned long count, unsigned long mask, const uint8_t *bits); + +int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc); + +#define GFP_KVM_S390_MMU_CACHE (GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN) + +static inline struct page_table *kvm_s390_mmu_cache_alloc_pt(struct kvm_s390_mmu_cache *mc) +{ + if (mc->n_pts) + return mc->pts[--mc->n_pts]; + return (void *)__get_free_page(GFP_KVM_S390_MMU_CACHE); +} + +static inline struct crst_table *kvm_s390_mmu_cache_alloc_crst(struct kvm_s390_mmu_cache *mc) +{ + if (mc->n_crsts) + return mc->crsts[--mc->n_crsts]; + return (void *)__get_free_pages(GFP_KVM_S390_MMU_CACHE | __GFP_COMP, CRST_ALLOC_ORDER); +} + +static inline struct vsie_rmap *kvm_s390_mmu_cache_alloc_rmap(struct kvm_s390_mmu_cache *mc) +{ + if (mc->n_rmaps) + return mc->rmaps[--mc->n_rmaps]; + return kzalloc(sizeof(struct vsie_rmap), GFP_KVM_S390_MMU_CACHE); +} + +static inline struct crst_table *crste_table_start(union crste *crstep) +{ + return (struct crst_table *)ALIGN_DOWN((unsigned long)crstep, _CRST_TABLE_SIZE); +} + +static inline struct page_table *pte_table_start(union pte *ptep) +{ + return (struct page_table *)ALIGN_DOWN((unsigned long)ptep, _PAGE_TABLE_SIZE); +} + +static inline bool crdte_crste(union crste *crstep, union crste old, union crste new, gfn_t gfn, + union asce asce) +{ + unsigned long dtt = 0x10 | new.h.tt << 2; + void *table = crste_table_start(crstep); + + return crdte(old.val, new.val, table, dtt, gfn_to_gpa(gfn), asce.val); +} + +/** + * idte_crste() - invalidate a crste entry using idte + * @crstep: pointer to the crste to be invalidated + * @gfn: a gfn mapped by the crste + * @opt: options for the idte instruction + * @asce: the asce + * @local: whether the operation is cpu-local + */ +static __always_inline void idte_crste(union crste *crstep, gfn_t gfn, unsigned long opt, + union asce asce, int local) +{ + unsigned long table_origin = __pa(crste_table_start(crstep)); + unsigned long gaddr = gfn_to_gpa(gfn) & HPAGE_MASK; + + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile("idte %[table_origin],0,%[gaddr],%[local]" + : "+m" (*crstep) + : [table_origin] "a" (table_origin), [gaddr] "a" (gaddr), + [local] "i" (local) + : "cc"); + } else { + /* flush with guest asce */ + asm volatile("idte %[table_origin],%[asce],%[gaddr_opt],%[local]" + : "+m" (*crstep) + : [table_origin] "a" (table_origin), [gaddr_opt] "a" (gaddr | opt), + [asce] "a" (asce.val), [local] "i" (local) + : "cc"); + } +} + +static inline void dat_init_pgstes(struct page_table *pt, unsigned long val) +{ + memset64((void *)pt->pgstes, val, PTRS_PER_PTE); +} + +static inline void dat_init_page_table(struct page_table *pt, unsigned long ptes, + unsigned long pgstes) +{ + memset64((void *)pt->ptes, ptes, PTRS_PER_PTE); + dat_init_pgstes(pt, pgstes); +} + +static inline gfn_t asce_end(union asce asce) +{ + return 1ULL << ((asce.dt + 1) * 11 + _SEGMENT_SHIFT - PAGE_SHIFT); +} + +#define _CRSTE(x) ((union crste) { .val = _Generic((x), \ + union pgd : (x).val, \ + union p4d : (x).val, \ + union pud : (x).val, \ + union pmd : (x).val, \ + union crste : (x).val)}) + +#define _CRSTEP(x) ((union crste *)_Generic((*(x)), \ + union pgd : (x), \ + union p4d : (x), \ + union pud : (x), \ + union pmd : (x), \ + union crste : (x))) + +#define _CRSTP(x) ((struct crst_table *)_Generic((*(x)), \ + struct crst_table : (x), \ + struct segment_table : (x), \ + struct region3_table : (x), \ + struct region2_table : (x), \ + struct region1_table : (x))) + +static inline bool asce_contains_gfn(union asce asce, gfn_t gfn) +{ + return gfn < asce_end(asce); +} + +static inline bool is_pmd(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_SEGMENT; +} + +static inline bool is_pud(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_REGION3; +} + +static inline bool is_p4d(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_REGION2; +} + +static inline bool is_pgd(union crste crste) +{ + return crste.h.tt == TABLE_TYPE_REGION1; +} + +static inline phys_addr_t pmd_origin_large(union pmd pmd) +{ + return pmd.val & _SEGMENT_ENTRY_ORIGIN_LARGE; +} + +static inline phys_addr_t pud_origin_large(union pud pud) +{ + return pud.val & _REGION3_ENTRY_ORIGIN_LARGE; +} + +/** + * crste_origin_large() - Return the large frame origin of a large crste + * @crste: The crste whose origin is to be returned. Should be either a + * region-3 table entry or a segment table entry, in both cases with + * FC set to 1 (large pages). + * + * Return: The origin of the large frame pointed to by @crste, or -1 if the + * crste was not large (wrong table type, or FC==0) + */ +static inline phys_addr_t crste_origin_large(union crste crste) +{ + if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3)) + return -1; + if (is_pmd(crste)) + return pmd_origin_large(crste.pmd); + return pud_origin_large(crste.pud); +} + +#define crste_origin(x) (_Generic((x), \ + union pmd : (x).val & _SEGMENT_ENTRY_ORIGIN, \ + union pud : (x).val & _REGION_ENTRY_ORIGIN, \ + union p4d : (x).val & _REGION_ENTRY_ORIGIN, \ + union pgd : (x).val & _REGION_ENTRY_ORIGIN)) + +static inline unsigned long pte_origin(union pte pte) +{ + return pte.val & PAGE_MASK; +} + +static inline bool pmd_prefix(union pmd pmd) +{ + return pmd.h.fc && pmd.s.fc1.prefix_notif; +} + +static inline bool pud_prefix(union pud pud) +{ + return pud.h.fc && pud.s.fc1.prefix_notif; +} + +static inline bool crste_leaf(union crste crste) +{ + return (crste.h.tt <= TABLE_TYPE_REGION3) && crste.h.fc; +} + +static inline bool crste_prefix(union crste crste) +{ + return crste_leaf(crste) && crste.s.fc1.prefix_notif; +} + +static inline bool crste_dirty(union crste crste) +{ + return crste_leaf(crste) && crste.s.fc1.d; +} + +static inline union pgste *pgste_of(union pte *pte) +{ + return (union pgste *)(pte + _PAGE_ENTRIES); +} + +static inline bool pte_hole(union pte pte) +{ + return pte.h.i && !pte.tok.pr && pte.tok.type != _DAT_TOKEN_NONE; +} + +static inline bool _crste_hole(union crste crste) +{ + return crste.h.i && !crste.tok.pr && crste.tok.type != _DAT_TOKEN_NONE; +} + +#define crste_hole(x) _crste_hole(_CRSTE(x)) + +static inline bool _crste_none(union crste crste) +{ + return crste.h.i && !crste.tok.pr && crste.tok.type == _DAT_TOKEN_NONE; +} + +#define crste_none(x) _crste_none(_CRSTE(x)) + +static inline phys_addr_t large_pud_to_phys(union pud pud, gfn_t gfn) +{ + return pud_origin_large(pud) | (gfn_to_gpa(gfn) & ~_REGION3_MASK); +} + +static inline phys_addr_t large_pmd_to_phys(union pmd pmd, gfn_t gfn) +{ + return pmd_origin_large(pmd) | (gfn_to_gpa(gfn) & ~_SEGMENT_MASK); +} + +static inline phys_addr_t large_crste_to_phys(union crste crste, gfn_t gfn) +{ + if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3)) + return -1; + if (is_pmd(crste)) + return large_pmd_to_phys(crste.pmd, gfn); + return large_pud_to_phys(crste.pud, gfn); +} + +static inline bool cspg_crste(union crste *crstep, union crste old, union crste new) +{ + return cspg(&crstep->val, old.val, new.val); +} + +static inline struct page_table *dereference_pmd(union pmd pmd) +{ + return phys_to_virt(crste_origin(pmd)); +} + +static inline struct segment_table *dereference_pud(union pud pud) +{ + return phys_to_virt(crste_origin(pud)); +} + +static inline struct region3_table *dereference_p4d(union p4d p4d) +{ + return phys_to_virt(crste_origin(p4d)); +} + +static inline struct region2_table *dereference_pgd(union pgd pgd) +{ + return phys_to_virt(crste_origin(pgd)); +} + +static inline struct crst_table *_dereference_crste(union crste crste) +{ + if (unlikely(is_pmd(crste))) + return NULL; + return phys_to_virt(crste_origin(crste.pud)); +} + +#define dereference_crste(x) (_Generic((x), \ + union pud : _dereference_crste(_CRSTE(x)), \ + union p4d : _dereference_crste(_CRSTE(x)), \ + union pgd : _dereference_crste(_CRSTE(x)), \ + union crste : _dereference_crste(_CRSTE(x)))) + +static inline struct crst_table *dereference_asce(union asce asce) +{ + return phys_to_virt(asce.val & _ASCE_ORIGIN); +} + +static inline void asce_flush_tlb(union asce asce) +{ + __tlb_flush_idte(asce.val); +} + +static inline bool pgste_get_trylock(union pte *ptep, union pgste *res) +{ + union pgste *pgstep = pgste_of(ptep); + union pgste old_pgste; + + if (READ_ONCE(pgstep->val) & PGSTE_PCL_BIT) + return false; + old_pgste.val = __atomic64_or_barrier(PGSTE_PCL_BIT, &pgstep->val); + if (old_pgste.pcl) + return false; + old_pgste.pcl = 1; + *res = old_pgste; + return true; +} + +static inline union pgste pgste_get_lock(union pte *ptep) +{ + union pgste res; + + while (!pgste_get_trylock(ptep, &res)) + cpu_relax(); + return res; +} + +static inline void pgste_set_unlock(union pte *ptep, union pgste pgste) +{ + pgste.pcl = 0; + barrier(); + WRITE_ONCE(*pgste_of(ptep), pgste); +} + +static inline void dat_ptep_xchg(union pte *ptep, union pte new, gfn_t gfn, union asce asce, + bool has_skeys) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, asce, has_skeys); + pgste_set_unlock(ptep, pgste); +} + +static inline void dat_ptep_clear(union pte *ptep, gfn_t gfn, union asce asce, bool has_skeys) +{ + dat_ptep_xchg(ptep, _PTE_EMPTY, gfn, asce, has_skeys); +} + +static inline void dat_free_pt(struct page_table *pt) +{ + free_page((unsigned long)pt); +} + +static inline void _dat_free_crst(struct crst_table *table) +{ + free_pages((unsigned long)table, CRST_ALLOC_ORDER); +} + +#define dat_free_crst(x) _dat_free_crst(_CRSTP(x)) + +static inline void kvm_s390_free_mmu_cache(struct kvm_s390_mmu_cache *mc) +{ + if (!mc) + return; + while (mc->n_pts) + dat_free_pt(mc->pts[--mc->n_pts]); + while (mc->n_crsts) + _dat_free_crst(mc->crsts[--mc->n_crsts]); + while (mc->n_rmaps) + kfree(mc->rmaps[--mc->n_rmaps]); + kfree(mc); +} + +DEFINE_FREE(kvm_s390_mmu_cache, struct kvm_s390_mmu_cache *, if (_T) kvm_s390_free_mmu_cache(_T)) + +static inline struct kvm_s390_mmu_cache *kvm_s390_new_mmu_cache(void) +{ + struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL; + + mc = kzalloc(sizeof(*mc), GFP_KERNEL_ACCOUNT); + if (mc && !kvm_s390_mmu_cache_topup(mc)) + return_ptr(mc); + return NULL; +} + +static inline bool dat_pmdp_xchg_atomic(union pmd *pmdp, union pmd old, union pmd new, + gfn_t gfn, union asce asce) +{ + return dat_crstep_xchg_atomic(_CRSTEP(pmdp), _CRSTE(old), _CRSTE(new), gfn, asce); +} + +static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pud new, + gfn_t gfn, union asce asce) +{ + return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce); +} + +static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce asce) +{ + union crste newcrste = _CRSTE_EMPTY(crstep->h.tt); + + dat_crstep_xchg(crstep, newcrste, gfn, asce); +} + +static inline int get_level(union crste *crstep, union pte *ptep) +{ + return ptep ? TABLE_TYPE_PAGE_TABLE : crstep->h.tt; +} + +static inline int dat_delete_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, + unsigned long npages) +{ + return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_PIC, PGM_ADDRESSING); +} + +static inline int dat_create_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, + unsigned long npages) +{ + return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_NONE, 0); +} + +static inline bool crste_is_ucas(union crste crste) +{ + return is_pmd(crste) && crste.h.i && crste.h.fc0.tl == 1 && crste.h.fc == 0; +} + +#endif /* __KVM_S390_DAT_H */ diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 53233dec8cad..d89d1c381522 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -10,13 +10,13 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> -#include <asm/gmap.h> #include <asm/gmap_helpers.h> #include <asm/virtio-ccw.h> #include "kvm-s390.h" #include "trace.h" #include "trace-s390.h" #include "gaccess.h" +#include "gmap.h" static void do_discard_gfn_range(struct kvm_vcpu *vcpu, gfn_t gfn_start, gfn_t gfn_end) { diff --git a/arch/s390/kvm/faultin.c b/arch/s390/kvm/faultin.c new file mode 100644 index 000000000000..e37cd18200f5 --- /dev/null +++ b/arch/s390/kvm/faultin.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM guest fault handling. + * + * Copyright IBM Corp. 2025 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + */ +#include <linux/kvm_types.h> +#include <linux/kvm_host.h> + +#include "gmap.h" +#include "trace.h" +#include "faultin.h" + +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu); + +/* + * kvm_s390_faultin_gfn() - handle a dat fault. + * @vcpu: The vCPU whose gmap is to be fixed up, or NULL if operating on the VM. + * @kvm: The VM whose gmap is to be fixed up, or NULL if operating on a vCPU. + * @f: The guest fault that needs to be resolved. + * + * Return: + * * 0 on success + * * < 0 in case of error + * * > 0 in case of guest exceptions + * + * Context: + * * The mm lock must not be held before calling + * * kvm->srcu must be held + * * may sleep + */ +int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f) +{ + struct kvm_s390_mmu_cache *local_mc __free(kvm_s390_mmu_cache) = NULL; + struct kvm_s390_mmu_cache *mc = NULL; + struct kvm_memory_slot *slot; + unsigned long inv_seq; + int foll, rc = 0; + + foll = f->write_attempt ? FOLL_WRITE : 0; + foll |= f->attempt_pfault ? FOLL_NOWAIT : 0; + + if (vcpu) { + kvm = vcpu->kvm; + mc = vcpu->arch.mc; + } + + lockdep_assert_held(&kvm->srcu); + + scoped_guard(read_lock, &kvm->mmu_lock) { + if (gmap_try_fixup_minor(kvm->arch.gmap, f) == 0) + return 0; + } + + while (1) { + f->valid = false; + inv_seq = kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); + + if (vcpu) + slot = kvm_vcpu_gfn_to_memslot(vcpu, f->gfn); + else + slot = gfn_to_memslot(kvm, f->gfn); + f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page); + + /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT). */ + if (f->pfn == KVM_PFN_ERR_NEEDS_IO) { + if (unlikely(!f->attempt_pfault)) + return -EAGAIN; + if (unlikely(!vcpu)) + return -EINVAL; + trace_kvm_s390_major_guest_pfault(vcpu); + if (kvm_arch_setup_async_pf(vcpu)) + return 0; + vcpu->stat.pfault_sync++; + /* Could not setup async pfault, try again synchronously. */ + foll &= ~FOLL_NOWAIT; + f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page); + } + + /* Access outside memory, addressing exception. */ + if (is_noslot_pfn(f->pfn)) + return PGM_ADDRESSING; + /* Signal pending: try again. */ + if (f->pfn == KVM_PFN_ERR_SIGPENDING) + return -EAGAIN; + /* Check if it's read-only memory; don't try to actually handle that case. */ + if (f->pfn == KVM_PFN_ERR_RO_FAULT) + return -EOPNOTSUPP; + /* Any other error. */ + if (is_error_pfn(f->pfn)) + return -EFAULT; + + if (!mc) { + local_mc = kvm_s390_new_mmu_cache(); + if (!local_mc) + return -ENOMEM; + mc = local_mc; + } + + /* Loop, will automatically release the faulted page. */ + if (mmu_invalidate_retry_gfn_unsafe(kvm, inv_seq, f->gfn)) { + kvm_release_faultin_page(kvm, f->page, true, false); + continue; + } + + scoped_guard(read_lock, &kvm->mmu_lock) { + if (!mmu_invalidate_retry_gfn(kvm, inv_seq, f->gfn)) { + f->valid = true; + rc = gmap_link(mc, kvm->arch.gmap, f); + kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt); + f->page = NULL; + } + } + kvm_release_faultin_page(kvm, f->page, true, false); + + if (rc == -ENOMEM) { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + } else if (rc != -EAGAIN) { + return rc; + } + } +} + +int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w) +{ + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + int foll = w ? FOLL_WRITE : 0; + + f->write_attempt = w; + f->gfn = gfn; + f->pfn = __kvm_faultin_pfn(slot, gfn, foll, &f->writable, &f->page); + if (is_noslot_pfn(f->pfn)) + return PGM_ADDRESSING; + if (is_sigpending_pfn(f->pfn)) + return -EINTR; + if (f->pfn == KVM_PFN_ERR_NEEDS_IO) + return -EAGAIN; + if (is_error_pfn(f->pfn)) + return -EFAULT; + + f->valid = true; + return 0; +} diff --git a/arch/s390/kvm/faultin.h b/arch/s390/kvm/faultin.h new file mode 100644 index 000000000000..f86176d2769c --- /dev/null +++ b/arch/s390/kvm/faultin.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM guest fault handling. + * + * Copyright IBM Corp. 2025 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + */ + +#ifndef __KVM_S390_FAULTIN_H +#define __KVM_S390_FAULTIN_H + +#include <linux/kvm_host.h> + +#include "dat.h" + +int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f); +int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w); + +static inline int kvm_s390_faultin_gfn_simple(struct kvm_vcpu *vcpu, struct kvm *kvm, + gfn_t gfn, bool wr) +{ + struct guest_fault f = { .gfn = gfn, .write_attempt = wr, }; + + return kvm_s390_faultin_gfn(vcpu, kvm, &f); +} + +static inline int kvm_s390_get_guest_page_and_read_gpa(struct kvm *kvm, struct guest_fault *f, + gpa_t gaddr, unsigned long *val) +{ + int rc; + + rc = kvm_s390_get_guest_page(kvm, f, gpa_to_gfn(gaddr), false); + if (rc) + return rc; + + *val = *(unsigned long *)phys_to_virt(pfn_to_phys(f->pfn) | offset_in_page(gaddr)); + + return 0; +} + +static inline void kvm_s390_release_multiple(struct kvm *kvm, struct guest_fault *guest_faults, + int n, bool ignore) +{ + int i; + + for (i = 0; i < n; i++) { + kvm_release_faultin_page(kvm, guest_faults[i].page, ignore, + guest_faults[i].write_attempt); + guest_faults[i].page = NULL; + } +} + +static inline bool kvm_s390_multiple_faults_need_retry(struct kvm *kvm, unsigned long seq, + struct guest_fault *guest_faults, int n, + bool unsafe) +{ + int i; + + for (i = 0; i < n; i++) { + if (!guest_faults[i].valid) + continue; + if (unsafe && mmu_invalidate_retry_gfn_unsafe(kvm, seq, guest_faults[i].gfn)) + return true; + if (!unsafe && mmu_invalidate_retry_gfn(kvm, seq, guest_faults[i].gfn)) + return true; + } + return false; +} + +static inline int kvm_s390_get_guest_pages(struct kvm *kvm, struct guest_fault *guest_faults, + gfn_t start, int n_pages, bool write_attempt) +{ + int i, rc; + + for (i = 0; i < n_pages; i++) { + rc = kvm_s390_get_guest_page(kvm, guest_faults + i, start + i, write_attempt); + if (rc) + break; + } + return rc; +} + +#define kvm_s390_release_faultin_array(kvm, array, ignore) \ + kvm_s390_release_multiple(kvm, array, ARRAY_SIZE(array), ignore) + +#define kvm_s390_array_needs_retry_unsafe(kvm, seq, array) \ + kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), true) + +#define kvm_s390_array_needs_retry_safe(kvm, seq, array) \ + kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), false) + +#endif /* __KVM_S390_FAULTIN_H */ diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 41ca6b0ee7a9..4630b2a067ea 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -11,41 +11,43 @@ #include <linux/err.h> #include <linux/pgtable.h> #include <linux/bitfield.h> +#include <linux/kvm_host.h> +#include <linux/kvm_types.h> +#include <asm/diag.h> #include <asm/access-regs.h> #include <asm/fault.h> -#include <asm/gmap.h> #include <asm/dat-bits.h> #include "kvm-s390.h" +#include "dat.h" +#include "gmap.h" #include "gaccess.h" +#include "faultin.h" #define GMAP_SHADOW_FAKE_TABLE 1ULL -/* - * vaddress union in order to easily decode a virtual address into its - * region first index, region second index etc. parts. - */ -union vaddress { - unsigned long addr; - struct { - unsigned long rfx : 11; - unsigned long rsx : 11; - unsigned long rtx : 11; - unsigned long sx : 11; - unsigned long px : 8; - unsigned long bx : 12; - }; - struct { - unsigned long rfx01 : 2; - unsigned long : 9; - unsigned long rsx01 : 2; - unsigned long : 9; - unsigned long rtx01 : 2; - unsigned long : 9; - unsigned long sx01 : 2; - unsigned long : 29; - }; +union dat_table_entry { + unsigned long val; + union region1_table_entry pgd; + union region2_table_entry p4d; + union region3_table_entry pud; + union segment_table_entry pmd; + union page_table_entry pte; +}; + +#define WALK_N_ENTRIES 7 +#define LEVEL_MEM -2 +struct pgtwalk { + struct guest_fault raw_entries[WALK_N_ENTRIES]; + gpa_t last_addr; + int level; + bool p; }; +static inline struct guest_fault *get_entries(struct pgtwalk *w) +{ + return w->raw_entries - LEVEL_MEM; +} + /* * raddress union which will contain the result (real or absolute address) * after a page table walk. The rfaa, sfaa and pfra members are used to @@ -107,6 +109,28 @@ struct aste { /* .. more fields there */ }; +union oac { + unsigned int val; + struct { + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac1; + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac2; + }; +}; + int ipte_lock_held(struct kvm *kvm) { if (sclp.has_siif) @@ -423,7 +447,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) } /** - * guest_translate - translate a guest virtual into a guest absolute address + * guest_translate_gva() - translate a guest virtual into a guest absolute address * @vcpu: virtual cpu * @gva: guest virtual address * @gpa: points to where guest physical (absolute) address should be stored @@ -443,9 +467,9 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) * the returned value is the program interruption code as defined * by the architecture */ -static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, - unsigned long *gpa, const union asce asce, - enum gacc_mode mode, enum prot_type *prot) +static unsigned long guest_translate_gva(struct kvm_vcpu *vcpu, unsigned long gva, + unsigned long *gpa, const union asce asce, + enum gacc_mode mode, enum prot_type *prot) { union vaddress vaddr = {.addr = gva}; union raddress raddr = {.addr = gva}; @@ -626,31 +650,19 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, return 1; } -static int vm_check_access_key(struct kvm *kvm, u8 access_key, - enum gacc_mode mode, gpa_t gpa) +static int vm_check_access_key_gpa(struct kvm *kvm, u8 access_key, + enum gacc_mode mode, gpa_t gpa) { - u8 storage_key, access_control; - bool fetch_protected; - unsigned long hva; + union skey storage_key; int r; - if (access_key == 0) - return 0; - - hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); - if (kvm_is_error_hva(hva)) - return PGM_ADDRESSING; - - mmap_read_lock(current->mm); - r = get_guest_storage_key(current->mm, hva, &storage_key); - mmap_read_unlock(current->mm); + scoped_guard(read_lock, &kvm->mmu_lock) + r = dat_get_storage_key(kvm->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key); if (r) return r; - access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); - if (access_control == access_key) + if (access_key == 0 || storage_key.acc == access_key) return 0; - fetch_protected = storage_key & _PAGE_FP_BIT; - if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected) + if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !storage_key.fp) return 0; return PGM_PROTECTION; } @@ -689,12 +701,11 @@ static bool storage_prot_override_applies(u8 access_control) return access_control == PAGE_SPO_ACC; } -static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key, - enum gacc_mode mode, union asce asce, gpa_t gpa, - unsigned long ga, unsigned int len) +static int vcpu_check_access_key_gpa(struct kvm_vcpu *vcpu, u8 access_key, + enum gacc_mode mode, union asce asce, gpa_t gpa, + unsigned long ga, unsigned int len) { - u8 storage_key, access_control; - unsigned long hva; + union skey storage_key; int r; /* access key 0 matches any storage key -> allow */ @@ -704,26 +715,23 @@ static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key, * caller needs to ensure that gfn is accessible, so we can * assume that this cannot fail */ - hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa)); - mmap_read_lock(current->mm); - r = get_guest_storage_key(current->mm, hva, &storage_key); - mmap_read_unlock(current->mm); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + r = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key); if (r) return r; - access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); /* access key matches storage key -> allow */ - if (access_control == access_key) + if (storage_key.acc == access_key) return 0; if (mode == GACC_FETCH || mode == GACC_IFETCH) { /* it is a fetch and fetch protection is off -> allow */ - if (!(storage_key & _PAGE_FP_BIT)) + if (!storage_key.fp) return 0; if (fetch_prot_override_applicable(vcpu, mode, asce) && fetch_prot_override_applies(ga, len)) return 0; } if (storage_prot_override_applicable(vcpu) && - storage_prot_override_applies(access_control)) + storage_prot_override_applies(storage_key.acc)) return 0; return PGM_PROTECTION; } @@ -783,7 +791,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode, PROT_TYPE_LA); if (psw_bits(*psw).dat) { - rc = guest_translate(vcpu, ga, &gpa, asce, mode, &prot); + rc = guest_translate_gva(vcpu, ga, &gpa, asce, mode, &prot); if (rc < 0) return rc; } else { @@ -795,8 +803,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, } if (rc) return trans_exc(vcpu, rc, ga, ar, mode, prot); - rc = vcpu_check_access_key(vcpu, access_key, mode, asce, gpa, ga, - fragment_len); + rc = vcpu_check_access_key_gpa(vcpu, access_key, mode, asce, gpa, ga, fragment_len); if (rc) return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_KEYC); if (gpas) @@ -808,8 +815,8 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, return 0; } -static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, - void *data, unsigned int len) +static int access_guest_page_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len) { const unsigned int offset = offset_in_page(gpa); const gfn_t gfn = gpa_to_gfn(gpa); @@ -824,38 +831,79 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, return rc; } -static int -access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, - void *data, unsigned int len, u8 access_key) +static int mvcos_key(void *to, const void *from, unsigned long size, u8 dst_key, u8 src_key) { - struct kvm_memory_slot *slot; - bool writable; - gfn_t gfn; - hva_t hva; - int rc; + union oac spec = { + .oac1.key = dst_key, + .oac1.k = !!dst_key, + .oac2.key = src_key, + .oac2.k = !!src_key, + }; + int exception = PGM_PROTECTION; + + asm_inline volatile( + " lr %%r0,%[spec]\n" + "0: mvcos %[to],%[from],%[size]\n" + "1: lhi %[exc],0\n" + "2:\n" + EX_TABLE(0b, 2b) + EX_TABLE(1b, 2b) + : [size] "+d" (size), [to] "=Q" (*(char *)to), [exc] "+d" (exception) + : [spec] "d" (spec.val), [from] "Q" (*(const char *)from) + : "memory", "cc", "0"); + return exception; +} - gfn = gpa >> PAGE_SHIFT; - slot = gfn_to_memslot(kvm, gfn); - hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); +struct acc_page_key_context { + void *data; + int exception; + unsigned short offset; + unsigned short len; + bool store; + u8 access_key; +}; - if (kvm_is_error_hva(hva)) - return PGM_ADDRESSING; - /* - * Check if it's a ro memslot, even tho that can't occur (they're unsupported). - * Don't try to actually handle that case. - */ - if (!writable && mode == GACC_STORE) - return -EOPNOTSUPP; - hva += offset_in_page(gpa); - if (mode == GACC_STORE) - rc = copy_to_user_key((void __user *)hva, data, len, access_key); +static void _access_guest_page_with_key_gpa(struct guest_fault *f) +{ + struct acc_page_key_context *context = f->priv; + void *ptr; + int r; + + ptr = __va(PFN_PHYS(f->pfn) | context->offset); + + if (context->store) + r = mvcos_key(ptr, context->data, context->len, context->access_key, 0); else - rc = copy_from_user_key(data, (void __user *)hva, len, access_key); + r = mvcos_key(context->data, ptr, context->len, 0, context->access_key); + + context->exception = r; +} + +static int access_guest_page_with_key_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len, u8 acc) +{ + struct acc_page_key_context context = { + .offset = offset_in_page(gpa), + .len = len, + .data = data, + .access_key = acc, + .store = mode == GACC_STORE, + }; + struct guest_fault fault = { + .gfn = gpa_to_gfn(gpa), + .priv = &context, + .write_attempt = mode == GACC_STORE, + .callback = _access_guest_page_with_key_gpa, + }; + int rc; + + if (KVM_BUG_ON((len + context.offset) > PAGE_SIZE, kvm)) + return -EINVAL; + + rc = kvm_s390_faultin_gfn(NULL, kvm, &fault); if (rc) - return PGM_PROTECTION; - if (mode == GACC_STORE) - mark_page_dirty_in_slot(kvm, slot, gfn); - return 0; + return rc; + return context.exception; } int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, @@ -867,7 +915,7 @@ int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, while (min(PAGE_SIZE - offset, len) > 0) { fragment_len = min(PAGE_SIZE - offset, len); - rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key); + rc = access_guest_page_with_key_gpa(kvm, mode, gpa, data, fragment_len, access_key); if (rc) return rc; offset = 0; @@ -927,15 +975,14 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, for (idx = 0; idx < nr_pages; idx++) { fragment_len = min(PAGE_SIZE - offset_in_page(gpas[idx]), len); if (try_fetch_prot_override && fetch_prot_override_applies(ga, fragment_len)) { - rc = access_guest_page(vcpu->kvm, mode, gpas[idx], - data, fragment_len); + rc = access_guest_page_gpa(vcpu->kvm, mode, gpas[idx], data, fragment_len); } else { - rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], - data, fragment_len, access_key); + rc = access_guest_page_with_key_gpa(vcpu->kvm, mode, gpas[idx], + data, fragment_len, access_key); } if (rc == PGM_PROTECTION && try_storage_prot_override) - rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], - data, fragment_len, PAGE_SPO_ACC); + rc = access_guest_page_with_key_gpa(vcpu->kvm, mode, gpas[idx], + data, fragment_len, PAGE_SPO_ACC); if (rc) break; len -= fragment_len; @@ -969,7 +1016,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, while (len && !rc) { gpa = kvm_s390_real_to_abs(vcpu, gra); fragment_len = min(PAGE_SIZE - offset_in_page(gpa), len); - rc = access_guest_page(vcpu->kvm, mode, gpa, data, fragment_len); + rc = access_guest_page_gpa(vcpu->kvm, mode, gpa, data, fragment_len); len -= fragment_len; gra += fragment_len; data += fragment_len; @@ -980,17 +1027,101 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, } /** + * __cmpxchg_with_key() - Perform cmpxchg, honoring storage keys. + * @ptr: Address of value to compare to *@old and exchange with + * @new. Must be aligned to @size. + * @old: Old value. Compared to the content pointed to by @ptr in order to + * determine if the exchange occurs. The old value read from *@ptr is + * written here. + * @new: New value to place at *@ptr. + * @size: Size of the operation in bytes, may only be a power of two up to 16. + * @access_key: Access key to use for checking storage key protection. + * + * Perform a cmpxchg on guest memory, honoring storage key protection. + * @access_key alone determines how key checking is performed, neither + * storage-protection-override nor fetch-protection-override apply. + * In case of an exception *@uval is set to zero. + * + * Return: + * * %0: cmpxchg executed successfully + * * %1: cmpxchg executed unsuccessfully + * * %PGM_PROTECTION: an exception happened when trying to access *@ptr + * * %-EAGAIN: maxed out number of retries (byte and short only) + * * %-EINVAL: invalid value for @size + */ +static int __cmpxchg_with_key(union kvm_s390_quad *ptr, union kvm_s390_quad *old, + union kvm_s390_quad new, int size, u8 access_key) +{ + union kvm_s390_quad tmp = { .sixteen = 0 }; + int rc; + + /* + * The cmpxchg_key macro depends on the type of "old", so we need + * a case for each valid length and get some code duplication as long + * as we don't introduce a new macro. + */ + switch (size) { + case 1: + rc = __cmpxchg_key1(&ptr->one, &tmp.one, old->one, new.one, access_key); + break; + case 2: + rc = __cmpxchg_key2(&ptr->two, &tmp.two, old->two, new.two, access_key); + break; + case 4: + rc = __cmpxchg_key4(&ptr->four, &tmp.four, old->four, new.four, access_key); + break; + case 8: + rc = __cmpxchg_key8(&ptr->eight, &tmp.eight, old->eight, new.eight, access_key); + break; + case 16: + rc = __cmpxchg_key16(&ptr->sixteen, &tmp.sixteen, old->sixteen, new.sixteen, + access_key); + break; + default: + return -EINVAL; + } + if (!rc && memcmp(&tmp, old, size)) + rc = 1; + *old = tmp; + /* + * Assume that the fault is caused by protection, either key protection + * or user page write protection. + */ + if (rc == -EFAULT) + rc = PGM_PROTECTION; + return rc; +} + +struct cmpxchg_key_context { + union kvm_s390_quad new; + union kvm_s390_quad *old; + int exception; + unsigned short offset; + u8 access_key; + u8 len; +}; + +static void _cmpxchg_guest_abs_with_key(struct guest_fault *f) +{ + struct cmpxchg_key_context *context = f->priv; + + context->exception = __cmpxchg_with_key(__va(PFN_PHYS(f->pfn) | context->offset), + context->old, context->new, context->len, + context->access_key); +} + +/** * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address. * @kvm: Virtual machine instance. * @gpa: Absolute guest address of the location to be changed. * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a * non power of two will result in failure. - * @old_addr: Pointer to old value. If the location at @gpa contains this value, - * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() - * *@old_addr contains the value at @gpa before the attempt to - * exchange the value. + * @old: Pointer to old value. If the location at @gpa contains this value, + * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() + * *@old contains the value at @gpa before the attempt to + * exchange the value. * @new: The value to place at @gpa. - * @access_key: The access key to use for the guest access. + * @acc: The access key to use for the guest access. * @success: output value indicating if an exchange occurred. * * Atomically exchange the value at @gpa by @new, if it contains *@old. @@ -1003,89 +1134,36 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * * -EAGAIN: transient failure (len 1 or 2) * * -EOPNOTSUPP: read-only memslot (should never occur) */ -int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, - __uint128_t *old_addr, __uint128_t new, - u8 access_key, bool *success) +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old, + union kvm_s390_quad new, u8 acc, bool *success) { - gfn_t gfn = gpa_to_gfn(gpa); - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); - bool writable; - hva_t hva; - int ret; - - if (!IS_ALIGNED(gpa, len)) - return -EINVAL; - - hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); - if (kvm_is_error_hva(hva)) - return PGM_ADDRESSING; - /* - * Check if it's a read-only memslot, even though that cannot occur - * since those are unsupported. - * Don't try to actually handle that case. - */ - if (!writable) - return -EOPNOTSUPP; - - hva += offset_in_page(gpa); - /* - * The cmpxchg_user_key macro depends on the type of "old", so we need - * a case for each valid length and get some code duplication as long - * as we don't introduce a new macro. - */ - switch (len) { - case 1: { - u8 old; - - ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - case 2: { - u16 old; - - ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - case 4: { - u32 old; - - ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - case 8: { - u64 old; + struct cmpxchg_key_context context = { + .old = old, + .new = new, + .offset = offset_in_page(gpa), + .len = len, + .access_key = acc, + }; + struct guest_fault fault = { + .gfn = gpa_to_gfn(gpa), + .priv = &context, + .write_attempt = true, + .callback = _cmpxchg_guest_abs_with_key, + }; + int rc; - ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - case 16: { - __uint128_t old; + lockdep_assert_held(&kvm->srcu); - ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key); - *success = !ret && old == *old_addr; - *old_addr = old; - break; - } - default: + if (len > 16 || !IS_ALIGNED(gpa, len)) return -EINVAL; - } - if (*success) - mark_page_dirty_in_slot(kvm, slot, gfn); - /* - * Assume that the fault is caused by protection, either key protection - * or user page write protection. - */ - if (ret == -EFAULT) - ret = PGM_PROTECTION; - return ret; + + rc = kvm_s390_faultin_gfn(NULL, kvm, &fault); + if (rc) + return rc; + *success = !context.exception; + if (context.exception == 1) + return 0; + return context.exception; } /** @@ -1160,7 +1238,7 @@ int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length, while (length && !rc) { fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length); - rc = vm_check_access_key(kvm, access_key, mode, gpa); + rc = vm_check_access_key_gpa(kvm, access_key, mode, gpa); length -= fragment_len; gpa += fragment_len; } @@ -1187,304 +1265,375 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) } /** - * kvm_s390_shadow_tables - walk the guest page table and create shadow tables - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @pgt: pointer to the beginning of the page table for the given address if - * successful (return value 0), or to the first invalid DAT entry in - * case of exceptions (return value > 0) - * @dat_protection: referenced memory is write protected - * @fake: pgt references contiguous guest memory block, not a pgtable + * walk_guest_tables() - Walk the guest page table and pin the dat tables. + * @sg: Pointer to the shadow guest address space structure. + * @saddr: Faulting address in the shadow gmap. + * @w: Will be filled with information on the pinned pages. + * @wr: Wndicates a write access if true. + * + * Return: + * * %0 in case of success, + * * a PIC code > 0 in case the address translation fails + * * an error code < 0 if other errors happen in the host */ -static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection, - int *fake) +static int walk_guest_tables(struct gmap *sg, unsigned long saddr, struct pgtwalk *w, bool wr) { - struct kvm *kvm; - struct gmap *parent; - union asce asce; + struct gmap *parent = sg->parent; + struct guest_fault *entries; + union dat_table_entry table; union vaddress vaddr; unsigned long ptr; + struct kvm *kvm; + union asce asce; int rc; - *fake = 0; - *dat_protection = 0; - kvm = sg->private; - parent = sg->parent; + if (!parent) + return -EAGAIN; + kvm = parent->kvm; + WARN_ON(!kvm); + asce = sg->guest_asce; + entries = get_entries(w); + + w->level = LEVEL_MEM; + w->last_addr = saddr; + if (asce.r) + return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, gpa_to_gfn(saddr), false); + vaddr.addr = saddr; - asce.val = sg->orig_asce; ptr = asce.rsto * PAGE_SIZE; - if (asce.r) { - *fake = 1; - ptr = 0; - asce.dt = ASCE_TYPE_REGION1; - } + + if (!asce_contains_gfn(asce, gpa_to_gfn(saddr))) + return PGM_ASCE_TYPE; switch (asce.dt) { case ASCE_TYPE_REGION1: - if (vaddr.rfx01 > asce.tl && !*fake) + if (vaddr.rfx01 > asce.tl) return PGM_REGION_FIRST_TRANS; break; case ASCE_TYPE_REGION2: - if (vaddr.rfx) - return PGM_ASCE_TYPE; if (vaddr.rsx01 > asce.tl) return PGM_REGION_SECOND_TRANS; break; case ASCE_TYPE_REGION3: - if (vaddr.rfx || vaddr.rsx) - return PGM_ASCE_TYPE; if (vaddr.rtx01 > asce.tl) return PGM_REGION_THIRD_TRANS; break; case ASCE_TYPE_SEGMENT: - if (vaddr.rfx || vaddr.rsx || vaddr.rtx) - return PGM_ASCE_TYPE; if (vaddr.sx01 > asce.tl) return PGM_SEGMENT_TRANSLATION; break; } + w->level = asce.dt; switch (asce.dt) { - case ASCE_TYPE_REGION1: { - union region1_table_entry rfte; - - if (*fake) { - ptr += vaddr.rfx * _REGION1_SIZE; - rfte.val = ptr; - goto shadow_r2t; - } - *pgt = ptr + vaddr.rfx * 8; - rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); + case ASCE_TYPE_REGION1: + w->last_addr = ptr + vaddr.rfx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (rfte.i) + if (table.pgd.i) return PGM_REGION_FIRST_TRANS; - if (rfte.tt != TABLE_TYPE_REGION1) + if (table.pgd.tt != TABLE_TYPE_REGION1) return PGM_TRANSLATION_SPEC; - if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl) + if (vaddr.rsx01 < table.pgd.tf || vaddr.rsx01 > table.pgd.tl) return PGM_REGION_SECOND_TRANS; if (sg->edat_level >= 1) - *dat_protection |= rfte.p; - ptr = rfte.rto * PAGE_SIZE; -shadow_r2t: - rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); - if (rc) - return rc; - kvm->stat.gmap_shadow_r1_entry++; - } + w->p |= table.pgd.p; + ptr = table.pgd.rto * PAGE_SIZE; + w->level--; fallthrough; - case ASCE_TYPE_REGION2: { - union region2_table_entry rste; - - if (*fake) { - ptr += vaddr.rsx * _REGION2_SIZE; - rste.val = ptr; - goto shadow_r3t; - } - *pgt = ptr + vaddr.rsx * 8; - rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); + case ASCE_TYPE_REGION2: + w->last_addr = ptr + vaddr.rsx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (rste.i) + if (table.p4d.i) return PGM_REGION_SECOND_TRANS; - if (rste.tt != TABLE_TYPE_REGION2) + if (table.p4d.tt != TABLE_TYPE_REGION2) return PGM_TRANSLATION_SPEC; - if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl) + if (vaddr.rtx01 < table.p4d.tf || vaddr.rtx01 > table.p4d.tl) return PGM_REGION_THIRD_TRANS; if (sg->edat_level >= 1) - *dat_protection |= rste.p; - ptr = rste.rto * PAGE_SIZE; -shadow_r3t: - rste.p |= *dat_protection; - rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); - if (rc) - return rc; - kvm->stat.gmap_shadow_r2_entry++; - } + w->p |= table.p4d.p; + ptr = table.p4d.rto * PAGE_SIZE; + w->level--; fallthrough; - case ASCE_TYPE_REGION3: { - union region3_table_entry rtte; - - if (*fake) { - ptr += vaddr.rtx * _REGION3_SIZE; - rtte.val = ptr; - goto shadow_sgt; - } - *pgt = ptr + vaddr.rtx * 8; - rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); + case ASCE_TYPE_REGION3: + w->last_addr = ptr + vaddr.rtx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (rtte.i) + if (table.pud.i) return PGM_REGION_THIRD_TRANS; - if (rtte.tt != TABLE_TYPE_REGION3) + if (table.pud.tt != TABLE_TYPE_REGION3) return PGM_TRANSLATION_SPEC; - if (rtte.cr && asce.p && sg->edat_level >= 2) + if (table.pud.cr && asce.p && sg->edat_level >= 2) return PGM_TRANSLATION_SPEC; - if (rtte.fc && sg->edat_level >= 2) { - *dat_protection |= rtte.fc0.p; - *fake = 1; - ptr = rtte.fc1.rfaa * _REGION3_SIZE; - rtte.val = ptr; - goto shadow_sgt; + if (sg->edat_level >= 1) + w->p |= table.pud.p; + if (table.pud.fc && sg->edat_level >= 2) { + table.val = u64_replace_bits(table.val, saddr, ~_REGION3_MASK); + goto edat_applies; } - if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl) + if (vaddr.sx01 < table.pud.fc0.tf || vaddr.sx01 > table.pud.fc0.tl) return PGM_SEGMENT_TRANSLATION; - if (sg->edat_level >= 1) - *dat_protection |= rtte.fc0.p; - ptr = rtte.fc0.sto * PAGE_SIZE; -shadow_sgt: - rtte.fc0.p |= *dat_protection; - rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); - if (rc) - return rc; - kvm->stat.gmap_shadow_r3_entry++; - } + ptr = table.pud.fc0.sto * PAGE_SIZE; + w->level--; fallthrough; - case ASCE_TYPE_SEGMENT: { - union segment_table_entry ste; - - if (*fake) { - ptr += vaddr.sx * _SEGMENT_SIZE; - ste.val = ptr; - goto shadow_pgt; - } - *pgt = ptr + vaddr.sx * 8; - rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); + case ASCE_TYPE_SEGMENT: + w->last_addr = ptr + vaddr.sx * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); if (rc) return rc; - if (ste.i) + if (table.pmd.i) return PGM_SEGMENT_TRANSLATION; - if (ste.tt != TABLE_TYPE_SEGMENT) + if (table.pmd.tt != TABLE_TYPE_SEGMENT) return PGM_TRANSLATION_SPEC; - if (ste.cs && asce.p) + if (table.pmd.cs && asce.p) return PGM_TRANSLATION_SPEC; - *dat_protection |= ste.fc0.p; - if (ste.fc && sg->edat_level >= 1) { - *fake = 1; - ptr = ste.fc1.sfaa * _SEGMENT_SIZE; - ste.val = ptr; - goto shadow_pgt; + w->p |= table.pmd.p; + if (table.pmd.fc && sg->edat_level >= 1) { + table.val = u64_replace_bits(table.val, saddr, ~_SEGMENT_MASK); + goto edat_applies; } - ptr = ste.fc0.pto * (PAGE_SIZE / 2); -shadow_pgt: - ste.fc0.p |= *dat_protection; - rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); + ptr = table.pmd.fc0.pto * (PAGE_SIZE / 2); + w->level--; + } + w->last_addr = ptr + vaddr.px * 8; + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, + w->last_addr, &table.val); + if (rc) + return rc; + if (table.pte.i) + return PGM_PAGE_TRANSLATION; + if (table.pte.z) + return PGM_TRANSLATION_SPEC; + w->p |= table.pte.p; +edat_applies: + if (wr && w->p) + return PGM_PROTECTION; + + return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, table.pte.pfra, wr); +} + +static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union pte *ptep, + struct guest_fault *f, bool p) +{ + union pgste pgste; + union pte newpte; + int rc; + + lockdep_assert_held(&sg->kvm->mmu_lock); + lockdep_assert_held(&sg->parent->children_lock); + + scoped_guard(spinlock, &sg->host_to_rmap_lock) + rc = gmap_insert_rmap(sg, f->gfn, gpa_to_gfn(raddr), TABLE_TYPE_PAGE_TABLE); + if (rc) + return rc; + + pgste = pgste_get_lock(ptep_h); + newpte = _pte(f->pfn, f->writable, !p, 0); + newpte.s.d |= ptep->s.d; + newpte.s.sd |= ptep->s.sd; + newpte.h.p &= ptep->h.p; + pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false); + pgste.vsie_notif = 1; + pgste_set_unlock(ptep_h, pgste); + + newpte = _pte(f->pfn, 0, !p, 0); + pgste = pgste_get_lock(ptep); + pgste = __dat_ptep_xchg(ptep, pgste, newpte, gpa_to_gfn(raddr), sg->asce, uses_skeys(sg)); + pgste_set_unlock(ptep, pgste); + + return 0; +} + +static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table, + struct guest_fault *f, bool p) +{ + union crste newcrste; + gfn_t gfn; + int rc; + + lockdep_assert_held(&sg->kvm->mmu_lock); + lockdep_assert_held(&sg->parent->children_lock); + + gfn = f->gfn & gpa_to_gfn(is_pmd(*table) ? _SEGMENT_MASK : _REGION3_MASK); + scoped_guard(spinlock, &sg->host_to_rmap_lock) + rc = gmap_insert_rmap(sg, gfn, gpa_to_gfn(raddr), host->h.tt); + if (rc) + return rc; + + newcrste = _crste_fc1(f->pfn, host->h.tt, f->writable, !p); + newcrste.s.fc1.d |= host->s.fc1.d; + newcrste.s.fc1.sd |= host->s.fc1.sd; + newcrste.h.p &= host->h.p; + newcrste.s.fc1.vsie_notif = 1; + newcrste.s.fc1.prefix_notif = host->s.fc1.prefix_notif; + _gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn, false); + + newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p); + dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce); + return 0; +} + +static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg, + unsigned long saddr, struct pgtwalk *w) +{ + struct guest_fault *entries; + int flags, i, hl, gl, l, rc; + union crste *table, *host; + union pte *ptep, *ptep_h; + + lockdep_assert_held(&sg->kvm->mmu_lock); + lockdep_assert_held(&sg->parent->children_lock); + + entries = get_entries(w); + ptep_h = NULL; + ptep = NULL; + + rc = dat_entry_walk(NULL, gpa_to_gfn(saddr), sg->asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, + &table, &ptep); + if (rc) + return rc; + + /* A race occourred. The shadow mapping is already valid, nothing to do */ + if ((ptep && !ptep->h.i) || (!ptep && crste_leaf(*table))) + return 0; + + gl = get_level(table, ptep); + + /* + * Skip levels that are already protected. For each level, protect + * only the page containing the entry, not the whole table. + */ + for (i = gl ; i >= w->level; i--) { + rc = gmap_protect_rmap(mc, sg, entries[i - 1].gfn, gpa_to_gfn(saddr), + entries[i - 1].pfn, i, entries[i - 1].writable); if (rc) return rc; - kvm->stat.gmap_shadow_sg_entry++; } + + rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF, + TABLE_TYPE_PAGE_TABLE, &host, &ptep_h); + if (rc) + return rc; + + hl = get_level(host, ptep_h); + /* Get the smallest granularity */ + l = min3(gl, hl, w->level); + + flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); + /* If necessary, create the shadow mapping */ + if (l < gl) { + rc = dat_entry_walk(mc, gpa_to_gfn(saddr), sg->asce, flags, l, &table, &ptep); + if (rc) + return rc; } - /* Return the parent address of the page table */ - *pgt = ptr; - return 0; + if (l < hl) { + rc = dat_entry_walk(mc, entries[LEVEL_MEM].gfn, sg->parent->asce, + flags, l, &host, &ptep_h); + if (rc) + return rc; + } + + if (KVM_BUG_ON(l > TABLE_TYPE_REGION3, sg->kvm)) + return -EFAULT; + if (l == TABLE_TYPE_PAGE_TABLE) + return _do_shadow_pte(sg, saddr, ptep_h, ptep, entries + LEVEL_MEM, w->p); + return _do_shadow_crste(sg, saddr, host, table, entries + LEVEL_MEM, w->p); } -/** - * shadow_pgt_lookup() - find a shadow page table - * @sg: pointer to the shadow guest address space structure - * @saddr: the address in the shadow aguest address space - * @pgt: parent gmap address of the page table to get shadowed - * @dat_protection: if the pgtable is marked as protected by dat - * @fake: pgt references contiguous guest memory block, not a pgtable - * - * Returns 0 if the shadow page table was found and -EAGAIN if the page - * table was not found. - * - * Called with sg->mm->mmap_lock in read. - */ -static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, - int *dat_protection, int *fake) +static inline int _gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + unsigned long seq, struct pgtwalk *walk) { - unsigned long pt_index; - unsigned long *table; - struct page *page; + struct gmap *parent; int rc; - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ - if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { - /* Shadow page tables are full pages (pte+pgste) */ - page = pfn_to_page(*table >> PAGE_SHIFT); - pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page)); - *pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE; - *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); - *fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE); - rc = 0; - } else { - rc = -EAGAIN; + if (kvm_s390_array_needs_retry_unsafe(vcpu->kvm, seq, walk->raw_entries)) + return -EAGAIN; +again: + rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc); + if (rc) + return rc; + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { + if (kvm_s390_array_needs_retry_safe(vcpu->kvm, seq, walk->raw_entries)) + return -EAGAIN; + parent = READ_ONCE(sg->parent); + if (!parent) + return -EAGAIN; + scoped_guard(spinlock, &parent->children_lock) { + if (READ_ONCE(sg->parent) != parent) + return -EAGAIN; + rc = _gaccess_do_shadow(vcpu->arch.mc, sg, saddr, walk); + } + if (rc == -ENOMEM) + goto again; + if (!rc) + kvm_s390_release_faultin_array(vcpu->kvm, walk->raw_entries, false); } - spin_unlock(&sg->guest_table_lock); return rc; } /** - * kvm_s390_shadow_fault - handle fault on a shadow page table - * @vcpu: virtual cpu - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @datptr: will contain the address of the faulting DAT table entry, or of - * the valid leaf, plus some flags + * __gaccess_shadow_fault() - Handle fault on a shadow page table. + * @vcpu: Virtual cpu that triggered the action. + * @sg: The shadow guest address space structure. + * @saddr: Faulting address in the shadow gmap. + * @datptr: Will contain the address of the faulting DAT table entry, or of + * the valid leaf, plus some flags. + * @wr: Whether this is a write access. * - * Returns: - 0 if the shadow fault was successfully resolved - * - > 0 (pgm exception code) on exceptions while faulting - * - -EAGAIN if the caller can retry immediately - * - -EFAULT when accessing invalid guest addresses - * - -ENOMEM if out of memory + * Return: + * * %0 if the shadow fault was successfully resolved + * * > 0 (pgm exception code) on exceptions while faulting + * * %-EAGAIN if the caller can retry immediately + * * %-EFAULT when accessing invalid guest addresses + * * %-ENOMEM if out of memory */ -int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, - unsigned long saddr, unsigned long *datptr) +static int __gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + union mvpg_pei *datptr, bool wr) { - union vaddress vaddr; - union page_table_entry pte; - unsigned long pgt = 0; - int dat_protection, fake; + struct pgtwalk walk = { .p = false, }; + unsigned long seq; int rc; - if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm)) - return -EFAULT; + seq = vcpu->kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); - mmap_read_lock(sg->mm); - /* - * We don't want any guest-2 tables to change - so the parent - * tables/pointers we read stay valid - unshadowing is however - * always possible - only guest_table_lock protects us. - */ - ipte_lock(vcpu->kvm); - - rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); + rc = walk_guest_tables(sg, saddr, &walk, wr); + if (datptr) { + datptr->val = walk.last_addr; + datptr->dat_prot = wr && walk.p; + datptr->not_pte = walk.level > TABLE_TYPE_PAGE_TABLE; + datptr->real = sg->guest_asce.r; + } + if (!rc) + rc = _gaccess_shadow_fault(vcpu, sg, saddr, seq, &walk); if (rc) - rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, - &fake); + kvm_s390_release_faultin_array(vcpu->kvm, walk.raw_entries, true); + return rc; +} - vaddr.addr = saddr; - if (fake) { - pte.val = pgt + vaddr.px * PAGE_SIZE; - goto shadow_page; - } +int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + union mvpg_pei *datptr, bool wr) +{ + int rc; - switch (rc) { - case PGM_SEGMENT_TRANSLATION: - case PGM_REGION_THIRD_TRANS: - case PGM_REGION_SECOND_TRANS: - case PGM_REGION_FIRST_TRANS: - pgt |= PEI_NOT_PTE; - break; - case 0: - pgt += vaddr.px * 8; - rc = gmap_read_table(sg->parent, pgt, &pte.val); - } - if (datptr) - *datptr = pgt | dat_protection * PEI_DAT_PROT; - if (!rc && pte.i) - rc = PGM_PAGE_TRANSLATION; - if (!rc && pte.z) - rc = PGM_TRANSLATION_SPEC; -shadow_page: - pte.p |= dat_protection; - if (!rc) - rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); - vcpu->kvm->stat.gmap_shadow_pg_entry++; + if (KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &sg->flags), vcpu->kvm)) + return -EFAULT; + + rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc); + if (rc) + return rc; + + ipte_lock(vcpu->kvm); + rc = __gaccess_shadow_fault(vcpu, sg, saddr, datptr, wr || sg->guest_asce.r); ipte_unlock(vcpu->kvm); - mmap_read_unlock(sg->mm); + return rc; } diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 3fde45a151f2..b5385cec60f4 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -206,8 +206,8 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len, enum gacc_mode mode); -int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old, - __uint128_t new, u8 access_key, bool *success); +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old, + union kvm_s390_quad new, u8 access_key, bool *success); /** * write_guest_with_key - copy data from kernel space to guest space @@ -450,11 +450,17 @@ void ipte_unlock(struct kvm *kvm); int ipte_lock_held(struct kvm *kvm); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); -/* MVPG PEI indication bits */ -#define PEI_DAT_PROT 2 -#define PEI_NOT_PTE 4 +union mvpg_pei { + unsigned long val; + struct { + unsigned long addr : 61; + unsigned long not_pte : 1; + unsigned long dat_prot: 1; + unsigned long real : 1; + }; +}; -int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, - unsigned long saddr, unsigned long *datptr); +int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, + union mvpg_pei *datptr, bool wr); #endif /* __KVM_S390_GACCESS_H */ diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c deleted file mode 100644 index 56ef153eb8fe..000000000000 --- a/arch/s390/kvm/gmap-vsie.c +++ /dev/null @@ -1,141 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Guest memory management for KVM/s390 nested VMs. - * - * Copyright IBM Corp. 2008, 2020, 2024 - * - * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> - * Martin Schwidefsky <schwidefsky@de.ibm.com> - * David Hildenbrand <david@redhat.com> - * Janosch Frank <frankja@linux.vnet.ibm.com> - */ - -#include <linux/compiler.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/pgtable.h> -#include <linux/pagemap.h> -#include <linux/mman.h> - -#include <asm/lowcore.h> -#include <asm/gmap.h> -#include <asm/uv.h> - -#include "kvm-s390.h" - -/** - * gmap_find_shadow - find a specific asce in the list of shadow tables - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * Returns the pointer to a gmap if a shadow table with the given asce is - * already available, ERR_PTR(-EAGAIN) if another one is just being created, - * otherwise NULL - * - * Context: Called with parent->shadow_lock held - */ -static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level) -{ - struct gmap *sg; - - lockdep_assert_held(&parent->shadow_lock); - list_for_each_entry(sg, &parent->children, list) { - if (!gmap_shadow_valid(sg, asce, edat_level)) - continue; - if (!sg->initialized) - return ERR_PTR(-EAGAIN); - refcount_inc(&sg->ref_count); - return sg; - } - return NULL; -} - -/** - * gmap_shadow - create/find a shadow guest address space - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * The pages of the top level page table referred by the asce parameter - * will be set to read-only and marked in the PGSTEs of the kvm process. - * The shadow table will be removed automatically on any change to the - * PTE mapping for the source table. - * - * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, - * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the - * parent gmap table could not be protected. - */ -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level) -{ - struct gmap *sg, *new; - unsigned long limit; - int rc; - - if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) || - KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private)) - return ERR_PTR(-EFAULT); - spin_lock(&parent->shadow_lock); - sg = gmap_find_shadow(parent, asce, edat_level); - spin_unlock(&parent->shadow_lock); - if (sg) - return sg; - /* Create a new shadow gmap */ - limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); - if (asce & _ASCE_REAL_SPACE) - limit = -1UL; - new = gmap_alloc(limit); - if (!new) - return ERR_PTR(-ENOMEM); - new->mm = parent->mm; - new->parent = gmap_get(parent); - new->private = parent->private; - new->orig_asce = asce; - new->edat_level = edat_level; - new->initialized = false; - spin_lock(&parent->shadow_lock); - /* Recheck if another CPU created the same shadow */ - sg = gmap_find_shadow(parent, asce, edat_level); - if (sg) { - spin_unlock(&parent->shadow_lock); - gmap_free(new); - return sg; - } - if (asce & _ASCE_REAL_SPACE) { - /* only allow one real-space gmap shadow */ - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce & _ASCE_REAL_SPACE) { - spin_lock(&sg->guest_table_lock); - gmap_unshadow(sg); - spin_unlock(&sg->guest_table_lock); - list_del(&sg->list); - gmap_put(sg); - break; - } - } - } - refcount_set(&new->ref_count, 2); - list_add(&new->list, &parent->children); - if (asce & _ASCE_REAL_SPACE) { - /* nothing to protect, return right away */ - new->initialized = true; - spin_unlock(&parent->shadow_lock); - return new; - } - spin_unlock(&parent->shadow_lock); - /* protect after insertion, so it will get properly invalidated */ - mmap_read_lock(parent->mm); - rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN, - ((asce & _ASCE_TABLE_LENGTH) + 1), - PROT_READ, GMAP_NOTIFY_SHADOW); - mmap_read_unlock(parent->mm); - spin_lock(&parent->shadow_lock); - new->initialized = true; - if (rc) { - list_del(&new->list); - gmap_free(new); - new = ERR_PTR(rc); - } - spin_unlock(&parent->shadow_lock); - return new; -} diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c new file mode 100644 index 000000000000..26cd2b208b6f --- /dev/null +++ b/arch/s390/kvm/gmap.c @@ -0,0 +1,1244 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Guest memory management for KVM/s390 + * + * Copyright IBM Corp. 2008, 2020, 2024 + * + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> + * Martin Schwidefsky <schwidefsky@de.ibm.com> + * David Hildenbrand <david@redhat.com> + * Janosch Frank <frankja@linux.ibm.com> + */ + +#include <linux/compiler.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/pgtable.h> +#include <linux/pagemap.h> +#include <asm/lowcore.h> +#include <asm/uv.h> +#include <asm/gmap_helpers.h> + +#include "dat.h" +#include "gmap.h" +#include "kvm-s390.h" +#include "faultin.h" + +static inline bool kvm_s390_is_in_sie(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.sie_block->prog0c & PROG_IN_SIE; +} + +static int gmap_limit_to_type(gfn_t limit) +{ + if (!limit) + return TABLE_TYPE_REGION1; + if (limit <= _REGION3_SIZE >> PAGE_SHIFT) + return TABLE_TYPE_SEGMENT; + if (limit <= _REGION2_SIZE >> PAGE_SHIFT) + return TABLE_TYPE_REGION3; + if (limit <= _REGION1_SIZE >> PAGE_SHIFT) + return TABLE_TYPE_REGION2; + return TABLE_TYPE_REGION1; +} + +/** + * gmap_new() - Allocate and initialize a guest address space. + * @kvm: The kvm owning the guest. + * @limit: Maximum address of the gmap address space. + * + * Return: A guest address space structure. + */ +struct gmap *gmap_new(struct kvm *kvm, gfn_t limit) +{ + struct crst_table *table; + struct gmap *gmap; + int type; + + type = gmap_limit_to_type(limit); + + gmap = kzalloc(sizeof(*gmap), GFP_KERNEL_ACCOUNT); + if (!gmap) + return NULL; + INIT_LIST_HEAD(&gmap->children); + INIT_LIST_HEAD(&gmap->list); + INIT_LIST_HEAD(&gmap->scb_users); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_KVM_S390_MMU_CACHE); + spin_lock_init(&gmap->children_lock); + spin_lock_init(&gmap->host_to_rmap_lock); + refcount_set(&gmap->refcount, 1); + + table = dat_alloc_crst_sleepable(_CRSTE_EMPTY(type).val); + if (!table) { + kfree(gmap); + return NULL; + } + + gmap->asce.val = __pa(table); + gmap->asce.dt = type; + gmap->asce.tl = _ASCE_TABLE_LENGTH; + gmap->asce.x = 1; + gmap->asce.p = 1; + gmap->asce.s = 1; + gmap->kvm = kvm; + set_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags); + + return gmap; +} + +static void gmap_add_child(struct gmap *parent, struct gmap *child) +{ + KVM_BUG_ON(is_ucontrol(parent) && parent->parent, parent->kvm); + KVM_BUG_ON(is_ucontrol(parent) && !owns_page_tables(parent), parent->kvm); + KVM_BUG_ON(!refcount_read(&child->refcount), parent->kvm); + lockdep_assert_held(&parent->children_lock); + + child->parent = parent; + + if (is_ucontrol(parent)) + set_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); + else + clear_bit(GMAP_FLAG_IS_UCONTROL, &child->flags); + + if (test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &parent->flags)) + set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); + else + clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &child->flags); + + if (kvm_is_ucontrol(parent->kvm)) + clear_bit(GMAP_FLAG_OWNS_PAGETABLES, &child->flags); + list_add(&child->list, &parent->children); +} + +struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit) +{ + struct gmap *res; + + lockdep_assert_not_held(&parent->children_lock); + res = gmap_new(parent->kvm, limit); + if (res) { + scoped_guard(spinlock, &parent->children_lock) + gmap_add_child(parent, res); + } + return res; +} + +int gmap_set_limit(struct gmap *gmap, gfn_t limit) +{ + struct kvm_s390_mmu_cache *mc; + int rc, type; + + type = gmap_limit_to_type(limit); + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + do { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + scoped_guard(write_lock, &gmap->kvm->mmu_lock) + rc = dat_set_asce_limit(mc, &gmap->asce, type); + } while (rc == -ENOMEM); + + kvm_s390_free_mmu_cache(mc); + return 0; +} + +static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) +{ + struct vsie_rmap *rmap, *rnext, *head; + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void __rcu **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + head = radix_tree_delete(root, index); + gmap_for_each_rmap_safe(rmap, rnext, head) + kfree(rmap); + } + } while (nr > 0); +} + +void gmap_remove_child(struct gmap *child) +{ + if (KVM_BUG_ON(!child->parent, child->kvm)) + return; + lockdep_assert_held(&child->parent->children_lock); + + list_del(&child->list); + child->parent = NULL; +} + +/** + * gmap_dispose() - Remove and free a guest address space and its children. + * @gmap: Pointer to the guest address space structure. + */ +void gmap_dispose(struct gmap *gmap) +{ + /* The gmap must have been removed from the parent beforehands */ + KVM_BUG_ON(gmap->parent, gmap->kvm); + /* All children of this gmap must have been removed beforehands */ + KVM_BUG_ON(!list_empty(&gmap->children), gmap->kvm); + /* No VSIE shadow block is allowed to use this gmap */ + KVM_BUG_ON(!list_empty(&gmap->scb_users), gmap->kvm); + /* The ASCE must be valid */ + KVM_BUG_ON(!gmap->asce.val, gmap->kvm); + /* The refcount must be 0 */ + KVM_BUG_ON(refcount_read(&gmap->refcount), gmap->kvm); + + /* Flush tlb of all gmaps */ + asce_flush_tlb(gmap->asce); + + /* Free all DAT tables. */ + dat_free_level(dereference_asce(gmap->asce), owns_page_tables(gmap)); + + /* Free additional data for a shadow gmap */ + if (is_shadow(gmap)) + gmap_rmap_radix_tree_free(&gmap->host_to_rmap); + + kfree(gmap); +} + +/** + * s390_replace_asce() - Try to replace the current ASCE of a gmap with a copy. + * @gmap: The gmap whose ASCE needs to be replaced. + * + * If the ASCE is a SEGMENT type then this function will return -EINVAL, + * otherwise the pointers in the host_to_guest radix tree will keep pointing + * to the wrong pages, causing use-after-free and memory corruption. + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + * + * Return: 0 in case of success, -EINVAL if the ASCE is segment type ASCE, + * -ENOMEM if runinng out of memory. + */ +int s390_replace_asce(struct gmap *gmap) +{ + struct crst_table *table; + union asce asce; + + /* Replacing segment type ASCEs would cause serious issues */ + if (gmap->asce.dt == ASCE_TYPE_SEGMENT) + return -EINVAL; + + table = dat_alloc_crst_sleepable(0); + if (!table) + return -ENOMEM; + memcpy(table, dereference_asce(gmap->asce), sizeof(*table)); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = gmap->asce; + asce.rsto = virt_to_pfn(table); + WRITE_ONCE(gmap->asce, asce); + + return 0; +} + +bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint) +{ + struct kvm *kvm = gmap->kvm; + struct kvm_vcpu *vcpu; + gfn_t prefix_gfn; + unsigned long i; + + if (is_shadow(gmap)) + return false; + kvm_for_each_vcpu(i, vcpu, kvm) { + /* Match against both prefix pages */ + prefix_gfn = gpa_to_gfn(kvm_s390_get_prefix(vcpu)); + if (prefix_gfn < end && gfn <= prefix_gfn + 1) { + if (hint && kvm_s390_is_in_sie(vcpu)) + return false; + VCPU_EVENT(vcpu, 2, "gmap notifier for %llx-%llx", + gfn_to_gpa(gfn), gfn_to_gpa(end)); + kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); + } + } + return true; +} + +struct clear_young_pte_priv { + struct gmap *gmap; + bool young; +}; + +static long gmap_clear_young_pte(union pte *ptep, gfn_t gfn, gfn_t end, struct dat_walk *walk) +{ + struct clear_young_pte_priv *p = walk->priv; + union pgste pgste; + union pte pte, new; + + pte = READ_ONCE(*ptep); + + if (!pte.s.pr || (!pte.s.y && pte.h.i)) + return 0; + + pgste = pgste_get_lock(ptep); + if (!pgste.prefix_notif || gmap_mkold_prefix(p->gmap, gfn, end)) { + new = pte; + new.h.i = 1; + new.s.y = 0; + if ((new.s.d || !new.h.p) && !new.s.s) + folio_set_dirty(pfn_folio(pte.h.pfra)); + new.s.d = 0; + new.h.p = 1; + + pgste.prefix_notif = 0; + pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, walk->asce, uses_skeys(p->gmap)); + } + p->young = 1; + pgste_set_unlock(ptep, pgste); + return 0; +} + +static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, struct dat_walk *walk) +{ + struct clear_young_pte_priv *priv = walk->priv; + union crste crste, new; + + crste = READ_ONCE(*crstep); + + if (!crste.h.fc) + return 0; + if (!crste.s.fc1.y && crste.h.i) + return 0; + if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) { + new = crste; + new.h.i = 1; + new.s.fc1.y = 0; + new.s.fc1.prefix_notif = 0; + if (new.s.fc1.d || !new.h.p) + folio_set_dirty(phys_to_folio(crste_origin_large(crste))); + new.s.fc1.d = 0; + new.h.p = 1; + dat_crstep_xchg(crstep, new, gfn, walk->asce); + } + priv->young = 1; + return 0; +} + +/** + * gmap_age_gfn() - Clear young. + * @gmap: The guest gmap. + * @start: The first gfn to test. + * @end: The gfn after the last one to test. + * + * Context: Called with the kvm mmu write lock held. + * Return: 1 if any page in the given range was young, otherwise 0. + */ +bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end) +{ + const struct dat_walk_ops ops = { + .pte_entry = gmap_clear_young_pte, + .pmd_entry = gmap_clear_young_crste, + .pud_entry = gmap_clear_young_crste, + }; + struct clear_young_pte_priv priv = { + .gmap = gmap, + .young = false, + }; + + _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); + + return priv.young; +} + +struct gmap_unmap_priv { + struct gmap *gmap; + struct kvm_memory_slot *slot; +}; + +static long _gmap_unmap_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *w) +{ + struct gmap_unmap_priv *priv = w->priv; + struct folio *folio = NULL; + unsigned long vmaddr; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + if (ptep->s.pr && pgste.usage == PGSTE_GPS_USAGE_UNUSED) { + vmaddr = __gfn_to_hva_memslot(priv->slot, gfn); + gmap_helper_try_set_pte_unused(priv->gmap->kvm->mm, vmaddr); + } + if (ptep->s.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) + folio = pfn_folio(ptep->h.pfra); + pgste = gmap_ptep_xchg(priv->gmap, ptep, _PTE_EMPTY, pgste, gfn); + pgste_set_unlock(ptep, pgste); + if (folio) + uv_convert_from_secure_folio(folio); + + return 0; +} + +static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct gmap_unmap_priv *priv = walk->priv; + struct folio *folio = NULL; + + if (crstep->h.fc) { + if (crstep->s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags)) + folio = phys_to_folio(crste_origin_large(*crstep)); + gmap_crstep_xchg(priv->gmap, crstep, _CRSTE_EMPTY(crstep->h.tt), gfn); + if (folio) + uv_convert_from_secure_folio(folio); + } + + return 0; +} + +/** + * gmap_unmap_gfn_range() - Unmap a range of guest addresses. + * @gmap: The gmap to act on. + * @slot: The memslot in which the range is located. + * @start: The first gfn to unmap. + * @end: The gfn after the last one to unmap. + * + * Context: Called with the kvm mmu write lock held. + * Return: false + */ +bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end) +{ + const struct dat_walk_ops ops = { + .pte_entry = _gmap_unmap_pte, + .pmd_entry = _gmap_unmap_crste, + .pud_entry = _gmap_unmap_crste, + }; + struct gmap_unmap_priv priv = { + .gmap = gmap, + .slot = slot, + }; + + lockdep_assert_held_write(&gmap->kvm->mmu_lock); + + _dat_walk_gfn_range(start, end, gmap->asce, &ops, 0, &priv); + return false; +} + +static union pgste __pte_test_and_clear_softdirty(union pte *ptep, union pgste pgste, gfn_t gfn, + struct gmap *gmap) +{ + union pte pte = READ_ONCE(*ptep); + + if (!pte.s.pr || (pte.h.p && !pte.s.sd)) + return pgste; + + /* + * If this page contains one or more prefixes of vCPUS that are currently + * running, do not reset the protection, leave it marked as dirty. + */ + if (!pgste.prefix_notif || gmap_mkold_prefix(gmap, gfn, gfn + 1)) { + pte.h.p = 1; + pte.s.sd = 0; + pgste = gmap_ptep_xchg(gmap, ptep, pte, pgste, gfn); + } + + mark_page_dirty(gmap->kvm, gfn); + + return pgste; +} + +static long _pte_test_and_clear_softdirty(union pte *ptep, gfn_t gfn, gfn_t end, + struct dat_walk *walk) +{ + struct gmap *gmap = walk->priv; + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste = __pte_test_and_clear_softdirty(ptep, pgste, gfn, gmap); + pgste_set_unlock(ptep, pgste); + return 0; +} + +static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t end, + struct dat_walk *walk) +{ + struct gmap *gmap = walk->priv; + union crste crste, new; + + if (fatal_signal_pending(current)) + return 1; + crste = READ_ONCE(*table); + if (!crste.h.fc) + return 0; + if (crste.h.p && !crste.s.fc1.sd) + return 0; + + /* + * If this large page contains one or more prefixes of vCPUs that are + * currently running, do not reset the protection, leave it marked as + * dirty. + */ + if (!crste.s.fc1.prefix_notif || gmap_mkold_prefix(gmap, gfn, end)) { + new = crste; + new.h.p = 1; + new.s.fc1.sd = 0; + gmap_crstep_xchg(gmap, table, new, gfn); + } + + for ( ; gfn < end; gfn++) + mark_page_dirty(gmap->kvm, gfn); + + return 0; +} + +void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end) +{ + const struct dat_walk_ops walk_ops = { + .pte_entry = _pte_test_and_clear_softdirty, + .pmd_entry = _crste_test_and_clear_softdirty, + .pud_entry = _crste_test_and_clear_softdirty, + }; + + lockdep_assert_held(&gmap->kvm->mmu_lock); + + _dat_walk_gfn_range(start, end, gmap->asce, &walk_ops, 0, gmap); +} + +static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f) +{ + union crste newcrste, oldcrste = READ_ONCE(*f->crstep); + + /* Somehow the crste is not large anymore, let the slow path deal with it. */ + if (!oldcrste.h.fc) + return 1; + + f->pfn = PHYS_PFN(large_crste_to_phys(oldcrste, f->gfn)); + f->writable = oldcrste.s.fc1.w; + + /* Appropriate permissions already (race with another handler), nothing to do. */ + if (!oldcrste.h.i && !(f->write_attempt && oldcrste.h.p)) + return 0; + + if (!f->write_attempt || oldcrste.s.fc1.w) { + f->write_attempt |= oldcrste.s.fc1.w && oldcrste.s.fc1.d; + newcrste = oldcrste; + newcrste.h.i = 0; + newcrste.s.fc1.y = 1; + if (f->write_attempt) { + newcrste.h.p = 0; + newcrste.s.fc1.d = 1; + newcrste.s.fc1.sd = 1; + } + if (!oldcrste.s.fc1.d && newcrste.s.fc1.d) + SetPageDirty(phys_to_page(crste_origin_large(newcrste))); + /* In case of races, let the slow path deal with it. */ + return !dat_crstep_xchg_atomic(f->crstep, oldcrste, newcrste, f->gfn, asce); + } + /* Trying to write on a read-only page, let the slow path deal with it. */ + return 1; +} + +static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste, + struct guest_fault *f) +{ + union pte newpte, oldpte = READ_ONCE(*f->ptep); + + f->pfn = oldpte.h.pfra; + f->writable = oldpte.s.w; + + /* Appropriate permissions already (race with another handler), nothing to do. */ + if (!oldpte.h.i && !(f->write_attempt && oldpte.h.p)) + return 0; + /* Trying to write on a read-only page, let the slow path deal with it. */ + if (!oldpte.s.pr || (f->write_attempt && !oldpte.s.w)) + return 1; + + newpte = oldpte; + newpte.h.i = 0; + newpte.s.y = 1; + if (f->write_attempt) { + newpte.h.p = 0; + newpte.s.d = 1; + newpte.s.sd = 1; + } + if (!oldpte.s.d && newpte.s.d) + SetPageDirty(pfn_to_page(newpte.h.pfra)); + *pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn); + + return 0; +} + +/** + * gmap_try_fixup_minor() -- Try to fixup a minor gmap fault. + * @gmap: The gmap whose fault needs to be resolved. + * @fault: Describes the fault that is being resolved. + * + * A minor fault is a fault that can be resolved quickly within gmap. + * The page is already mapped, the fault is only due to dirty/young tracking. + * + * Return: 0 in case of success, < 0 in case of error, > 0 if the fault could + * not be resolved and needs to go through the slow path. + */ +int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault) +{ + union pgste pgste; + int rc; + + lockdep_assert_held(&gmap->kvm->mmu_lock); + + rc = dat_entry_walk(NULL, fault->gfn, gmap->asce, DAT_WALK_LEAF, TABLE_TYPE_PAGE_TABLE, + &fault->crstep, &fault->ptep); + /* If a PTE or a leaf CRSTE could not be reached, slow path. */ + if (rc) + return 1; + + if (fault->ptep) { + pgste = pgste_get_lock(fault->ptep); + rc = _gmap_handle_minor_pte_fault(gmap, &pgste, fault); + if (!rc && fault->callback) + fault->callback(fault); + pgste_set_unlock(fault->ptep, pgste); + } else { + rc = gmap_handle_minor_crste_fault(gmap->asce, fault); + if (!rc && fault->callback) + fault->callback(fault); + } + return rc; +} + +static inline bool gmap_2g_allowed(struct gmap *gmap, gfn_t gfn) +{ + return false; +} + +static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn) +{ + return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags); +} + +int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f) +{ + unsigned int order; + int rc, level; + + lockdep_assert_held(&gmap->kvm->mmu_lock); + + level = TABLE_TYPE_PAGE_TABLE; + if (f->page) { + order = folio_order(page_folio(f->page)); + if (order >= get_order(_REGION3_SIZE) && gmap_2g_allowed(gmap, f->gfn)) + level = TABLE_TYPE_REGION3; + else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn)) + level = TABLE_TYPE_SEGMENT; + } + rc = dat_link(mc, gmap->asce, level, uses_skeys(gmap), f); + KVM_BUG_ON(rc == -EINVAL, gmap->kvm); + return rc; +} + +static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, + gfn_t p_gfn, gfn_t c_gfn, bool force_alloc) +{ + struct page_table *pt; + union crste newcrste; + union crste *crstep; + union pte *ptep; + int rc; + + if (force_alloc) + rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC, + TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + else + rc = dat_entry_walk(mc, p_gfn, gmap->parent->asce, DAT_WALK_ALLOC_CONTINUE, + TABLE_TYPE_SEGMENT, &crstep, &ptep); + if (rc) + return rc; + if (!ptep) { + newcrste = _crste_fc0(p_gfn, TABLE_TYPE_SEGMENT); + newcrste.h.i = 1; + newcrste.h.fc0.tl = 1; + } else { + pt = pte_table_start(ptep); + dat_set_ptval(pt, PTVAL_VMADDR, p_gfn >> (_SEGMENT_SHIFT - PAGE_SHIFT)); + newcrste = _crste_fc0(virt_to_pfn(pt), TABLE_TYPE_SEGMENT); + } + rc = dat_entry_walk(mc, c_gfn, gmap->asce, DAT_WALK_ALLOC, TABLE_TYPE_SEGMENT, + &crstep, &ptep); + if (rc) + return rc; + dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce); + return 0; +} + +static int gmap_ucas_translate_simple(struct gmap *gmap, gpa_t *gaddr, union crste **crstepp) +{ + union pte *ptep; + int rc; + + rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), gmap->asce, DAT_WALK_CONTINUE, + TABLE_TYPE_SEGMENT, crstepp, &ptep); + if (rc || (!ptep && !crste_is_ucas(**crstepp))) + return -EREMOTE; + if (!ptep) + return 1; + *gaddr &= ~_SEGMENT_MASK; + *gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT; + return 0; +} + +/** + * gmap_ucas_translate() - Translate a vcpu address into a host gmap address + * @mc: The memory cache to be used for allocations. + * @gmap: The per-cpu gmap. + * @gaddr: Pointer to the address to be translated, will get overwritten with + * the translated address in case of success. + * Translates the per-vCPU guest address into a fake guest address, which can + * then be used with the fake memslots that are identity mapping userspace. + * This allows ucontrol VMs to use the normal fault resolution path, like + * normal VMs. + * + * Return: %0 in case of success, otherwise %-EREMOTE. + */ +int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr) +{ + gpa_t translated_address; + union crste *crstep; + gfn_t gfn; + int rc; + + gfn = gpa_to_gfn(*gaddr); + + scoped_guard(read_lock, &gmap->kvm->mmu_lock) { + rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); + if (rc <= 0) + return rc; + } + do { + scoped_guard(write_lock, &gmap->kvm->mmu_lock) { + rc = gmap_ucas_translate_simple(gmap, gaddr, &crstep); + if (rc <= 0) + return rc; + translated_address = (*gaddr & ~_SEGMENT_MASK) | + (crstep->val & _SEGMENT_MASK); + rc = gmap_ucas_map_one(mc, gmap, gpa_to_gfn(translated_address), gfn, true); + } + if (!rc) { + *gaddr = translated_address; + return 0; + } + if (rc != -ENOMEM) + return -EREMOTE; + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + } while (1); + return 0; +} + +int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count) +{ + struct kvm_s390_mmu_cache *mc; + int rc; + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + while (count) { + scoped_guard(write_lock, &gmap->kvm->mmu_lock) + rc = gmap_ucas_map_one(mc, gmap, p_gfn, c_gfn, false); + if (rc == -ENOMEM) { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + continue; + } + if (rc) + return rc; + + count--; + c_gfn += _PAGE_ENTRIES; + p_gfn += _PAGE_ENTRIES; + } + return rc; +} + +static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn) +{ + union crste *crstep; + union pte *ptep; + int rc; + + rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep); + if (!rc) + dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce); +} + +void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count) +{ + guard(read_lock)(&gmap->kvm->mmu_lock); + + for ( ; count; count--, c_gfn += _PAGE_ENTRIES) + gmap_ucas_unmap_one(gmap, c_gfn); +} + +static long _gmap_split_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + struct gmap *gmap = walk->priv; + union crste crste, newcrste; + + crste = READ_ONCE(*crstep); + newcrste = _CRSTE_EMPTY(crste.h.tt); + + while (crste_leaf(crste)) { + if (crste_prefix(crste)) + gmap_unmap_prefix(gmap, gfn, next); + if (crste.s.fc1.vsie_notif) + gmap_handle_vsie_unshadow_event(gmap, gfn); + if (dat_crstep_xchg_atomic(crstep, crste, newcrste, gfn, walk->asce)) + break; + crste = READ_ONCE(*crstep); + } + + if (need_resched()) + return next; + + return 0; +} + +void gmap_split_huge_pages(struct gmap *gmap) +{ + const struct dat_walk_ops ops = { + .pmd_entry = _gmap_split_crste, + .pud_entry = _gmap_split_crste, + }; + gfn_t start = 0; + + do { + scoped_guard(read_lock, &gmap->kvm->mmu_lock) + start = _dat_walk_gfn_range(start, asce_end(gmap->asce), gmap->asce, + &ops, DAT_WALK_IGN_HOLES, gmap); + cond_resched(); + } while (start); +} + +static int _gmap_enable_skeys(struct gmap *gmap) +{ + gfn_t start = 0; + int rc; + + if (uses_skeys(gmap)) + return 0; + + set_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); + rc = gmap_helper_disable_cow_sharing(); + if (rc) { + clear_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); + return rc; + } + + do { + scoped_guard(write_lock, &gmap->kvm->mmu_lock) + start = dat_reset_skeys(gmap->asce, start); + cond_resched(); + } while (start); + return 0; +} + +int gmap_enable_skeys(struct gmap *gmap) +{ + int rc; + + mmap_write_lock(gmap->kvm->mm); + rc = _gmap_enable_skeys(gmap); + mmap_write_unlock(gmap->kvm->mm); + return rc; +} + +static long _destroy_pages_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + if (!ptep->s.pr) + return 0; + __kvm_s390_pv_destroy_page(phys_to_page(pte_origin(*ptep))); + if (need_resched()) + return next; + return 0; +} + +static long _destroy_pages_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + phys_addr_t origin, cur, end; + + if (!crstep->h.fc || !crstep->s.fc1.pr) + return 0; + + origin = crste_origin_large(*crstep); + cur = ((max(gfn, walk->start) - gfn) << PAGE_SHIFT) + origin; + end = ((min(next, walk->end) - gfn) << PAGE_SHIFT) + origin; + for ( ; cur < end; cur += PAGE_SIZE) + __kvm_s390_pv_destroy_page(phys_to_page(cur)); + if (need_resched()) + return next; + return 0; +} + +int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible) +{ + const struct dat_walk_ops ops = { + .pte_entry = _destroy_pages_pte, + .pmd_entry = _destroy_pages_crste, + .pud_entry = _destroy_pages_crste, + }; + + do { + scoped_guard(read_lock, &gmap->kvm->mmu_lock) + start = _dat_walk_gfn_range(start, end, gmap->asce, &ops, + DAT_WALK_IGN_HOLES, NULL); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + cond_resched(); + } while (start && start < end); + return 0; +} + +int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level) +{ + struct vsie_rmap *rmap __free(kvfree) = NULL; + struct vsie_rmap *temp; + void __rcu **slot; + int rc = 0; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + lockdep_assert_held(&sg->host_to_rmap_lock); + + rmap = kzalloc(sizeof(*rmap), GFP_ATOMIC); + if (!rmap) + return -ENOMEM; + + rmap->r_gfn = r_gfn; + rmap->level = level; + slot = radix_tree_lookup_slot(&sg->host_to_rmap, p_gfn); + if (slot) { + rmap->next = radix_tree_deref_slot_protected(slot, &sg->host_to_rmap_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->val == rmap->val) + return 0; + } + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); + } else { + rmap->next = NULL; + rc = radix_tree_insert(&sg->host_to_rmap, p_gfn, rmap); + if (rc) + return rc; + } + rmap = NULL; + + return 0; +} + +int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, + kvm_pfn_t pfn, int level, bool wr) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + union pte pte; + int flags, rc; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + lockdep_assert_held(&sg->parent->children_lock); + + flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); + rc = dat_entry_walk(mc, p_gfn, sg->parent->asce, flags, + TABLE_TYPE_PAGE_TABLE, &crstep, &ptep); + if (rc) + return rc; + if (level <= TABLE_TYPE_REGION1) { + scoped_guard(spinlock, &sg->host_to_rmap_lock) + rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level); + } + if (rc) + return rc; + + if (!pgste_get_trylock(ptep, &pgste)) + return -EAGAIN; + pte = ptep->s.pr ? *ptep : _pte(pfn, wr, false, false); + pte.h.p = 1; + pgste = _gmap_ptep_xchg(sg->parent, ptep, pte, pgste, p_gfn, false); + pgste.vsie_notif = 1; + pgste_set_unlock(ptep, pgste); + + return 0; +} + +static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + __atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val); + if (need_resched()) + return next; + return 0; +} + +void gmap_set_cmma_all_dirty(struct gmap *gmap) +{ + const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, }; + gfn_t gfn = 0; + + do { + scoped_guard(read_lock, &gmap->kvm->mmu_lock) + gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops, + DAT_WALK_IGN_HOLES, NULL); + cond_resched(); + } while (gfn); +} + +static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level) +{ + unsigned long align = PAGE_SIZE; + gpa_t gaddr = gfn_to_gpa(r_gfn); + union crste *crstep; + union crste crste; + union pte *ptep; + + if (level > TABLE_TYPE_PAGE_TABLE) + align = 1UL << (11 * level + _SEGMENT_SHIFT); + kvm_s390_vsie_gmap_notifier(sg, ALIGN_DOWN(gaddr, align), ALIGN(gaddr + 1, align)); + if (dat_entry_walk(NULL, r_gfn, sg->asce, 0, level, &crstep, &ptep)) + return; + if (ptep) { + if (READ_ONCE(*ptep).val != _PTE_EMPTY.val) + dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg)); + return; + } + crste = READ_ONCE(*crstep); + dat_crstep_clear(crstep, r_gfn, sg->asce); + if (crste_leaf(crste) || crste.h.i) + return; + if (is_pmd(crste)) + dat_free_pt(dereference_pmd(crste.pmd)); + else + dat_free_level(dereference_crste(crste), true); +} + +static void gmap_unshadow(struct gmap *sg) +{ + struct gmap_cache *gmap_cache, *next; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + KVM_BUG_ON(!sg->parent, sg->kvm); + + lockdep_assert_held(&sg->parent->children_lock); + + gmap_remove_child(sg); + kvm_s390_vsie_gmap_notifier(sg, 0, -1UL); + + list_for_each_entry_safe(gmap_cache, next, &sg->scb_users, list) { + gmap_cache->gmap = NULL; + list_del(&gmap_cache->list); + } + + gmap_put(sg); +} + +void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn) +{ + struct vsie_rmap *rmap, *rnext, *head; + struct gmap *sg, *next; + gfn_t start, end; + + list_for_each_entry_safe(sg, next, &parent->children, list) { + start = sg->guest_asce.rsto; + end = start + sg->guest_asce.tl + 1; + if (!sg->guest_asce.r && gfn >= start && gfn < end) { + gmap_unshadow(sg); + continue; + } + scoped_guard(spinlock, &sg->host_to_rmap_lock) + head = radix_tree_delete(&sg->host_to_rmap, gfn); + gmap_for_each_rmap_safe(rmap, rnext, head) + gmap_unshadow_level(sg, rmap->r_gfn, rmap->level); + } +} + +/** + * gmap_find_shadow() - Find a specific ASCE in the list of shadow tables. + * @parent: Pointer to the parent gmap. + * @asce: ASCE for which the shadow table is created. + * @edat_level: Edat level to be used for the shadow translation. + * + * Context: Called with parent->children_lock held. + * + * Return: The pointer to a gmap if a shadow table with the given asce is + * already available, ERR_PTR(-EAGAIN) if another one is just being created, + * otherwise NULL. + */ +static struct gmap *gmap_find_shadow(struct gmap *parent, union asce asce, int edat_level) +{ + struct gmap *sg; + + lockdep_assert_held(&parent->children_lock); + list_for_each_entry(sg, &parent->children, list) { + if (!gmap_is_shadow_valid(sg, asce, edat_level)) + continue; + return sg; + } + return NULL; +} + +#define CRST_TABLE_PAGES (_CRST_TABLE_SIZE / PAGE_SIZE) +struct gmap_protect_asce_top_level { + unsigned long seq; + struct guest_fault f[CRST_TABLE_PAGES]; +}; + +static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, + struct gmap_protect_asce_top_level *context) +{ + int rc, i; + + guard(write_lock)(&sg->kvm->mmu_lock); + + if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f)) + return -EAGAIN; + + scoped_guard(spinlock, &sg->parent->children_lock) { + for (i = 0; i < CRST_TABLE_PAGES; i++) { + if (!context->f[i].valid) + continue; + rc = gmap_protect_rmap(mc, sg, context->f[i].gfn, 0, context->f[i].pfn, + TABLE_TYPE_REGION1 + 1, context->f[i].writable); + if (rc) + return rc; + } + gmap_add_child(sg->parent, sg); + } + + kvm_s390_release_faultin_array(sg->kvm, context->f, false); + return 0; +} + +static inline int _gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg, + struct gmap_protect_asce_top_level *context) +{ + int rc; + + if (kvm_s390_array_needs_retry_unsafe(sg->kvm, context->seq, context->f)) + return -EAGAIN; + do { + rc = kvm_s390_mmu_cache_topup(mc); + if (rc) + return rc; + rc = radix_tree_preload(GFP_KERNEL); + if (rc) + return rc; + rc = __gmap_protect_asce_top_level(mc, sg, context); + radix_tree_preload_end(); + } while (rc == -ENOMEM); + + return rc; +} + +static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg) +{ + struct gmap_protect_asce_top_level context = {}; + union asce asce = sg->guest_asce; + int rc; + + KVM_BUG_ON(!is_shadow(sg), sg->kvm); + + context.seq = sg->kvm->mmu_invalidate_seq; + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ + smp_rmb(); + + rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false); + if (rc > 0) + rc = -EFAULT; + if (!rc) + rc = _gmap_protect_asce_top_level(mc, sg, &context); + if (rc) + kvm_s390_release_faultin_array(sg->kvm, context.f, true); + return rc; +} + +/** + * gmap_create_shadow() - Create/find a shadow guest address space. + * @mc: The cache to use to allocate dat tables. + * @parent: Pointer to the parent gmap. + * @asce: ASCE for which the shadow table is created. + * @edat_level: Edat level to be used for the shadow translation. + * + * The pages of the top level page table referred by the asce parameter + * will be set to read-only and marked in the PGSTEs of the kvm process. + * The shadow table will be removed automatically on any change to the + * PTE mapping for the source table. + * + * The returned shadow gmap will be returned with one extra reference. + * + * Return: A guest address space structure, ERR_PTR(-ENOMEM) if out of memory, + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the + * parent gmap table could not be protected. + */ +struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *parent, + union asce asce, int edat_level) +{ + struct gmap *sg, *new; + int rc; + + scoped_guard(spinlock, &parent->children_lock) { + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + gmap_get(sg); + return sg; + } + } + /* Create a new shadow gmap. */ + new = gmap_new(parent->kvm, asce.r ? 1UL << (64 - PAGE_SHIFT) : asce_end(asce)); + if (!new) + return ERR_PTR(-ENOMEM); + new->guest_asce = asce; + new->edat_level = edat_level; + set_bit(GMAP_FLAG_SHADOW, &new->flags); + + scoped_guard(spinlock, &parent->children_lock) { + /* Recheck if another CPU created the same shadow. */ + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + gmap_put(new); + gmap_get(sg); + return sg; + } + if (asce.r) { + /* Only allow one real-space gmap shadow. */ + list_for_each_entry(sg, &parent->children, list) { + if (sg->guest_asce.r) { + scoped_guard(write_lock, &parent->kvm->mmu_lock) + gmap_unshadow(sg); + break; + } + } + gmap_add_child(parent, new); + /* Nothing to protect, return right away. */ + gmap_get(new); + return new; + } + } + + gmap_get(new); + new->parent = parent; + /* Protect while inserting, protects against invalidation races. */ + rc = gmap_protect_asce_top_level(mc, new); + if (rc) { + new->parent = NULL; + gmap_put(new); + gmap_put(new); + return ERR_PTR(rc); + } + return new; +} diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h new file mode 100644 index 000000000000..ccb5cd751e31 --- /dev/null +++ b/arch/s390/kvm/gmap.h @@ -0,0 +1,244 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2016, 2025 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + * Claudio Imbrenda <imbrenda@linux.ibm.com> + */ + +#ifndef ARCH_KVM_S390_GMAP_H +#define ARCH_KVM_S390_GMAP_H + +#include "dat.h" + +/** + * enum gmap_flags - Flags of a gmap. + * + * @GMAP_FLAG_SHADOW: The gmap is a vsie shadow gmap. + * @GMAP_FLAG_OWNS_PAGETABLES: The gmap owns all dat levels; normally 1, is 0 + * only for ucontrol per-cpu gmaps, since they + * share the page tables with the main gmap. + * @GMAP_FLAG_IS_UCONTROL: The gmap is ucontrol (main gmap or per-cpu gmap). + * @GMAP_FLAG_ALLOW_HPAGE_1M: 1M hugepages are allowed for this gmap, + * independently of the page size used by userspace. + * @GMAP_FLAG_ALLOW_HPAGE_2G: 2G hugepages are allowed for this gmap, + * independently of the page size used by userspace. + * @GMAP_FLAG_PFAULT_ENABLED: Pfault is enabled for the gmap. + * @GMAP_FLAG_USES_SKEYS: If the guest uses storage keys. + * @GMAP_FLAG_USES_CMM: Whether the guest uses CMMA. + * @GMAP_FLAG_EXPORT_ON_UNMAP: Whether to export guest pages when unmapping. + */ +enum gmap_flags { + GMAP_FLAG_SHADOW = 0, + GMAP_FLAG_OWNS_PAGETABLES, + GMAP_FLAG_IS_UCONTROL, + GMAP_FLAG_ALLOW_HPAGE_1M, + GMAP_FLAG_ALLOW_HPAGE_2G, + GMAP_FLAG_PFAULT_ENABLED, + GMAP_FLAG_USES_SKEYS, + GMAP_FLAG_USES_CMM, + GMAP_FLAG_EXPORT_ON_UNMAP, +}; + +/** + * struct gmap_struct - Guest address space. + * + * @flags: GMAP_FLAG_* flags. + * @edat_level: The edat level of this shadow gmap. + * @kvm: The vm. + * @asce: The ASCE used by this gmap. + * @list: List head used in children gmaps for the children gmap list. + * @children_lock: Protects children and scb_users. + * @children: List of child gmaps of this gmap. + * @scb_users: List of vsie_scb that use this shadow gmap. + * @parent: Parent gmap of a child gmap. + * @guest_asce: Original ASCE of this shadow gmap. + * @host_to_rmap_lock: Protects host_to_rmap. + * @host_to_rmap: Radix tree mapping host addresses to guest addresses. + */ +struct gmap { + unsigned long flags; + unsigned char edat_level; + struct kvm *kvm; + union asce asce; + struct list_head list; + spinlock_t children_lock; /* Protects: children, scb_users */ + struct list_head children; + struct list_head scb_users; + struct gmap *parent; + union asce guest_asce; + spinlock_t host_to_rmap_lock; /* Protects host_to_rmap */ + struct radix_tree_root host_to_rmap; + refcount_t refcount; +}; + +struct gmap_cache { + struct list_head list; + struct gmap *gmap; +}; + +#define gmap_for_each_rmap_safe(pos, n, head) \ + for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n) + +int s390_replace_asce(struct gmap *gmap); +bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint); +bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end); +bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end); +int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault); +struct gmap *gmap_new(struct kvm *kvm, gfn_t limit); +struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit); +void gmap_remove_child(struct gmap *child); +void gmap_dispose(struct gmap *gmap); +int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *fault); +void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end); +int gmap_set_limit(struct gmap *gmap, gfn_t limit); +int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr); +int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count); +void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count); +int gmap_enable_skeys(struct gmap *gmap); +int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible); +int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level); +int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, + kvm_pfn_t pfn, int level, bool wr); +void gmap_set_cmma_all_dirty(struct gmap *gmap); +void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn); +struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, + union asce asce, int edat_level); +void gmap_split_huge_pages(struct gmap *gmap); + +static inline bool uses_skeys(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags); +} + +static inline bool uses_cmm(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_USES_CMM, &gmap->flags); +} + +static inline bool pfault_enabled(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_PFAULT_ENABLED, &gmap->flags); +} + +static inline bool is_ucontrol(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_IS_UCONTROL, &gmap->flags); +} + +static inline bool is_shadow(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_SHADOW, &gmap->flags); +} + +static inline bool owns_page_tables(struct gmap *gmap) +{ + return test_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags); +} + +static inline struct gmap *gmap_put(struct gmap *gmap) +{ + if (refcount_dec_and_test(&gmap->refcount)) + gmap_dispose(gmap); + return NULL; +} + +static inline void gmap_get(struct gmap *gmap) +{ + WARN_ON_ONCE(unlikely(!refcount_inc_not_zero(&gmap->refcount))); +} + +static inline void gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn) +{ + scoped_guard(spinlock, &parent->children_lock) + _gmap_handle_vsie_unshadow_event(parent, gfn); +} + +static inline bool gmap_mkold_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end) +{ + return _gmap_unmap_prefix(gmap, gfn, end, true); +} + +static inline bool gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end) +{ + return _gmap_unmap_prefix(gmap, gfn, end, false); +} + +static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte, + union pgste pgste, gfn_t gfn, bool needs_lock) +{ + lockdep_assert_held(&gmap->kvm->mmu_lock); + if (!needs_lock) + lockdep_assert_held(&gmap->children_lock); + else + lockdep_assert_not_held(&gmap->children_lock); + + if (pgste.prefix_notif && (newpte.h.p || newpte.h.i)) { + pgste.prefix_notif = 0; + gmap_unmap_prefix(gmap, gfn, gfn + 1); + } + if (pgste.vsie_notif && (ptep->h.p != newpte.h.p || newpte.h.i)) { + pgste.vsie_notif = 0; + if (needs_lock) + gmap_handle_vsie_unshadow_event(gmap, gfn); + else + _gmap_handle_vsie_unshadow_event(gmap, gfn); + } + return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap)); +} + +static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte, + union pgste pgste, gfn_t gfn) +{ + return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true); +} + +static inline void _gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne, + gfn_t gfn, bool needs_lock) +{ + unsigned long align = 8 + (is_pmd(*crstep) ? 0 : 11); + + lockdep_assert_held(&gmap->kvm->mmu_lock); + if (!needs_lock) + lockdep_assert_held(&gmap->children_lock); + + gfn = ALIGN_DOWN(gfn, align); + if (crste_prefix(*crstep) && (ne.h.p || ne.h.i || !crste_prefix(ne))) { + ne.s.fc1.prefix_notif = 0; + gmap_unmap_prefix(gmap, gfn, gfn + align); + } + if (crste_leaf(*crstep) && crstep->s.fc1.vsie_notif && + (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) { + ne.s.fc1.vsie_notif = 0; + if (needs_lock) + gmap_handle_vsie_unshadow_event(gmap, gfn); + else + _gmap_handle_vsie_unshadow_event(gmap, gfn); + } + dat_crstep_xchg(crstep, ne, gfn, gmap->asce); +} + +static inline void gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne, + gfn_t gfn) +{ + return _gmap_crstep_xchg(gmap, crstep, ne, gfn, true); +} + +/** + * gmap_is_shadow_valid() - check if a shadow guest address space matches the + * given properties and is still valid. + * @sg: Pointer to the shadow guest address space structure. + * @asce: ASCE for which the shadow table is requested. + * @edat_level: Edat level to be used for the shadow translation. + * + * Return: true if the gmap shadow is still valid and matches the given + * properties and the caller can continue using it; false otherwise, the + * caller has to request a new shadow gmap in this case. + */ +static inline bool gmap_is_shadow_valid(struct gmap *sg, union asce asce, int edat_level) +{ + return sg->guest_asce.val == asce.val && sg->edat_level == edat_level; +} + +#endif /* ARCH_KVM_S390_GMAP_H */ diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 420ae62977e2..39aff324203e 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -21,6 +21,7 @@ #include "gaccess.h" #include "trace.h" #include "trace-s390.h" +#include "faultin.h" u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu) { @@ -367,8 +368,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg2, &srcaddr, GACC_FETCH, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0); - if (rc != 0) + + do { + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(srcaddr), false); + } while (rc == -EAGAIN); + if (rc) return rc; /* Ensure that the source is paged-in, no actual access -> no key checking */ @@ -376,8 +380,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg1, &dstaddr, GACC_STORE, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE); - if (rc != 0) + + do { + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(dstaddr), true); + } while (rc == -EAGAIN); + if (rc) return rc; kvm_s390_retry_instr(vcpu); diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 249cdc822ec5..1c2bb5cd7e12 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -26,7 +26,6 @@ #include <linux/uaccess.h> #include <asm/sclp.h> #include <asm/isc.h> -#include <asm/gmap.h> #include <asm/nmi.h> #include <asm/airq.h> #include <asm/tpi.h> @@ -34,6 +33,7 @@ #include "gaccess.h" #include "trace-s390.h" #include "pci.h" +#include "gmap.h" #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 @@ -2632,12 +2632,12 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) case KVM_DEV_FLIC_APF_ENABLE: if (kvm_is_ucontrol(dev->kvm)) return -EINVAL; - dev->kvm->arch.gmap->pfault_enabled = 1; + set_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags); break; case KVM_DEV_FLIC_APF_DISABLE_WAIT: if (kvm_is_ucontrol(dev->kvm)) return -EINVAL; - dev->kvm->arch.gmap->pfault_enabled = 0; + clear_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags); /* * Make sure no async faults are in transition when * clearing the queues. So we don't need to worry @@ -2768,13 +2768,13 @@ static int adapter_indicators_set(struct kvm *kvm, bit = get_ind_bit(adapter_int->ind_addr, adapter_int->ind_offset, adapter->swap); set_bit(bit, map); - mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT); + mark_page_dirty(kvm, adapter_int->ind_gaddr >> PAGE_SHIFT); set_page_dirty_lock(ind_page); map = page_address(summary_page); bit = get_ind_bit(adapter_int->summary_addr, adapter_int->summary_offset, adapter->swap); summary_set = test_and_set_bit(bit, map); - mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT); + mark_page_dirty(kvm, adapter_int->summary_gaddr >> PAGE_SHIFT); set_page_dirty_lock(summary_page); srcu_read_unlock(&kvm->srcu, idx); @@ -2870,7 +2870,9 @@ int kvm_set_routing_entry(struct kvm *kvm, if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i)) return -EFAULT; e->adapter.summary_addr = uaddr_s; + e->adapter.summary_gaddr = ue->u.adapter.summary_addr; e->adapter.ind_addr = uaddr_i; + e->adapter.ind_gaddr = ue->u.adapter.ind_addr; e->adapter.summary_offset = ue->u.adapter.summary_offset; e->adapter.ind_offset = ue->u.adapter.ind_offset; e->adapter.adapter_id = ue->u.adapter.adapter_id; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 56a50524b3ee..de645025db0f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -40,7 +40,6 @@ #include <asm/lowcore.h> #include <asm/machine.h> #include <asm/stp.h> -#include <asm/gmap.h> #include <asm/gmap_helpers.h> #include <asm/nmi.h> #include <asm/isc.h> @@ -53,6 +52,8 @@ #include <asm/uv.h> #include "kvm-s390.h" #include "gaccess.h" +#include "gmap.h" +#include "faultin.h" #include "pci.h" #define CREATE_TRACE_POINTS @@ -264,16 +265,11 @@ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS) /* available subfunctions indicated via query / "test bit" */ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; -static struct gmap_notifier gmap_notifier; -static struct gmap_notifier vsie_gmap_notifier; debug_info_t *kvm_s390_dbf; debug_info_t *kvm_s390_dbf_uv; /* Section: not file related */ /* forward declarations */ -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end); - static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) { u8 delta_idx = 0; @@ -529,10 +525,6 @@ static int __init __kvm_s390_init(void) if (rc) goto err_gib; - gmap_notifier.notifier_call = kvm_gmap_notifier; - gmap_register_pte_notifier(&gmap_notifier); - vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; - gmap_register_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_register(&s390_epoch_delta_notifier, &kvm_clock_notifier); @@ -552,8 +544,6 @@ err_kvm_uv: static void __kvm_s390_exit(void) { - gmap_unregister_pte_notifier(&gmap_notifier); - gmap_unregister_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, &kvm_clock_notifier); @@ -564,12 +554,43 @@ static void __kvm_s390_exit(void) debug_unregister(kvm_s390_dbf_uv); } +static int kvm_s390_keyop(struct kvm_s390_mmu_cache *mc, struct kvm *kvm, int op, + unsigned long addr, union skey skey) +{ + union asce asce = kvm->arch.gmap->asce; + gfn_t gfn = gpa_to_gfn(addr); + int r; + + guard(read_lock)(&kvm->mmu_lock); + + switch (op) { + case KVM_S390_KEYOP_SSKE: + r = dat_cond_set_storage_key(mc, asce, gfn, skey, &skey, 0, 0, 0); + if (r >= 0) + return skey.skey; + break; + case KVM_S390_KEYOP_ISKE: + r = dat_get_storage_key(asce, gfn, &skey); + if (!r) + return skey.skey; + break; + case KVM_S390_KEYOP_RRBE: + r = dat_reset_reference_bit(asce, gfn); + if (r > 0) + return r << 1; + break; + default: + return -EINVAL; + } + return r; +} + /* Section: device related */ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { if (ioctl == KVM_S390_ENABLE_SIE) - return s390_enable_sie(); + return 0; return -EINVAL; } @@ -608,6 +629,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_DIAG318: case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_S390_USER_OPEREXEC: + case KVM_CAP_S390_KEYOP: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -698,32 +720,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - int i; - gfn_t cur_gfn, last_gfn; - unsigned long gaddr, vmaddr; - struct gmap *gmap = kvm->arch.gmap; - DECLARE_BITMAP(bitmap, _PAGE_ENTRIES); - - /* Loop over all guest segments */ - cur_gfn = memslot->base_gfn; - last_gfn = memslot->base_gfn + memslot->npages; - for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) { - gaddr = gfn_to_gpa(cur_gfn); - vmaddr = gfn_to_hva_memslot(memslot, cur_gfn); - if (kvm_is_error_hva(vmaddr)) - continue; - - bitmap_zero(bitmap, _PAGE_ENTRIES); - gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr); - for (i = 0; i < _PAGE_ENTRIES; i++) { - if (test_bit(i, bitmap)) - mark_page_dirty(kvm, cur_gfn + i); - } + gfn_t last_gfn = memslot->base_gfn + memslot->npages; - if (fatal_signal_pending(current)) - return; - cond_resched(); - } + scoped_guard(read_lock, &kvm->mmu_lock) + gmap_sync_dirty_log(kvm->arch.gmap, memslot->base_gfn, last_gfn); } /* Section: vm related */ @@ -883,9 +883,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) r = -EINVAL; else { r = 0; - mmap_write_lock(kvm->mm); - kvm->mm->context.allow_gmap_hpage_1m = 1; - mmap_write_unlock(kvm->mm); + set_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &kvm->arch.gmap->flags); /* * We might have to create fake 4k page * tables. To avoid that the hardware works on @@ -958,7 +956,7 @@ static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *att static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) { int ret; - unsigned int idx; + switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: ret = -ENXIO; @@ -969,8 +967,6 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att mutex_lock(&kvm->lock); if (kvm->created_vcpus) ret = -EBUSY; - else if (kvm->mm->context.allow_gmap_hpage_1m) - ret = -EINVAL; else { kvm->arch.use_cmma = 1; /* Not compatible with cmma. */ @@ -979,7 +975,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att } mutex_unlock(&kvm->lock); break; - case KVM_S390_VM_MEM_CLR_CMMA: + case KVM_S390_VM_MEM_CLR_CMMA: { + gfn_t start_gfn = 0; + ret = -ENXIO; if (!sclp.has_cmma) break; @@ -988,13 +986,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att break; VM_EVENT(kvm, 3, "%s", "RESET: CMMA states"); - mutex_lock(&kvm->lock); - idx = srcu_read_lock(&kvm->srcu); - s390_reset_cmma(kvm->arch.gmap->mm); - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); + do { + start_gfn = dat_reset_cmma(kvm->arch.gmap->asce, start_gfn); + cond_resched(); + } while (start_gfn); ret = 0; break; + } case KVM_S390_VM_MEM_LIMIT_SIZE: { unsigned long new_limit; @@ -1011,29 +1009,12 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!new_limit) return -EINVAL; - /* gmap_create takes last usable address */ - if (new_limit != KVM_S390_NO_MEM_LIMIT) - new_limit -= 1; - ret = -EBUSY; - mutex_lock(&kvm->lock); - if (!kvm->created_vcpus) { - /* gmap_create will round the limit up */ - struct gmap *new = gmap_create(current->mm, new_limit); - - if (!new) { - ret = -ENOMEM; - } else { - gmap_remove(kvm->arch.gmap); - new->private = kvm; - kvm->arch.gmap = new; - ret = 0; - } - } - mutex_unlock(&kvm->lock); + if (!kvm->created_vcpus) + ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit)); VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit); VM_EVENT(kvm, 3, "New guest asce: 0x%p", - (void *) kvm->arch.gmap->asce); + (void *)kvm->arch.gmap->asce.val); break; } default: @@ -1198,19 +1179,13 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) kvm->arch.migration_mode = 1; return 0; } - /* mark all the pages in active slots as dirty */ kvm_for_each_memslot(ms, bkt, slots) { if (!ms->dirty_bitmap) return -EINVAL; - /* - * The second half of the bitmap is only used on x86, - * and would be wasted otherwise, so we put it to good - * use here to keep track of the state of the storage - * attributes. - */ - memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms)); ram_pages += ms->npages; } + /* mark all the pages as dirty */ + gmap_set_cmma_all_dirty(kvm->arch.gmap); atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages); kvm->arch.migration_mode = 1; kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); @@ -2116,40 +2091,32 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { - uint8_t *keys; - uint64_t hva; - int srcu_idx, i, r = 0; + union skey *keys; + int i, r = 0; if (args->flags != 0) return -EINVAL; /* Is this guest using storage keys? */ - if (!mm_uses_skeys(current->mm)) + if (!uses_skeys(kvm->arch.gmap)) return KVM_S390_GET_SKEYS_NONE; /* Enforce sane limit on memory allocation */ if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) return -EINVAL; - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); + keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT); if (!keys) return -ENOMEM; - mmap_read_lock(current->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < args->count; i++) { - hva = gfn_to_hva(kvm, args->start_gfn + i); - if (kvm_is_error_hva(hva)) { - r = -EFAULT; - break; + scoped_guard(read_lock, &kvm->mmu_lock) { + for (i = 0; i < args->count; i++) { + r = dat_get_storage_key(kvm->arch.gmap->asce, + args->start_gfn + i, keys + i); + if (r) + break; } - - r = get_guest_storage_key(current->mm, hva, &keys[i]); - if (r) - break; } - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(current->mm); if (!r) { r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, @@ -2164,10 +2131,9 @@ static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { - uint8_t *keys; - uint64_t hva; - int srcu_idx, i, r = 0; - bool unlocked; + struct kvm_s390_mmu_cache *mc; + union skey *keys; + int i, r = 0; if (args->flags != 0) return -EINVAL; @@ -2176,7 +2142,7 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) return -EINVAL; - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); + keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT); if (!keys) return -ENOMEM; @@ -2188,159 +2154,41 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) } /* Enable storage key handling for the guest */ - r = s390_enable_skey(); + r = gmap_enable_skeys(kvm->arch.gmap); if (r) goto out; - i = 0; - mmap_read_lock(current->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - while (i < args->count) { - unlocked = false; - hva = gfn_to_hva(kvm, args->start_gfn + i); - if (kvm_is_error_hva(hva)) { - r = -EFAULT; - break; - } - + r = -EINVAL; + for (i = 0; i < args->count; i++) { /* Lowest order bit is reserved */ - if (keys[i] & 0x01) { - r = -EINVAL; - break; - } - - r = set_guest_storage_key(current->mm, hva, keys[i], 0); - if (r) { - r = fixup_user_fault(current->mm, hva, - FAULT_FLAG_WRITE, &unlocked); - if (r) - break; - } - if (!r) - i++; - } - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(current->mm); -out: - kvfree(keys); - return r; -} - -/* - * Base address and length must be sent at the start of each block, therefore - * it's cheaper to send some clean data, as long as it's less than the size of - * two longs. - */ -#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) -/* for consistency */ -#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) - -static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, - u8 *res, unsigned long bufsize) -{ - unsigned long pgstev, hva, cur_gfn = args->start_gfn; - - args->count = 0; - while (args->count < bufsize) { - hva = gfn_to_hva(kvm, cur_gfn); - /* - * We return an error if the first value was invalid, but we - * return successfully if at least one value was copied. - */ - if (kvm_is_error_hva(hva)) - return args->count ? 0 : -EFAULT; - if (get_pgste(kvm->mm, hva, &pgstev) < 0) - pgstev = 0; - res[args->count++] = (pgstev >> 24) & 0x43; - cur_gfn++; - } - - return 0; -} - -static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots, - gfn_t gfn) -{ - return ____gfn_to_memslot(slots, gfn, true); -} - -static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, - unsigned long cur_gfn) -{ - struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn); - unsigned long ofs = cur_gfn - ms->base_gfn; - struct rb_node *mnode = &ms->gfn_node[slots->node_idx]; - - if (ms->base_gfn + ms->npages <= cur_gfn) { - mnode = rb_next(mnode); - /* If we are above the highest slot, wrap around */ - if (!mnode) - mnode = rb_first(&slots->gfn_tree); - - ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); - ofs = 0; + if (keys[i].zero) + goto out; } - if (cur_gfn < ms->base_gfn) - ofs = 0; - - ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs); - while (ofs >= ms->npages && (mnode = rb_next(mnode))) { - ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); - ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages); + mc = kvm_s390_new_mmu_cache(); + if (!mc) { + r = -ENOMEM; + goto out; } - return ms->base_gfn + ofs; -} - -static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, - u8 *res, unsigned long bufsize) -{ - unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev; - struct kvm_memslots *slots = kvm_memslots(kvm); - struct kvm_memory_slot *ms; - if (unlikely(kvm_memslots_empty(slots))) - return 0; - - cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn); - ms = gfn_to_memslot(kvm, cur_gfn); - args->count = 0; - args->start_gfn = cur_gfn; - if (!ms) - return 0; - next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); - mem_end = kvm_s390_get_gfn_end(slots); - - while (args->count < bufsize) { - hva = gfn_to_hva(kvm, cur_gfn); - if (kvm_is_error_hva(hva)) - return 0; - /* Decrement only if we actually flipped the bit to 0 */ - if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) - atomic64_dec(&kvm->arch.cmma_dirty_pages); - if (get_pgste(kvm->mm, hva, &pgstev) < 0) - pgstev = 0; - /* Save the value */ - res[args->count++] = (pgstev >> 24) & 0x43; - /* If the next bit is too far away, stop. */ - if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE) - return 0; - /* If we reached the previous "next", find the next one */ - if (cur_gfn == next_gfn) - next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); - /* Reached the end of memory or of the buffer, stop */ - if ((next_gfn >= mem_end) || - (next_gfn - args->start_gfn >= bufsize)) - return 0; - cur_gfn++; - /* Reached the end of the current memslot, take the next one. */ - if (cur_gfn - ms->base_gfn >= ms->npages) { - ms = gfn_to_memslot(kvm, cur_gfn); - if (!ms) - return 0; + r = 0; + do { + r = kvm_s390_mmu_cache_topup(mc); + if (r == -ENOMEM) + break; + scoped_guard(read_lock, &kvm->mmu_lock) { + for (i = 0 ; i < args->count; i++) { + r = dat_set_storage_key(mc, kvm->arch.gmap->asce, + args->start_gfn + i, keys[i], 0); + if (r) + break; + } } - } - return 0; + } while (r == -ENOMEM); + kvm_s390_free_mmu_cache(mc); +out: + kvfree(keys); + return r; } /* @@ -2354,8 +2202,7 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, static int kvm_s390_get_cmma_bits(struct kvm *kvm, struct kvm_s390_cmma_log *args) { - unsigned long bufsize; - int srcu_idx, peek, ret; + int peek, ret; u8 *values; if (!kvm->arch.use_cmma) @@ -2368,8 +2215,8 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, if (!peek && !kvm->arch.migration_mode) return -EINVAL; /* CMMA is disabled or was not used, or the buffer has length zero */ - bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX); - if (!bufsize || !kvm->mm->context.uses_cmm) { + args->count = min(args->count, KVM_S390_CMMA_SIZE_MAX); + if (!args->count || !uses_cmm(kvm->arch.gmap)) { memset(args, 0, sizeof(*args)); return 0; } @@ -2379,18 +2226,18 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, return 0; } - values = vmalloc(bufsize); + values = vmalloc(args->count); if (!values) return -ENOMEM; - mmap_read_lock(kvm->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - if (peek) - ret = kvm_s390_peek_cmma(kvm, args, values, bufsize); - else - ret = kvm_s390_get_cmma(kvm, args, values, bufsize); - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(kvm->mm); + scoped_guard(read_lock, &kvm->mmu_lock) { + if (peek) + ret = dat_peek_cmma(args->start_gfn, kvm->arch.gmap->asce, &args->count, + values); + else + ret = dat_get_cmma(kvm->arch.gmap->asce, &args->start_gfn, &args->count, + values, &kvm->arch.cmma_dirty_pages); + } if (kvm->arch.migration_mode) args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages); @@ -2412,11 +2259,9 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, static int kvm_s390_set_cmma_bits(struct kvm *kvm, const struct kvm_s390_cmma_log *args) { - unsigned long hva, mask, pgstev, i; - uint8_t *bits; - int srcu_idx, r = 0; - - mask = args->mask; + struct kvm_s390_mmu_cache *mc; + u8 *bits = NULL; + int r = 0; if (!kvm->arch.use_cmma) return -ENXIO; @@ -2430,9 +2275,12 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, if (args->count == 0) return 0; + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; bits = vmalloc(array_size(sizeof(*bits), args->count)); if (!bits) - return -ENOMEM; + goto out; r = copy_from_user(bits, (void __user *)args->values, args->count); if (r) { @@ -2440,29 +2288,19 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, goto out; } - mmap_read_lock(kvm->mm); - srcu_idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < args->count; i++) { - hva = gfn_to_hva(kvm, args->start_gfn + i); - if (kvm_is_error_hva(hva)) { - r = -EFAULT; + do { + r = kvm_s390_mmu_cache_topup(mc); + if (r) break; + scoped_guard(read_lock, &kvm->mmu_lock) { + r = dat_set_cmma_bits(mc, kvm->arch.gmap->asce, args->start_gfn, + args->count, args->mask, bits); } + } while (r == -ENOMEM); - pgstev = bits[i]; - pgstev = pgstev << 24; - mask &= _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT; - set_pgste_bits(kvm->mm, hva, mask, pgstev); - } - srcu_read_unlock(&kvm->srcu, srcu_idx); - mmap_read_unlock(kvm->mm); - - if (!kvm->mm->context.uses_cmm) { - mmap_write_lock(kvm->mm); - kvm->mm->context.uses_cmm = 1; - mmap_write_unlock(kvm->mm); - } + set_bit(GMAP_FLAG_USES_CMM, &kvm->arch.gmap->flags); out: + kvm_s390_free_mmu_cache(mc); vfree(bits); return r; } @@ -2671,6 +2509,13 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) break; mmap_write_lock(kvm->mm); + /* + * Disable creation of new THPs. Existing THPs can stay, they + * will be split when any part of them gets imported. + */ + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, kvm->mm); + mm_flags_set(MMF_DISABLE_THP_COMPLETELY, kvm->mm); + set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags); r = gmap_helper_disable_cow_sharing(); mmap_write_unlock(kvm->mm); if (r) @@ -2744,9 +2589,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) if (copy_from_user(&parms, argp, sizeof(parms))) break; - /* Currently restricted to 8KB */ + /* Currently restricted to 1MiB */ r = -EINVAL; - if (parms.length > PAGE_SIZE * 2) + if (parms.length > SZ_1M) break; r = -ENOMEM; @@ -2900,9 +2745,9 @@ static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_fla static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; + void *tmpbuf __free(kvfree) = NULL; enum gacc_mode acc_mode; - void *tmpbuf = NULL; - int r, srcu_idx; + int r; r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION | KVM_S390_MEMOP_F_CHECK_ONLY); @@ -2915,52 +2760,32 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) return -ENOMEM; } - srcu_idx = srcu_read_lock(&kvm->srcu); + acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; - if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { - r = PGM_ADDRESSING; - goto out_unlock; - } + scoped_guard(srcu, &kvm->srcu) { + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) + return check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); - acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); - goto out_unlock; - } - if (acc_mode == GACC_FETCH) { + if (acc_mode == GACC_STORE && copy_from_user(tmpbuf, uaddr, mop->size)) + return -EFAULT; r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, - mop->size, GACC_FETCH, mop->key); + mop->size, acc_mode, mop->key); if (r) - goto out_unlock; - if (copy_to_user(uaddr, tmpbuf, mop->size)) - r = -EFAULT; - } else { - if (copy_from_user(tmpbuf, uaddr, mop->size)) { - r = -EFAULT; - goto out_unlock; - } - r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, - mop->size, GACC_STORE, mop->key); + return r; + if (acc_mode != GACC_STORE && copy_to_user(uaddr, tmpbuf, mop->size)) + return -EFAULT; } - -out_unlock: - srcu_read_unlock(&kvm->srcu, srcu_idx); - - vfree(tmpbuf); - return r; + return 0; } static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; void __user *old_addr = (void __user *)mop->old_addr; - union { - __uint128_t quad; - char raw[sizeof(__uint128_t)]; - } old = { .quad = 0}, new = { .quad = 0 }; - unsigned int off_in_quad = sizeof(new) - mop->size; - int r, srcu_idx; - bool success; + union kvm_s390_quad old = { .sixteen = 0 }; + union kvm_s390_quad new = { .sixteen = 0 }; + bool success = false; + int r; r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION); if (r) @@ -2972,25 +2797,18 @@ static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *m */ if (mop->size > sizeof(new)) return -EINVAL; - if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size)) + if (copy_from_user(&new, uaddr, mop->size)) return -EFAULT; - if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size)) + if (copy_from_user(&old, old_addr, mop->size)) return -EFAULT; - srcu_idx = srcu_read_lock(&kvm->srcu); + scoped_guard(srcu, &kvm->srcu) { + r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old, new, + mop->key, &success); - if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) { - r = PGM_ADDRESSING; - goto out_unlock; + if (!success && copy_to_user(old_addr, &old, mop->size)) + return -EFAULT; } - - r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad, - new.quad, mop->key, &success); - if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size)) - r = -EFAULT; - -out_unlock: - srcu_read_unlock(&kvm->srcu, srcu_idx); return r; } @@ -3145,6 +2963,32 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) r = -EFAULT; break; } + case KVM_S390_KEYOP: { + struct kvm_s390_mmu_cache *mc; + struct kvm_s390_keyop kop; + union skey skey; + + if (copy_from_user(&kop, argp, sizeof(kop))) { + r = -EFAULT; + break; + } + skey.skey = kop.key; + + mc = kvm_s390_new_mmu_cache(); + if (!mc) + return -ENOMEM; + + r = kvm_s390_keyop(mc, kvm, kop.operation, kop.guest_addr, skey); + kvm_s390_free_mmu_cache(mc); + if (r < 0) + break; + + kop.key = r; + r = 0; + if (copy_to_user(argp, &kop, sizeof(kop))) + r = -EFAULT; + break; + } case KVM_S390_ZPCI_OP: { struct kvm_s390_zpci_op args; @@ -3330,6 +3174,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) char debug_name[16]; int i, rc; + mutex_init(&kvm->arch.pv.import_lock); + rc = -EINVAL; #ifdef CONFIG_KVM_S390_UCONTROL if (type & ~KVM_VM_S390_UCONTROL) @@ -3340,11 +3186,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (type) goto out_err; #endif - - rc = s390_enable_sie(); - if (rc) - goto out_err; - rc = -ENOMEM; if (!sclp.has_64bscao) @@ -3418,6 +3259,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) debug_register_view(kvm->arch.dbf, &debug_sprintf_view); VM_EVENT(kvm, 3, "vm created with type %lu", type); + kvm->arch.mem_limit = type & KVM_VM_S390_UCONTROL ? KVM_S390_NO_MEM_LIMIT : sclp.hamax + 1; + kvm->arch.gmap = gmap_new(kvm, gpa_to_gfn(kvm->arch.mem_limit)); + if (!kvm->arch.gmap) + goto out_err; + clear_bit(GMAP_FLAG_PFAULT_ENABLED, &kvm->arch.gmap->flags); + if (type & KVM_VM_S390_UCONTROL) { struct kvm_userspace_memory_region2 fake_memslot = { .slot = KVM_S390_UCONTROL_MEMSLOT, @@ -3427,23 +3274,15 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) .flags = 0, }; - kvm->arch.gmap = NULL; - kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT; /* one flat fake memslot covering the whole address-space */ mutex_lock(&kvm->slots_lock); KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm); mutex_unlock(&kvm->slots_lock); + set_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags); } else { - if (sclp.hamax == U64_MAX) - kvm->arch.mem_limit = TASK_SIZE_MAX; - else - kvm->arch.mem_limit = min_t(unsigned long, TASK_SIZE_MAX, - sclp.hamax + 1); - kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1); - if (!kvm->arch.gmap) - goto out_err; - kvm->arch.gmap->private = kvm; - kvm->arch.gmap->pfault_enabled = 0; + struct crst_table *table = dereference_asce(kvm->arch.gmap->asce); + + crst_table_init((void *)table, _CRSTE_HOLE(table->crstes[0].h.tt).val); } kvm->arch.use_pfmfi = sclp.has_pfmfi; @@ -3477,8 +3316,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) sca_del_vcpu(vcpu); kvm_s390_update_topology_change_report(vcpu->kvm, 1); - if (kvm_is_ucontrol(vcpu->kvm)) - gmap_remove(vcpu->arch.gmap); + if (kvm_is_ucontrol(vcpu->kvm)) { + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) + gmap_remove_child(vcpu->arch.gmap); + vcpu->arch.gmap = gmap_put(vcpu->arch.gmap); + } if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); @@ -3486,6 +3328,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (kvm_s390_pv_cpu_get_handle(vcpu)) kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc); free_page((unsigned long)(vcpu->arch.sie_block)); + kvm_s390_free_mmu_cache(vcpu->arch.mc); } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -3512,25 +3355,14 @@ void kvm_arch_destroy_vm(struct kvm *kvm) debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); - if (!kvm_is_ucontrol(kvm)) - gmap_remove(kvm->arch.gmap); kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); kvm_s390_vsie_destroy(kvm); + kvm->arch.gmap = gmap_put(kvm->arch.gmap); KVM_EVENT(3, "vm 0x%p destroyed", kvm); } /* Section: vcpu related */ -static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) -{ - vcpu->arch.gmap = gmap_create(current->mm, -1UL); - if (!vcpu->arch.gmap) - return -ENOMEM; - vcpu->arch.gmap->private = vcpu->kvm; - - return 0; -} - static void sca_del_vcpu(struct kvm_vcpu *vcpu) { struct esca_block *sca = vcpu->kvm->arch.sca; @@ -3871,9 +3703,15 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) int rc; BUILD_BUG_ON(sizeof(struct sie_page) != 4096); + vcpu->arch.mc = kvm_s390_new_mmu_cache(); + if (!vcpu->arch.mc) + return -ENOMEM; sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!sie_page) + if (!sie_page) { + kvm_s390_free_mmu_cache(vcpu->arch.mc); + vcpu->arch.mc = NULL; return -ENOMEM; + } vcpu->arch.sie_block = &sie_page->sie_block; vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb); @@ -3915,8 +3753,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; if (kvm_is_ucontrol(vcpu->kvm)) { - rc = __kvm_ucontrol_vcpu_init(vcpu); - if (rc) + rc = -ENOMEM; + vcpu->arch.gmap = gmap_new_child(vcpu->kvm->arch.gmap, -1UL); + if (!vcpu->arch.gmap) goto out_free_sie_block; } @@ -3932,8 +3771,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return 0; out_ucontrol_uninit: - if (kvm_is_ucontrol(vcpu->kvm)) - gmap_remove(vcpu->arch.gmap); + if (kvm_is_ucontrol(vcpu->kvm)) { + gmap_remove_child(vcpu->arch.gmap); + vcpu->arch.gmap = gmap_put(vcpu->arch.gmap); + } out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); return rc; @@ -3997,32 +3838,6 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) kvm_s390_vcpu_request(vcpu); } -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end) -{ - struct kvm *kvm = gmap->private; - struct kvm_vcpu *vcpu; - unsigned long prefix; - unsigned long i; - - trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap)); - - if (gmap_is_shadow(gmap)) - return; - if (start >= 1UL << 31) - /* We are only interested in prefix pages */ - return; - kvm_for_each_vcpu(i, vcpu, kvm) { - /* match against both prefix pages */ - prefix = kvm_s390_get_prefix(vcpu); - if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) { - VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx", - start, end); - kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); - } - } -} - bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { /* do not poll with more than halt_poll_max_steal percent of steal time */ @@ -4404,72 +4219,41 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu) return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS); } -static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags) -{ - struct kvm *kvm = gmap->private; - gfn_t gfn = gpa_to_gfn(gaddr); - bool unlocked; - hva_t vmaddr; - gpa_t tmp; - int rc; - - if (kvm_is_ucontrol(kvm)) { - tmp = __gmap_translate(gmap, gaddr); - gfn = gpa_to_gfn(tmp); - } - - vmaddr = gfn_to_hva(kvm, gfn); - rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); - if (!rc) - rc = __gmap_link(gmap, gaddr, vmaddr); - return rc; -} - -/** - * __kvm_s390_mprotect_many() - Apply specified protection to guest pages - * @gmap: the gmap of the guest - * @gpa: the starting guest address - * @npages: how many pages to protect - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: pgste notification bits to set - * - * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one() - * - * Context: kvm->srcu and gmap->mm need to be held in read mode - */ -int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, - unsigned long bits) +static int vcpu_ucontrol_translate(struct kvm_vcpu *vcpu, gpa_t *gaddr) { - unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0; - gpa_t end = gpa + npages * PAGE_SIZE; int rc; - for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) { - rc = gmap_protect_one(gmap, gpa, prot, bits); - if (rc == -EAGAIN) { - __kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag); - rc = gmap_protect_one(gmap, gpa, prot, bits); + if (kvm_is_ucontrol(vcpu->kvm)) { + rc = gmap_ucas_translate(vcpu->arch.mc, vcpu->arch.gmap, gaddr); + if (rc == -EREMOTE) { + vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; + vcpu->run->s390_ucontrol.trans_exc_code = *gaddr; + vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; } - if (rc < 0) - return rc; + return rc; } - return 0; } -static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu) +static int kvm_s390_fixup_prefix(struct kvm_vcpu *vcpu) { gpa_t gaddr = kvm_s390_get_prefix(vcpu); - int idx, rc; - - idx = srcu_read_lock(&vcpu->kvm->srcu); - mmap_read_lock(vcpu->arch.gmap->mm); + gfn_t gfn; + int rc; - rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT); + if (vcpu_ucontrol_translate(vcpu, &gaddr)) + return -EREMOTE; + gfn = gpa_to_gfn(gaddr); - mmap_read_unlock(vcpu->arch.gmap->mm); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn, true); + if (rc) + return rc; + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn + 1, true); + if (rc) + return rc; + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) + rc = dat_set_prefix_notif_bit(vcpu->kvm->arch.gmap->asce, gfn); return rc; } @@ -4489,7 +4273,7 @@ retry: if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) { int rc; - rc = kvm_s390_mprotect_notify_prefix(vcpu); + rc = kvm_s390_fixup_prefix(vcpu); if (rc) { kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); return rc; @@ -4538,8 +4322,7 @@ retry: * Re-enable CMM virtualization if CMMA is available and * CMM has been used. */ - if ((vcpu->kvm->arch.use_cmma) && - (vcpu->kvm->mm->context.uses_cmm)) + if (vcpu->kvm->arch.use_cmma && uses_cmm(vcpu->arch.gmap)) vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; goto retry; } @@ -4635,7 +4418,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) return true; } -static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) { hva_t hva; struct kvm_arch_async_pf arch; @@ -4651,7 +4434,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) return false; if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) return false; - if (!vcpu->arch.gmap->pfault_enabled) + if (!pfault_enabled(vcpu->arch.gmap)) return false; hva = gfn_to_hva(vcpu->kvm, current->thread.gmap_teid.addr); @@ -4744,98 +4527,25 @@ static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu) current->thread.gmap_int_code, current->thread.gmap_teid.val); } -/* - * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu - * @vcpu: the vCPU whose gmap is to be fixed up - * @gfn: the guest frame number used for memslots (including fake memslots) - * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps - * @foll: FOLL_* flags - * - * Return: 0 on success, < 0 in case of error. - * Context: The mm lock must not be held before calling. May sleep. - */ -int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int foll) -{ - struct kvm_memory_slot *slot; - unsigned int fault_flags; - bool writable, unlocked; - unsigned long vmaddr; - struct page *page; - kvm_pfn_t pfn; +static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, gpa_t gaddr, bool wr) +{ + struct guest_fault f = { + .write_attempt = wr, + .attempt_pfault = pfault_enabled(vcpu->arch.gmap), + }; int rc; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID) - return vcpu_post_run_addressing_exception(vcpu); - - fault_flags = foll & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; - if (vcpu->arch.gmap->pfault_enabled) - foll |= FOLL_NOWAIT; - vmaddr = __gfn_to_hva_memslot(slot, gfn); - -try_again: - pfn = __kvm_faultin_pfn(slot, gfn, foll, &writable, &page); + if (vcpu_ucontrol_translate(vcpu, &gaddr)) + return -EREMOTE; + f.gfn = gpa_to_gfn(gaddr); - /* Access outside memory, inject addressing exception */ - if (is_noslot_pfn(pfn)) + rc = kvm_s390_faultin_gfn(vcpu, NULL, &f); + if (rc <= 0) + return rc; + if (rc == PGM_ADDRESSING) return vcpu_post_run_addressing_exception(vcpu); - /* Signal pending: try again */ - if (pfn == KVM_PFN_ERR_SIGPENDING) - return -EAGAIN; - - /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */ - if (pfn == KVM_PFN_ERR_NEEDS_IO) { - trace_kvm_s390_major_guest_pfault(vcpu); - if (kvm_arch_setup_async_pf(vcpu)) - return 0; - vcpu->stat.pfault_sync++; - /* Could not setup async pfault, try again synchronously */ - foll &= ~FOLL_NOWAIT; - goto try_again; - } - /* Any other error */ - if (is_error_pfn(pfn)) - return -EFAULT; - - /* Success */ - mmap_read_lock(vcpu->arch.gmap->mm); - /* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */ - rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked); - if (!rc) - rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr); - scoped_guard(spinlock, &vcpu->kvm->mmu_lock) { - kvm_release_faultin_page(vcpu->kvm, page, false, writable); - } - mmap_read_unlock(vcpu->arch.gmap->mm); - return rc; -} - -static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int foll) -{ - unsigned long gaddr_tmp; - gfn_t gfn; - - gfn = gpa_to_gfn(gaddr); - if (kvm_is_ucontrol(vcpu->kvm)) { - /* - * This translates the per-vCPU guest address into a - * fake guest address, which can then be used with the - * fake memslots that are identity mapping userspace. - * This allows ucontrol VMs to use the normal fault - * resolution path, like normal VMs. - */ - mmap_read_lock(vcpu->arch.gmap->mm); - gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr); - mmap_read_unlock(vcpu->arch.gmap->mm); - if (gaddr_tmp == -EFAULT) { - vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; - vcpu->run->s390_ucontrol.trans_exc_code = gaddr; - vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; - return -EREMOTE; - } - gfn = gpa_to_gfn(gaddr_tmp); - } - return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, foll); + KVM_BUG_ON(rc, vcpu->kvm); + return -EINVAL; } static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) @@ -5012,7 +4722,7 @@ xfer_to_guest_mode_check: exit_reason = kvm_s390_enter_exit_sie(vcpu->arch.sie_block, vcpu->run->s.regs.gprs, - vcpu->arch.gmap->asce); + vcpu->arch.gmap->asce.val); __enable_cpu_timer_accounting(vcpu); guest_timing_exit_irqoff(); @@ -5547,8 +5257,8 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; + void *tmpbuf __free(kvfree) = NULL; enum gacc_mode acc_mode; - void *tmpbuf = NULL; int r; r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION | @@ -5570,32 +5280,21 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, acc_mode, mop->key); - goto out_inject; - } - if (acc_mode == GACC_FETCH) { + } else if (acc_mode == GACC_FETCH) { r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size, mop->key); - if (r) - goto out_inject; - if (copy_to_user(uaddr, tmpbuf, mop->size)) { - r = -EFAULT; - goto out_free; - } + if (!r && copy_to_user(uaddr, tmpbuf, mop->size)) + return -EFAULT; } else { - if (copy_from_user(tmpbuf, uaddr, mop->size)) { - r = -EFAULT; - goto out_free; - } + if (copy_from_user(tmpbuf, uaddr, mop->size)) + return -EFAULT; r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size, mop->key); } -out_inject: if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); -out_free: - vfree(tmpbuf); return r; } @@ -5785,37 +5484,39 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #ifdef CONFIG_KVM_S390_UCONTROL case KVM_S390_UCAS_MAP: { - struct kvm_s390_ucas_mapping ucasmap; + struct kvm_s390_ucas_mapping ucas; - if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { - r = -EFAULT; + r = -EFAULT; + if (copy_from_user(&ucas, argp, sizeof(ucas))) break; - } - if (!kvm_is_ucontrol(vcpu->kvm)) { - r = -EINVAL; + r = -EINVAL; + if (!kvm_is_ucontrol(vcpu->kvm)) + break; + if (!IS_ALIGNED(ucas.user_addr | ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE)) break; - } - r = gmap_map_segment(vcpu->arch.gmap, ucasmap.user_addr, - ucasmap.vcpu_addr, ucasmap.length); + r = gmap_ucas_map(vcpu->arch.gmap, gpa_to_gfn(ucas.user_addr), + gpa_to_gfn(ucas.vcpu_addr), + ucas.length >> _SEGMENT_SHIFT); break; } case KVM_S390_UCAS_UNMAP: { - struct kvm_s390_ucas_mapping ucasmap; + struct kvm_s390_ucas_mapping ucas; - if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { - r = -EFAULT; + r = -EFAULT; + if (copy_from_user(&ucas, argp, sizeof(ucas))) break; - } - if (!kvm_is_ucontrol(vcpu->kvm)) { - r = -EINVAL; + r = -EINVAL; + if (!kvm_is_ucontrol(vcpu->kvm)) + break; + if (!IS_ALIGNED(ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE)) break; - } - r = gmap_unmap_segment(vcpu->arch.gmap, ucasmap.vcpu_addr, - ucasmap.length); + gmap_ucas_unmap(vcpu->arch.gmap, gpa_to_gfn(ucas.vcpu_addr), + ucas.length >> _SEGMENT_SHIFT); + r = 0; break; } #endif @@ -5988,37 +5689,89 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { + struct kvm_s390_mmu_cache *mc = NULL; int rc = 0; - if (kvm_is_ucontrol(kvm)) + if (change == KVM_MR_FLAGS_ONLY) return; - switch (change) { - case KVM_MR_DELETE: - rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, - old->npages * PAGE_SIZE); - break; - case KVM_MR_MOVE: - rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, - old->npages * PAGE_SIZE); - if (rc) + mc = kvm_s390_new_mmu_cache(); + if (!mc) { + rc = -ENOMEM; + goto out; + } + + scoped_guard(write_lock, &kvm->mmu_lock) { + switch (change) { + case KVM_MR_DELETE: + rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages); break; - fallthrough; - case KVM_MR_CREATE: - rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr, - new->base_gfn * PAGE_SIZE, - new->npages * PAGE_SIZE); - break; - case KVM_MR_FLAGS_ONLY: - break; - default: - WARN(1, "Unknown KVM MR CHANGE: %d\n", change); + case KVM_MR_MOVE: + rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages); + if (rc) + break; + fallthrough; + case KVM_MR_CREATE: + rc = dat_create_slot(mc, kvm->arch.gmap->asce, new->base_gfn, new->npages); + break; + case KVM_MR_FLAGS_ONLY: + break; + default: + WARN(1, "Unknown KVM MR CHANGE: %d\n", change); + } } +out: if (rc) pr_warn("failed to commit memory region\n"); + kvm_s390_free_mmu_cache(mc); return; } +/** + * kvm_test_age_gfn() - test young + * @kvm: the kvm instance + * @range: the range of guest addresses whose young status needs to be cleared + * + * Context: called by KVM common code without holding the kvm mmu lock + * Return: true if any page in the given range is young, otherwise 0. + */ +bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + scoped_guard(read_lock, &kvm->mmu_lock) + return dat_test_age_gfn(kvm->arch.gmap->asce, range->start, range->end); +} + +/** + * kvm_age_gfn() - clear young + * @kvm: the kvm instance + * @range: the range of guest addresses whose young status needs to be cleared + * + * Context: called by KVM common code without holding the kvm mmu lock + * Return: true if any page in the given range was young, otherwise 0. + */ +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + scoped_guard(read_lock, &kvm->mmu_lock) + return gmap_age_gfn(kvm->arch.gmap, range->start, range->end); +} + +/** + * kvm_unmap_gfn_range() - Unmap a range of guest addresses + * @kvm: the kvm instance + * @range: the range of guest page frames to invalidate + * + * This function always returns false because every DAT table modification + * has to use the appropriate DAT table manipulation instructions, which will + * keep the TLB coherent, hence no additional TLB flush is ever required. + * + * Context: called by KVM common code with the kvm mmu write lock held + * Return: false + */ +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +{ + return gmap_unmap_gfn_range(kvm->arch.gmap, range->slot, range->start, range->end); +} + static inline unsigned long nonhyp_mask(int i) { unsigned int nonhyp_fai = (sclp.hmfai << i * 2) >> 30; @@ -6035,11 +5788,6 @@ static int __init kvm_s390_init(void) return -ENODEV; } - if (nested && hpage) { - pr_info("A KVM host that supports nesting cannot back its KVM guests with huge pages\n"); - return -EINVAL; - } - for (i = 0; i < 16; i++) kvm_s390_fac_base[i] |= stfle_fac_list[i] & nonhyp_mask(i); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 65c950760993..bf1d7798c1af 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -19,9 +19,19 @@ #include <asm/facility.h> #include <asm/processor.h> #include <asm/sclp.h> +#include "dat.h" +#include "gmap.h" #define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0) +union kvm_s390_quad { + __uint128_t sixteen; + unsigned long eight; + unsigned int four; + unsigned short two; + unsigned char one; +}; + static inline void kvm_s390_fpu_store(struct kvm_run *run) { fpu_stfpc(&run->s.regs.fpc); @@ -106,9 +116,7 @@ static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) static inline int kvm_is_ucontrol(struct kvm *kvm) { #ifdef CONFIG_KVM_S390_UCONTROL - if (kvm->arch.gmap) - return 0; - return 1; + return test_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags); #else return 0; #endif @@ -432,14 +440,9 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu); /* implemented in vsie.c */ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu); void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu); -void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end); +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end); void kvm_s390_vsie_init(struct kvm *kvm); void kvm_s390_vsie_destroy(struct kvm *kvm); -int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); - -/* implemented in gmap-vsie.c */ -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); /* implemented in sigp.c */ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); @@ -461,14 +464,10 @@ void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); -int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags); int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, unsigned long bits); -static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags) -{ - return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags); -} +bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu); /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 0b14d894f38a..a3250ad83a8e 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -21,13 +21,14 @@ #include <asm/ebcdic.h> #include <asm/sysinfo.h> #include <asm/page-states.h> -#include <asm/gmap.h> #include <asm/ptrace.h> #include <asm/sclp.h> #include <asm/ap.h> +#include <asm/gmap_helpers.h> #include "gaccess.h" #include "kvm-s390.h" #include "trace.h" +#include "gmap.h" static int handle_ri(struct kvm_vcpu *vcpu) { @@ -222,7 +223,7 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu) if (vcpu->arch.skey_enabled) return 0; - rc = s390_enable_skey(); + rc = gmap_enable_skeys(vcpu->arch.gmap); VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc); if (rc) return rc; @@ -255,10 +256,9 @@ static int try_handle_skey(struct kvm_vcpu *vcpu) static int handle_iske(struct kvm_vcpu *vcpu) { - unsigned long gaddr, vmaddr; - unsigned char key; + unsigned long gaddr; int reg1, reg2; - bool unlocked; + union skey key; int rc; vcpu->stat.instruction_iske++; @@ -275,37 +275,21 @@ static int handle_iske(struct kvm_vcpu *vcpu) gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; gaddr = kvm_s390_logical_to_effective(vcpu, gaddr); gaddr = kvm_s390_real_to_abs(vcpu, gaddr); - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr)); - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); -retry: - unlocked = false; - mmap_read_lock(current->mm); - rc = get_guest_storage_key(current->mm, vmaddr, &key); - - if (rc) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - if (!rc) { - mmap_read_unlock(current->mm); - goto retry; - } - } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + rc = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr), &key); + if (rc > 0) + return kvm_s390_inject_program_int(vcpu, rc); if (rc < 0) return rc; vcpu->run->s.regs.gprs[reg1] &= ~0xff; - vcpu->run->s.regs.gprs[reg1] |= key; + vcpu->run->s.regs.gprs[reg1] |= key.skey; return 0; } static int handle_rrbe(struct kvm_vcpu *vcpu) { - unsigned long vmaddr, gaddr; + unsigned long gaddr; int reg1, reg2; - bool unlocked; int rc; vcpu->stat.instruction_rrbe++; @@ -322,24 +306,10 @@ static int handle_rrbe(struct kvm_vcpu *vcpu) gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; gaddr = kvm_s390_logical_to_effective(vcpu, gaddr); gaddr = kvm_s390_real_to_abs(vcpu, gaddr); - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr)); - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); -retry: - unlocked = false; - mmap_read_lock(current->mm); - rc = reset_guest_reference_bit(current->mm, vmaddr); - if (rc < 0) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - if (!rc) { - mmap_read_unlock(current->mm); - goto retry; - } - } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + rc = dat_reset_reference_bit(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr)); + if (rc > 0) + return kvm_s390_inject_program_int(vcpu, rc); if (rc < 0) return rc; kvm_s390_set_psw_cc(vcpu, rc); @@ -354,9 +324,8 @@ static int handle_sske(struct kvm_vcpu *vcpu) { unsigned char m3 = vcpu->arch.sie_block->ipb >> 28; unsigned long start, end; - unsigned char key, oldkey; + union skey key, oldkey; int reg1, reg2; - bool unlocked; int rc; vcpu->stat.instruction_sske++; @@ -377,7 +346,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) kvm_s390_get_regs_rre(vcpu, ®1, ®2); - key = vcpu->run->s.regs.gprs[reg1] & 0xfe; + key.skey = vcpu->run->s.regs.gprs[reg1] & 0xfe; start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; start = kvm_s390_logical_to_effective(vcpu, start); if (m3 & SSKE_MB) { @@ -389,27 +358,17 @@ static int handle_sske(struct kvm_vcpu *vcpu) } while (start != end) { - unsigned long vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); - unlocked = false; - - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - - mmap_read_lock(current->mm); - rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey, - m3 & SSKE_NQ, m3 & SSKE_MR, - m3 & SSKE_MC); - - if (rc < 0) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - rc = !rc ? -EAGAIN : rc; + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { + rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce, + gpa_to_gfn(start), key, &oldkey, + m3 & SSKE_NQ, m3 & SSKE_MR, m3 & SSKE_MC); } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) + if (rc > 1) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (rc == -EAGAIN) + if (rc == -ENOMEM) { + kvm_s390_mmu_cache_topup(vcpu->arch.mc); continue; + } if (rc < 0) return rc; start += PAGE_SIZE; @@ -422,7 +381,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) } else { kvm_s390_set_psw_cc(vcpu, rc); vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL; - vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8; + vcpu->run->s.regs.gprs[reg1] |= (u64)oldkey.skey << 8; } } if (m3 & SSKE_MB) { @@ -1082,7 +1041,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) bool mr = false, mc = false, nq; int reg1, reg2; unsigned long start, end; - unsigned char key; + union skey key; vcpu->stat.instruction_pfmf++; @@ -1110,7 +1069,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) } nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ; - key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; + key.skey = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; start = kvm_s390_logical_to_effective(vcpu, start); @@ -1141,14 +1100,6 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) } while (start != end) { - unsigned long vmaddr; - bool unlocked = false; - - /* Translate guest address to host address */ - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); - if (kvm_is_error_hva(vmaddr)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); @@ -1159,19 +1110,17 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (rc) return rc; - mmap_read_lock(current->mm); - rc = cond_set_guest_storage_key(current->mm, vmaddr, - key, NULL, nq, mr, mc); - if (rc < 0) { - rc = fixup_user_fault(current->mm, vmaddr, - FAULT_FLAG_WRITE, &unlocked); - rc = !rc ? -EAGAIN : rc; + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { + rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce, + gpa_to_gfn(start), key, + NULL, nq, mr, mc); } - mmap_read_unlock(current->mm); - if (rc == -EFAULT) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (rc == -EAGAIN) + if (rc > 1) + return kvm_s390_inject_program_int(vcpu, rc); + if (rc == -ENOMEM) { + kvm_s390_mmu_cache_topup(vcpu->arch.mc); continue; + } if (rc < 0) return rc; } @@ -1195,8 +1144,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) { int r1, r2, nappended, entries; - unsigned long gfn, hva, res, pgstev, ptev; + union essa_state state; unsigned long *cbrlo; + unsigned long gfn; + bool dirtied; /* * We don't need to set SD.FPF.SK to 1 here, because if we have a @@ -1205,33 +1156,12 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) kvm_s390_get_regs_rre(vcpu, &r1, &r2); gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT; - hva = gfn_to_hva(vcpu->kvm, gfn); entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; - if (kvm_is_error_hva(hva)) - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - - nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev); - if (nappended < 0) { - res = orc ? 0x10 : 0; - vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */ + nappended = dat_perform_essa(vcpu->arch.gmap->asce, gfn, orc, &state, &dirtied); + vcpu->run->s.regs.gprs[r1] = state.val; + if (nappended < 0) return 0; - } - res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22; - /* - * Set the block-content state part of the result. 0 means resident, so - * nothing to do if the page is valid. 2 is for preserved pages - * (non-present and non-zero), and 3 for zero pages (non-present and - * zero). - */ - if (ptev & _PAGE_INVALID) { - res |= 2; - if (pgstev & _PGSTE_GPS_ZERO) - res |= 1; - } - if (pgstev & _PGSTE_GPS_NODAT) - res |= 0x20; - vcpu->run->s.regs.gprs[r1] = res; /* * It is possible that all the normal 511 slots were full, in which case * we will now write in the 512th slot, which is reserved for host use. @@ -1243,17 +1173,34 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) cbrlo[entries] = gfn << PAGE_SHIFT; } - if (orc) { - struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn); - - /* Increment only if we are really flipping the bit */ - if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) - atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages); - } + if (dirtied) + atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages); return nappended; } +static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len) +{ + union crste *crstep; + union pgste pgste; + union pte *ptep; + int i; + + lockdep_assert_held(&vcpu->kvm->mmu_lock); + + for (i = 0; i < len; i++) { + if (dat_entry_walk(NULL, gpa_to_gfn(cbrl[i]), vcpu->arch.gmap->asce, + 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep)) + continue; + if (!ptep || ptep->s.pr) + continue; + pgste = pgste_get_lock(ptep); + if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) + gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]); + pgste_set_unlock(ptep, pgste); + } +} + static int handle_essa(struct kvm_vcpu *vcpu) { lockdep_assert_held(&vcpu->kvm->srcu); @@ -1261,11 +1208,9 @@ static int handle_essa(struct kvm_vcpu *vcpu) /* entries expected to be 1FF */ int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; unsigned long *cbrlo; - struct gmap *gmap; int i, orc; VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries); - gmap = vcpu->arch.gmap; vcpu->stat.instruction_essa++; if (!vcpu->kvm->arch.use_cmma) return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); @@ -1289,11 +1234,7 @@ static int handle_essa(struct kvm_vcpu *vcpu) * value really needs to be written to; if the value is * already correct, we do nothing and avoid the lock. */ - if (vcpu->kvm->mm->context.uses_cmm == 0) { - mmap_write_lock(vcpu->kvm->mm); - vcpu->kvm->mm->context.uses_cmm = 1; - mmap_write_unlock(vcpu->kvm->mm); - } + set_bit(GMAP_FLAG_USES_CMM, &vcpu->arch.gmap->flags); /* * If we are here, we are supposed to have CMMA enabled in * the SIE block. Enabling CMMA works on a per-CPU basis, @@ -1307,20 +1248,22 @@ static int handle_essa(struct kvm_vcpu *vcpu) /* Retry the ESSA instruction */ kvm_s390_retry_instr(vcpu); } else { - mmap_read_lock(vcpu->kvm->mm); - i = __do_essa(vcpu, orc); - mmap_read_unlock(vcpu->kvm->mm); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + i = __do_essa(vcpu, orc); if (i < 0) return i; /* Account for the possible extra cbrl entry */ entries += i; } - vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ + /* reset nceo */ + vcpu->arch.sie_block->cbrlo &= PAGE_MASK; cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); - mmap_read_lock(gmap->mm); - for (i = 0; i < entries; ++i) - __gmap_zap(gmap, cbrlo[i]); - mmap_read_unlock(gmap->mm); + + mmap_read_lock(vcpu->kvm->mm); + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) + _essa_clear_cbrl(vcpu, cbrlo, entries); + mmap_read_unlock(vcpu->kvm->mm); + return 0; } diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 6ba5a0305e25..461b413c76a3 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -12,13 +12,16 @@ #include <linux/minmax.h> #include <linux/pagemap.h> #include <linux/sched/signal.h> -#include <asm/gmap.h> #include <asm/uv.h> #include <asm/mman.h> #include <linux/pagewalk.h> #include <linux/sched/mm.h> #include <linux/mmu_notifier.h> #include "kvm-s390.h" +#include "dat.h" +#include "gaccess.h" +#include "gmap.h" +#include "faultin.h" bool kvm_s390_pv_is_protected(struct kvm *kvm) { @@ -35,6 +38,85 @@ bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected); /** + * should_export_before_import() - Determine whether an export is needed + * before an import-like operation. + * @uvcb: The Ultravisor control block of the UVC to be performed. + * @mm: The mm of the process. + * + * Returns whether an export is needed before every import-like operation. + * This is needed for shared pages, which don't trigger a secure storage + * exception when accessed from a different guest. + * + * Although considered as one, the Unpin Page UVC is not an actual import, + * so it is not affected. + * + * No export is needed also when there is only one protected VM, because the + * page cannot belong to the wrong VM in that case (there is no "other VM" + * it can belong to). + * + * Return: %true if an export is needed before every import, otherwise %false. + */ +static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +{ + /* + * The misc feature indicates, among other things, that importing a + * shared page from a different protected VM will automatically also + * transfer its ownership. + */ + if (uv_has_feature(BIT_UV_FEAT_MISC)) + return false; + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) + return false; + return atomic_read(&mm->context.protected_count) > 1; +} + +struct pv_make_secure { + void *uvcb; + struct folio *folio; + int rc; + bool needs_export; +}; + +static int __kvm_s390_pv_make_secure(struct guest_fault *f, struct folio *folio) +{ + struct pv_make_secure *priv = f->priv; + int rc; + + if (priv->needs_export) + uv_convert_from_secure(folio_to_phys(folio)); + + if (folio_test_hugetlb(folio)) + return -EFAULT; + if (folio_test_large(folio)) + return -E2BIG; + + if (!f->page) + folio_get(folio); + rc = __make_folio_secure(folio, priv->uvcb); + if (!f->page) + folio_put(folio); + + return rc; +} + +static void _kvm_s390_pv_make_secure(struct guest_fault *f) +{ + struct pv_make_secure *priv = f->priv; + struct folio *folio; + + folio = pfn_folio(f->pfn); + priv->rc = -EAGAIN; + if (folio_trylock(folio)) { + priv->rc = __kvm_s390_pv_make_secure(f, folio); + if (priv->rc == -E2BIG || priv->rc == -EBUSY) { + priv->folio = folio; + folio_get(folio); + } + folio_unlock(folio); + } +} + +/** * kvm_s390_pv_make_secure() - make one guest page secure * @kvm: the guest * @gaddr: the guest address that needs to be made secure @@ -45,14 +127,34 @@ EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected); */ int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb) { - unsigned long vmaddr; + struct pv_make_secure priv = { .uvcb = uvcb }; + struct guest_fault f = { + .write_attempt = true, + .gfn = gpa_to_gfn(gaddr), + .callback = _kvm_s390_pv_make_secure, + .priv = &priv, + }; + int rc; lockdep_assert_held(&kvm->srcu); - vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr)); - if (kvm_is_error_hva(vmaddr)) - return -EFAULT; - return make_hva_secure(kvm->mm, vmaddr, uvcb); + priv.needs_export = should_export_before_import(uvcb, kvm->mm); + + scoped_guard(mutex, &kvm->arch.pv.import_lock) { + rc = kvm_s390_faultin_gfn(NULL, kvm, &f); + + if (!rc) { + rc = priv.rc; + if (priv.folio) { + rc = s390_wiggle_split_folio(kvm->mm, priv.folio); + if (!rc) + rc = -EAGAIN; + } + } + } + if (priv.folio) + folio_put(priv.folio); + return rc; } int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr) @@ -299,35 +401,6 @@ done_fast: return 0; } -/** - * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory. - * @kvm: the VM whose memory is to be cleared. - * - * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot. - * The CPUs of the protected VM need to be destroyed beforehand. - */ -static void kvm_s390_destroy_lower_2g(struct kvm *kvm) -{ - const unsigned long pages_2g = SZ_2G / PAGE_SIZE; - struct kvm_memory_slot *slot; - unsigned long len; - int srcu_idx; - - srcu_idx = srcu_read_lock(&kvm->srcu); - - /* Take the memslot containing guest absolute address 0 */ - slot = gfn_to_memslot(kvm, 0); - /* Clear all slots or parts thereof that are below 2GB */ - while (slot && slot->base_gfn < pages_2g) { - len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE; - s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len); - /* Take the next memslot */ - slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages); - } - - srcu_read_unlock(&kvm->srcu, srcu_idx); -} - static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) { struct uv_cb_destroy_fast uvcb = { @@ -342,7 +415,6 @@ static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) *rc = uvcb.header.rc; if (rrc) *rrc = uvcb.header.rrc; - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x", uvcb.header.rc, uvcb.header.rrc); WARN_ONCE(cc && uvcb.header.rc != 0x104, @@ -391,7 +463,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) return -EINVAL; /* Guest with segment type ASCE, refuse to destroy asynchronously */ - if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + if (kvm->arch.gmap->asce.dt == TABLE_TYPE_SEGMENT) return -EINVAL; priv = kzalloc(sizeof(*priv), GFP_KERNEL); @@ -404,8 +476,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) priv->stor_var = kvm->arch.pv.stor_var; priv->stor_base = kvm->arch.pv.stor_base; priv->handle = kvm_s390_pv_get_handle(kvm); - priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + priv->old_gmap_table = (unsigned long)dereference_asce(kvm->arch.gmap->asce); if (s390_replace_asce(kvm->arch.gmap)) res = -ENOMEM; } @@ -415,7 +486,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) return res; } - kvm_s390_destroy_lower_2g(kvm); + gmap_pv_destroy_range(kvm->arch.gmap, 0, gpa_to_gfn(SZ_2G), false); kvm_s390_clear_pv_state(kvm); kvm->arch.pv.set_aside = priv; @@ -449,7 +520,6 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), UVC_CMD_DESTROY_SEC_CONF, rc, rrc); - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); if (!cc) { atomic_dec(&kvm->mm->context.protected_count); kvm_s390_pv_dealloc_vm(kvm); @@ -532,7 +602,7 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc) * cleanup has been performed. */ if (need_zap && mmget_not_zero(kvm->mm)) { - s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); + gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), false); mmput(kvm->mm); } @@ -570,7 +640,7 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc) return -EINVAL; /* When a fatal signal is received, stop immediately */ - if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) + if (gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), true)) goto done; if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) ret = -EIO; @@ -609,6 +679,7 @@ static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm)) kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy); + set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags); } static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = { @@ -642,7 +713,7 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) /* Inputs */ uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ uvcb.guest_stor_len = kvm->arch.pv.guest_len; - uvcb.guest_asce = kvm->arch.gmap->asce; + uvcb.guest_asce = kvm->arch.gmap->asce.val; uvcb.guest_sca = virt_to_phys(kvm->arch.sca); uvcb.conf_base_stor_origin = virt_to_phys((void *)kvm->arch.pv.stor_base); @@ -650,6 +721,9 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap; uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr; + clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &kvm->arch.gmap->flags); + gmap_split_huge_pages(kvm->arch.gmap); + cc = uv_call_sched(0, (u64)&uvcb); *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; @@ -669,7 +743,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) } return -EIO; } - kvm->arch.gmap->guest_handle = uvcb.guest_handle; return 0; } @@ -704,26 +777,14 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, .tweak[1] = offset, }; int ret = kvm_s390_pv_make_secure(kvm, addr, &uvcb); - unsigned long vmaddr; - bool unlocked; *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; if (ret == -ENXIO) { - mmap_read_lock(kvm->mm); - vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr)); - if (kvm_is_error_hva(vmaddr)) { - ret = -EFAULT; - } else { - ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); - if (!ret) - ret = __gmap_link(kvm->arch.gmap, addr, vmaddr); - } - mmap_read_unlock(kvm->mm); + ret = kvm_s390_faultin_gfn_simple(NULL, kvm, gpa_to_gfn(addr), true); if (!ret) return -EAGAIN; - return ret; } if (ret && ret != -EAGAIN) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index b526621d2a1b..d249b10044eb 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -15,7 +15,6 @@ #include <linux/io.h> #include <linux/mman.h> -#include <asm/gmap.h> #include <asm/mmu_context.h> #include <asm/sclp.h> #include <asm/nmi.h> @@ -23,6 +22,7 @@ #include <asm/facility.h> #include "kvm-s390.h" #include "gaccess.h" +#include "gmap.h" enum vsie_page_flags { VSIE_PAGE_IN_USE = 0, @@ -41,8 +41,11 @@ struct vsie_page { * are reused conditionally, should be accessed via READ_ONCE. */ struct kvm_s390_sie_block *scb_o; /* 0x0218 */ - /* the shadow gmap in use by the vsie_page */ - struct gmap *gmap; /* 0x0220 */ + /* + * Flags: must be set/cleared atomically after the vsie page can be + * looked up by other CPUs. + */ + unsigned long flags; /* 0x0220 */ /* address of the last reported fault to guest2 */ unsigned long fault_addr; /* 0x0228 */ /* calculated guest addresses of satellite control blocks */ @@ -57,33 +60,14 @@ struct vsie_page { * radix tree. */ gpa_t scb_gpa; /* 0x0258 */ - /* - * Flags: must be set/cleared atomically after the vsie page can be - * looked up by other CPUs. - */ - unsigned long flags; /* 0x0260 */ - __u8 reserved[0x0700 - 0x0268]; /* 0x0268 */ + /* the shadow gmap in use by the vsie_page */ + struct gmap_cache gmap_cache; /* 0x0260 */ + __u8 reserved[0x0700 - 0x0278]; /* 0x0278 */ struct kvm_s390_crypto_cb crycb; /* 0x0700 */ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ }; -/** - * gmap_shadow_valid() - check if a shadow guest address space matches the - * given properties and is still valid - * @sg: pointer to the shadow guest address space structure - * @asce: ASCE for which the shadow table is requested - * @edat_level: edat level to be used for the shadow translation - * - * Returns 1 if the gmap shadow is still valid and matches the given - * properties, the caller can continue using it. Returns 0 otherwise; the - * caller has to request a new shadow gmap in this case. - */ -int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) -{ - if (sg->removed) - return 0; - return sg->orig_asce == asce && sg->edat_level == edat_level; -} +static_assert(sizeof(struct vsie_page) == PAGE_SIZE); /* trigger a validity icpt for the given scb */ static int set_validity_icpt(struct kvm_s390_sie_block *scb, @@ -612,26 +596,17 @@ out: return rc; } -void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, - unsigned long end) +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end) { - struct kvm *kvm = gmap->private; - struct vsie_page *cur; + struct vsie_page *cur, *next; unsigned long prefix; - int i; - if (!gmap_is_shadow(gmap)) - return; + KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &gmap->flags), gmap->kvm); /* * Only new shadow blocks are added to the list during runtime, * therefore we can safely reference them all the time. */ - for (i = 0; i < kvm->arch.vsie.page_count; i++) { - cur = READ_ONCE(kvm->arch.vsie.pages[i]); - if (!cur) - continue; - if (READ_ONCE(cur->gmap) != gmap) - continue; + list_for_each_entry_safe(cur, next, &gmap->scb_users, gmap_cache.list) { prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; /* with mso/msl, the prefix lies at an offset */ prefix += cur->scb_s.mso; @@ -652,7 +627,7 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, * - -EAGAIN if the caller can retry immediately * - -ENOMEM if out of memory */ -static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; @@ -667,10 +642,9 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* with mso/msl, the prefix lies at offset *mso* */ prefix += scb_s->mso; - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL); + rc = gaccess_shadow_fault(vcpu, sg, prefix, NULL, true); if (!rc && (scb_s->ecb & ECB_TE)) - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - prefix + PAGE_SIZE, NULL); + rc = gaccess_shadow_fault(vcpu, sg, prefix + PAGE_SIZE, NULL, true); /* * We don't have to mprotect, we will be called for all unshadows. * SIE will detect if protection applies and trigger a validity. @@ -951,8 +925,9 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr, * - > 0 if control has to be given to guest 2 * - < 0 if an error occurred */ -static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { + bool wr = kvm_s390_cur_gmap_fault_is_write(); int rc; if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION) @@ -960,12 +935,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return inject_fault(vcpu, PGM_PROTECTION, current->thread.gmap_teid.addr * PAGE_SIZE, 1); - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - current->thread.gmap_teid.addr * PAGE_SIZE, NULL); + rc = gaccess_shadow_fault(vcpu, sg, current->thread.gmap_teid.addr * PAGE_SIZE, NULL, wr); if (rc > 0) { rc = inject_fault(vcpu, rc, - current->thread.gmap_teid.addr * PAGE_SIZE, - kvm_s390_cur_gmap_fault_is_write()); + current->thread.gmap_teid.addr * PAGE_SIZE, wr); if (rc >= 0) vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE; } @@ -978,12 +951,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * * Will ignore any errors. The next SIE fault will do proper fault handling. */ -static void handle_last_fault(struct kvm_vcpu *vcpu, - struct vsie_page *vsie_page) +static void handle_last_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { if (vsie_page->fault_addr) - kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - vsie_page->fault_addr, NULL); + gaccess_shadow_fault(vcpu, sg, vsie_page->fault_addr, NULL, true); vsie_page->fault_addr = 0; } @@ -1065,11 +1036,12 @@ static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, } } -static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; - unsigned long pei_dest, pei_src, src, dest, mask, prefix; + unsigned long src, dest, mask, prefix; u64 *pei_block = &vsie_page->scb_o->mcic; + union mvpg_pei pei_dest, pei_src; int edat, rc_dest, rc_src; union ctlreg0 cr0; @@ -1083,8 +1055,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask; src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso; - rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest); - rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src); + rc_dest = gaccess_shadow_fault(vcpu, sg, dest, &pei_dest, true); + rc_src = gaccess_shadow_fault(vcpu, sg, src, &pei_src, false); /* * Either everything went well, or something non-critical went wrong * e.g. because of a race. In either case, simply retry. @@ -1119,8 +1091,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0; } if (!rc_dest && !rc_src) { - pei_block[0] = pei_dest; - pei_block[1] = pei_src; + pei_block[0] = pei_dest.val; + pei_block[1] = pei_src.val; return 1; } @@ -1144,7 +1116,7 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * - > 0 if control has to be given to guest 2 * - < 0 if an error occurred */ -static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) __releases(vcpu->kvm->srcu) __acquires(vcpu->kvm->srcu) { @@ -1153,7 +1125,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) int guest_bp_isolation; int rc = 0; - handle_last_fault(vcpu, vsie_page); + handle_last_fault(vcpu, vsie_page, sg); kvm_vcpu_srcu_read_unlock(vcpu); @@ -1191,7 +1163,7 @@ xfer_to_guest_mode_check: goto xfer_to_guest_mode_check; } guest_timing_enter_irqoff(); - rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce); + rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, sg->asce.val); guest_timing_exit_irqoff(); local_irq_enable(); } @@ -1215,7 +1187,7 @@ skip_sie: if (rc > 0) rc = 0; /* we could still have an icpt */ else if (current->thread.gmap_int_code) - return handle_fault(vcpu, vsie_page); + return handle_fault(vcpu, vsie_page, sg); switch (scb_s->icptcode) { case ICPT_INST: @@ -1233,7 +1205,7 @@ skip_sie: break; case ICPT_PARTEXEC: if (scb_s->ipa == 0xb254) - rc = vsie_handle_mvpg(vcpu, vsie_page); + rc = vsie_handle_mvpg(vcpu, vsie_page, sg); break; } return rc; @@ -1241,43 +1213,67 @@ skip_sie: static void release_gmap_shadow(struct vsie_page *vsie_page) { - if (vsie_page->gmap) - gmap_put(vsie_page->gmap); - WRITE_ONCE(vsie_page->gmap, NULL); + struct gmap *gmap = vsie_page->gmap_cache.gmap; + + lockdep_assert_held(&gmap->kvm->arch.gmap->children_lock); + + list_del(&vsie_page->gmap_cache.list); + vsie_page->gmap_cache.gmap = NULL; prefix_unmapped(vsie_page); + + if (list_empty(&gmap->scb_users)) { + gmap_remove_child(gmap); + gmap_put(gmap); + } } -static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, - struct vsie_page *vsie_page) +static struct gmap *acquire_gmap_shadow(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { - unsigned long asce; union ctlreg0 cr0; struct gmap *gmap; + union asce asce; int edat; - asce = vcpu->arch.sie_block->gcr[1]; + asce.val = vcpu->arch.sie_block->gcr[1]; cr0.val = vcpu->arch.sie_block->gcr[0]; edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); edat += edat && test_kvm_facility(vcpu->kvm, 78); - /* - * ASCE or EDAT could have changed since last icpt, or the gmap - * we're holding has been unshadowed. If the gmap is still valid, - * we can safely reuse it. - */ - if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) { - vcpu->kvm->stat.gmap_shadow_reuse++; - return 0; + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) { + gmap = vsie_page->gmap_cache.gmap; + if (gmap) { + /* + * ASCE or EDAT could have changed since last icpt, or the gmap + * we're holding has been unshadowed. If the gmap is still valid, + * we can safely reuse it. + */ + if (gmap_is_shadow_valid(gmap, asce, edat)) { + vcpu->kvm->stat.gmap_shadow_reuse++; + gmap_get(gmap); + return gmap; + } + /* release the old shadow and mark the prefix as unmapped */ + release_gmap_shadow(vsie_page); + } } - - /* release the old shadow - if any, and mark the prefix as unmapped */ - release_gmap_shadow(vsie_page); - gmap = gmap_shadow(vcpu->arch.gmap, asce, edat); +again: + gmap = gmap_create_shadow(vcpu->arch.mc, vcpu->kvm->arch.gmap, asce, edat); if (IS_ERR(gmap)) - return PTR_ERR(gmap); - vcpu->kvm->stat.gmap_shadow_create++; - WRITE_ONCE(vsie_page->gmap, gmap); - return 0; + return gmap; + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) { + /* unlikely race condition, remove the previous shadow */ + if (vsie_page->gmap_cache.gmap) + release_gmap_shadow(vsie_page); + if (!gmap->parent) { + gmap_put(gmap); + goto again; + } + vcpu->kvm->stat.gmap_shadow_create++; + list_add(&vsie_page->gmap_cache.list, &gmap->scb_users); + vsie_page->gmap_cache.gmap = gmap; + prefix_unmapped(vsie_page); + } + return gmap; } /* @@ -1330,15 +1326,20 @@ static void unregister_shadow_scb(struct kvm_vcpu *vcpu) static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct gmap *sg; int rc = 0; while (1) { - rc = acquire_gmap_shadow(vcpu, vsie_page); + sg = acquire_gmap_shadow(vcpu, vsie_page); + if (IS_ERR(sg)) { + rc = PTR_ERR(sg); + sg = NULL; + } if (!rc) - rc = map_prefix(vcpu, vsie_page); + rc = map_prefix(vcpu, vsie_page, sg); if (!rc) { update_intervention_requests(vsie_page); - rc = do_vsie_run(vcpu, vsie_page); + rc = do_vsie_run(vcpu, vsie_page, sg); } atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20); @@ -1361,6 +1362,9 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) kvm_s390_rewind_psw(vcpu, 4); break; } + if (sg) + sg = gmap_put(sg); + cond_resched(); } if (rc == -EFAULT) { @@ -1457,8 +1461,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) vsie_page->scb_gpa = ULONG_MAX; /* Double use of the same address or allocation failure. */ - if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, - vsie_page)) { + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, vsie_page)) { put_vsie_page(vsie_page); mutex_unlock(&kvm->arch.vsie.mutex); return NULL; @@ -1467,7 +1470,12 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) mutex_unlock(&kvm->arch.vsie.mutex); memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); - release_gmap_shadow(vsie_page); + if (vsie_page->gmap_cache.gmap) { + scoped_guard(spinlock, &kvm->arch.gmap->children_lock) + if (vsie_page->gmap_cache.gmap) + release_gmap_shadow(vsie_page); + } + prefix_unmapped(vsie_page); vsie_page->fault_addr = 0; vsie_page->scb_s.ihcpu = 0xffffU; return vsie_page; @@ -1498,11 +1506,13 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) } vsie_page = get_vsie_page(vcpu->kvm, scb_addr); - if (IS_ERR(vsie_page)) + if (IS_ERR(vsie_page)) { return PTR_ERR(vsie_page); - else if (!vsie_page) + } else if (!vsie_page) { /* double use of sie control block - simply do nothing */ + kvm_s390_rewind_psw(vcpu, 4); return 0; + } rc = pin_scb(vcpu, vsie_page, scb_addr); if (rc) @@ -1543,8 +1553,10 @@ void kvm_s390_vsie_destroy(struct kvm *kvm) mutex_lock(&kvm->arch.vsie.mutex); for (i = 0; i < kvm->arch.vsie.page_count; i++) { vsie_page = kvm->arch.vsie.pages[i]; + scoped_guard(spinlock, &kvm->arch.gmap->children_lock) + if (vsie_page->gmap_cache.gmap) + release_gmap_shadow(vsie_page); kvm->arch.vsie.pages[i] = NULL; - release_gmap_shadow(vsie_page); /* free the radix tree entry */ if (vsie_page->scb_gpa != ULONG_MAX) radix_tree_delete(&kvm->arch.vsie.addr_to_page, diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 1a6ba105e071..0ac2f3998b14 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -34,136 +34,19 @@ void debug_user_asce(int exit) } #endif /*CONFIG_DEBUG_ENTRY */ -union oac { - unsigned int val; - struct { - struct { - unsigned short key : 4; - unsigned short : 4; - unsigned short as : 2; - unsigned short : 4; - unsigned short k : 1; - unsigned short a : 1; - } oac1; - struct { - unsigned short key : 4; - unsigned short : 4; - unsigned short as : 2; - unsigned short : 4; - unsigned short k : 1; - unsigned short a : 1; - } oac2; - }; -}; - -static uaccess_kmsan_or_inline __must_check unsigned long -raw_copy_from_user_key(void *to, const void __user *from, unsigned long size, unsigned long key) -{ - unsigned long osize; - union oac spec = { - .oac2.key = key, - .oac2.as = PSW_BITS_AS_SECONDARY, - .oac2.k = 1, - .oac2.a = 1, - }; - int cc; - - while (1) { - osize = size; - asm_inline volatile( - " lr %%r0,%[spec]\n" - "0: mvcos %[to],%[from],%[size]\n" - "1: nopr %%r7\n" - CC_IPM(cc) - EX_TABLE_UA_MVCOS_FROM(0b, 0b) - EX_TABLE_UA_MVCOS_FROM(1b, 0b) - : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char *)to) - : [spec] "d" (spec.val), [from] "Q" (*(const char __user *)from) - : CC_CLOBBER_LIST("memory", "0")); - if (CC_TRANSFORM(cc) == 0) - return osize - size; - size -= 4096; - to += 4096; - from += 4096; - } -} - -unsigned long _copy_from_user_key(void *to, const void __user *from, - unsigned long n, unsigned long key) -{ - unsigned long res = n; - - might_fault(); - if (!should_fail_usercopy()) { - instrument_copy_from_user_before(to, from, n); - res = raw_copy_from_user_key(to, from, n, key); - instrument_copy_from_user_after(to, from, n, res); - } - if (unlikely(res)) - memset(to + (n - res), 0, res); - return res; -} -EXPORT_SYMBOL(_copy_from_user_key); - -static uaccess_kmsan_or_inline __must_check unsigned long -raw_copy_to_user_key(void __user *to, const void *from, unsigned long size, unsigned long key) -{ - unsigned long osize; - union oac spec = { - .oac1.key = key, - .oac1.as = PSW_BITS_AS_SECONDARY, - .oac1.k = 1, - .oac1.a = 1, - }; - int cc; - - while (1) { - osize = size; - asm_inline volatile( - " lr %%r0,%[spec]\n" - "0: mvcos %[to],%[from],%[size]\n" - "1: nopr %%r7\n" - CC_IPM(cc) - EX_TABLE_UA_MVCOS_TO(0b, 0b) - EX_TABLE_UA_MVCOS_TO(1b, 0b) - : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to) - : [spec] "d" (spec.val), [from] "Q" (*(const char *)from) - : CC_CLOBBER_LIST("memory", "0")); - if (CC_TRANSFORM(cc) == 0) - return osize - size; - size -= 4096; - to += 4096; - from += 4096; - } -} - -unsigned long _copy_to_user_key(void __user *to, const void *from, - unsigned long n, unsigned long key) -{ - might_fault(); - if (should_fail_usercopy()) - return n; - instrument_copy_to_user(to, from, n); - return raw_copy_to_user_key(to, from, n, key); -} -EXPORT_SYMBOL(_copy_to_user_key); - #define CMPXCHG_USER_KEY_MAX_LOOPS 128 -static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsigned int *uval, - unsigned int old, unsigned int new, - unsigned int mask, unsigned long key) +static nokprobe_inline int __cmpxchg_key_small(void *address, unsigned int *uval, + unsigned int old, unsigned int new, + unsigned int mask, unsigned long key) { unsigned long count; unsigned int prev; - bool sacf_flag; int rc = 0; skey_regions_initialize(); - sacf_flag = enable_sacf_uaccess(); asm_inline volatile( "20: spka 0(%[key])\n" - " sacf 256\n" " llill %[count],%[max_loops]\n" "0: l %[prev],%[address]\n" "1: nr %[prev],%[mask]\n" @@ -178,8 +61,7 @@ static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsig " nr %[tmp],%[mask]\n" " jnz 5f\n" " brct %[count],2b\n" - "5: sacf 768\n" - " spka %[default_key]\n" + "5: spka %[default_key]\n" "21:\n" EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) @@ -197,16 +79,16 @@ static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsig [default_key] "J" (PAGE_DEFAULT_KEY), [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) : "memory", "cc"); - disable_sacf_uaccess(sacf_flag); *uval = prev; if (!count) rc = -EAGAIN; return rc; } -int __kprobes __cmpxchg_user_key1(unsigned long address, unsigned char *uval, - unsigned char old, unsigned char new, unsigned long key) +int __kprobes __cmpxchg_key1(void *addr, unsigned char *uval, unsigned char old, + unsigned char new, unsigned long key) { + unsigned long address = (unsigned long)addr; unsigned int prev, shift, mask, _old, _new; int rc; @@ -215,15 +97,16 @@ int __kprobes __cmpxchg_user_key1(unsigned long address, unsigned char *uval, _old = (unsigned int)old << shift; _new = (unsigned int)new << shift; mask = ~(0xff << shift); - rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key); + rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key); *uval = prev >> shift; return rc; } -EXPORT_SYMBOL(__cmpxchg_user_key1); +EXPORT_SYMBOL(__cmpxchg_key1); -int __kprobes __cmpxchg_user_key2(unsigned long address, unsigned short *uval, - unsigned short old, unsigned short new, unsigned long key) +int __kprobes __cmpxchg_key2(void *addr, unsigned short *uval, unsigned short old, + unsigned short new, unsigned long key) { + unsigned long address = (unsigned long)addr; unsigned int prev, shift, mask, _old, _new; int rc; @@ -232,27 +115,23 @@ int __kprobes __cmpxchg_user_key2(unsigned long address, unsigned short *uval, _old = (unsigned int)old << shift; _new = (unsigned int)new << shift; mask = ~(0xffff << shift); - rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key); + rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key); *uval = prev >> shift; return rc; } -EXPORT_SYMBOL(__cmpxchg_user_key2); +EXPORT_SYMBOL(__cmpxchg_key2); -int __kprobes __cmpxchg_user_key4(unsigned long address, unsigned int *uval, - unsigned int old, unsigned int new, unsigned long key) +int __kprobes __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old, + unsigned int new, unsigned long key) { unsigned int prev = old; - bool sacf_flag; int rc = 0; skey_regions_initialize(); - sacf_flag = enable_sacf_uaccess(); asm_inline volatile( "20: spka 0(%[key])\n" - " sacf 256\n" "0: cs %[prev],%[new],%[address]\n" - "1: sacf 768\n" - " spka %[default_key]\n" + "1: spka %[default_key]\n" "21:\n" EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) @@ -264,27 +143,22 @@ int __kprobes __cmpxchg_user_key4(unsigned long address, unsigned int *uval, [key] "a" (key << 4), [default_key] "J" (PAGE_DEFAULT_KEY) : "memory", "cc"); - disable_sacf_uaccess(sacf_flag); *uval = prev; return rc; } -EXPORT_SYMBOL(__cmpxchg_user_key4); +EXPORT_SYMBOL(__cmpxchg_key4); -int __kprobes __cmpxchg_user_key8(unsigned long address, unsigned long *uval, - unsigned long old, unsigned long new, unsigned long key) +int __kprobes __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old, + unsigned long new, unsigned long key) { unsigned long prev = old; - bool sacf_flag; int rc = 0; skey_regions_initialize(); - sacf_flag = enable_sacf_uaccess(); asm_inline volatile( "20: spka 0(%[key])\n" - " sacf 256\n" "0: csg %[prev],%[new],%[address]\n" - "1: sacf 768\n" - " spka %[default_key]\n" + "1: spka %[default_key]\n" "21:\n" EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) @@ -296,27 +170,22 @@ int __kprobes __cmpxchg_user_key8(unsigned long address, unsigned long *uval, [key] "a" (key << 4), [default_key] "J" (PAGE_DEFAULT_KEY) : "memory", "cc"); - disable_sacf_uaccess(sacf_flag); *uval = prev; return rc; } -EXPORT_SYMBOL(__cmpxchg_user_key8); +EXPORT_SYMBOL(__cmpxchg_key8); -int __kprobes __cmpxchg_user_key16(unsigned long address, __uint128_t *uval, - __uint128_t old, __uint128_t new, unsigned long key) +int __kprobes __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old, + __uint128_t new, unsigned long key) { __uint128_t prev = old; - bool sacf_flag; int rc = 0; skey_regions_initialize(); - sacf_flag = enable_sacf_uaccess(); asm_inline volatile( "20: spka 0(%[key])\n" - " sacf 256\n" "0: cdsg %[prev],%[new],%[address]\n" - "1: sacf 768\n" - " spka %[default_key]\n" + "1: spka %[default_key]\n" "21:\n" EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev]) EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev]) @@ -328,8 +197,7 @@ int __kprobes __cmpxchg_user_key16(unsigned long address, __uint128_t *uval, [key] "a" (key << 4), [default_key] "J" (PAGE_DEFAULT_KEY) : "memory", "cc"); - disable_sacf_uaccess(sacf_flag); *uval = prev; return rc; } -EXPORT_SYMBOL(__cmpxchg_user_key16); +EXPORT_SYMBOL(__cmpxchg_key16); diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index bd0401cc7ca5..193899c39ca7 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -10,7 +10,6 @@ obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP) += dump_pagetables.o -obj-$(CONFIG_PGSTE) += gmap.o obj-$(CONFIG_PFAULT) += pfault.o obj-$(subst m,y,$(CONFIG_KVM)) += gmap_helpers.o diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index e2e13778c36a..a52aa7a99b6b 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -403,7 +403,7 @@ void do_dat_exception(struct pt_regs *regs) } NOKPROBE_SYMBOL(do_dat_exception); -#if IS_ENABLED(CONFIG_PGSTE) +#if IS_ENABLED(CONFIG_KVM) void do_secure_storage_access(struct pt_regs *regs) { @@ -470,4 +470,4 @@ void do_secure_storage_access(struct pt_regs *regs) } NOKPROBE_SYMBOL(do_secure_storage_access); -#endif /* CONFIG_PGSTE */ +#endif /* CONFIG_KVM */ diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c deleted file mode 100644 index dd85bcca817d..000000000000 --- a/arch/s390/mm/gmap.c +++ /dev/null @@ -1,2436 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * KVM guest address space mapping code - * - * Copyright IBM Corp. 2007, 2020 - * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> - * David Hildenbrand <david@redhat.com> - * Janosch Frank <frankja@linux.vnet.ibm.com> - */ - -#include <linux/cpufeature.h> -#include <linux/export.h> -#include <linux/kernel.h> -#include <linux/pagewalk.h> -#include <linux/swap.h> -#include <linux/smp.h> -#include <linux/spinlock.h> -#include <linux/slab.h> -#include <linux/swapops.h> -#include <linux/ksm.h> -#include <linux/mman.h> -#include <linux/pgtable.h> -#include <asm/page-states.h> -#include <asm/pgalloc.h> -#include <asm/machine.h> -#include <asm/gmap_helpers.h> -#include <asm/gmap.h> -#include <asm/page.h> - -/* - * The address is saved in a radix tree directly; NULL would be ambiguous, - * since 0 is a valid address, and NULL is returned when nothing was found. - * The lower bits are ignored by all users of the macro, so it can be used - * to distinguish a valid address 0 from a NULL. - */ -#define VALID_GADDR_FLAG 1 -#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG) -#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG) - -#define GMAP_SHADOW_FAKE_TABLE 1ULL - -static struct page *gmap_alloc_crst(void) -{ - struct page *page; - - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); - if (!page) - return NULL; - __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER); - return page; -} - -/** - * gmap_alloc - allocate and initialize a guest address space - * @limit: maximum address of the gmap address space - * - * Returns a guest address space structure. - */ -struct gmap *gmap_alloc(unsigned long limit) -{ - struct gmap *gmap; - struct page *page; - unsigned long *table; - unsigned long etype, atype; - - if (limit < _REGION3_SIZE) { - limit = _REGION3_SIZE - 1; - atype = _ASCE_TYPE_SEGMENT; - etype = _SEGMENT_ENTRY_EMPTY; - } else if (limit < _REGION2_SIZE) { - limit = _REGION2_SIZE - 1; - atype = _ASCE_TYPE_REGION3; - etype = _REGION3_ENTRY_EMPTY; - } else if (limit < _REGION1_SIZE) { - limit = _REGION1_SIZE - 1; - atype = _ASCE_TYPE_REGION2; - etype = _REGION2_ENTRY_EMPTY; - } else { - limit = -1UL; - atype = _ASCE_TYPE_REGION1; - etype = _REGION1_ENTRY_EMPTY; - } - gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); - if (!gmap) - goto out; - INIT_LIST_HEAD(&gmap->children); - INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); - INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); - INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); - spin_lock_init(&gmap->guest_table_lock); - spin_lock_init(&gmap->shadow_lock); - refcount_set(&gmap->ref_count, 1); - page = gmap_alloc_crst(); - if (!page) - goto out_free; - table = page_to_virt(page); - crst_table_init(table, etype); - gmap->table = table; - gmap->asce = atype | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | __pa(table); - gmap->asce_end = limit; - return gmap; - -out_free: - kfree(gmap); -out: - return NULL; -} -EXPORT_SYMBOL_GPL(gmap_alloc); - -/** - * gmap_create - create a guest address space - * @mm: pointer to the parent mm_struct - * @limit: maximum size of the gmap address space - * - * Returns a guest address space structure. - */ -struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) -{ - struct gmap *gmap; - unsigned long gmap_asce; - - gmap = gmap_alloc(limit); - if (!gmap) - return NULL; - gmap->mm = mm; - spin_lock(&mm->context.lock); - list_add_rcu(&gmap->list, &mm->context.gmap_list); - if (list_is_singular(&mm->context.gmap_list)) - gmap_asce = gmap->asce; - else - gmap_asce = -1UL; - WRITE_ONCE(mm->context.gmap_asce, gmap_asce); - spin_unlock(&mm->context.lock); - return gmap; -} -EXPORT_SYMBOL_GPL(gmap_create); - -static void gmap_flush_tlb(struct gmap *gmap) -{ - __tlb_flush_idte(gmap->asce); -} - -static void gmap_radix_tree_free(struct radix_tree_root *root) -{ - struct radix_tree_iter iter; - unsigned long indices[16]; - unsigned long index; - void __rcu **slot; - int i, nr; - - /* A radix tree is freed by deleting all of its entries */ - index = 0; - do { - nr = 0; - radix_tree_for_each_slot(slot, root, &iter, index) { - indices[nr] = iter.index; - if (++nr == 16) - break; - } - for (i = 0; i < nr; i++) { - index = indices[i]; - radix_tree_delete(root, index); - } - } while (nr > 0); -} - -static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) -{ - struct gmap_rmap *rmap, *rnext, *head; - struct radix_tree_iter iter; - unsigned long indices[16]; - unsigned long index; - void __rcu **slot; - int i, nr; - - /* A radix tree is freed by deleting all of its entries */ - index = 0; - do { - nr = 0; - radix_tree_for_each_slot(slot, root, &iter, index) { - indices[nr] = iter.index; - if (++nr == 16) - break; - } - for (i = 0; i < nr; i++) { - index = indices[i]; - head = radix_tree_delete(root, index); - gmap_for_each_rmap_safe(rmap, rnext, head) - kfree(rmap); - } - } while (nr > 0); -} - -static void gmap_free_crst(unsigned long *table, bool free_ptes) -{ - bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0; - int i; - - if (is_segment) { - if (!free_ptes) - goto out; - for (i = 0; i < _CRST_ENTRIES; i++) - if (!(table[i] & _SEGMENT_ENTRY_INVALID)) - page_table_free_pgste(page_ptdesc(phys_to_page(table[i]))); - } else { - for (i = 0; i < _CRST_ENTRIES; i++) - if (!(table[i] & _REGION_ENTRY_INVALID)) - gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes); - } - -out: - free_pages((unsigned long)table, CRST_ALLOC_ORDER); -} - -/** - * gmap_free - free a guest address space - * @gmap: pointer to the guest address space structure - * - * No locks required. There are no references to this gmap anymore. - */ -void gmap_free(struct gmap *gmap) -{ - /* Flush tlb of all gmaps (if not already done for shadows) */ - if (!(gmap_is_shadow(gmap) && gmap->removed)) - gmap_flush_tlb(gmap); - /* Free all segment & region tables. */ - gmap_free_crst(gmap->table, gmap_is_shadow(gmap)); - - gmap_radix_tree_free(&gmap->guest_to_host); - gmap_radix_tree_free(&gmap->host_to_guest); - - /* Free additional data for a shadow gmap */ - if (gmap_is_shadow(gmap)) { - gmap_rmap_radix_tree_free(&gmap->host_to_rmap); - /* Release reference to the parent */ - gmap_put(gmap->parent); - } - - kfree(gmap); -} -EXPORT_SYMBOL_GPL(gmap_free); - -/** - * gmap_get - increase reference counter for guest address space - * @gmap: pointer to the guest address space structure - * - * Returns the gmap pointer - */ -struct gmap *gmap_get(struct gmap *gmap) -{ - refcount_inc(&gmap->ref_count); - return gmap; -} -EXPORT_SYMBOL_GPL(gmap_get); - -/** - * gmap_put - decrease reference counter for guest address space - * @gmap: pointer to the guest address space structure - * - * If the reference counter reaches zero the guest address space is freed. - */ -void gmap_put(struct gmap *gmap) -{ - if (refcount_dec_and_test(&gmap->ref_count)) - gmap_free(gmap); -} -EXPORT_SYMBOL_GPL(gmap_put); - -/** - * gmap_remove - remove a guest address space but do not free it yet - * @gmap: pointer to the guest address space structure - */ -void gmap_remove(struct gmap *gmap) -{ - struct gmap *sg, *next; - unsigned long gmap_asce; - - /* Remove all shadow gmaps linked to this gmap */ - if (!list_empty(&gmap->children)) { - spin_lock(&gmap->shadow_lock); - list_for_each_entry_safe(sg, next, &gmap->children, list) { - list_del(&sg->list); - gmap_put(sg); - } - spin_unlock(&gmap->shadow_lock); - } - /* Remove gmap from the pre-mm list */ - spin_lock(&gmap->mm->context.lock); - list_del_rcu(&gmap->list); - if (list_empty(&gmap->mm->context.gmap_list)) - gmap_asce = 0; - else if (list_is_singular(&gmap->mm->context.gmap_list)) - gmap_asce = list_first_entry(&gmap->mm->context.gmap_list, - struct gmap, list)->asce; - else - gmap_asce = -1UL; - WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); - spin_unlock(&gmap->mm->context.lock); - synchronize_rcu(); - /* Put reference */ - gmap_put(gmap); -} -EXPORT_SYMBOL_GPL(gmap_remove); - -/* - * gmap_alloc_table is assumed to be called with mmap_lock held - */ -static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, - unsigned long init, unsigned long gaddr) -{ - struct page *page; - unsigned long *new; - - /* since we dont free the gmap table until gmap_free we can unlock */ - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - new = page_to_virt(page); - crst_table_init(new, init); - spin_lock(&gmap->guest_table_lock); - if (*table & _REGION_ENTRY_INVALID) { - *table = __pa(new) | _REGION_ENTRY_LENGTH | - (*table & _REGION_ENTRY_TYPE_MASK); - page = NULL; - } - spin_unlock(&gmap->guest_table_lock); - if (page) - __free_pages(page, CRST_ALLOC_ORDER); - return 0; -} - -static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr) -{ - return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); -} - -static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr) -{ - return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); -} - -static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr, - unsigned long *gaddr) -{ - *gaddr = host_to_guest_delete(gmap, vmaddr); - if (IS_GADDR_VALID(*gaddr)) - return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1); - return NULL; -} - -/** - * __gmap_unlink_by_vmaddr - unlink a single segment via a host address - * @gmap: pointer to the guest address space structure - * @vmaddr: address in the host process address space - * - * Returns 1 if a TLB flush is required - */ -static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) -{ - unsigned long gaddr; - int flush = 0; - pmd_t *pmdp; - - BUG_ON(gmap_is_shadow(gmap)); - spin_lock(&gmap->guest_table_lock); - - pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); - if (pmdp) { - flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY); - *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); - } - - spin_unlock(&gmap->guest_table_lock); - return flush; -} - -/** - * __gmap_unmap_by_gaddr - unmap a single segment via a guest address - * @gmap: pointer to the guest address space structure - * @gaddr: address in the guest address space - * - * Returns 1 if a TLB flush is required - */ -static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr; - - vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; -} - -/** - * gmap_unmap_segment - unmap segment from the guest address space - * @gmap: pointer to the guest address space structure - * @to: address in the guest address space - * @len: length of the memory area to unmap - * - * Returns 0 if the unmap succeeded, -EINVAL if not. - */ -int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) -{ - unsigned long off; - int flush; - - BUG_ON(gmap_is_shadow(gmap)); - if ((to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || to + len < to) - return -EINVAL; - - flush = 0; - mmap_write_lock(gmap->mm); - for (off = 0; off < len; off += PMD_SIZE) - flush |= __gmap_unmap_by_gaddr(gmap, to + off); - mmap_write_unlock(gmap->mm); - if (flush) - gmap_flush_tlb(gmap); - return 0; -} -EXPORT_SYMBOL_GPL(gmap_unmap_segment); - -/** - * gmap_map_segment - map a segment to the guest address space - * @gmap: pointer to the guest address space structure - * @from: source address in the parent address space - * @to: target address in the guest address space - * @len: length of the memory area to map - * - * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. - */ -int gmap_map_segment(struct gmap *gmap, unsigned long from, - unsigned long to, unsigned long len) -{ - unsigned long off; - int flush; - - BUG_ON(gmap_is_shadow(gmap)); - if ((from | to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || from + len < from || to + len < to || - from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end) - return -EINVAL; - - flush = 0; - mmap_write_lock(gmap->mm); - for (off = 0; off < len; off += PMD_SIZE) { - /* Remove old translation */ - flush |= __gmap_unmap_by_gaddr(gmap, to + off); - /* Store new translation */ - if (radix_tree_insert(&gmap->guest_to_host, - (to + off) >> PMD_SHIFT, - (void *) from + off)) - break; - } - mmap_write_unlock(gmap->mm); - if (flush) - gmap_flush_tlb(gmap); - if (off >= len) - return 0; - gmap_unmap_segment(gmap, to, len); - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(gmap_map_segment); - -/** - * __gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - * The mmap_lock of the mm that belongs to the address space must be held - * when this function gets called. - * - * Note: Can also be called for shadow gmaps. - */ -unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr; - - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); - /* Note: guest_to_host is empty for a shadow gmap */ - return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; -} -EXPORT_SYMBOL_GPL(__gmap_translate); - -/** - * gmap_unlink - disconnect a page table from the gmap shadow tables - * @mm: pointer to the parent mm_struct - * @table: pointer to the host page table - * @vmaddr: vm address associated with the host page table - */ -void gmap_unlink(struct mm_struct *mm, unsigned long *table, - unsigned long vmaddr) -{ - struct gmap *gmap; - int flush; - - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); - if (flush) - gmap_flush_tlb(gmap); - } - rcu_read_unlock(); -} - -static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, - unsigned long gaddr); - -/** - * __gmap_link - set up shadow page tables to connect a host to a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @vmaddr: vm address - * - * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT - * if the vm address is already mapped to a different guest segment. - * The mmap_lock of the mm that belongs to the address space must be held - * when this function gets called. - */ -int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) -{ - struct mm_struct *mm; - unsigned long *table; - spinlock_t *ptl; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - u64 unprot; - int rc; - - BUG_ON(gmap_is_shadow(gmap)); - /* Create higher level tables in the gmap page table */ - table = gmap->table; - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { - table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, - gaddr & _REGION1_MASK)) - return -ENOMEM; - table = __va(*table & _REGION_ENTRY_ORIGIN); - } - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { - table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, - gaddr & _REGION2_MASK)) - return -ENOMEM; - table = __va(*table & _REGION_ENTRY_ORIGIN); - } - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { - table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, - gaddr & _REGION3_MASK)) - return -ENOMEM; - table = __va(*table & _REGION_ENTRY_ORIGIN); - } - table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; - /* Walk the parent mm page table */ - mm = gmap->mm; - pgd = pgd_offset(mm, vmaddr); - VM_BUG_ON(pgd_none(*pgd)); - p4d = p4d_offset(pgd, vmaddr); - VM_BUG_ON(p4d_none(*p4d)); - pud = pud_offset(p4d, vmaddr); - VM_BUG_ON(pud_none(*pud)); - /* large puds cannot yet be handled */ - if (pud_leaf(*pud)) - return -EFAULT; - pmd = pmd_offset(pud, vmaddr); - VM_BUG_ON(pmd_none(*pmd)); - /* Are we allowed to use huge pages? */ - if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) - return -EFAULT; - /* Link gmap segment table entry location to page table. */ - rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); - if (rc) - return rc; - ptl = pmd_lock(mm, pmd); - spin_lock(&gmap->guest_table_lock); - if (*table == _SEGMENT_ENTRY_EMPTY) { - rc = radix_tree_insert(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT, - (void *)MAKE_VALID_GADDR(gaddr)); - if (!rc) { - if (pmd_leaf(*pmd)) { - *table = (pmd_val(*pmd) & - _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) - | _SEGMENT_ENTRY_GMAP_UC - | _SEGMENT_ENTRY; - } else - *table = (pmd_val(*pmd) & - _SEGMENT_ENTRY_HARDWARE_BITS) - | _SEGMENT_ENTRY; - } - } else if (*table & _SEGMENT_ENTRY_PROTECT && - !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) { - unprot = (u64)*table; - unprot &= ~_SEGMENT_ENTRY_PROTECT; - unprot |= _SEGMENT_ENTRY_GMAP_UC; - gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr); - } - spin_unlock(&gmap->guest_table_lock); - spin_unlock(ptl); - radix_tree_preload_end(); - return rc; -} -EXPORT_SYMBOL(__gmap_link); - -/* - * this function is assumed to be called with mmap_lock held - */ -void __gmap_zap(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long vmaddr; - - mmap_assert_locked(gmap->mm); - - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (vmaddr) { - vmaddr |= gaddr & ~PMD_MASK; - gmap_helper_zap_one_page(gmap->mm, vmaddr); - } -} -EXPORT_SYMBOL_GPL(__gmap_zap); - -static LIST_HEAD(gmap_notifier_list); -static DEFINE_SPINLOCK(gmap_notifier_lock); - -/** - * gmap_register_pte_notifier - register a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_register_pte_notifier(struct gmap_notifier *nb) -{ - spin_lock(&gmap_notifier_lock); - list_add_rcu(&nb->list, &gmap_notifier_list); - spin_unlock(&gmap_notifier_lock); -} -EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); - -/** - * gmap_unregister_pte_notifier - remove a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_unregister_pte_notifier(struct gmap_notifier *nb) -{ - spin_lock(&gmap_notifier_lock); - list_del_rcu(&nb->list); - spin_unlock(&gmap_notifier_lock); - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); - -/** - * gmap_call_notifier - call all registered invalidation callbacks - * @gmap: pointer to guest mapping meta data structure - * @start: start virtual address in the guest address space - * @end: end virtual address in the guest address space - */ -static void gmap_call_notifier(struct gmap *gmap, unsigned long start, - unsigned long end) -{ - struct gmap_notifier *nb; - - list_for_each_entry(nb, &gmap_notifier_list, list) - nb->notifier_call(gmap, start, end); -} - -/** - * gmap_table_walk - walk the gmap page tables - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @level: page table level to stop at - * - * Returns a table entry pointer for the given guest address and @level - * @level=0 : returns a pointer to a page table table entry (or NULL) - * @level=1 : returns a pointer to a segment table entry (or NULL) - * @level=2 : returns a pointer to a region-3 table entry (or NULL) - * @level=3 : returns a pointer to a region-2 table entry (or NULL) - * @level=4 : returns a pointer to a region-1 table entry (or NULL) - * - * Returns NULL if the gmap page tables could not be walked to the - * requested level. - * - * Note: Can also be called for shadow gmaps. - */ -unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) -{ - const int asce_type = gmap->asce & _ASCE_TYPE_MASK; - unsigned long *table = gmap->table; - - if (gmap_is_shadow(gmap) && gmap->removed) - return NULL; - - if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) - return NULL; - - if (asce_type != _ASCE_TYPE_REGION1 && - gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) - return NULL; - - switch (asce_type) { - case _ASCE_TYPE_REGION1: - table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; - if (level == 4) - break; - if (*table & _REGION_ENTRY_INVALID) - return NULL; - table = __va(*table & _REGION_ENTRY_ORIGIN); - fallthrough; - case _ASCE_TYPE_REGION2: - table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; - if (level == 3) - break; - if (*table & _REGION_ENTRY_INVALID) - return NULL; - table = __va(*table & _REGION_ENTRY_ORIGIN); - fallthrough; - case _ASCE_TYPE_REGION3: - table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; - if (level == 2) - break; - if (*table & _REGION_ENTRY_INVALID) - return NULL; - table = __va(*table & _REGION_ENTRY_ORIGIN); - fallthrough; - case _ASCE_TYPE_SEGMENT: - table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; - if (level == 1) - break; - if (*table & _REGION_ENTRY_INVALID) - return NULL; - table = __va(*table & _SEGMENT_ENTRY_ORIGIN); - table += (gaddr & _PAGE_INDEX) >> PAGE_SHIFT; - } - return table; -} -EXPORT_SYMBOL(gmap_table_walk); - -/** - * gmap_pte_op_walk - walk the gmap page table, get the page table lock - * and return the pte pointer - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @ptl: pointer to the spinlock pointer - * - * Returns a pointer to the locked pte for a guest address, or NULL - */ -static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, - spinlock_t **ptl) -{ - unsigned long *table; - - BUG_ON(gmap_is_shadow(gmap)); - /* Walk the gmap page table, lock and get pte pointer */ - table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ - if (!table || *table & _SEGMENT_ENTRY_INVALID) - return NULL; - return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); -} - -/** - * gmap_pte_op_fixup - force a page in and connect the gmap page table - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @vmaddr: address in the host process address space - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * - * Returns 0 if the caller can retry __gmap_translate (might fail again), - * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing - * up or connecting the gmap page table. - */ -static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, - unsigned long vmaddr, int prot) -{ - struct mm_struct *mm = gmap->mm; - unsigned int fault_flags; - bool unlocked = false; - - BUG_ON(gmap_is_shadow(gmap)); - fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; - if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) - return -EFAULT; - if (unlocked) - /* lost mmap_lock, caller has to retry __gmap_translate */ - return 0; - /* Connect the page tables */ - return __gmap_link(gmap, gaddr, vmaddr); -} - -/** - * gmap_pte_op_end - release the page table lock - * @ptep: pointer to the locked pte - * @ptl: pointer to the page table spinlock - */ -static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) -{ - pte_unmap_unlock(ptep, ptl); -} - -/** - * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock - * and return the pmd pointer - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * - * Returns a pointer to the pmd for a guest address, or NULL - */ -static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) -{ - pmd_t *pmdp; - - BUG_ON(gmap_is_shadow(gmap)); - pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); - if (!pmdp) - return NULL; - - /* without huge pages, there is no need to take the table lock */ - if (!gmap->mm->context.allow_gmap_hpage_1m) - return pmd_none(*pmdp) ? NULL : pmdp; - - spin_lock(&gmap->guest_table_lock); - if (pmd_none(*pmdp)) { - spin_unlock(&gmap->guest_table_lock); - return NULL; - } - - /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ - if (!pmd_leaf(*pmdp)) - spin_unlock(&gmap->guest_table_lock); - return pmdp; -} - -/** - * gmap_pmd_op_end - release the guest_table_lock if needed - * @gmap: pointer to the guest mapping meta data structure - * @pmdp: pointer to the pmd - */ -static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) -{ - if (pmd_leaf(*pmdp)) - spin_unlock(&gmap->guest_table_lock); -} - -/* - * gmap_protect_pmd - remove access rights to memory and set pmd notification bits - * @pmdp: pointer to the pmd to be protected - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: notification bits to set - * - * Returns: - * 0 if successfully protected - * -EAGAIN if a fixup is needed - * -EINVAL if unsupported notifier bits have been specified - * - * Expected to be called with sg->mm->mmap_lock in read and - * guest_table_lock held. - */ -static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, - pmd_t *pmdp, int prot, unsigned long bits) -{ - int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID; - int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT; - pmd_t new = *pmdp; - - /* Fixup needed */ - if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE))) - return -EAGAIN; - - if (prot == PROT_NONE && !pmd_i) { - new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); - gmap_pmdp_xchg(gmap, pmdp, new, gaddr); - } - - if (prot == PROT_READ && !pmd_p) { - new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); - new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); - gmap_pmdp_xchg(gmap, pmdp, new, gaddr); - } - - if (bits & GMAP_NOTIFY_MPROT) - set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); - - /* Shadow GMAP protection needs split PMDs */ - if (bits & GMAP_NOTIFY_SHADOW) - return -EINVAL; - - return 0; -} - -/* - * gmap_protect_pte - remove access rights to memory and set pgste bits - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @pmdp: pointer to the pmd associated with the pte - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: notification bits to set - * - * Returns 0 if successfully protected, -ENOMEM if out of memory and - * -EAGAIN if a fixup is needed. - * - * Expected to be called with sg->mm->mmap_lock in read - */ -static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, - pmd_t *pmdp, int prot, unsigned long bits) -{ - int rc; - pte_t *ptep; - spinlock_t *ptl; - unsigned long pbits = 0; - - if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) - return -EAGAIN; - - ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl); - if (!ptep) - return -ENOMEM; - - pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0; - pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; - /* Protect and unlock. */ - rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); - gmap_pte_op_end(ptep, ptl); - return rc; -} - -/* - * gmap_protect_range - remove access rights to memory and set pgste bits - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @len: size of area - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bits: pgste notification bits to set - * - * Returns: - * PAGE_SIZE if a small page was successfully protected; - * HPAGE_SIZE if a large page was successfully protected; - * -ENOMEM if out of memory; - * -EFAULT if gaddr is invalid (or mapping for shadows is missing); - * -EAGAIN if the guest mapping is missing and should be fixed by the caller. - * - * Context: Called with sg->mm->mmap_lock in read. - */ -int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits) -{ - pmd_t *pmdp; - int rc = 0; - - BUG_ON(gmap_is_shadow(gmap)); - - pmdp = gmap_pmd_op_walk(gmap, gaddr); - if (!pmdp) - return -EAGAIN; - - if (!pmd_leaf(*pmdp)) { - rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); - if (!rc) - rc = PAGE_SIZE; - } else { - rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits); - if (!rc) - rc = HPAGE_SIZE; - } - gmap_pmd_op_end(gmap, pmdp); - - return rc; -} -EXPORT_SYMBOL_GPL(gmap_protect_one); - -/** - * gmap_read_table - get an unsigned long value from a guest page table using - * absolute addressing, without marking the page referenced. - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @val: pointer to the unsigned long value to return - * - * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT - * if reading using the virtual address failed. -EINVAL if called on a gmap - * shadow. - * - * Called with gmap->mm->mmap_lock in read. - */ -int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) -{ - unsigned long address, vmaddr; - spinlock_t *ptl; - pte_t *ptep, pte; - int rc; - - if (gmap_is_shadow(gmap)) - return -EINVAL; - - while (1) { - rc = -EAGAIN; - ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); - if (ptep) { - pte = *ptep; - if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { - address = pte_val(pte) & PAGE_MASK; - address += gaddr & ~PAGE_MASK; - *val = *(unsigned long *)__va(address); - set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); - /* Do *NOT* clear the _PAGE_INVALID bit! */ - rc = 0; - } - gmap_pte_op_end(ptep, ptl); - } - if (!rc) - break; - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) { - rc = vmaddr; - break; - } - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); - if (rc) - break; - } - return rc; -} -EXPORT_SYMBOL_GPL(gmap_read_table); - -/** - * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree - * @sg: pointer to the shadow guest address space structure - * @vmaddr: vm address associated with the rmap - * @rmap: pointer to the rmap structure - * - * Called with the sg->guest_table_lock - */ -static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, - struct gmap_rmap *rmap) -{ - struct gmap_rmap *temp; - void __rcu **slot; - - BUG_ON(!gmap_is_shadow(sg)); - slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); - if (slot) { - rmap->next = radix_tree_deref_slot_protected(slot, - &sg->guest_table_lock); - for (temp = rmap->next; temp; temp = temp->next) { - if (temp->raddr == rmap->raddr) { - kfree(rmap); - return; - } - } - radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); - } else { - rmap->next = NULL; - radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, - rmap); - } -} - -/** - * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow gmap - * @paddr: address in the parent guest address space - * @len: length of the memory area to protect - * - * Returns 0 if successfully protected and the rmap was created, -ENOMEM - * if out of memory and -EFAULT if paddr is invalid. - */ -static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, - unsigned long paddr, unsigned long len) -{ - struct gmap *parent; - struct gmap_rmap *rmap; - unsigned long vmaddr; - spinlock_t *ptl; - pte_t *ptep; - int rc; - - BUG_ON(!gmap_is_shadow(sg)); - parent = sg->parent; - while (len) { - vmaddr = __gmap_translate(parent, paddr); - if (IS_ERR_VALUE(vmaddr)) - return vmaddr; - rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); - if (!rmap) - return -ENOMEM; - rmap->raddr = raddr; - rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); - if (rc) { - kfree(rmap); - return rc; - } - rc = -EAGAIN; - ptep = gmap_pte_op_walk(parent, paddr, &ptl); - if (ptep) { - spin_lock(&sg->guest_table_lock); - rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ, - PGSTE_VSIE_BIT); - if (!rc) - gmap_insert_rmap(sg, vmaddr, rmap); - spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(ptep, ptl); - } - radix_tree_preload_end(); - if (rc) { - kfree(rmap); - rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ); - if (rc) - return rc; - continue; - } - paddr += PAGE_SIZE; - len -= PAGE_SIZE; - } - return 0; -} - -#define _SHADOW_RMAP_MASK 0x7 -#define _SHADOW_RMAP_REGION1 0x5 -#define _SHADOW_RMAP_REGION2 0x4 -#define _SHADOW_RMAP_REGION3 0x3 -#define _SHADOW_RMAP_SEGMENT 0x2 -#define _SHADOW_RMAP_PGTABLE 0x1 - -/** - * gmap_idte_one - invalidate a single region or segment table entry - * @asce: region or segment table *origin* + table-type bits - * @vaddr: virtual address to identify the table entry to flush - * - * The invalid bit of a single region or segment table entry is set - * and the associated TLB entries depending on the entry are flushed. - * The table-type of the @asce identifies the portion of the @vaddr - * that is used as the invalidation index. - */ -static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) -{ - asm volatile( - " idte %0,0,%1" - : : "a" (asce), "a" (vaddr) : "cc", "memory"); -} - -/** - * gmap_unshadow_page - remove a page from a shadow page table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * - * Called with the sg->guest_table_lock - */ -static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) -{ - unsigned long *table; - - BUG_ON(!gmap_is_shadow(sg)); - table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ - if (!table || *table & _PAGE_INVALID) - return; - gmap_call_notifier(sg, raddr, raddr + PAGE_SIZE - 1); - ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); -} - -/** - * __gmap_unshadow_pgt - remove all entries from a shadow page table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * @pgt: pointer to the start of a shadow page table - * - * Called with the sg->guest_table_lock - */ -static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, - unsigned long *pgt) -{ - int i; - - BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _PAGE_ENTRIES; i++, raddr += PAGE_SIZE) - pgt[i] = _PAGE_INVALID; -} - -/** - * gmap_unshadow_pgt - remove a shadow page table from a segment entry - * @sg: pointer to the shadow guest address space structure - * @raddr: address in the shadow guest address space - * - * Called with the sg->guest_table_lock - */ -static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) -{ - unsigned long *ste; - phys_addr_t sto, pgt; - struct ptdesc *ptdesc; - - BUG_ON(!gmap_is_shadow(sg)); - ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ - if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) - return; - gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); - sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); - gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); - pgt = *ste & _SEGMENT_ENTRY_ORIGIN; - *ste = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, __va(pgt)); - /* Free page table */ - ptdesc = page_ptdesc(phys_to_page(pgt)); - page_table_free_pgste(ptdesc); -} - -/** - * __gmap_unshadow_sgt - remove all entries from a shadow segment table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * @sgt: pointer to the start of a shadow segment table - * - * Called with the sg->guest_table_lock - */ -static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, - unsigned long *sgt) -{ - struct ptdesc *ptdesc; - phys_addr_t pgt; - int i; - - BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { - if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) - continue; - pgt = sgt[i] & _REGION_ENTRY_ORIGIN; - sgt[i] = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, __va(pgt)); - /* Free page table */ - ptdesc = page_ptdesc(phys_to_page(pgt)); - page_table_free_pgste(ptdesc); - } -} - -/** - * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * - * Called with the shadow->guest_table_lock - */ -static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) -{ - unsigned long r3o, *r3e; - phys_addr_t sgt; - struct page *page; - - BUG_ON(!gmap_is_shadow(sg)); - r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ - if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) - return; - gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); - r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); - gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); - sgt = *r3e & _REGION_ENTRY_ORIGIN; - *r3e = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, __va(sgt)); - /* Free segment table */ - page = phys_to_page(sgt); - __free_pages(page, CRST_ALLOC_ORDER); -} - -/** - * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table - * @sg: pointer to the shadow guest address space structure - * @raddr: address in the shadow guest address space - * @r3t: pointer to the start of a shadow region-3 table - * - * Called with the sg->guest_table_lock - */ -static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, - unsigned long *r3t) -{ - struct page *page; - phys_addr_t sgt; - int i; - - BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { - if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) - continue; - sgt = r3t[i] & _REGION_ENTRY_ORIGIN; - r3t[i] = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, __va(sgt)); - /* Free segment table */ - page = phys_to_page(sgt); - __free_pages(page, CRST_ALLOC_ORDER); - } -} - -/** - * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * - * Called with the sg->guest_table_lock - */ -static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) -{ - unsigned long r2o, *r2e; - phys_addr_t r3t; - struct page *page; - - BUG_ON(!gmap_is_shadow(sg)); - r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ - if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) - return; - gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); - r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); - gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); - r3t = *r2e & _REGION_ENTRY_ORIGIN; - *r2e = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, __va(r3t)); - /* Free region 3 table */ - page = phys_to_page(r3t); - __free_pages(page, CRST_ALLOC_ORDER); -} - -/** - * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * @r2t: pointer to the start of a shadow region-2 table - * - * Called with the sg->guest_table_lock - */ -static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, - unsigned long *r2t) -{ - phys_addr_t r3t; - struct page *page; - int i; - - BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { - if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) - continue; - r3t = r2t[i] & _REGION_ENTRY_ORIGIN; - r2t[i] = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, __va(r3t)); - /* Free region 3 table */ - page = phys_to_page(r3t); - __free_pages(page, CRST_ALLOC_ORDER); - } -} - -/** - * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * - * Called with the sg->guest_table_lock - */ -static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) -{ - unsigned long r1o, *r1e; - struct page *page; - phys_addr_t r2t; - - BUG_ON(!gmap_is_shadow(sg)); - r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ - if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) - return; - gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); - r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); - gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); - r2t = *r1e & _REGION_ENTRY_ORIGIN; - *r1e = _REGION1_ENTRY_EMPTY; - __gmap_unshadow_r2t(sg, raddr, __va(r2t)); - /* Free region 2 table */ - page = phys_to_page(r2t); - __free_pages(page, CRST_ALLOC_ORDER); -} - -/** - * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table - * @sg: pointer to the shadow guest address space structure - * @raddr: rmap address in the shadow guest address space - * @r1t: pointer to the start of a shadow region-1 table - * - * Called with the shadow->guest_table_lock - */ -static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, - unsigned long *r1t) -{ - unsigned long asce; - struct page *page; - phys_addr_t r2t; - int i; - - BUG_ON(!gmap_is_shadow(sg)); - asce = __pa(r1t) | _ASCE_TYPE_REGION1; - for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { - if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) - continue; - r2t = r1t[i] & _REGION_ENTRY_ORIGIN; - __gmap_unshadow_r2t(sg, raddr, __va(r2t)); - /* Clear entry and flush translation r1t -> r2t */ - gmap_idte_one(asce, raddr); - r1t[i] = _REGION1_ENTRY_EMPTY; - /* Free region 2 table */ - page = phys_to_page(r2t); - __free_pages(page, CRST_ALLOC_ORDER); - } -} - -/** - * gmap_unshadow - remove a shadow page table completely - * @sg: pointer to the shadow guest address space structure - * - * Called with sg->guest_table_lock - */ -void gmap_unshadow(struct gmap *sg) -{ - unsigned long *table; - - BUG_ON(!gmap_is_shadow(sg)); - if (sg->removed) - return; - sg->removed = 1; - gmap_call_notifier(sg, 0, -1UL); - gmap_flush_tlb(sg); - table = __va(sg->asce & _ASCE_ORIGIN); - switch (sg->asce & _ASCE_TYPE_MASK) { - case _ASCE_TYPE_REGION1: - __gmap_unshadow_r1t(sg, 0, table); - break; - case _ASCE_TYPE_REGION2: - __gmap_unshadow_r2t(sg, 0, table); - break; - case _ASCE_TYPE_REGION3: - __gmap_unshadow_r3t(sg, 0, table); - break; - case _ASCE_TYPE_SEGMENT: - __gmap_unshadow_sgt(sg, 0, table); - break; - } -} -EXPORT_SYMBOL(gmap_unshadow); - -/** - * gmap_shadow_r2t - create an empty shadow region 2 table - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @r2t: parent gmap address of the region 2 table to get shadowed - * @fake: r2t references contiguous guest memory block, not a r2t - * - * The r2t parameter specifies the address of the source table. The - * four pages of the source table are made read-only in the parent gmap - * address space. A write to the source table area @r2t will automatically - * remove the shadow r2 table and all of its descendants. - * - * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory and - * -EFAULT if an address in the parent gmap could not be resolved. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, - int fake) -{ - unsigned long raddr, origin, offset, len; - unsigned long *table; - phys_addr_t s_r2t; - struct page *page; - int rc; - - BUG_ON(!gmap_is_shadow(sg)); - /* Allocate a shadow region second table */ - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - s_r2t = page_to_phys(page); - /* Install shadow region second table */ - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ - if (!table) { - rc = -EAGAIN; /* Race with unshadow */ - goto out_free; - } - if (!(*table & _REGION_ENTRY_INVALID)) { - rc = 0; /* Already established */ - goto out_free; - } else if (*table & _REGION_ENTRY_ORIGIN) { - rc = -EAGAIN; /* Race with shadow */ - goto out_free; - } - crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); - /* mark as invalid as long as the parent table is not protected */ - *table = s_r2t | _REGION_ENTRY_LENGTH | - _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; - if (sg->edat_level >= 1) - *table |= (r2t & _REGION_ENTRY_PROTECT); - if (fake) { - /* nothing to protect for fake tables */ - *table &= ~_REGION_ENTRY_INVALID; - spin_unlock(&sg->guest_table_lock); - return 0; - } - spin_unlock(&sg->guest_table_lock); - /* Make r2t read-only in parent gmap page table */ - raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; - origin = r2t & _REGION_ENTRY_ORIGIN; - offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; - len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len); - spin_lock(&sg->guest_table_lock); - if (!rc) { - table = gmap_table_walk(sg, saddr, 4); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) - rc = -EAGAIN; /* Race with unshadow */ - else - *table &= ~_REGION_ENTRY_INVALID; - } else { - gmap_unshadow_r2t(sg, raddr); - } - spin_unlock(&sg->guest_table_lock); - return rc; -out_free: - spin_unlock(&sg->guest_table_lock); - __free_pages(page, CRST_ALLOC_ORDER); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_shadow_r2t); - -/** - * gmap_shadow_r3t - create a shadow region 3 table - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @r3t: parent gmap address of the region 3 table to get shadowed - * @fake: r3t references contiguous guest memory block, not a r3t - * - * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory and - * -EFAULT if an address in the parent gmap could not be resolved. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, - int fake) -{ - unsigned long raddr, origin, offset, len; - unsigned long *table; - phys_addr_t s_r3t; - struct page *page; - int rc; - - BUG_ON(!gmap_is_shadow(sg)); - /* Allocate a shadow region second table */ - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - s_r3t = page_to_phys(page); - /* Install shadow region second table */ - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ - if (!table) { - rc = -EAGAIN; /* Race with unshadow */ - goto out_free; - } - if (!(*table & _REGION_ENTRY_INVALID)) { - rc = 0; /* Already established */ - goto out_free; - } else if (*table & _REGION_ENTRY_ORIGIN) { - rc = -EAGAIN; /* Race with shadow */ - goto out_free; - } - crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); - /* mark as invalid as long as the parent table is not protected */ - *table = s_r3t | _REGION_ENTRY_LENGTH | - _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; - if (sg->edat_level >= 1) - *table |= (r3t & _REGION_ENTRY_PROTECT); - if (fake) { - /* nothing to protect for fake tables */ - *table &= ~_REGION_ENTRY_INVALID; - spin_unlock(&sg->guest_table_lock); - return 0; - } - spin_unlock(&sg->guest_table_lock); - /* Make r3t read-only in parent gmap page table */ - raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; - origin = r3t & _REGION_ENTRY_ORIGIN; - offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; - len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len); - spin_lock(&sg->guest_table_lock); - if (!rc) { - table = gmap_table_walk(sg, saddr, 3); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) - rc = -EAGAIN; /* Race with unshadow */ - else - *table &= ~_REGION_ENTRY_INVALID; - } else { - gmap_unshadow_r3t(sg, raddr); - } - spin_unlock(&sg->guest_table_lock); - return rc; -out_free: - spin_unlock(&sg->guest_table_lock); - __free_pages(page, CRST_ALLOC_ORDER); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_shadow_r3t); - -/** - * gmap_shadow_sgt - create a shadow segment table - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @sgt: parent gmap address of the segment table to get shadowed - * @fake: sgt references contiguous guest memory block, not a sgt - * - * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory and - * -EFAULT if an address in the parent gmap could not be resolved. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, - int fake) -{ - unsigned long raddr, origin, offset, len; - unsigned long *table; - phys_addr_t s_sgt; - struct page *page; - int rc; - - BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); - /* Allocate a shadow segment table */ - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - s_sgt = page_to_phys(page); - /* Install shadow region second table */ - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ - if (!table) { - rc = -EAGAIN; /* Race with unshadow */ - goto out_free; - } - if (!(*table & _REGION_ENTRY_INVALID)) { - rc = 0; /* Already established */ - goto out_free; - } else if (*table & _REGION_ENTRY_ORIGIN) { - rc = -EAGAIN; /* Race with shadow */ - goto out_free; - } - crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); - /* mark as invalid as long as the parent table is not protected */ - *table = s_sgt | _REGION_ENTRY_LENGTH | - _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; - if (sg->edat_level >= 1) - *table |= sgt & _REGION_ENTRY_PROTECT; - if (fake) { - /* nothing to protect for fake tables */ - *table &= ~_REGION_ENTRY_INVALID; - spin_unlock(&sg->guest_table_lock); - return 0; - } - spin_unlock(&sg->guest_table_lock); - /* Make sgt read-only in parent gmap page table */ - raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; - origin = sgt & _REGION_ENTRY_ORIGIN; - offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; - len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; - rc = gmap_protect_rmap(sg, raddr, origin + offset, len); - spin_lock(&sg->guest_table_lock); - if (!rc) { - table = gmap_table_walk(sg, saddr, 2); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) - rc = -EAGAIN; /* Race with unshadow */ - else - *table &= ~_REGION_ENTRY_INVALID; - } else { - gmap_unshadow_sgt(sg, raddr); - } - spin_unlock(&sg->guest_table_lock); - return rc; -out_free: - spin_unlock(&sg->guest_table_lock); - __free_pages(page, CRST_ALLOC_ORDER); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_shadow_sgt); - -static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr) -{ - unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc)); - - pgstes += _PAGE_ENTRIES; - - pgstes[0] &= ~PGSTE_ST2_MASK; - pgstes[1] &= ~PGSTE_ST2_MASK; - pgstes[2] &= ~PGSTE_ST2_MASK; - pgstes[3] &= ~PGSTE_ST2_MASK; - - pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK; - pgstes[1] |= pgt_addr & PGSTE_ST2_MASK; - pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK; - pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK; -} - -/** - * gmap_shadow_pgt - instantiate a shadow page table - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @pgt: parent gmap address of the page table to get shadowed - * @fake: pgt references contiguous guest memory block, not a pgtable - * - * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory, - * -EFAULT if an address in the parent gmap could not be resolved and - * - * Called with gmap->mm->mmap_lock in read - */ -int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, - int fake) -{ - unsigned long raddr, origin; - unsigned long *table; - struct ptdesc *ptdesc; - phys_addr_t s_pgt; - int rc; - - BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); - /* Allocate a shadow page table */ - ptdesc = page_table_alloc_pgste(sg->mm); - if (!ptdesc) - return -ENOMEM; - origin = pgt & _SEGMENT_ENTRY_ORIGIN; - if (fake) - origin |= GMAP_SHADOW_FAKE_TABLE; - gmap_pgste_set_pgt_addr(ptdesc, origin); - s_pgt = page_to_phys(ptdesc_page(ptdesc)); - /* Install shadow page table */ - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ - if (!table) { - rc = -EAGAIN; /* Race with unshadow */ - goto out_free; - } - if (!(*table & _SEGMENT_ENTRY_INVALID)) { - rc = 0; /* Already established */ - goto out_free; - } else if (*table & _SEGMENT_ENTRY_ORIGIN) { - rc = -EAGAIN; /* Race with shadow */ - goto out_free; - } - /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | - (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; - if (fake) { - /* nothing to protect for fake tables */ - *table &= ~_SEGMENT_ENTRY_INVALID; - spin_unlock(&sg->guest_table_lock); - return 0; - } - spin_unlock(&sg->guest_table_lock); - /* Make pgt read-only in parent gmap page table (not the pgste) */ - raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; - origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; - rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE); - spin_lock(&sg->guest_table_lock); - if (!rc) { - table = gmap_table_walk(sg, saddr, 1); - if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) - rc = -EAGAIN; /* Race with unshadow */ - else - *table &= ~_SEGMENT_ENTRY_INVALID; - } else { - gmap_unshadow_pgt(sg, raddr); - } - spin_unlock(&sg->guest_table_lock); - return rc; -out_free: - spin_unlock(&sg->guest_table_lock); - page_table_free_pgste(ptdesc); - return rc; - -} -EXPORT_SYMBOL_GPL(gmap_shadow_pgt); - -/** - * gmap_shadow_page - create a shadow page mapping - * @sg: pointer to the shadow guest address space structure - * @saddr: faulting address in the shadow gmap - * @pte: pte in parent gmap address space to get shadowed - * - * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the - * shadow table structure is incomplete, -ENOMEM if out of memory and - * -EFAULT if an address in the parent gmap could not be resolved. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) -{ - struct gmap *parent; - struct gmap_rmap *rmap; - unsigned long vmaddr, paddr; - spinlock_t *ptl; - pte_t *sptep, *tptep; - int prot; - int rc; - - BUG_ON(!gmap_is_shadow(sg)); - parent = sg->parent; - prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; - - rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); - if (!rmap) - return -ENOMEM; - rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; - - while (1) { - paddr = pte_val(pte) & PAGE_MASK; - vmaddr = __gmap_translate(parent, paddr); - if (IS_ERR_VALUE(vmaddr)) { - rc = vmaddr; - break; - } - rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); - if (rc) - break; - rc = -EAGAIN; - sptep = gmap_pte_op_walk(parent, paddr, &ptl); - if (sptep) { - spin_lock(&sg->guest_table_lock); - /* Get page table pointer */ - tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); - if (!tptep) { - spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(sptep, ptl); - radix_tree_preload_end(); - break; - } - rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); - if (rc > 0) { - /* Success and a new mapping */ - gmap_insert_rmap(sg, vmaddr, rmap); - rmap = NULL; - rc = 0; - } - gmap_pte_op_end(sptep, ptl); - spin_unlock(&sg->guest_table_lock); - } - radix_tree_preload_end(); - if (!rc) - break; - rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); - if (rc) - break; - } - kfree(rmap); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_shadow_page); - -/* - * gmap_shadow_notify - handle notifications for shadow gmap - * - * Called with sg->parent->shadow_lock. - */ -static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, - unsigned long gaddr) -{ - struct gmap_rmap *rmap, *rnext, *head; - unsigned long start, end, bits, raddr; - - BUG_ON(!gmap_is_shadow(sg)); - - spin_lock(&sg->guest_table_lock); - if (sg->removed) { - spin_unlock(&sg->guest_table_lock); - return; - } - /* Check for top level table */ - start = sg->orig_asce & _ASCE_ORIGIN; - end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; - if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && - gaddr < end) { - /* The complete shadow table has to go */ - gmap_unshadow(sg); - spin_unlock(&sg->guest_table_lock); - list_del(&sg->list); - gmap_put(sg); - return; - } - /* Remove the page table tree from on specific entry */ - head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); - gmap_for_each_rmap_safe(rmap, rnext, head) { - bits = rmap->raddr & _SHADOW_RMAP_MASK; - raddr = rmap->raddr ^ bits; - switch (bits) { - case _SHADOW_RMAP_REGION1: - gmap_unshadow_r2t(sg, raddr); - break; - case _SHADOW_RMAP_REGION2: - gmap_unshadow_r3t(sg, raddr); - break; - case _SHADOW_RMAP_REGION3: - gmap_unshadow_sgt(sg, raddr); - break; - case _SHADOW_RMAP_SEGMENT: - gmap_unshadow_pgt(sg, raddr); - break; - case _SHADOW_RMAP_PGTABLE: - gmap_unshadow_page(sg, raddr); - break; - } - kfree(rmap); - } - spin_unlock(&sg->guest_table_lock); -} - -/** - * ptep_notify - call all invalidation callbacks for a specific pte. - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - * @pte: pointer to the page table entry - * @bits: bits from the pgste that caused the notify call - * - * This function is assumed to be called with the page table lock held - * for the pte to notify. - */ -void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, - pte_t *pte, unsigned long bits) -{ - unsigned long offset, gaddr = 0; - struct gmap *gmap, *sg, *next; - - offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - offset = offset * (PAGE_SIZE / sizeof(pte_t)); - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - spin_lock(&gmap->guest_table_lock); - gaddr = host_to_guest_lookup(gmap, vmaddr) + offset; - spin_unlock(&gmap->guest_table_lock); - if (!IS_GADDR_VALID(gaddr)) - continue; - - if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { - spin_lock(&gmap->shadow_lock); - list_for_each_entry_safe(sg, next, - &gmap->children, list) - gmap_shadow_notify(sg, vmaddr, gaddr); - spin_unlock(&gmap->shadow_lock); - } - if (bits & PGSTE_IN_BIT) - gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(ptep_notify); - -static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, - unsigned long gaddr) -{ - set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); - gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); -} - -/** - * gmap_pmdp_xchg - exchange a gmap pmd with another - * @gmap: pointer to the guest address space structure - * @pmdp: pointer to the pmd entry - * @new: replacement entry - * @gaddr: the affected guest address - * - * This function is assumed to be called with the guest_table_lock - * held. - */ -static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, - unsigned long gaddr) -{ - gaddr &= HPAGE_MASK; - pmdp_notify_gmap(gmap, pmdp, gaddr); - new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); - if (machine_has_tlb_guest()) - __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, - IDTE_GLOBAL); - else - __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); - set_pmd(pmdp, new); -} - -static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, - int purge) -{ - pmd_t *pmdp; - struct gmap *gmap; - unsigned long gaddr; - - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - spin_lock(&gmap->guest_table_lock); - pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); - if (pmdp) { - pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC | - _SEGMENT_ENTRY)); - if (purge) - __pmdp_cspg(pmdp); - set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); - } - spin_unlock(&gmap->guest_table_lock); - } - rcu_read_unlock(); -} - -/** - * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without - * flushing - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - */ -void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr) -{ - gmap_pmdp_clear(mm, vmaddr, 0); -} -EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate); - -/** - * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - */ -void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) -{ - unsigned long gaddr; - struct gmap *gmap; - pmd_t *pmdp; - - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - spin_lock(&gmap->guest_table_lock); - pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); - if (pmdp) { - pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC | - _SEGMENT_ENTRY)); - if (machine_has_tlb_guest()) - __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, - gmap->asce, IDTE_LOCAL); - else - __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); - *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); - } - spin_unlock(&gmap->guest_table_lock); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); - -/** - * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry - * @mm: pointer to the process mm_struct - * @vmaddr: virtual address in the process address space - */ -void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) -{ - unsigned long gaddr; - struct gmap *gmap; - pmd_t *pmdp; - - rcu_read_lock(); - list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { - spin_lock(&gmap->guest_table_lock); - pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); - if (pmdp) { - pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC | - _SEGMENT_ENTRY)); - if (machine_has_tlb_guest()) - __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, - gmap->asce, IDTE_GLOBAL); - else - __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); - *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); - } - spin_unlock(&gmap->guest_table_lock); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global); - -/** - * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status - * @gmap: pointer to guest address space - * @pmdp: pointer to the pmd to be tested - * @gaddr: virtual address in the guest address space - * - * This function is assumed to be called with the guest_table_lock - * held. - */ -static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, - unsigned long gaddr) -{ - if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) - return false; - - /* Already protected memory, which did not change is clean */ - if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT && - !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC)) - return false; - - /* Clear UC indication and reset protection */ - set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); - gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); - return true; -} - -/** - * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment - * @gmap: pointer to guest address space - * @bitmap: dirty bitmap for this pmd - * @gaddr: virtual address in the guest address space - * @vmaddr: virtual address in the host address space - * - * This function is assumed to be called with the guest_table_lock - * held. - */ -void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], - unsigned long gaddr, unsigned long vmaddr) -{ - int i; - pmd_t *pmdp; - pte_t *ptep; - spinlock_t *ptl; - - pmdp = gmap_pmd_op_walk(gmap, gaddr); - if (!pmdp) - return; - - if (pmd_leaf(*pmdp)) { - if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) - bitmap_fill(bitmap, _PAGE_ENTRIES); - } else { - for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) { - ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl); - if (!ptep) - continue; - if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) - set_bit(i, bitmap); - pte_unmap_unlock(ptep, ptl); - } - } - gmap_pmd_op_end(gmap, pmdp); -} -EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - struct vm_area_struct *vma = walk->vma; - - split_huge_pmd(vma, pmd, addr); - return 0; -} - -static const struct mm_walk_ops thp_split_walk_ops = { - .pmd_entry = thp_split_walk_pmd_entry, - .walk_lock = PGWALK_WRLOCK_VERIFY, -}; - -static inline void thp_split_mm(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - for_each_vma(vmi, vma) { - vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); - walk_page_vma(vma, &thp_split_walk_ops, NULL); - } - mm->def_flags |= VM_NOHUGEPAGE; -} -#else -static inline void thp_split_mm(struct mm_struct *mm) -{ -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -/* - * switch on pgstes for its userspace process (for kvm) - */ -int s390_enable_sie(void) -{ - struct mm_struct *mm = current->mm; - - /* Do we have pgstes? if yes, we are done */ - if (mm_has_pgste(mm)) - return 0; - mmap_write_lock(mm); - mm->context.has_pgste = 1; - /* split thp mappings and disable thp for future mappings */ - thp_split_mm(mm); - mmap_write_unlock(mm); - return 0; -} -EXPORT_SYMBOL_GPL(s390_enable_sie); - -/* - * Enable storage key handling from now on and initialize the storage - * keys with the default key. - */ -static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - /* Clear storage key */ - ptep_zap_key(walk->mm, addr, pte); - return 0; -} - -/* - * Give a chance to schedule after setting a key to 256 pages. - * We only hold the mm lock, which is a rwsem and the kvm srcu. - * Both can sleep. - */ -static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - cond_resched(); - return 0; -} - -static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, - unsigned long hmask, unsigned long next, - struct mm_walk *walk) -{ - pmd_t *pmd = (pmd_t *)pte; - unsigned long start, end; - struct folio *folio = page_folio(pmd_page(*pmd)); - - /* - * The write check makes sure we do not set a key on shared - * memory. This is needed as the walker does not differentiate - * between actual guest memory and the process executable or - * shared libraries. - */ - if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID || - !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE)) - return 0; - - start = pmd_val(*pmd) & HPAGE_MASK; - end = start + HPAGE_SIZE; - __storage_key_init_range(start, end); - set_bit(PG_arch_1, &folio->flags.f); - cond_resched(); - return 0; -} - -static const struct mm_walk_ops enable_skey_walk_ops = { - .hugetlb_entry = __s390_enable_skey_hugetlb, - .pte_entry = __s390_enable_skey_pte, - .pmd_entry = __s390_enable_skey_pmd, - .walk_lock = PGWALK_WRLOCK, -}; - -int s390_enable_skey(void) -{ - struct mm_struct *mm = current->mm; - int rc = 0; - - mmap_write_lock(mm); - if (mm_uses_skeys(mm)) - goto out_up; - - mm->context.uses_skeys = 1; - rc = gmap_helper_disable_cow_sharing(); - if (rc) { - mm->context.uses_skeys = 0; - goto out_up; - } - walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); - -out_up: - mmap_write_unlock(mm); - return rc; -} -EXPORT_SYMBOL_GPL(s390_enable_skey); - -/* - * Reset CMMA state, make all pages stable again. - */ -static int __s390_reset_cmma(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - ptep_zap_unused(walk->mm, addr, pte, 1); - return 0; -} - -static const struct mm_walk_ops reset_cmma_walk_ops = { - .pte_entry = __s390_reset_cmma, - .walk_lock = PGWALK_WRLOCK, -}; - -void s390_reset_cmma(struct mm_struct *mm) -{ - mmap_write_lock(mm); - walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); - mmap_write_unlock(mm); -} -EXPORT_SYMBOL_GPL(s390_reset_cmma); - -#define GATHER_GET_PAGES 32 - -struct reset_walk_state { - unsigned long next; - unsigned long count; - unsigned long pfns[GATHER_GET_PAGES]; -}; - -static int s390_gather_pages(pte_t *ptep, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - struct reset_walk_state *p = walk->private; - pte_t pte = READ_ONCE(*ptep); - - if (pte_present(pte)) { - /* we have a reference from the mapping, take an extra one */ - get_page(phys_to_page(pte_val(pte))); - p->pfns[p->count] = phys_to_pfn(pte_val(pte)); - p->next = next; - p->count++; - } - return p->count >= GATHER_GET_PAGES; -} - -static const struct mm_walk_ops gather_pages_ops = { - .pte_entry = s390_gather_pages, - .walk_lock = PGWALK_RDLOCK, -}; - -/* - * Call the Destroy secure page UVC on each page in the given array of PFNs. - * Each page needs to have an extra reference, which will be released here. - */ -void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) -{ - struct folio *folio; - unsigned long i; - - for (i = 0; i < count; i++) { - folio = pfn_folio(pfns[i]); - /* we always have an extra reference */ - uv_destroy_folio(folio); - /* get rid of the extra reference */ - folio_put(folio); - cond_resched(); - } -} -EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); - -/** - * __s390_uv_destroy_range - Call the destroy secure page UVC on each page - * in the given range of the given address space. - * @mm: the mm to operate on - * @start: the start of the range - * @end: the end of the range - * @interruptible: if not 0, stop when a fatal signal is received - * - * Walk the given range of the given address space and call the destroy - * secure page UVC on each page. Optionally exit early if a fatal signal is - * pending. - * - * Return: 0 on success, -EINTR if the function stopped before completing - */ -int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, - unsigned long end, bool interruptible) -{ - struct reset_walk_state state = { .next = start }; - int r = 1; - - while (r > 0) { - state.count = 0; - mmap_read_lock(mm); - r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); - mmap_read_unlock(mm); - cond_resched(); - s390_uv_destroy_pfns(state.count, state.pfns); - if (interruptible && fatal_signal_pending(current)) - return -EINTR; - } - return 0; -} -EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); - -/** - * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy - * @gmap: the gmap whose ASCE needs to be replaced - * - * If the ASCE is a SEGMENT type then this function will return -EINVAL, - * otherwise the pointers in the host_to_guest radix tree will keep pointing - * to the wrong pages, causing use-after-free and memory corruption. - * If the allocation of the new top level page table fails, the ASCE is not - * replaced. - * In any case, the old ASCE is always removed from the gmap CRST list. - * Therefore the caller has to make sure to save a pointer to it - * beforehand, unless a leak is actually intended. - */ -int s390_replace_asce(struct gmap *gmap) -{ - unsigned long asce; - struct page *page; - void *table; - - /* Replacing segment type ASCEs would cause serious issues */ - if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) - return -EINVAL; - - page = gmap_alloc_crst(); - if (!page) - return -ENOMEM; - table = page_to_virt(page); - memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); - - /* Set new table origin while preserving existing ASCE control bits */ - asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); - WRITE_ONCE(gmap->asce, asce); - WRITE_ONCE(gmap->mm->context.gmap_asce, asce); - WRITE_ONCE(gmap->table, table); - - return 0; -} -EXPORT_SYMBOL_GPL(s390_replace_asce); diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index d41b19925a5a..d653c64b869a 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -15,7 +15,6 @@ #include <linux/pagewalk.h> #include <linux/ksm.h> #include <asm/gmap_helpers.h> -#include <asm/pgtable.h> /** * ptep_zap_softleaf_entry() - discard a software leaf entry. @@ -47,9 +46,7 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) { struct vm_area_struct *vma; - unsigned long pgstev; spinlock_t *ptl; - pgste_t pgste; pte_t *ptep; mmap_assert_locked(mm); @@ -64,18 +61,8 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) if (unlikely(!ptep)) return; if (pte_swap(*ptep)) { - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgstev = pgste_val(pgste); - - if ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || - (pgstev & _PGSTE_GPS_ZERO)) { - ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); - pte_clear(mm, vmaddr, ptep); - } - - pgste_set_unlock(ptep, pgste); - preempt_enable(); + ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); + pte_clear(mm, vmaddr, ptep); } pte_unmap_unlock(ptep, ptl); } @@ -108,6 +95,85 @@ void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned lo } EXPORT_SYMBOL_GPL(gmap_helper_discard); +/** + * gmap_helper_try_set_pte_unused() - mark a pte entry as unused + * @mm: the mm + * @vmaddr: the userspace address whose pte is to be marked + * + * Mark the pte corresponding the given address as unused. This will cause + * core mm code to just drop this page instead of swapping it. + * + * This function needs to be called with interrupts disabled (for example + * while holding a spinlock), or while holding the mmap lock. Normally this + * function is called as a result of an unmap operation, and thus KVM common + * code will already hold kvm->mmu_lock in write mode. + * + * Context: Needs to be called while holding the mmap lock or with interrupts + * disabled. + */ +void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr) +{ + pmd_t *pmdp, pmd, pmdval; + pud_t *pudp, pud; + p4d_t *p4dp, p4d; + pgd_t *pgdp, pgd; + spinlock_t *ptl; /* Lock for the host (userspace) page table */ + pte_t *ptep; + + pgdp = pgd_offset(mm, vmaddr); + pgd = pgdp_get(pgdp); + if (pgd_none(pgd) || !pgd_present(pgd)) + return; + + p4dp = p4d_offset(pgdp, vmaddr); + p4d = p4dp_get(p4dp); + if (p4d_none(p4d) || !p4d_present(p4d)) + return; + + pudp = pud_offset(p4dp, vmaddr); + pud = pudp_get(pudp); + if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud)) + return; + + pmdp = pmd_offset(pudp, vmaddr); + pmd = pmdp_get_lockless(pmdp); + if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd)) + return; + + ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl); + if (!ptep) + return; + + /* + * Several paths exists that takes the ptl lock and then call the + * mmu_notifier, which takes the mmu_lock. The unmap path, instead, + * takes the mmu_lock in write mode first, and then potentially + * calls this function, which takes the ptl lock. This can lead to a + * deadlock. + * The unused page mechanism is only an optimization, if the + * _PAGE_UNUSED bit is not set, the unused page is swapped as normal + * instead of being discarded. + * If the lock is contended the bit is not set and the deadlock is + * avoided. + */ + if (spin_trylock(ptl)) { + /* + * Make sure the pte we are touching is still the correct + * one. In theory this check should not be needed, but + * better safe than sorry. + * Disabling interrupts or holding the mmap lock is enough to + * guarantee that no concurrent updates to the page tables + * are possible. + */ + if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp)))) + __atomic64_or(_PAGE_UNUSED, (long *)ptep); + spin_unlock(ptl); + } + + pte_unmap(ptep); +} +EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused); + static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d42e61c7594e..35a898e15b1c 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -135,29 +135,6 @@ static inline pte_t __rste_to_pte(unsigned long rste) return __pte(pteval); } -static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) -{ - struct folio *folio; - unsigned long size, paddr; - - if (!mm_uses_skeys(mm) || - rste & _SEGMENT_ENTRY_INVALID) - return; - - if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { - folio = page_folio(pud_page(__pud(rste))); - size = PUD_SIZE; - paddr = rste & PUD_MASK; - } else { - folio = page_folio(pmd_page(__pmd(rste))); - size = PMD_SIZE; - paddr = rste & PMD_MASK; - } - - if (!test_and_set_bit(PG_arch_1, &folio->flags.f)) - __storage_key_init_range(paddr, paddr + size); -} - void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -173,7 +150,6 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, } else if (likely(pte_present(pte))) rste |= _SEGMENT_ENTRY_LARGE; - clear_huge_pte_skeys(mm, rste); set_pte(ptep, __pte(rste)); } diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 01f9b39e65f5..5bee173db72e 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -13,6 +13,7 @@ #include <asm/page.h> int __bootdata_preserved(cmma_flag); +EXPORT_SYMBOL(cmma_flag); void arch_free_page(struct page *page, int order) { diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index d3ce04a4b248..bb29c38ae624 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -16,13 +16,6 @@ #include <asm/asm.h> #include <asm/set_memory.h> -static inline unsigned long sske_frame(unsigned long addr, unsigned char skey) -{ - asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0" - : [addr] "+a" (addr) : [skey] "d" (skey)); - return addr; -} - void __storage_key_init_range(unsigned long start, unsigned long end) { unsigned long boundary, size; diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 7df23528c01b..7ac44543e051 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -114,30 +114,6 @@ err_p4d: return -ENOMEM; } -#ifdef CONFIG_PGSTE - -struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm) -{ - struct ptdesc *ptdesc; - u64 *table; - - ptdesc = pagetable_alloc_noprof(GFP_KERNEL_ACCOUNT, 0); - if (ptdesc) { - table = (u64 *)ptdesc_address(ptdesc); - __arch_set_page_dat(table, 1); - memset64(table, _PAGE_INVALID, PTRS_PER_PTE); - memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); - } - return ptdesc; -} - -void page_table_free_pgste(struct ptdesc *ptdesc) -{ - pagetable_free(ptdesc); -} - -#endif /* CONFIG_PGSTE */ - unsigned long *page_table_alloc_noprof(struct mm_struct *mm) { gfp_t gfp = GFP_KERNEL_ACCOUNT; diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 666adcd681ab..4acd8b140c4b 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -24,7 +24,6 @@ #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/page-states.h> -#include <asm/pgtable.h> #include <asm/machine.h> pgprot_t pgprot_writecombine(pgprot_t prot) @@ -116,149 +115,14 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, return old; } -static inline pgste_t pgste_get(pte_t *ptep) -{ - unsigned long pgste = 0; -#ifdef CONFIG_PGSTE - pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); -#endif - return __pgste(pgste); -} - -static inline void pgste_set(pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; -#endif -} - -static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, - struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - unsigned long address, bits, skey; - - if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) - return pgste; - address = pte_val(pte) & PAGE_MASK; - skey = (unsigned long) page_get_storage_key(address); - bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); - /* Transfer page changed & referenced bit to guest bits in pgste */ - pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */ - /* Copy page access key and fetch protection bit to pgste */ - pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); -#endif - return pgste; - -} - -static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, - struct mm_struct *mm) -{ -#ifdef CONFIG_PGSTE - unsigned long address; - unsigned long nkey; - - if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) - return; - VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); - address = pte_val(entry) & PAGE_MASK; - /* - * Set page access key and fetch protection bit from pgste. - * The guest C/R information is still in the PGSTE, set real - * key C/R to 0. - */ - nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; - nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; - page_set_storage_key(address, nkey, 0); -#endif -} - -static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) -{ -#ifdef CONFIG_PGSTE - if ((pte_val(entry) & _PAGE_PRESENT) && - (pte_val(entry) & _PAGE_WRITE) && - !(pte_val(entry) & _PAGE_INVALID)) { - if (!machine_has_esop()) { - /* - * Without enhanced suppression-on-protection force - * the dirty bit on for all writable ptes. - */ - entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); - entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); - } - if (!(pte_val(entry) & _PAGE_PROTECT)) - /* This pte allows write access, set user-dirty */ - pgste = set_pgste_bit(pgste, PGSTE_UC_BIT); - } -#endif - set_pte(ptep, entry); - return pgste; -} - -static inline pgste_t pgste_pte_notify(struct mm_struct *mm, - unsigned long addr, - pte_t *ptep, pgste_t pgste) -{ -#ifdef CONFIG_PGSTE - unsigned long bits; - - bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); - if (bits) { - pgste = __pgste(pgste_val(pgste) ^ bits); - ptep_notify(mm, addr, ptep, bits); - } -#endif - return pgste; -} - -static inline pgste_t ptep_xchg_start(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - pgste_t pgste = __pgste(0); - - if (mm_has_pgste(mm)) { - pgste = pgste_get_lock(ptep); - pgste = pgste_pte_notify(mm, addr, ptep, pgste); - } - return pgste; -} - -static inline pte_t ptep_xchg_commit(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, - pgste_t pgste, pte_t old, pte_t new) -{ - if (mm_has_pgste(mm)) { - if (pte_val(old) & _PAGE_INVALID) - pgste_set_key(ptep, pgste, new, mm); - if (pte_val(new) & _PAGE_INVALID) { - pgste = pgste_update_all(old, pgste, mm); - if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == - _PGSTE_GPS_USAGE_UNUSED) - old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); - } - pgste = pgste_set_pte(ptep, pgste, new); - pgste_set_unlock(ptep, pgste); - } else { - set_pte(ptep, new); - } - return old; -} - pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t new) { - pgste_t pgste; pte_t old; - int nodat; preempt_disable(); - pgste = ptep_xchg_start(mm, addr, ptep); - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - old = ptep_flush_direct(mm, addr, ptep, nodat); - old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + old = ptep_flush_direct(mm, addr, ptep, 1); + set_pte(ptep, new); preempt_enable(); return old; } @@ -292,15 +156,11 @@ EXPORT_SYMBOL(ptep_reset_dat_prot); pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t new) { - pgste_t pgste; pte_t old; - int nodat; preempt_disable(); - pgste = ptep_xchg_start(mm, addr, ptep); - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - old = ptep_flush_lazy(mm, addr, ptep, nodat); - old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + old = ptep_flush_lazy(mm, addr, ptep, 1); + set_pte(ptep, new); preempt_enable(); return old; } @@ -309,47 +169,22 @@ EXPORT_SYMBOL(ptep_xchg_lazy); pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - pgste_t pgste; - pte_t old; - int nodat; - struct mm_struct *mm = vma->vm_mm; - - pgste = ptep_xchg_start(mm, addr, ptep); - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - old = ptep_flush_lazy(mm, addr, ptep, nodat); - if (mm_has_pgste(mm)) { - pgste = pgste_update_all(old, pgste, mm); - pgste_set(ptep, pgste); - } - return old; + return ptep_flush_lazy(vma->vm_mm, addr, ptep, 1); } void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { - pgste_t pgste; - struct mm_struct *mm = vma->vm_mm; - - if (mm_has_pgste(mm)) { - pgste = pgste_get(ptep); - pgste_set_key(ptep, pgste, pte, mm); - pgste = pgste_set_pte(ptep, pgste, pte); - pgste_set_unlock(ptep, pgste); - } else { - set_pte(ptep, pte); - } + set_pte(ptep, pte); } static inline void pmdp_idte_local(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { if (machine_has_tlb_guest()) - __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, - mm->context.asce, IDTE_LOCAL); + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_LOCAL); else __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); - if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) - gmap_pmdp_idte_local(mm, addr); } static inline void pmdp_idte_global(struct mm_struct *mm, @@ -358,12 +193,8 @@ static inline void pmdp_idte_global(struct mm_struct *mm, if (machine_has_tlb_guest()) { __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_GLOBAL); - if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) - gmap_pmdp_idte_global(mm, addr); } else { __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); - if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) - gmap_pmdp_idte_global(mm, addr); } } @@ -398,8 +229,6 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, cpumask_of(smp_processor_id()))) { set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); mm->context.flush_mm = 1; - if (mm_has_pgste(mm)) - gmap_pmdp_invalidate(mm, addr); } else { pmdp_idte_global(mm, addr, pmdp); } @@ -407,40 +236,6 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, return old; } -#ifdef CONFIG_PGSTE -static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) -{ - struct vm_area_struct *vma; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - - /* We need a valid VMA, otherwise this is clearly a fault. */ - vma = vma_lookup(mm, addr); - if (!vma) - return -EFAULT; - - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - return -ENOENT; - - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) - return -ENOENT; - - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) - return -ENOENT; - - /* Large PUDs are not supported yet. */ - if (pud_leaf(*pud)) - return -EFAULT; - - *pmdp = pmd_offset(pud, addr); - return 0; -} -#endif - pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t new) { @@ -558,598 +353,3 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) return pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -#ifdef CONFIG_PGSTE -void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) -{ - pgste_t pgste; - - /* the mm_has_pgste() check is done in set_pte_at() */ - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO); - pgste_set_key(ptep, pgste, entry, mm); - pgste = pgste_set_pte(ptep, pgste, entry); - pgste_set_unlock(ptep, pgste); - preempt_enable(); -} - -void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - pgste_t pgste; - - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgste = set_pgste_bit(pgste, PGSTE_IN_BIT); - pgste_set_unlock(ptep, pgste); - preempt_enable(); -} - -/** - * ptep_force_prot - change access rights of a locked pte - * @mm: pointer to the process mm_struct - * @addr: virtual address in the guest address space - * @ptep: pointer to the page table entry - * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE - * @bit: pgste bit to set (e.g. for notification) - * - * Returns 0 if the access rights were changed and -EAGAIN if the current - * and requested access rights are incompatible. - */ -int ptep_force_prot(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, int prot, unsigned long bit) -{ - pte_t entry; - pgste_t pgste; - int pte_i, pte_p, nodat; - - pgste = pgste_get_lock(ptep); - entry = *ptep; - /* Check pte entry after all locks have been acquired */ - pte_i = pte_val(entry) & _PAGE_INVALID; - pte_p = pte_val(entry) & _PAGE_PROTECT; - if ((pte_i && (prot != PROT_NONE)) || - (pte_p && (prot & PROT_WRITE))) { - pgste_set_unlock(ptep, pgste); - return -EAGAIN; - } - /* Change access rights and set pgste bit */ - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - if (prot == PROT_NONE && !pte_i) { - ptep_flush_direct(mm, addr, ptep, nodat); - pgste = pgste_update_all(entry, pgste, mm); - entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); - } - if (prot == PROT_READ && !pte_p) { - ptep_flush_direct(mm, addr, ptep, nodat); - entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); - entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); - } - pgste = set_pgste_bit(pgste, bit); - pgste = pgste_set_pte(ptep, pgste, entry); - pgste_set_unlock(ptep, pgste); - return 0; -} - -int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, - pte_t *sptep, pte_t *tptep, pte_t pte) -{ - pgste_t spgste, tpgste; - pte_t spte, tpte; - int rc = -EAGAIN; - - if (!(pte_val(*tptep) & _PAGE_INVALID)) - return 0; /* already shadowed */ - spgste = pgste_get_lock(sptep); - spte = *sptep; - if (!(pte_val(spte) & _PAGE_INVALID) && - !((pte_val(spte) & _PAGE_PROTECT) && - !(pte_val(pte) & _PAGE_PROTECT))) { - spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT); - tpgste = pgste_get_lock(tptep); - tpte = __pte((pte_val(spte) & PAGE_MASK) | - (pte_val(pte) & _PAGE_PROTECT)); - /* don't touch the storage key - it belongs to parent pgste */ - tpgste = pgste_set_pte(tptep, tpgste, tpte); - pgste_set_unlock(tptep, tpgste); - rc = 1; - } - pgste_set_unlock(sptep, spgste); - return rc; -} - -void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) -{ - pgste_t pgste; - int nodat; - - pgste = pgste_get_lock(ptep); - /* notifier is called by the caller */ - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - ptep_flush_direct(mm, saddr, ptep, nodat); - /* don't touch the storage key - it belongs to parent pgste */ - pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); - pgste_set_unlock(ptep, pgste); -} - -static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) -{ - if (softleaf_is_swap(entry)) - dec_mm_counter(mm, MM_SWAPENTS); - else if (softleaf_is_migration(entry)) { - struct folio *folio = softleaf_to_folio(entry); - - dec_mm_counter(mm, mm_counter(folio)); - } - free_swap_and_cache(entry); -} - -void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, int reset) -{ - unsigned long pgstev; - pgste_t pgste; - pte_t pte; - - /* Zap unused and logically-zero pages */ - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgstev = pgste_val(pgste); - pte = *ptep; - if (!reset && pte_swap(pte) && - ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || - (pgstev & _PGSTE_GPS_ZERO))) { - ptep_zap_softleaf_entry(mm, softleaf_from_pte(pte)); - pte_clear(mm, addr, ptep); - } - if (reset) - pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); - pgste_set_unlock(ptep, pgste); - preempt_enable(); -} - -void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - unsigned long ptev; - pgste_t pgste; - - /* Clear storage key ACC and F, but set R/C */ - preempt_disable(); - pgste = pgste_get_lock(ptep); - pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT); - ptev = pte_val(*ptep); - if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) - page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); - pgste_set_unlock(ptep, pgste); - preempt_enable(); -} - -/* - * Test and reset if a guest page is dirty - */ -bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - pgste_t pgste; - pte_t pte; - bool dirty; - int nodat; - - pgste = pgste_get_lock(ptep); - dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); - pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT); - pte = *ptep; - if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { - pgste = pgste_pte_notify(mm, addr, ptep, pgste); - nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); - ptep_ipte_global(mm, addr, ptep, nodat); - if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE)) - pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); - else - pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); - set_pte(ptep, pte); - } - pgste_set_unlock(ptep, pgste); - return dirty; -} -EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); - -int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char key, bool nq) -{ - unsigned long keyul, paddr; - spinlock_t *ptl; - pgste_t old, new; - pmd_t *pmdp; - pte_t *ptep; - - /* - * If we don't have a PTE table and if there is no huge page mapped, - * we can ignore attempts to set the key to 0, because it already is 0. - */ - switch (pmd_lookup(mm, addr, &pmdp)) { - case -ENOENT: - return key ? -EFAULT : 0; - case 0: - break; - default: - return -EFAULT; - } -again: - ptl = pmd_lock(mm, pmdp); - if (!pmd_present(*pmdp)) { - spin_unlock(ptl); - return key ? -EFAULT : 0; - } - - if (pmd_leaf(*pmdp)) { - paddr = pmd_val(*pmdp) & HPAGE_MASK; - paddr |= addr & ~HPAGE_MASK; - /* - * Huge pmds need quiescing operations, they are - * always mapped. - */ - page_set_storage_key(paddr, key, 1); - spin_unlock(ptl); - return 0; - } - spin_unlock(ptl); - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - if (!ptep) - goto again; - new = old = pgste_get_lock(ptep); - new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT | - PGSTE_ACC_BITS | PGSTE_FP_BIT); - keyul = (unsigned long) key; - new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48); - new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - unsigned long bits, skey; - - paddr = pte_val(*ptep) & PAGE_MASK; - skey = (unsigned long) page_get_storage_key(paddr); - bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); - skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); - /* Set storage key ACC and FP */ - page_set_storage_key(paddr, skey, !nq); - /* Merge host changed & referenced into pgste */ - new = set_pgste_bit(new, bits << 52); - } - /* changing the guest storage key is considered a change of the page */ - if ((pgste_val(new) ^ pgste_val(old)) & - (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) - new = set_pgste_bit(new, PGSTE_UC_BIT); - - pgste_set_unlock(ptep, new); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(set_guest_storage_key); - -/* - * Conditionally set a guest storage key (handling csske). - * oldkey will be updated when either mr or mc is set and a pointer is given. - * - * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest - * storage key was updated and -EFAULT on access errors. - */ -int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char key, unsigned char *oldkey, - bool nq, bool mr, bool mc) -{ - unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; - int rc; - - /* we can drop the pgste lock between getting and setting the key */ - if (mr | mc) { - rc = get_guest_storage_key(current->mm, addr, &tmp); - if (rc) - return rc; - if (oldkey) - *oldkey = tmp; - if (!mr) - mask |= _PAGE_REFERENCED; - if (!mc) - mask |= _PAGE_CHANGED; - if (!((tmp ^ key) & mask)) - return 0; - } - rc = set_guest_storage_key(current->mm, addr, key, nq); - return rc < 0 ? rc : 1; -} -EXPORT_SYMBOL(cond_set_guest_storage_key); - -/* - * Reset a guest reference bit (rrbe), returning the reference and changed bit. - * - * Returns < 0 in case of error, otherwise the cc to be reported to the guest. - */ -int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) -{ - spinlock_t *ptl; - unsigned long paddr; - pgste_t old, new; - pmd_t *pmdp; - pte_t *ptep; - int cc = 0; - - /* - * If we don't have a PTE table and if there is no huge page mapped, - * the storage key is 0 and there is nothing for us to do. - */ - switch (pmd_lookup(mm, addr, &pmdp)) { - case -ENOENT: - return 0; - case 0: - break; - default: - return -EFAULT; - } -again: - ptl = pmd_lock(mm, pmdp); - if (!pmd_present(*pmdp)) { - spin_unlock(ptl); - return 0; - } - - if (pmd_leaf(*pmdp)) { - paddr = pmd_val(*pmdp) & HPAGE_MASK; - paddr |= addr & ~HPAGE_MASK; - cc = page_reset_referenced(paddr); - spin_unlock(ptl); - return cc; - } - spin_unlock(ptl); - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - if (!ptep) - goto again; - new = old = pgste_get_lock(ptep); - /* Reset guest reference bit only */ - new = clear_pgste_bit(new, PGSTE_GR_BIT); - - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - paddr = pte_val(*ptep) & PAGE_MASK; - cc = page_reset_referenced(paddr); - /* Merge real referenced bit into host-set */ - new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT); - } - /* Reflect guest's logical view, not physical */ - cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; - /* Changing the guest storage key is considered a change of the page */ - if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) - new = set_pgste_bit(new, PGSTE_UC_BIT); - - pgste_set_unlock(ptep, new); - pte_unmap_unlock(ptep, ptl); - return cc; -} -EXPORT_SYMBOL(reset_guest_reference_bit); - -int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned char *key) -{ - unsigned long paddr; - spinlock_t *ptl; - pgste_t pgste; - pmd_t *pmdp; - pte_t *ptep; - - /* - * If we don't have a PTE table and if there is no huge page mapped, - * the storage key is 0. - */ - *key = 0; - - switch (pmd_lookup(mm, addr, &pmdp)) { - case -ENOENT: - return 0; - case 0: - break; - default: - return -EFAULT; - } -again: - ptl = pmd_lock(mm, pmdp); - if (!pmd_present(*pmdp)) { - spin_unlock(ptl); - return 0; - } - - if (pmd_leaf(*pmdp)) { - paddr = pmd_val(*pmdp) & HPAGE_MASK; - paddr |= addr & ~HPAGE_MASK; - *key = page_get_storage_key(paddr); - spin_unlock(ptl); - return 0; - } - spin_unlock(ptl); - - ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); - if (!ptep) - goto again; - pgste = pgste_get_lock(ptep); - *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; - paddr = pte_val(*ptep) & PAGE_MASK; - if (!(pte_val(*ptep) & _PAGE_INVALID)) - *key = page_get_storage_key(paddr); - /* Reflect guest's logical view, not physical */ - *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; - pgste_set_unlock(ptep, pgste); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(get_guest_storage_key); - -/** - * pgste_perform_essa - perform ESSA actions on the PGSTE. - * @mm: the memory context. It must have PGSTEs, no check is performed here! - * @hva: the host virtual address of the page whose PGSTE is to be processed - * @orc: the specific action to perform, see the ESSA_SET_* macros. - * @oldpte: the PTE will be saved there if the pointer is not NULL. - * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. - * - * Return: 1 if the page is to be added to the CBRL, otherwise 0, - * or < 0 in case of error. -EINVAL is returned for invalid values - * of orc, -EFAULT for invalid addresses. - */ -int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, - unsigned long *oldpte, unsigned long *oldpgste) -{ - struct vm_area_struct *vma; - unsigned long pgstev; - spinlock_t *ptl; - pgste_t pgste; - pte_t *ptep; - int res = 0; - - WARN_ON_ONCE(orc > ESSA_MAX); - if (unlikely(orc > ESSA_MAX)) - return -EINVAL; - - vma = vma_lookup(mm, hva); - if (!vma || is_vm_hugetlb_page(vma)) - return -EFAULT; - ptep = get_locked_pte(mm, hva, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - pgste = pgste_get_lock(ptep); - pgstev = pgste_val(pgste); - if (oldpte) - *oldpte = pte_val(*ptep); - if (oldpgste) - *oldpgste = pgstev; - - switch (orc) { - case ESSA_GET_STATE: - break; - case ESSA_SET_STABLE: - pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); - pgstev |= _PGSTE_GPS_USAGE_STABLE; - break; - case ESSA_SET_UNUSED: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_UNUSED; - if (pte_val(*ptep) & _PAGE_INVALID) - res = 1; - break; - case ESSA_SET_VOLATILE: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_VOLATILE; - if (pte_val(*ptep) & _PAGE_INVALID) - res = 1; - break; - case ESSA_SET_POT_VOLATILE: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; - break; - } - if (pgstev & _PGSTE_GPS_ZERO) { - pgstev |= _PGSTE_GPS_USAGE_VOLATILE; - break; - } - if (!(pgstev & PGSTE_GC_BIT)) { - pgstev |= _PGSTE_GPS_USAGE_VOLATILE; - res = 1; - break; - } - break; - case ESSA_SET_STABLE_RESIDENT: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_STABLE; - /* - * Since the resident state can go away any time after this - * call, we will not make this page resident. We can revisit - * this decision if a guest will ever start using this. - */ - break; - case ESSA_SET_STABLE_IF_RESIDENT: - if (!(pte_val(*ptep) & _PAGE_INVALID)) { - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_STABLE; - } - break; - case ESSA_SET_STABLE_NODAT: - pgstev &= ~_PGSTE_GPS_USAGE_MASK; - pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; - break; - default: - /* we should never get here! */ - break; - } - /* If we are discarding a page, set it to logical zero */ - if (res) - pgstev |= _PGSTE_GPS_ZERO; - - pgste = __pgste(pgstev); - pgste_set_unlock(ptep, pgste); - pte_unmap_unlock(ptep, ptl); - return res; -} -EXPORT_SYMBOL(pgste_perform_essa); - -/** - * set_pgste_bits - set specific PGSTE bits. - * @mm: the memory context. It must have PGSTEs, no check is performed here! - * @hva: the host virtual address of the page whose PGSTE is to be processed - * @bits: a bitmask representing the bits that will be touched - * @value: the values of the bits to be written. Only the bits in the mask - * will be written. - * - * Return: 0 on success, < 0 in case of error. - */ -int set_pgste_bits(struct mm_struct *mm, unsigned long hva, - unsigned long bits, unsigned long value) -{ - struct vm_area_struct *vma; - spinlock_t *ptl; - pgste_t new; - pte_t *ptep; - - vma = vma_lookup(mm, hva); - if (!vma || is_vm_hugetlb_page(vma)) - return -EFAULT; - ptep = get_locked_pte(mm, hva, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - new = pgste_get_lock(ptep); - - new = clear_pgste_bit(new, bits); - new = set_pgste_bit(new, value & bits); - - pgste_set_unlock(ptep, new); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(set_pgste_bits); - -/** - * get_pgste - get the current PGSTE for the given address. - * @mm: the memory context. It must have PGSTEs, no check is performed here! - * @hva: the host virtual address of the page whose PGSTE is to be processed - * @pgstep: will be written with the current PGSTE for the given address. - * - * Return: 0 on success, < 0 in case of error. - */ -int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) -{ - struct vm_area_struct *vma; - spinlock_t *ptl; - pte_t *ptep; - - vma = vma_lookup(mm, hva); - if (!vma || is_vm_hugetlb_page(vma)) - return -EFAULT; - ptep = get_locked_pte(mm, hva, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - *pgstep = pgste_val(pgste_get(ptep)); - pte_unmap_unlock(ptep, ptl); - return 0; -} -EXPORT_SYMBOL(get_pgste); -#endif |
