diff options
| author | Marc Zyngier <maz@kernel.org> | 2026-04-08 12:23:45 +0100 |
|---|---|---|
| committer | Marc Zyngier <maz@kernel.org> | 2026-04-08 12:23:45 +0100 |
| commit | 64f2fa630d7f7e1b87018a3623a75d11e718db94 (patch) | |
| tree | 7f9c109036672b07a2a760b984c446d9d1a4babd | |
| parent | b693940e81318d5cf4432afc97be5e22e541e2fd (diff) | |
| parent | e9550374d13a4bfd0b8a711733f5d423c2e56b96 (diff) | |
Merge branch kvm-arm64/user_mem_abort-rework into kvmarm-master/next
* kvm-arm64/user_mem_abort-rework: (30 commits)
: .
: user_mem_abort() has become an absolute pain to maintain,
: to the point that each single fix is likely to introduce
: *two* new bugs.
:
: Deconstruct the whole thing in logical units, reducing
: the amount of visible and/or mutable state between functions,
: and finally making the code a bit more maintainable.
: .
KVM: arm64: Convert gmem_abort() to struct kvm_s2_fault_desc
KVM: arm64: Simplify integration of adjust_nested_*_perms()
KVM: arm64: Directly expose mapping prot and kill kvm_s2_fault
KVM: arm64: Move device mapping management into kvm_s2_fault_pin_pfn()
KVM: arm64: Replace force_pte with a max_map_size attribute
KVM: arm64: Move kvm_s2_fault.{pfn,page} to kvm_s2_vma_info
KVM: arm64: Restrict the scope of the 'writable' attribute
KVM: arm64: Kill logging_active from kvm_s2_fault
KVM: arm64: Move VMA-related information to kvm_s2_fault_vma_info
KVM: arm64: Kill topup_memcache from kvm_s2_fault
KVM: arm64: Kill exec_fault from kvm_s2_fault
KVM: arm64: Kill write_fault from kvm_s2_fault
KVM: arm64: Constrain fault_granule to kvm_s2_fault_map()
KVM: arm64: Replace fault_is_perm with a helper
KVM: arm64: Move fault context to const structure
KVM: arm64: Make fault_ipa immutable
KVM: arm64: Kill fault->ipa
KVM: arm64: Clean up control flow in kvm_s2_fault_map()
KVM: arm64: Hoist MTE validation check out of MMU lock path
KVM: arm64: Optimize early exit checks in kvm_s2_fault_pin_pfn()
...
Signed-off-by: Marc Zyngier <maz@kernel.org>
| -rw-r--r-- | arch/arm64/kvm/mmu.c | 530 |
1 files changed, 310 insertions, 220 deletions
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 17d64a1e11e5..03e1f389339c 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1400,10 +1400,10 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, */ static long transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long hva, kvm_pfn_t *pfnp, - phys_addr_t *ipap) + unsigned long hva, kvm_pfn_t *pfnp, gfn_t *gfnp) { kvm_pfn_t pfn = *pfnp; + gfn_t gfn = *gfnp; /* * Make sure the adjustment is done only for THP pages. Also make @@ -1419,7 +1419,8 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, if (sz < PMD_SIZE) return PAGE_SIZE; - *ipap &= PMD_MASK; + gfn &= ~(PTRS_PER_PMD - 1); + *gfnp = gfn; pfn &= ~(PTRS_PER_PMD - 1); *pfnp = pfn; @@ -1512,25 +1513,22 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma) } } -static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache, - void **memcache) +static void *get_mmu_memcache(struct kvm_vcpu *vcpu) { - int min_pages; - if (!is_protected_kvm_enabled()) - *memcache = &vcpu->arch.mmu_page_cache; + return &vcpu->arch.mmu_page_cache; else - *memcache = &vcpu->arch.pkvm_memcache; - - if (!topup_memcache) - return 0; + return &vcpu->arch.pkvm_memcache; +} - min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); +static int topup_mmu_memcache(struct kvm_vcpu *vcpu, void *memcache) +{ + int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); if (!is_protected_kvm_enabled()) - return kvm_mmu_topup_memory_cache(*memcache, min_pages); + return kvm_mmu_topup_memory_cache(memcache, min_pages); - return topup_hyp_memcache(*memcache, min_pages); + return topup_hyp_memcache(memcache, min_pages); } /* @@ -1543,54 +1541,63 @@ static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache, * TLB invalidation from the guest and used to limit the invalidation scope if a * TTL hint or a range isn't provided. */ -static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, - enum kvm_pgtable_prot *prot, - bool *writable) +static enum kvm_pgtable_prot adjust_nested_fault_perms(struct kvm_s2_trans *nested, + enum kvm_pgtable_prot prot) { - *writable &= kvm_s2_trans_writable(nested); + if (!kvm_s2_trans_writable(nested)) + prot &= ~KVM_PGTABLE_PROT_W; if (!kvm_s2_trans_readable(nested)) - *prot &= ~KVM_PGTABLE_PROT_R; + prot &= ~KVM_PGTABLE_PROT_R; - *prot |= kvm_encode_nested_level(nested); + return prot | kvm_encode_nested_level(nested); } -static void adjust_nested_exec_perms(struct kvm *kvm, - struct kvm_s2_trans *nested, - enum kvm_pgtable_prot *prot) +static enum kvm_pgtable_prot adjust_nested_exec_perms(struct kvm *kvm, + struct kvm_s2_trans *nested, + enum kvm_pgtable_prot prot) { if (!kvm_s2_trans_exec_el0(kvm, nested)) - *prot &= ~KVM_PGTABLE_PROT_UX; + prot &= ~KVM_PGTABLE_PROT_UX; if (!kvm_s2_trans_exec_el1(kvm, nested)) - *prot &= ~KVM_PGTABLE_PROT_PX; + prot &= ~KVM_PGTABLE_PROT_PX; + + return prot; } -static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - struct kvm_s2_trans *nested, - struct kvm_memory_slot *memslot, bool is_perm) +struct kvm_s2_fault_desc { + struct kvm_vcpu *vcpu; + phys_addr_t fault_ipa; + struct kvm_s2_trans *nested; + struct kvm_memory_slot *memslot; + unsigned long hva; +}; + +static int gmem_abort(const struct kvm_s2_fault_desc *s2fd) { - bool write_fault, exec_fault, writable; + bool write_fault, exec_fault; enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; - struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; + struct kvm_pgtable *pgt = s2fd->vcpu->arch.hw_mmu->pgt; unsigned long mmu_seq; struct page *page; - struct kvm *kvm = vcpu->kvm; + struct kvm *kvm = s2fd->vcpu->kvm; void *memcache; kvm_pfn_t pfn; gfn_t gfn; int ret; - ret = prepare_mmu_memcache(vcpu, true, &memcache); + memcache = get_mmu_memcache(s2fd->vcpu); + ret = topup_mmu_memcache(s2fd->vcpu, memcache); if (ret) return ret; - if (nested) - gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT; + if (s2fd->nested) + gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT; else - gfn = fault_ipa >> PAGE_SHIFT; + gfn = s2fd->fault_ipa >> PAGE_SHIFT; - write_fault = kvm_is_write_fault(vcpu); - exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); + write_fault = kvm_is_write_fault(s2fd->vcpu); + exec_fault = kvm_vcpu_trap_is_exec_fault(s2fd->vcpu); VM_WARN_ON_ONCE(write_fault && exec_fault); @@ -1598,26 +1605,24 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ smp_rmb(); - ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL); + ret = kvm_gmem_get_pfn(kvm, s2fd->memslot, gfn, &pfn, &page, NULL); if (ret) { - kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE, + kvm_prepare_memory_fault_exit(s2fd->vcpu, s2fd->fault_ipa, PAGE_SIZE, write_fault, exec_fault, false); return ret; } - writable = !(memslot->flags & KVM_MEM_READONLY); - - if (nested) - adjust_nested_fault_perms(nested, &prot, &writable); - - if (writable) + if (!(s2fd->memslot->flags & KVM_MEM_READONLY)) prot |= KVM_PGTABLE_PROT_W; + if (s2fd->nested) + prot = adjust_nested_fault_perms(s2fd->nested, prot); + if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) prot |= KVM_PGTABLE_PROT_X; - if (nested) - adjust_nested_exec_perms(kvm, nested, &prot); + if (s2fd->nested) + prot = adjust_nested_exec_perms(kvm, s2fd->nested, prot); kvm_fault_lock(kvm); if (mmu_invalidate_retry(kvm, mmu_seq)) { @@ -1625,85 +1630,53 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, goto out_unlock; } - ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE, + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE, __pfn_to_phys(pfn), prot, memcache, flags); out_unlock: - kvm_release_faultin_page(kvm, page, !!ret, writable); + kvm_release_faultin_page(kvm, page, !!ret, prot & KVM_PGTABLE_PROT_W); kvm_fault_unlock(kvm); - if (writable && !ret) - mark_page_dirty_in_slot(kvm, memslot, gfn); + if ((prot & KVM_PGTABLE_PROT_W) && !ret) + mark_page_dirty_in_slot(kvm, s2fd->memslot, gfn); return ret != -EAGAIN ? ret : 0; } -static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, - struct kvm_s2_trans *nested, - struct kvm_memory_slot *memslot, unsigned long hva, - bool fault_is_perm) +struct kvm_s2_fault_vma_info { + unsigned long mmu_seq; + long vma_pagesize; + vm_flags_t vm_flags; + unsigned long max_map_size; + struct page *page; + kvm_pfn_t pfn; + gfn_t gfn; + bool device; + bool mte_allowed; + bool is_vma_cacheable; + bool map_writable; + bool map_non_cacheable; +}; + +static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd, + struct kvm_s2_fault_vma_info *s2vi, + struct vm_area_struct *vma) { - int ret = 0; - bool topup_memcache; - bool write_fault, writable; - bool exec_fault, mte_allowed, is_vma_cacheable; - bool s2_force_noncacheable = false, vfio_allow_any_uc = false; - unsigned long mmu_seq; - phys_addr_t ipa = fault_ipa; - struct kvm *kvm = vcpu->kvm; - struct vm_area_struct *vma; short vma_shift; - void *memcache; - gfn_t gfn; - kvm_pfn_t pfn; - bool logging_active = memslot_is_logging(memslot); - bool force_pte = logging_active; - long vma_pagesize, fault_granule; - enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; - struct kvm_pgtable *pgt; - struct page *page; - vm_flags_t vm_flags; - enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; - - if (fault_is_perm) - fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); - write_fault = kvm_is_write_fault(vcpu); - exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); - VM_WARN_ON_ONCE(write_fault && exec_fault); - - /* - * Permission faults just need to update the existing leaf entry, - * and so normally don't require allocations from the memcache. The - * only exception to this is when dirty logging is enabled at runtime - * and a write fault needs to collapse a block entry into a table. - */ - topup_memcache = !fault_is_perm || (logging_active && write_fault); - ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache); - if (ret) - return ret; - /* - * Let's check if we will get back a huge page backed by hugetlbfs, or - * get block mapping for device MMIO region. - */ - mmap_read_lock(current->mm); - vma = vma_lookup(current->mm, hva); - if (unlikely(!vma)) { - kvm_err("Failed to find VMA for hva 0x%lx\n", hva); - mmap_read_unlock(current->mm); - return -EFAULT; - } - - if (force_pte) + if (memslot_is_logging(s2fd->memslot)) { + s2vi->max_map_size = PAGE_SIZE; vma_shift = PAGE_SHIFT; - else - vma_shift = get_vma_page_shift(vma, hva); + } else { + s2vi->max_map_size = PUD_SIZE; + vma_shift = get_vma_page_shift(vma, s2fd->hva); + } switch (vma_shift) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SHIFT: - if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) + if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PUD_SIZE)) break; fallthrough; #endif @@ -1711,12 +1684,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_shift = PMD_SHIFT; fallthrough; case PMD_SHIFT: - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) + if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PMD_SIZE)) break; fallthrough; case CONT_PTE_SHIFT: vma_shift = PAGE_SHIFT; - force_pte = true; + s2vi->max_map_size = PAGE_SIZE; fallthrough; case PAGE_SHIFT: break; @@ -1724,21 +1697,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); } - vma_pagesize = 1UL << vma_shift; - - if (nested) { + if (s2fd->nested) { unsigned long max_map_size; - max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; - - ipa = kvm_s2_trans_output(nested); + max_map_size = min(s2vi->max_map_size, PUD_SIZE); /* * If we're about to create a shadow stage 2 entry, then we * can only create a block mapping if the guest stage 2 page * table uses at least as big a mapping. */ - max_map_size = min(kvm_s2_trans_size(nested), max_map_size); + max_map_size = min(kvm_s2_trans_size(s2fd->nested), max_map_size); /* * Be careful that if the mapping size falls between @@ -1749,30 +1718,46 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) max_map_size = PAGE_SIZE; - force_pte = (max_map_size == PAGE_SIZE); - vma_pagesize = min_t(long, vma_pagesize, max_map_size); - vma_shift = __ffs(vma_pagesize); + s2vi->max_map_size = max_map_size; + vma_shift = min_t(short, vma_shift, __ffs(max_map_size)); + } + + return vma_shift; +} + +static bool kvm_s2_fault_is_perm(const struct kvm_s2_fault_desc *s2fd) +{ + return kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); +} + +static int kvm_s2_fault_get_vma_info(const struct kvm_s2_fault_desc *s2fd, + struct kvm_s2_fault_vma_info *s2vi) +{ + struct vm_area_struct *vma; + struct kvm *kvm = s2fd->vcpu->kvm; + + mmap_read_lock(current->mm); + vma = vma_lookup(current->mm, s2fd->hva); + if (unlikely(!vma)) { + kvm_err("Failed to find VMA for hva 0x%lx\n", s2fd->hva); + mmap_read_unlock(current->mm); + return -EFAULT; } + s2vi->vma_pagesize = BIT(kvm_s2_resolve_vma_size(s2fd, s2vi, vma)); + /* * Both the canonical IPA and fault IPA must be aligned to the * mapping size to ensure we find the right PFN and lay down the * mapping in the right place. */ - fault_ipa = ALIGN_DOWN(fault_ipa, vma_pagesize); - ipa = ALIGN_DOWN(ipa, vma_pagesize); - - gfn = ipa >> PAGE_SHIFT; - mte_allowed = kvm_vma_mte_allowed(vma); - - vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; + s2vi->gfn = ALIGN_DOWN(s2fd->fault_ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; - vm_flags = vma->vm_flags; + s2vi->mte_allowed = kvm_vma_mte_allowed(vma); - is_vma_cacheable = kvm_vma_is_cacheable(vma); + s2vi->vm_flags = vma->vm_flags; - /* Don't use the VMA after the unlock -- it may have vanished */ - vma = NULL; + s2vi->is_vma_cacheable = kvm_vma_is_cacheable(vma); /* * Read mmu_invalidate_seq so that KVM can detect if the results of @@ -1782,24 +1767,50 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs * with the smp_wmb() in kvm_mmu_invalidate_end(). */ - mmu_seq = kvm->mmu_invalidate_seq; + s2vi->mmu_seq = kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); - pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, - &writable, &page); - if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(hva, vma_shift); - return 0; - } - if (is_error_noslot_pfn(pfn)) + return 0; +} + +static gfn_t get_canonical_gfn(const struct kvm_s2_fault_desc *s2fd, + const struct kvm_s2_fault_vma_info *s2vi) +{ + phys_addr_t ipa; + + if (!s2fd->nested) + return s2vi->gfn; + + ipa = kvm_s2_trans_output(s2fd->nested); + return ALIGN_DOWN(ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; +} + +static int kvm_s2_fault_pin_pfn(const struct kvm_s2_fault_desc *s2fd, + struct kvm_s2_fault_vma_info *s2vi) +{ + int ret; + + ret = kvm_s2_fault_get_vma_info(s2fd, s2vi); + if (ret) + return ret; + + s2vi->pfn = __kvm_faultin_pfn(s2fd->memslot, get_canonical_gfn(s2fd, s2vi), + kvm_is_write_fault(s2fd->vcpu) ? FOLL_WRITE : 0, + &s2vi->map_writable, &s2vi->page); + if (unlikely(is_error_noslot_pfn(s2vi->pfn))) { + if (s2vi->pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(s2fd->hva, __ffs(s2vi->vma_pagesize)); + return 0; + } return -EFAULT; + } /* * Check if this is non-struct page memory PFN, and cannot support * CMOs. It could potentially be unsafe to access as cacheable. */ - if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) { - if (is_vma_cacheable) { + if (s2vi->vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(s2vi->pfn)) { + if (s2vi->is_vma_cacheable) { /* * Whilst the VMA owner expects cacheable mapping to this * PFN, hardware also has to support the FWB and CACHE DIC @@ -1812,8 +1823,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * S2FWB and CACHE DIC are mandatory to avoid the need for * cache maintenance. */ - if (!kvm_supports_cacheable_pfnmap()) - ret = -EFAULT; + if (!kvm_supports_cacheable_pfnmap()) { + kvm_release_faultin_page(s2fd->vcpu->kvm, s2vi->page, true, false); + return -EFAULT; + } } else { /* * If the page was identified as device early by looking at @@ -1825,21 +1838,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * In both cases, we don't let transparent_hugepage_adjust() * change things at the last minute. */ - s2_force_noncacheable = true; + s2vi->map_non_cacheable = true; } - } else if (logging_active && !write_fault) { - /* - * Only actually map the page as writable if this was a write - * fault. - */ - writable = false; + + s2vi->device = true; } - if (exec_fault && s2_force_noncacheable) - ret = -ENOEXEC; + return 1; +} - if (ret) - goto out_put_page; +static int kvm_s2_fault_compute_prot(const struct kvm_s2_fault_desc *s2fd, + const struct kvm_s2_fault_vma_info *s2vi, + enum kvm_pgtable_prot *prot) +{ + struct kvm *kvm = s2fd->vcpu->kvm; + + if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu) && s2vi->map_non_cacheable) + return -ENOEXEC; /* * Guest performs atomic/exclusive operations on memory with unsupported @@ -1847,99 +1862,167 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * and trigger the exception here. Since the memslot is valid, inject * the fault back to the guest. */ - if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(vcpu))) { - kvm_inject_dabt_excl_atomic(vcpu, kvm_vcpu_get_hfar(vcpu)); - ret = 1; - goto out_put_page; + if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(s2fd->vcpu))) { + kvm_inject_dabt_excl_atomic(s2fd->vcpu, kvm_vcpu_get_hfar(s2fd->vcpu)); + return 1; + } + + *prot = KVM_PGTABLE_PROT_R; + + if (s2vi->map_writable && (s2vi->device || + !memslot_is_logging(s2fd->memslot) || + kvm_is_write_fault(s2fd->vcpu))) + *prot |= KVM_PGTABLE_PROT_W; + + if (s2fd->nested) + *prot = adjust_nested_fault_perms(s2fd->nested, *prot); + + if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu)) + *prot |= KVM_PGTABLE_PROT_X; + + if (s2vi->map_non_cacheable) + *prot |= (s2vi->vm_flags & VM_ALLOW_ANY_UNCACHED) ? + KVM_PGTABLE_PROT_NORMAL_NC : KVM_PGTABLE_PROT_DEVICE; + else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) + *prot |= KVM_PGTABLE_PROT_X; + + if (s2fd->nested) + *prot = adjust_nested_exec_perms(kvm, s2fd->nested, *prot); + + if (!kvm_s2_fault_is_perm(s2fd) && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) { + /* Check the VMM hasn't introduced a new disallowed VMA */ + if (!s2vi->mte_allowed) + return -EFAULT; } - if (nested) - adjust_nested_fault_perms(nested, &prot, &writable); + return 0; +} + +static int kvm_s2_fault_map(const struct kvm_s2_fault_desc *s2fd, + const struct kvm_s2_fault_vma_info *s2vi, + enum kvm_pgtable_prot prot, + void *memcache) +{ + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; + bool writable = prot & KVM_PGTABLE_PROT_W; + struct kvm *kvm = s2fd->vcpu->kvm; + struct kvm_pgtable *pgt; + long perm_fault_granule; + long mapping_size; + kvm_pfn_t pfn; + gfn_t gfn; + int ret; kvm_fault_lock(kvm); - pgt = vcpu->arch.hw_mmu->pgt; - if (mmu_invalidate_retry(kvm, mmu_seq)) { - ret = -EAGAIN; + pgt = s2fd->vcpu->arch.hw_mmu->pgt; + ret = -EAGAIN; + if (mmu_invalidate_retry(kvm, s2vi->mmu_seq)) goto out_unlock; - } + + perm_fault_granule = (kvm_s2_fault_is_perm(s2fd) ? + kvm_vcpu_trap_get_perm_fault_granule(s2fd->vcpu) : 0); + mapping_size = s2vi->vma_pagesize; + pfn = s2vi->pfn; + gfn = s2vi->gfn; /* * If we are not forced to use page mapping, check if we are * backed by a THP and thus use block mapping if possible. */ - if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) { - if (fault_is_perm && fault_granule > PAGE_SIZE) - vma_pagesize = fault_granule; - else - vma_pagesize = transparent_hugepage_adjust(kvm, memslot, - hva, &pfn, - &fault_ipa); - - if (vma_pagesize < 0) { - ret = vma_pagesize; - goto out_unlock; - } - } - - if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) { - /* Check the VMM hasn't introduced a new disallowed VMA */ - if (mte_allowed) { - sanitise_mte_tags(kvm, pfn, vma_pagesize); + if (mapping_size == PAGE_SIZE && + !(s2vi->max_map_size == PAGE_SIZE || s2vi->map_non_cacheable)) { + if (perm_fault_granule > PAGE_SIZE) { + mapping_size = perm_fault_granule; } else { - ret = -EFAULT; - goto out_unlock; + mapping_size = transparent_hugepage_adjust(kvm, s2fd->memslot, + s2fd->hva, &pfn, + &gfn); + if (mapping_size < 0) { + ret = mapping_size; + goto out_unlock; + } } } - if (writable) - prot |= KVM_PGTABLE_PROT_W; - - if (exec_fault) - prot |= KVM_PGTABLE_PROT_X; - - if (s2_force_noncacheable) { - if (vfio_allow_any_uc) - prot |= KVM_PGTABLE_PROT_NORMAL_NC; - else - prot |= KVM_PGTABLE_PROT_DEVICE; - } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { - prot |= KVM_PGTABLE_PROT_X; - } - - if (nested) - adjust_nested_exec_perms(kvm, nested, &prot); + if (!perm_fault_granule && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) + sanitise_mte_tags(kvm, pfn, mapping_size); /* * Under the premise of getting a FSC_PERM fault, we just need to relax - * permissions only if vma_pagesize equals fault_granule. Otherwise, + * permissions only if mapping_size equals perm_fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_is_perm && vma_pagesize == fault_granule) { + if (mapping_size == perm_fault_granule) { /* * Drop the SW bits in favour of those stored in the * PTE, which will be preserved. */ prot &= ~KVM_NV_GUEST_MAP_SZ; - ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); + ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, gfn_to_gpa(gfn), + prot, flags); } else { - ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, - __pfn_to_phys(pfn), prot, - memcache, flags); + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, gfn_to_gpa(gfn), mapping_size, + __pfn_to_phys(pfn), prot, + memcache, flags); } out_unlock: - kvm_release_faultin_page(kvm, page, !!ret, writable); + kvm_release_faultin_page(kvm, s2vi->page, !!ret, writable); kvm_fault_unlock(kvm); - /* Mark the page dirty only if the fault is handled successfully */ - if (writable && !ret) - mark_page_dirty_in_slot(kvm, memslot, gfn); + /* + * Mark the page dirty only if the fault is handled successfully, + * making sure we adjust the canonical IPA if the mapping size has + * been updated (via a THP upgrade, for example). + */ + if (writable && !ret) { + phys_addr_t ipa = gfn_to_gpa(get_canonical_gfn(s2fd, s2vi)); + ipa &= ~(mapping_size - 1); + mark_page_dirty_in_slot(kvm, s2fd->memslot, gpa_to_gfn(ipa)); + } + + if (ret != -EAGAIN) + return ret; + return 0; +} - return ret != -EAGAIN ? ret : 0; +static int user_mem_abort(const struct kvm_s2_fault_desc *s2fd) +{ + bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); + struct kvm_s2_fault_vma_info s2vi = {}; + enum kvm_pgtable_prot prot; + void *memcache; + int ret; -out_put_page: - kvm_release_page_unused(page); - return ret; + /* + * Permission faults just need to update the existing leaf entry, + * and so normally don't require allocations from the memcache. The + * only exception to this is when dirty logging is enabled at runtime + * and a write fault needs to collapse a block entry into a table. + */ + memcache = get_mmu_memcache(s2fd->vcpu); + if (!perm_fault || (memslot_is_logging(s2fd->memslot) && + kvm_is_write_fault(s2fd->vcpu))) { + ret = topup_mmu_memcache(s2fd->vcpu, memcache); + if (ret) + return ret; + } + + /* + * Let's check if we will get back a huge page backed by hugetlbfs, or + * get block mapping for device MMIO region. + */ + ret = kvm_s2_fault_pin_pfn(s2fd, &s2vi); + if (ret != 1) + return ret; + + ret = kvm_s2_fault_compute_prot(s2fd, &s2vi, &prot); + if (ret) { + kvm_release_page_unused(s2vi.page); + return ret; + } + + return kvm_s2_fault_map(s2fd, &s2vi, prot, memcache); } /* Resolve the access fault by making the page young again. */ @@ -2205,12 +2288,19 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu)); + const struct kvm_s2_fault_desc s2fd = { + .vcpu = vcpu, + .fault_ipa = fault_ipa, + .nested = nested, + .memslot = memslot, + .hva = hva, + }; + if (kvm_slot_has_gmem(memslot)) - ret = gmem_abort(vcpu, fault_ipa, nested, memslot, - esr_fsc_is_permission_fault(esr)); + ret = gmem_abort(&s2fd); else - ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, - esr_fsc_is_permission_fault(esr)); + ret = user_mem_abort(&s2fd); + if (ret == 0) ret = 1; out: |
