summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarc Zyngier <maz@kernel.org>2026-04-08 12:23:45 +0100
committerMarc Zyngier <maz@kernel.org>2026-04-08 12:23:45 +0100
commit64f2fa630d7f7e1b87018a3623a75d11e718db94 (patch)
tree7f9c109036672b07a2a760b984c446d9d1a4babd
parentb693940e81318d5cf4432afc97be5e22e541e2fd (diff)
parente9550374d13a4bfd0b8a711733f5d423c2e56b96 (diff)
Merge branch kvm-arm64/user_mem_abort-rework into kvmarm-master/next
* kvm-arm64/user_mem_abort-rework: (30 commits) : . : user_mem_abort() has become an absolute pain to maintain, : to the point that each single fix is likely to introduce : *two* new bugs. : : Deconstruct the whole thing in logical units, reducing : the amount of visible and/or mutable state between functions, : and finally making the code a bit more maintainable. : . KVM: arm64: Convert gmem_abort() to struct kvm_s2_fault_desc KVM: arm64: Simplify integration of adjust_nested_*_perms() KVM: arm64: Directly expose mapping prot and kill kvm_s2_fault KVM: arm64: Move device mapping management into kvm_s2_fault_pin_pfn() KVM: arm64: Replace force_pte with a max_map_size attribute KVM: arm64: Move kvm_s2_fault.{pfn,page} to kvm_s2_vma_info KVM: arm64: Restrict the scope of the 'writable' attribute KVM: arm64: Kill logging_active from kvm_s2_fault KVM: arm64: Move VMA-related information to kvm_s2_fault_vma_info KVM: arm64: Kill topup_memcache from kvm_s2_fault KVM: arm64: Kill exec_fault from kvm_s2_fault KVM: arm64: Kill write_fault from kvm_s2_fault KVM: arm64: Constrain fault_granule to kvm_s2_fault_map() KVM: arm64: Replace fault_is_perm with a helper KVM: arm64: Move fault context to const structure KVM: arm64: Make fault_ipa immutable KVM: arm64: Kill fault->ipa KVM: arm64: Clean up control flow in kvm_s2_fault_map() KVM: arm64: Hoist MTE validation check out of MMU lock path KVM: arm64: Optimize early exit checks in kvm_s2_fault_pin_pfn() ... Signed-off-by: Marc Zyngier <maz@kernel.org>
-rw-r--r--arch/arm64/kvm/mmu.c530
1 files changed, 310 insertions, 220 deletions
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 17d64a1e11e5..03e1f389339c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1400,10 +1400,10 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
*/
static long
transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
- unsigned long hva, kvm_pfn_t *pfnp,
- phys_addr_t *ipap)
+ unsigned long hva, kvm_pfn_t *pfnp, gfn_t *gfnp)
{
kvm_pfn_t pfn = *pfnp;
+ gfn_t gfn = *gfnp;
/*
* Make sure the adjustment is done only for THP pages. Also make
@@ -1419,7 +1419,8 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (sz < PMD_SIZE)
return PAGE_SIZE;
- *ipap &= PMD_MASK;
+ gfn &= ~(PTRS_PER_PMD - 1);
+ *gfnp = gfn;
pfn &= ~(PTRS_PER_PMD - 1);
*pfnp = pfn;
@@ -1512,25 +1513,22 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
}
}
-static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache,
- void **memcache)
+static void *get_mmu_memcache(struct kvm_vcpu *vcpu)
{
- int min_pages;
-
if (!is_protected_kvm_enabled())
- *memcache = &vcpu->arch.mmu_page_cache;
+ return &vcpu->arch.mmu_page_cache;
else
- *memcache = &vcpu->arch.pkvm_memcache;
-
- if (!topup_memcache)
- return 0;
+ return &vcpu->arch.pkvm_memcache;
+}
- min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
+static int topup_mmu_memcache(struct kvm_vcpu *vcpu, void *memcache)
+{
+ int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
if (!is_protected_kvm_enabled())
- return kvm_mmu_topup_memory_cache(*memcache, min_pages);
+ return kvm_mmu_topup_memory_cache(memcache, min_pages);
- return topup_hyp_memcache(*memcache, min_pages);
+ return topup_hyp_memcache(memcache, min_pages);
}
/*
@@ -1543,54 +1541,63 @@ static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache,
* TLB invalidation from the guest and used to limit the invalidation scope if a
* TTL hint or a range isn't provided.
*/
-static void adjust_nested_fault_perms(struct kvm_s2_trans *nested,
- enum kvm_pgtable_prot *prot,
- bool *writable)
+static enum kvm_pgtable_prot adjust_nested_fault_perms(struct kvm_s2_trans *nested,
+ enum kvm_pgtable_prot prot)
{
- *writable &= kvm_s2_trans_writable(nested);
+ if (!kvm_s2_trans_writable(nested))
+ prot &= ~KVM_PGTABLE_PROT_W;
if (!kvm_s2_trans_readable(nested))
- *prot &= ~KVM_PGTABLE_PROT_R;
+ prot &= ~KVM_PGTABLE_PROT_R;
- *prot |= kvm_encode_nested_level(nested);
+ return prot | kvm_encode_nested_level(nested);
}
-static void adjust_nested_exec_perms(struct kvm *kvm,
- struct kvm_s2_trans *nested,
- enum kvm_pgtable_prot *prot)
+static enum kvm_pgtable_prot adjust_nested_exec_perms(struct kvm *kvm,
+ struct kvm_s2_trans *nested,
+ enum kvm_pgtable_prot prot)
{
if (!kvm_s2_trans_exec_el0(kvm, nested))
- *prot &= ~KVM_PGTABLE_PROT_UX;
+ prot &= ~KVM_PGTABLE_PROT_UX;
if (!kvm_s2_trans_exec_el1(kvm, nested))
- *prot &= ~KVM_PGTABLE_PROT_PX;
+ prot &= ~KVM_PGTABLE_PROT_PX;
+
+ return prot;
}
-static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
- struct kvm_s2_trans *nested,
- struct kvm_memory_slot *memslot, bool is_perm)
+struct kvm_s2_fault_desc {
+ struct kvm_vcpu *vcpu;
+ phys_addr_t fault_ipa;
+ struct kvm_s2_trans *nested;
+ struct kvm_memory_slot *memslot;
+ unsigned long hva;
+};
+
+static int gmem_abort(const struct kvm_s2_fault_desc *s2fd)
{
- bool write_fault, exec_fault, writable;
+ bool write_fault, exec_fault;
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
- struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
+ struct kvm_pgtable *pgt = s2fd->vcpu->arch.hw_mmu->pgt;
unsigned long mmu_seq;
struct page *page;
- struct kvm *kvm = vcpu->kvm;
+ struct kvm *kvm = s2fd->vcpu->kvm;
void *memcache;
kvm_pfn_t pfn;
gfn_t gfn;
int ret;
- ret = prepare_mmu_memcache(vcpu, true, &memcache);
+ memcache = get_mmu_memcache(s2fd->vcpu);
+ ret = topup_mmu_memcache(s2fd->vcpu, memcache);
if (ret)
return ret;
- if (nested)
- gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT;
+ if (s2fd->nested)
+ gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT;
else
- gfn = fault_ipa >> PAGE_SHIFT;
+ gfn = s2fd->fault_ipa >> PAGE_SHIFT;
- write_fault = kvm_is_write_fault(vcpu);
- exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
+ write_fault = kvm_is_write_fault(s2fd->vcpu);
+ exec_fault = kvm_vcpu_trap_is_exec_fault(s2fd->vcpu);
VM_WARN_ON_ONCE(write_fault && exec_fault);
@@ -1598,26 +1605,24 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
smp_rmb();
- ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL);
+ ret = kvm_gmem_get_pfn(kvm, s2fd->memslot, gfn, &pfn, &page, NULL);
if (ret) {
- kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE,
+ kvm_prepare_memory_fault_exit(s2fd->vcpu, s2fd->fault_ipa, PAGE_SIZE,
write_fault, exec_fault, false);
return ret;
}
- writable = !(memslot->flags & KVM_MEM_READONLY);
-
- if (nested)
- adjust_nested_fault_perms(nested, &prot, &writable);
-
- if (writable)
+ if (!(s2fd->memslot->flags & KVM_MEM_READONLY))
prot |= KVM_PGTABLE_PROT_W;
+ if (s2fd->nested)
+ prot = adjust_nested_fault_perms(s2fd->nested, prot);
+
if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
- if (nested)
- adjust_nested_exec_perms(kvm, nested, &prot);
+ if (s2fd->nested)
+ prot = adjust_nested_exec_perms(kvm, s2fd->nested, prot);
kvm_fault_lock(kvm);
if (mmu_invalidate_retry(kvm, mmu_seq)) {
@@ -1625,85 +1630,53 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
goto out_unlock;
}
- ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE,
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE,
__pfn_to_phys(pfn), prot,
memcache, flags);
out_unlock:
- kvm_release_faultin_page(kvm, page, !!ret, writable);
+ kvm_release_faultin_page(kvm, page, !!ret, prot & KVM_PGTABLE_PROT_W);
kvm_fault_unlock(kvm);
- if (writable && !ret)
- mark_page_dirty_in_slot(kvm, memslot, gfn);
+ if ((prot & KVM_PGTABLE_PROT_W) && !ret)
+ mark_page_dirty_in_slot(kvm, s2fd->memslot, gfn);
return ret != -EAGAIN ? ret : 0;
}
-static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
- struct kvm_s2_trans *nested,
- struct kvm_memory_slot *memslot, unsigned long hva,
- bool fault_is_perm)
+struct kvm_s2_fault_vma_info {
+ unsigned long mmu_seq;
+ long vma_pagesize;
+ vm_flags_t vm_flags;
+ unsigned long max_map_size;
+ struct page *page;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+ bool device;
+ bool mte_allowed;
+ bool is_vma_cacheable;
+ bool map_writable;
+ bool map_non_cacheable;
+};
+
+static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd,
+ struct kvm_s2_fault_vma_info *s2vi,
+ struct vm_area_struct *vma)
{
- int ret = 0;
- bool topup_memcache;
- bool write_fault, writable;
- bool exec_fault, mte_allowed, is_vma_cacheable;
- bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
- unsigned long mmu_seq;
- phys_addr_t ipa = fault_ipa;
- struct kvm *kvm = vcpu->kvm;
- struct vm_area_struct *vma;
short vma_shift;
- void *memcache;
- gfn_t gfn;
- kvm_pfn_t pfn;
- bool logging_active = memslot_is_logging(memslot);
- bool force_pte = logging_active;
- long vma_pagesize, fault_granule;
- enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
- struct kvm_pgtable *pgt;
- struct page *page;
- vm_flags_t vm_flags;
- enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED;
-
- if (fault_is_perm)
- fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
- write_fault = kvm_is_write_fault(vcpu);
- exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
- VM_WARN_ON_ONCE(write_fault && exec_fault);
-
- /*
- * Permission faults just need to update the existing leaf entry,
- * and so normally don't require allocations from the memcache. The
- * only exception to this is when dirty logging is enabled at runtime
- * and a write fault needs to collapse a block entry into a table.
- */
- topup_memcache = !fault_is_perm || (logging_active && write_fault);
- ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache);
- if (ret)
- return ret;
- /*
- * Let's check if we will get back a huge page backed by hugetlbfs, or
- * get block mapping for device MMIO region.
- */
- mmap_read_lock(current->mm);
- vma = vma_lookup(current->mm, hva);
- if (unlikely(!vma)) {
- kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
- mmap_read_unlock(current->mm);
- return -EFAULT;
- }
-
- if (force_pte)
+ if (memslot_is_logging(s2fd->memslot)) {
+ s2vi->max_map_size = PAGE_SIZE;
vma_shift = PAGE_SHIFT;
- else
- vma_shift = get_vma_page_shift(vma, hva);
+ } else {
+ s2vi->max_map_size = PUD_SIZE;
+ vma_shift = get_vma_page_shift(vma, s2fd->hva);
+ }
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SHIFT:
- if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
+ if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PUD_SIZE))
break;
fallthrough;
#endif
@@ -1711,12 +1684,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
vma_shift = PMD_SHIFT;
fallthrough;
case PMD_SHIFT:
- if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
+ if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PMD_SIZE))
break;
fallthrough;
case CONT_PTE_SHIFT:
vma_shift = PAGE_SHIFT;
- force_pte = true;
+ s2vi->max_map_size = PAGE_SIZE;
fallthrough;
case PAGE_SHIFT:
break;
@@ -1724,21 +1697,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
}
- vma_pagesize = 1UL << vma_shift;
-
- if (nested) {
+ if (s2fd->nested) {
unsigned long max_map_size;
- max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE;
-
- ipa = kvm_s2_trans_output(nested);
+ max_map_size = min(s2vi->max_map_size, PUD_SIZE);
/*
* If we're about to create a shadow stage 2 entry, then we
* can only create a block mapping if the guest stage 2 page
* table uses at least as big a mapping.
*/
- max_map_size = min(kvm_s2_trans_size(nested), max_map_size);
+ max_map_size = min(kvm_s2_trans_size(s2fd->nested), max_map_size);
/*
* Be careful that if the mapping size falls between
@@ -1749,30 +1718,46 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
max_map_size = PAGE_SIZE;
- force_pte = (max_map_size == PAGE_SIZE);
- vma_pagesize = min_t(long, vma_pagesize, max_map_size);
- vma_shift = __ffs(vma_pagesize);
+ s2vi->max_map_size = max_map_size;
+ vma_shift = min_t(short, vma_shift, __ffs(max_map_size));
+ }
+
+ return vma_shift;
+}
+
+static bool kvm_s2_fault_is_perm(const struct kvm_s2_fault_desc *s2fd)
+{
+ return kvm_vcpu_trap_is_permission_fault(s2fd->vcpu);
+}
+
+static int kvm_s2_fault_get_vma_info(const struct kvm_s2_fault_desc *s2fd,
+ struct kvm_s2_fault_vma_info *s2vi)
+{
+ struct vm_area_struct *vma;
+ struct kvm *kvm = s2fd->vcpu->kvm;
+
+ mmap_read_lock(current->mm);
+ vma = vma_lookup(current->mm, s2fd->hva);
+ if (unlikely(!vma)) {
+ kvm_err("Failed to find VMA for hva 0x%lx\n", s2fd->hva);
+ mmap_read_unlock(current->mm);
+ return -EFAULT;
}
+ s2vi->vma_pagesize = BIT(kvm_s2_resolve_vma_size(s2fd, s2vi, vma));
+
/*
* Both the canonical IPA and fault IPA must be aligned to the
* mapping size to ensure we find the right PFN and lay down the
* mapping in the right place.
*/
- fault_ipa = ALIGN_DOWN(fault_ipa, vma_pagesize);
- ipa = ALIGN_DOWN(ipa, vma_pagesize);
-
- gfn = ipa >> PAGE_SHIFT;
- mte_allowed = kvm_vma_mte_allowed(vma);
-
- vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
+ s2vi->gfn = ALIGN_DOWN(s2fd->fault_ipa, s2vi->vma_pagesize) >> PAGE_SHIFT;
- vm_flags = vma->vm_flags;
+ s2vi->mte_allowed = kvm_vma_mte_allowed(vma);
- is_vma_cacheable = kvm_vma_is_cacheable(vma);
+ s2vi->vm_flags = vma->vm_flags;
- /* Don't use the VMA after the unlock -- it may have vanished */
- vma = NULL;
+ s2vi->is_vma_cacheable = kvm_vma_is_cacheable(vma);
/*
* Read mmu_invalidate_seq so that KVM can detect if the results of
@@ -1782,24 +1767,50 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
* with the smp_wmb() in kvm_mmu_invalidate_end().
*/
- mmu_seq = kvm->mmu_invalidate_seq;
+ s2vi->mmu_seq = kvm->mmu_invalidate_seq;
mmap_read_unlock(current->mm);
- pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
- &writable, &page);
- if (pfn == KVM_PFN_ERR_HWPOISON) {
- kvm_send_hwpoison_signal(hva, vma_shift);
- return 0;
- }
- if (is_error_noslot_pfn(pfn))
+ return 0;
+}
+
+static gfn_t get_canonical_gfn(const struct kvm_s2_fault_desc *s2fd,
+ const struct kvm_s2_fault_vma_info *s2vi)
+{
+ phys_addr_t ipa;
+
+ if (!s2fd->nested)
+ return s2vi->gfn;
+
+ ipa = kvm_s2_trans_output(s2fd->nested);
+ return ALIGN_DOWN(ipa, s2vi->vma_pagesize) >> PAGE_SHIFT;
+}
+
+static int kvm_s2_fault_pin_pfn(const struct kvm_s2_fault_desc *s2fd,
+ struct kvm_s2_fault_vma_info *s2vi)
+{
+ int ret;
+
+ ret = kvm_s2_fault_get_vma_info(s2fd, s2vi);
+ if (ret)
+ return ret;
+
+ s2vi->pfn = __kvm_faultin_pfn(s2fd->memslot, get_canonical_gfn(s2fd, s2vi),
+ kvm_is_write_fault(s2fd->vcpu) ? FOLL_WRITE : 0,
+ &s2vi->map_writable, &s2vi->page);
+ if (unlikely(is_error_noslot_pfn(s2vi->pfn))) {
+ if (s2vi->pfn == KVM_PFN_ERR_HWPOISON) {
+ kvm_send_hwpoison_signal(s2fd->hva, __ffs(s2vi->vma_pagesize));
+ return 0;
+ }
return -EFAULT;
+ }
/*
* Check if this is non-struct page memory PFN, and cannot support
* CMOs. It could potentially be unsafe to access as cacheable.
*/
- if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) {
- if (is_vma_cacheable) {
+ if (s2vi->vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(s2vi->pfn)) {
+ if (s2vi->is_vma_cacheable) {
/*
* Whilst the VMA owner expects cacheable mapping to this
* PFN, hardware also has to support the FWB and CACHE DIC
@@ -1812,8 +1823,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* S2FWB and CACHE DIC are mandatory to avoid the need for
* cache maintenance.
*/
- if (!kvm_supports_cacheable_pfnmap())
- ret = -EFAULT;
+ if (!kvm_supports_cacheable_pfnmap()) {
+ kvm_release_faultin_page(s2fd->vcpu->kvm, s2vi->page, true, false);
+ return -EFAULT;
+ }
} else {
/*
* If the page was identified as device early by looking at
@@ -1825,21 +1838,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* In both cases, we don't let transparent_hugepage_adjust()
* change things at the last minute.
*/
- s2_force_noncacheable = true;
+ s2vi->map_non_cacheable = true;
}
- } else if (logging_active && !write_fault) {
- /*
- * Only actually map the page as writable if this was a write
- * fault.
- */
- writable = false;
+
+ s2vi->device = true;
}
- if (exec_fault && s2_force_noncacheable)
- ret = -ENOEXEC;
+ return 1;
+}
- if (ret)
- goto out_put_page;
+static int kvm_s2_fault_compute_prot(const struct kvm_s2_fault_desc *s2fd,
+ const struct kvm_s2_fault_vma_info *s2vi,
+ enum kvm_pgtable_prot *prot)
+{
+ struct kvm *kvm = s2fd->vcpu->kvm;
+
+ if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu) && s2vi->map_non_cacheable)
+ return -ENOEXEC;
/*
* Guest performs atomic/exclusive operations on memory with unsupported
@@ -1847,99 +1862,167 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* and trigger the exception here. Since the memslot is valid, inject
* the fault back to the guest.
*/
- if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(vcpu))) {
- kvm_inject_dabt_excl_atomic(vcpu, kvm_vcpu_get_hfar(vcpu));
- ret = 1;
- goto out_put_page;
+ if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(s2fd->vcpu))) {
+ kvm_inject_dabt_excl_atomic(s2fd->vcpu, kvm_vcpu_get_hfar(s2fd->vcpu));
+ return 1;
+ }
+
+ *prot = KVM_PGTABLE_PROT_R;
+
+ if (s2vi->map_writable && (s2vi->device ||
+ !memslot_is_logging(s2fd->memslot) ||
+ kvm_is_write_fault(s2fd->vcpu)))
+ *prot |= KVM_PGTABLE_PROT_W;
+
+ if (s2fd->nested)
+ *prot = adjust_nested_fault_perms(s2fd->nested, *prot);
+
+ if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu))
+ *prot |= KVM_PGTABLE_PROT_X;
+
+ if (s2vi->map_non_cacheable)
+ *prot |= (s2vi->vm_flags & VM_ALLOW_ANY_UNCACHED) ?
+ KVM_PGTABLE_PROT_NORMAL_NC : KVM_PGTABLE_PROT_DEVICE;
+ else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
+ *prot |= KVM_PGTABLE_PROT_X;
+
+ if (s2fd->nested)
+ *prot = adjust_nested_exec_perms(kvm, s2fd->nested, *prot);
+
+ if (!kvm_s2_fault_is_perm(s2fd) && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) {
+ /* Check the VMM hasn't introduced a new disallowed VMA */
+ if (!s2vi->mte_allowed)
+ return -EFAULT;
}
- if (nested)
- adjust_nested_fault_perms(nested, &prot, &writable);
+ return 0;
+}
+
+static int kvm_s2_fault_map(const struct kvm_s2_fault_desc *s2fd,
+ const struct kvm_s2_fault_vma_info *s2vi,
+ enum kvm_pgtable_prot prot,
+ void *memcache)
+{
+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED;
+ bool writable = prot & KVM_PGTABLE_PROT_W;
+ struct kvm *kvm = s2fd->vcpu->kvm;
+ struct kvm_pgtable *pgt;
+ long perm_fault_granule;
+ long mapping_size;
+ kvm_pfn_t pfn;
+ gfn_t gfn;
+ int ret;
kvm_fault_lock(kvm);
- pgt = vcpu->arch.hw_mmu->pgt;
- if (mmu_invalidate_retry(kvm, mmu_seq)) {
- ret = -EAGAIN;
+ pgt = s2fd->vcpu->arch.hw_mmu->pgt;
+ ret = -EAGAIN;
+ if (mmu_invalidate_retry(kvm, s2vi->mmu_seq))
goto out_unlock;
- }
+
+ perm_fault_granule = (kvm_s2_fault_is_perm(s2fd) ?
+ kvm_vcpu_trap_get_perm_fault_granule(s2fd->vcpu) : 0);
+ mapping_size = s2vi->vma_pagesize;
+ pfn = s2vi->pfn;
+ gfn = s2vi->gfn;
/*
* If we are not forced to use page mapping, check if we are
* backed by a THP and thus use block mapping if possible.
*/
- if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) {
- if (fault_is_perm && fault_granule > PAGE_SIZE)
- vma_pagesize = fault_granule;
- else
- vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
- hva, &pfn,
- &fault_ipa);
-
- if (vma_pagesize < 0) {
- ret = vma_pagesize;
- goto out_unlock;
- }
- }
-
- if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) {
- /* Check the VMM hasn't introduced a new disallowed VMA */
- if (mte_allowed) {
- sanitise_mte_tags(kvm, pfn, vma_pagesize);
+ if (mapping_size == PAGE_SIZE &&
+ !(s2vi->max_map_size == PAGE_SIZE || s2vi->map_non_cacheable)) {
+ if (perm_fault_granule > PAGE_SIZE) {
+ mapping_size = perm_fault_granule;
} else {
- ret = -EFAULT;
- goto out_unlock;
+ mapping_size = transparent_hugepage_adjust(kvm, s2fd->memslot,
+ s2fd->hva, &pfn,
+ &gfn);
+ if (mapping_size < 0) {
+ ret = mapping_size;
+ goto out_unlock;
+ }
}
}
- if (writable)
- prot |= KVM_PGTABLE_PROT_W;
-
- if (exec_fault)
- prot |= KVM_PGTABLE_PROT_X;
-
- if (s2_force_noncacheable) {
- if (vfio_allow_any_uc)
- prot |= KVM_PGTABLE_PROT_NORMAL_NC;
- else
- prot |= KVM_PGTABLE_PROT_DEVICE;
- } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
- prot |= KVM_PGTABLE_PROT_X;
- }
-
- if (nested)
- adjust_nested_exec_perms(kvm, nested, &prot);
+ if (!perm_fault_granule && !s2vi->map_non_cacheable && kvm_has_mte(kvm))
+ sanitise_mte_tags(kvm, pfn, mapping_size);
/*
* Under the premise of getting a FSC_PERM fault, we just need to relax
- * permissions only if vma_pagesize equals fault_granule. Otherwise,
+ * permissions only if mapping_size equals perm_fault_granule. Otherwise,
* kvm_pgtable_stage2_map() should be called to change block size.
*/
- if (fault_is_perm && vma_pagesize == fault_granule) {
+ if (mapping_size == perm_fault_granule) {
/*
* Drop the SW bits in favour of those stored in the
* PTE, which will be preserved.
*/
prot &= ~KVM_NV_GUEST_MAP_SZ;
- ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, gfn_to_gpa(gfn),
+ prot, flags);
} else {
- ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
- __pfn_to_phys(pfn), prot,
- memcache, flags);
+ ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, gfn_to_gpa(gfn), mapping_size,
+ __pfn_to_phys(pfn), prot,
+ memcache, flags);
}
out_unlock:
- kvm_release_faultin_page(kvm, page, !!ret, writable);
+ kvm_release_faultin_page(kvm, s2vi->page, !!ret, writable);
kvm_fault_unlock(kvm);
- /* Mark the page dirty only if the fault is handled successfully */
- if (writable && !ret)
- mark_page_dirty_in_slot(kvm, memslot, gfn);
+ /*
+ * Mark the page dirty only if the fault is handled successfully,
+ * making sure we adjust the canonical IPA if the mapping size has
+ * been updated (via a THP upgrade, for example).
+ */
+ if (writable && !ret) {
+ phys_addr_t ipa = gfn_to_gpa(get_canonical_gfn(s2fd, s2vi));
+ ipa &= ~(mapping_size - 1);
+ mark_page_dirty_in_slot(kvm, s2fd->memslot, gpa_to_gfn(ipa));
+ }
+
+ if (ret != -EAGAIN)
+ return ret;
+ return 0;
+}
- return ret != -EAGAIN ? ret : 0;
+static int user_mem_abort(const struct kvm_s2_fault_desc *s2fd)
+{
+ bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu);
+ struct kvm_s2_fault_vma_info s2vi = {};
+ enum kvm_pgtable_prot prot;
+ void *memcache;
+ int ret;
-out_put_page:
- kvm_release_page_unused(page);
- return ret;
+ /*
+ * Permission faults just need to update the existing leaf entry,
+ * and so normally don't require allocations from the memcache. The
+ * only exception to this is when dirty logging is enabled at runtime
+ * and a write fault needs to collapse a block entry into a table.
+ */
+ memcache = get_mmu_memcache(s2fd->vcpu);
+ if (!perm_fault || (memslot_is_logging(s2fd->memslot) &&
+ kvm_is_write_fault(s2fd->vcpu))) {
+ ret = topup_mmu_memcache(s2fd->vcpu, memcache);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Let's check if we will get back a huge page backed by hugetlbfs, or
+ * get block mapping for device MMIO region.
+ */
+ ret = kvm_s2_fault_pin_pfn(s2fd, &s2vi);
+ if (ret != 1)
+ return ret;
+
+ ret = kvm_s2_fault_compute_prot(s2fd, &s2vi, &prot);
+ if (ret) {
+ kvm_release_page_unused(s2vi.page);
+ return ret;
+ }
+
+ return kvm_s2_fault_map(s2fd, &s2vi, prot, memcache);
}
/* Resolve the access fault by making the page young again. */
@@ -2205,12 +2288,19 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
!write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));
+ const struct kvm_s2_fault_desc s2fd = {
+ .vcpu = vcpu,
+ .fault_ipa = fault_ipa,
+ .nested = nested,
+ .memslot = memslot,
+ .hva = hva,
+ };
+
if (kvm_slot_has_gmem(memslot))
- ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
- esr_fsc_is_permission_fault(esr));
+ ret = gmem_abort(&s2fd);
else
- ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
- esr_fsc_is_permission_fault(esr));
+ ret = user_mem_abort(&s2fd);
+
if (ret == 0)
ret = 1;
out: