diff options
Diffstat (limited to 'virt/kvm/arm/mmu.c')
-rw-r--r-- | virt/kvm/arm/mmu.c | 125 |
1 files changed, 73 insertions, 52 deletions
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index ffd7acdceac7..27c958306449 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -102,8 +102,7 @@ static bool kvm_is_device_pfn(unsigned long pfn) * @addr: IPA * @pmd: pmd pointer for IPA * - * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all - * pages in the range dirty. + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. */ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) { @@ -121,8 +120,7 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) * @addr: IPA * @pud: pud pointer for IPA * - * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all - * pages in the range dirty. + * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. */ static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) { @@ -899,9 +897,8 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. * @kvm: The KVM struct pointer for the VM. * - * Allocates only the stage-2 HW PGD level table(s) (can support either full - * 40-bit input addresses or limited to 32-bit input addresses). Clears the - * allocated pages. + * Allocates only the stage-2 HW PGD level table(s) of size defined by + * stage2_pgd_size(kvm). * * Note we don't need locking here as this is only called when the VM is * created, which can only be done once. @@ -1067,25 +1064,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache { pmd_t *pmd, old_pmd; +retry: pmd = stage2_get_pmd(kvm, cache, addr); VM_BUG_ON(!pmd); old_pmd = *pmd; + /* + * Multiple vcpus faulting on the same PMD entry, can + * lead to them sequentially updating the PMD with the + * same value. Following the break-before-make + * (pmd_clear() followed by tlb_flush()) process can + * hinder forward progress due to refaults generated + * on missing translations. + * + * Skip updating the page table if the entry is + * unchanged. + */ + if (pmd_val(old_pmd) == pmd_val(*new_pmd)) + return 0; + if (pmd_present(old_pmd)) { /* - * Multiple vcpus faulting on the same PMD entry, can - * lead to them sequentially updating the PMD with the - * same value. Following the break-before-make - * (pmd_clear() followed by tlb_flush()) process can - * hinder forward progress due to refaults generated - * on missing translations. + * If we already have PTE level mapping for this block, + * we must unmap it to avoid inconsistent TLB state and + * leaking the table page. We could end up in this situation + * if the memory slot was marked for dirty logging and was + * reverted, leaving PTE level mappings for the pages accessed + * during the period. So, unmap the PTE level mapping for this + * block and retry, as we could have released the upper level + * table in the process. * - * Skip updating the page table if the entry is - * unchanged. + * Normal THP split/merge follows mmu_notifier callbacks and do + * get handled accordingly. */ - if (pmd_val(old_pmd) == pmd_val(*new_pmd)) - return 0; - + if (!pmd_thp_or_huge(old_pmd)) { + unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE); + goto retry; + } /* * Mapping in huge pages should only happen through a * fault. If a page is merged into a transparent huge @@ -1097,8 +1112,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache * should become splitting first, unmapped, merged, * and mapped back in on-demand. */ - VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); - + WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); } else { @@ -1114,6 +1128,7 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac { pud_t *pudp, old_pud; +retry: pudp = stage2_get_pud(kvm, cache, addr); VM_BUG_ON(!pudp); @@ -1121,14 +1136,23 @@ static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cac /* * A large number of vcpus faulting on the same stage 2 entry, - * can lead to a refault due to the - * stage2_pud_clear()/tlb_flush(). Skip updating the page - * tables if there is no change. + * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). + * Skip updating the page tables if there is no change. */ if (pud_val(old_pud) == pud_val(*new_pudp)) return 0; if (stage2_pud_present(kvm, old_pud)) { + /* + * If we already have table level mapping for this block, unmap + * the range for this block and retry. + */ + if (!stage2_pud_huge(kvm, old_pud)) { + unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE); + goto retry; + } + + WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp)); stage2_pud_clear(kvm, pudp); kvm_tlb_flush_vmid_ipa(kvm, addr); } else { @@ -1451,13 +1475,11 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, } /** - * stage2_wp_puds - write protect PGD range - * @pgd: pointer to pgd entry - * @addr: range start address - * @end: range end address - * - * Process PUD entries, for a huge PUD we cause a panic. - */ + * stage2_wp_puds - write protect PGD range + * @pgd: pointer to pgd entry + * @addr: range start address + * @end: range end address + */ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr, phys_addr_t end) { @@ -1594,8 +1616,9 @@ static void kvm_send_hwpoison_signal(unsigned long address, send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); } -static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, - unsigned long hva) +static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, + unsigned long hva, + unsigned long map_size) { gpa_t gpa_start; hva_t uaddr_start, uaddr_end; @@ -1610,34 +1633,34 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, /* * Pages belonging to memslots that don't have the same alignment - * within a PMD for userspace and IPA cannot be mapped with stage-2 - * PMD entries, because we'll end up mapping the wrong pages. + * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 + * PMD/PUD entries, because we'll end up mapping the wrong pages. * * Consider a layout like the following: * * memslot->userspace_addr: * +-----+--------------------+--------------------+---+ - * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz| + * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| * +-----+--------------------+--------------------+---+ * * memslot->base_gfn << PAGE_SIZE: * +---+--------------------+--------------------+-----+ - * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz| + * |abc|def Stage-2 block | Stage-2 block |tvxyz| * +---+--------------------+--------------------+-----+ * - * If we create those stage-2 PMDs, we'll end up with this incorrect + * If we create those stage-2 blocks, we'll end up with this incorrect * mapping: * d -> f * e -> g * f -> h */ - if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK)) + if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) return false; /* * Next, let's make sure we're not trying to map anything not covered - * by the memslot. This means we have to prohibit PMD size mappings - * for the beginning and end of a non-PMD aligned and non-PMD sized + * by the memslot. This means we have to prohibit block size mappings + * for the beginning and end of a non-block aligned and non-block sized * memory slot (illustrated by the head and tail parts of the * userspace view above containing pages 'abcde' and 'xyz', * respectively). @@ -1646,8 +1669,8 @@ static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot, * userspace_addr or the base_gfn, as both are equally aligned (per * the check above) and equally sized. */ - return (hva & S2_PMD_MASK) >= uaddr_start && - (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end; + return (hva & ~(map_size - 1)) >= uaddr_start && + (hva & ~(map_size - 1)) + map_size <= uaddr_end; } static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, @@ -1676,12 +1699,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - if (!fault_supports_stage2_pmd_mappings(memslot, hva)) - force_pte = true; - - if (logging_active) - force_pte = true; - /* Let's check if we will get back a huge page backed by hugetlbfs */ down_read(¤t->mm->mmap_sem); vma = find_vma_intersection(current->mm, hva, hva + 1); @@ -1692,6 +1709,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } vma_pagesize = vma_kernel_pagesize(vma); + if (logging_active || + !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { + force_pte = true; + vma_pagesize = PAGE_SIZE; + } + /* * The stage2 has a minimum of 2 level table (For arm64 see * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can @@ -1699,11 +1722,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * As for PUD huge maps, we must make sure that we have at least * 3 levels, i.e, PMD is not folded. */ - if ((vma_pagesize == PMD_SIZE || - (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) && - !force_pte) { + if (vma_pagesize == PMD_SIZE || + (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; - } up_read(¤t->mm->mmap_sem); /* We need minimum second+third level pages */ |