diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 43 | ||||
-rw-r--r-- | mm/gup.c | 46 | ||||
-rw-r--r-- | mm/huge_memory.c | 9 | ||||
-rw-r--r-- | mm/hugetlb.c | 29 | ||||
-rw-r--r-- | mm/kasan/kasan_init.c | 44 | ||||
-rw-r--r-- | mm/kasan/quarantine.c | 51 | ||||
-rw-r--r-- | mm/madvise.c | 44 | ||||
-rw-r--r-- | mm/memblock.c | 5 | ||||
-rw-r--r-- | mm/memcontrol.c | 18 | ||||
-rw-r--r-- | mm/memory.c | 230 | ||||
-rw-r--r-- | mm/mlock.c | 10 | ||||
-rw-r--r-- | mm/mprotect.c | 26 | ||||
-rw-r--r-- | mm/mremap.c | 13 | ||||
-rw-r--r-- | mm/page_alloc.c | 3 | ||||
-rw-r--r-- | mm/page_vma_mapped.c | 6 | ||||
-rw-r--r-- | mm/pagewalk.c | 32 | ||||
-rw-r--r-- | mm/percpu-vm.c | 7 | ||||
-rw-r--r-- | mm/percpu.c | 5 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 6 | ||||
-rw-r--r-- | mm/rmap.c | 20 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 22 | ||||
-rw-r--r-- | mm/swapfile.c | 26 | ||||
-rw-r--r-- | mm/userfaultfd.c | 23 | ||||
-rw-r--r-- | mm/vmalloc.c | 81 | ||||
-rw-r--r-- | mm/vmstat.c | 3 |
25 files changed, 649 insertions, 153 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 6d861d090e9f..c6f2a37028c2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -683,33 +683,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { struct radix_tree_iter iter; - struct rb_node *rbn; void **slot; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); - radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); - - while ((rbn = rb_first(&bdi->cgwb_congested_tree))) { - struct bdi_writeback_congested *congested = - rb_entry(rbn, struct bdi_writeback_congested, rb_node); - - rb_erase(rbn, &bdi->cgwb_congested_tree); - congested->bdi = NULL; /* mark @congested unlinked */ - } - spin_unlock_irq(&cgwb_lock); /* - * All cgwb's and their congested states must be shutdown and - * released before returning. Drain the usage counter to wait for - * all cgwb's and cgwb_congested's ever created on @bdi. + * All cgwb's must be shutdown and released before returning. Drain + * the usage counter to wait for all cgwb's ever created on @bdi. */ atomic_dec(&bdi->usage_cnt); wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt)); + /* + * Grab back our reference so that we hold it when @bdi gets + * re-registered. + */ + atomic_inc(&bdi->usage_cnt); } /** @@ -749,6 +742,21 @@ void wb_blkcg_offline(struct blkcg *blkcg) spin_unlock_irq(&cgwb_lock); } +static void cgwb_bdi_exit(struct backing_dev_info *bdi) +{ + struct rb_node *rbn; + + spin_lock_irq(&cgwb_lock); + while ((rbn = rb_first(&bdi->cgwb_congested_tree))) { + struct bdi_writeback_congested *congested = + rb_entry(rbn, struct bdi_writeback_congested, rb_node); + + rb_erase(rbn, &bdi->cgwb_congested_tree); + congested->bdi = NULL; /* mark @congested unlinked */ + } + spin_unlock_irq(&cgwb_lock); +} + #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) @@ -769,7 +777,9 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) return 0; } -static void cgwb_bdi_destroy(struct backing_dev_info *bdi) +static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } + +static void cgwb_bdi_exit(struct backing_dev_info *bdi) { wb_congested_put(bdi->wb_congested); } @@ -857,6 +867,8 @@ int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) MINOR(owner->devt)); if (rc) return rc; + /* Leaking owner reference... */ + WARN_ON(bdi->owner); bdi->owner = owner; get_device(owner); return 0; @@ -898,6 +910,7 @@ static void bdi_exit(struct backing_dev_info *bdi) { WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); + cgwb_bdi_exit(bdi); } static void release_bdi(struct kref *ref) @@ -226,6 +226,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma, unsigned int *page_mask) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; spinlock_t *ptl; @@ -243,8 +244,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma, pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); - - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d)) + return no_page_table(vma, flags); + BUILD_BUG_ON(p4d_huge(*p4d)); + if (unlikely(p4d_bad(*p4d))) + return no_page_table(vma, flags); + pud = pud_offset(p4d, address); if (pud_none(*pud)) return no_page_table(vma, flags); if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { @@ -325,6 +331,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, struct page **page) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -338,7 +345,9 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, else pgd = pgd_offset_gate(mm, address); BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + BUG_ON(p4d_none(*p4d)); + pud = pud_offset(p4d, address); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) @@ -1400,13 +1409,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, return 1; } -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, +static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; - pudp = pud_offset(&pgd, addr); + pudp = pud_offset(&p4d, addr); do { pud_t pud = READ_ONCE(*pudp); @@ -1428,6 +1437,31 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + p4d_t *p4dp; + + p4dp = p4d_offset(&pgd, addr); + do { + p4d_t p4d = READ_ONCE(*p4dp); + + next = p4d_addr_end(addr, end); + if (p4d_none(p4d)) + return 0; + BUILD_BUG_ON(p4d_huge(p4d)); + if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { + if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, + P4D_SHIFT, next, write, pages, nr)) + return 0; + } else if (!gup_pud_range(p4d, addr, next, write, pages, nr)) + return 0; + } while (p4dp++, addr = next, addr != end); + + return 1; +} + /* * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. It will only return non-negative values. @@ -1478,7 +1512,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, PGDIR_SHIFT, next, write, pages, &nr)) break; - } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + } else if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d36b2af4d1bf..1ebc93e179f3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1828,7 +1828,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); - count_vm_event(THP_SPLIT_PMD); + count_vm_event(THP_SPLIT_PUD); pudp_huge_clear_flush_notify(vma, haddr, pud); } @@ -2048,6 +2048,7 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -2055,7 +2056,11 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, if (!pgd_present(*pgd)) return; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return; + + pud = pud_offset(p4d, address); if (!pud_present(*pud)) return; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a7aa811b7d14..3d0aab9ee80d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4555,7 +4555,8 @@ out: int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { pgd_t *pgd = pgd_offset(mm, *addr); - pud_t *pud = pud_offset(pgd, *addr); + p4d_t *p4d = p4d_offset(pgd, *addr); + pud_t *pud = pud_offset(p4d, *addr); BUG_ON(page_count(virt_to_page(ptep)) == 0); if (page_count(virt_to_page(ptep)) == 1) @@ -4586,11 +4587,13 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); + p4d = p4d_offset(pgd, addr); + pud = pud_alloc(mm, p4d, addr); if (pud) { if (sz == PUD_SIZE) { pte = (pte_t *)pud; @@ -4610,18 +4613,22 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; - pmd_t *pmd = NULL; + pmd_t *pmd; pgd = pgd_offset(mm, addr); - if (pgd_present(*pgd)) { - pud = pud_offset(pgd, addr); - if (pud_present(*pud)) { - if (pud_huge(*pud)) - return (pte_t *)pud; - pmd = pmd_offset(pud, addr); - } - } + if (!pgd_present(*pgd)) + return NULL; + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return NULL; + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return NULL; + if (pud_huge(*pud)) + return (pte_t *)pud; + pmd = pmd_offset(pud, addr); return (pte_t *) pmd; } diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index 31238dad85fb..b96a5f773d88 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -30,6 +30,9 @@ */ unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; +#if CONFIG_PGTABLE_LEVELS > 4 +p4d_t kasan_zero_p4d[PTRS_PER_P4D] __page_aligned_bss; +#endif #if CONFIG_PGTABLE_LEVELS > 3 pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; #endif @@ -82,10 +85,10 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, } while (pmd++, addr = next, addr != end); } -static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr, +static void __init zero_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end) { - pud_t *pud = pud_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); unsigned long next; do { @@ -107,6 +110,23 @@ static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr, } while (pud++, addr = next, addr != end); } +static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, + unsigned long end) +{ + p4d_t *p4d = p4d_offset(pgd, addr); + unsigned long next; + + do { + next = p4d_addr_end(addr, end); + + if (p4d_none(*p4d)) { + p4d_populate(&init_mm, p4d, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + zero_pud_populate(p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + /** * kasan_populate_zero_shadow - populate shadow memory region with * kasan_zero_page @@ -125,6 +145,7 @@ void __init kasan_populate_zero_shadow(const void *shadow_start, next = pgd_addr_end(addr, end); if (IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -135,9 +156,22 @@ void __init kasan_populate_zero_shadow(const void *shadow_start, * 3,2 - level page tables where we don't have * puds,pmds, so pgd_populate(), pud_populate() * is noops. + * + * The ifndef is required to avoid build breakage. + * + * With 5level-fixup.h, pgd_populate() is not nop and + * we reference kasan_zero_p4d. It's not defined + * unless 5-level paging enabled. + * + * The ifndef can be dropped once all KASAN-enabled + * architectures will switch to pgtable-nop4d.h. */ - pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_pud)); - pud = pud_offset(pgd, addr); +#ifndef __ARCH_HAS_5LEVEL_HACK + pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_p4d)); +#endif + p4d = p4d_offset(pgd, addr); + p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud)); + pud = pud_offset(p4d, addr); pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); pmd = pmd_offset(pud, addr); pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); @@ -148,6 +182,6 @@ void __init kasan_populate_zero_shadow(const void *shadow_start, pgd_populate(&init_mm, pgd, early_alloc(PAGE_SIZE, NUMA_NO_NODE)); } - zero_pud_populate(pgd, addr, next); + zero_p4d_populate(pgd, addr, next); } while (pgd++, addr = next, addr != end); } diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 6f1ed1630873..3a8ddf8baf7d 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -25,6 +25,7 @@ #include <linux/printk.h> #include <linux/shrinker.h> #include <linux/slab.h> +#include <linux/srcu.h> #include <linux/string.h> #include <linux/types.h> @@ -103,6 +104,7 @@ static int quarantine_tail; /* Total size of all objects in global_quarantine across all batches. */ static unsigned long quarantine_size; static DEFINE_SPINLOCK(quarantine_lock); +DEFINE_STATIC_SRCU(remove_cache_srcu); /* Maximum size of the global queue. */ static unsigned long quarantine_max_size; @@ -173,17 +175,22 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) struct qlist_head *q; struct qlist_head temp = QLIST_INIT; + /* + * Note: irq must be disabled until after we move the batch to the + * global quarantine. Otherwise quarantine_remove_cache() can miss + * some objects belonging to the cache if they are in our local temp + * list. quarantine_remove_cache() executes on_each_cpu() at the + * beginning which ensures that it either sees the objects in per-cpu + * lists or in the global quarantine. + */ local_irq_save(flags); q = this_cpu_ptr(&cpu_quarantine); qlist_put(q, &info->quarantine_link, cache->size); - if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) + if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) { qlist_move_all(q, &temp); - local_irq_restore(flags); - - if (unlikely(!qlist_empty(&temp))) { - spin_lock_irqsave(&quarantine_lock, flags); + spin_lock(&quarantine_lock); WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); qlist_move_all(&temp, &global_quarantine[quarantine_tail]); if (global_quarantine[quarantine_tail].bytes >= @@ -196,20 +203,33 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (new_tail != quarantine_head) quarantine_tail = new_tail; } - spin_unlock_irqrestore(&quarantine_lock, flags); + spin_unlock(&quarantine_lock); } + + local_irq_restore(flags); } void quarantine_reduce(void) { size_t total_size, new_quarantine_size, percpu_quarantines; unsigned long flags; + int srcu_idx; struct qlist_head to_free = QLIST_INIT; if (likely(READ_ONCE(quarantine_size) <= READ_ONCE(quarantine_max_size))) return; + /* + * srcu critical section ensures that quarantine_remove_cache() + * will not miss objects belonging to the cache while they are in our + * local to_free list. srcu is chosen because (1) it gives us private + * grace period domain that does not interfere with anything else, + * and (2) it allows synchronize_srcu() to return without waiting + * if there are no pending read critical sections (which is the + * expected case). + */ + srcu_idx = srcu_read_lock(&remove_cache_srcu); spin_lock_irqsave(&quarantine_lock, flags); /* @@ -237,6 +257,7 @@ void quarantine_reduce(void) spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, NULL); + srcu_read_unlock(&remove_cache_srcu, srcu_idx); } static void qlist_move_cache(struct qlist_head *from, @@ -280,12 +301,28 @@ void quarantine_remove_cache(struct kmem_cache *cache) unsigned long flags, i; struct qlist_head to_free = QLIST_INIT; + /* + * Must be careful to not miss any objects that are being moved from + * per-cpu list to the global quarantine in quarantine_put(), + * nor objects being freed in quarantine_reduce(). on_each_cpu() + * achieves the first goal, while synchronize_srcu() achieves the + * second. + */ on_each_cpu(per_cpu_remove_cache, cache, 1); spin_lock_irqsave(&quarantine_lock, flags); - for (i = 0; i < QUARANTINE_BATCHES; i++) + for (i = 0; i < QUARANTINE_BATCHES; i++) { + if (qlist_empty(&global_quarantine[i])) + continue; qlist_move_cache(&global_quarantine[i], &to_free, cache); + /* Scanning whole quarantine can take a while. */ + spin_unlock_irqrestore(&quarantine_lock, flags); + cond_resched(); + spin_lock_irqsave(&quarantine_lock, flags); + } spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, cache); + + synchronize_srcu(&remove_cache_srcu); } diff --git a/mm/madvise.c b/mm/madvise.c index dc5927c812d3..7a2abf0127ae 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -513,7 +513,43 @@ static long madvise_dontneed(struct vm_area_struct *vma, if (!can_madv_dontneed_vma(vma)) return -EINVAL; - userfaultfd_remove(vma, prev, start, end); + if (!userfaultfd_remove(vma, start, end)) { + *prev = NULL; /* mmap_sem has been dropped, prev is stale */ + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + if (!vma) + return -ENOMEM; + if (start < vma->vm_start) { + /* + * This "vma" under revalidation is the one + * with the lowest vma->vm_start where start + * is also < vma->vm_end. If start < + * vma->vm_start it means an hole materialized + * in the user address space within the + * virtual range passed to MADV_DONTNEED. + */ + return -ENOMEM; + } + if (!can_madv_dontneed_vma(vma)) + return -EINVAL; + if (end > vma->vm_end) { + /* + * Don't fail if end > vma->vm_end. If the old + * vma was splitted while the mmap_sem was + * released the effect of the concurrent + * operation may not cause MADV_DONTNEED to + * have an undefined result. There may be an + * adjacent next vma that we'll walk + * next. userfaultfd_remove() will generate an + * UFFD_EVENT_REMOVE repetition on the + * end-vma->vm_end range, but the manager can + * handle a repetition fine. + */ + end = vma->vm_end; + } + VM_WARN_ON(start >= end); + } zap_page_range(vma, start, end - start); return 0; } @@ -554,8 +590,10 @@ static long madvise_remove(struct vm_area_struct *vma, * mmap_sem. */ get_file(f); - userfaultfd_remove(vma, prev, start, end); - up_read(¤t->mm->mmap_sem); + if (userfaultfd_remove(vma, start, end)) { + /* mmap_sem was not released by userfaultfd_remove() */ + up_read(¤t->mm->mmap_sem); + } error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); diff --git a/mm/memblock.c b/mm/memblock.c index b64b47803e52..696f06d17c4e 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1118,7 +1118,10 @@ unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn, } } while (left < right); - return min(PHYS_PFN(type->regions[right].base), max_pfn); + if (right == type->cnt) + return max_pfn; + else + return min(PHYS_PFN(type->regions[right].base), max_pfn); } /** diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c52ec893e241..2bd7541d7c11 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -466,6 +466,8 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) struct mem_cgroup_tree_per_node *mctz; mctz = soft_limit_tree_from_page(page); + if (!mctz) + return; /* * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched. @@ -503,7 +505,8 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) for_each_node(nid) { mz = mem_cgroup_nodeinfo(memcg, nid); mctz = soft_limit_tree_node(nid); - mem_cgroup_remove_exceeded(mz, mctz); + if (mctz) + mem_cgroup_remove_exceeded(mz, mctz); } } @@ -2558,7 +2561,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, * is empty. Do it lockless to prevent lock bouncing. Races * are acceptable as soft limit is best effort anyway. */ - if (RB_EMPTY_ROOT(&mctz->rb_root)) + if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) return 0; /* @@ -4135,17 +4138,22 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) kfree(memcg->nodeinfo[node]); } -static void mem_cgroup_free(struct mem_cgroup *memcg) +static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - memcg_wb_domain_exit(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->stat); kfree(memcg); } +static void mem_cgroup_free(struct mem_cgroup *memcg) +{ + memcg_wb_domain_exit(memcg); + __mem_cgroup_free(memcg); +} + static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; @@ -4196,7 +4204,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) fail: if (memcg->id.id > 0) idr_remove(&mem_cgroup_idr, memcg->id.id); - mem_cgroup_free(memcg); + __mem_cgroup_free(memcg); return NULL; } diff --git a/mm/memory.c b/mm/memory.c index a97a4cec2e1f..235ba51b2fbf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -445,7 +445,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, mm_dec_nr_pmds(tlb->mm); } -static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, +static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { @@ -454,7 +454,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long start; start = addr; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) @@ -462,6 +462,39 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, free_pmd_range(tlb, pud, addr, next, floor, ceiling); } while (pud++, addr = next, addr != end); + start &= P4D_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= P4D_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pud = pud_offset(p4d, start); + p4d_clear(p4d); + pud_free_tlb(tlb, pud, start); +} + +static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + p4d_t *p4d; + unsigned long next; + unsigned long start; + + start = addr; + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + free_pud_range(tlb, p4d, addr, next, floor, ceiling); + } while (p4d++, addr = next, addr != end); + start &= PGDIR_MASK; if (start < floor) return; @@ -473,9 +506,9 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, if (end - 1 > ceiling - 1) return; - pud = pud_offset(pgd, start); + p4d = p4d_offset(pgd, start); pgd_clear(pgd); - pud_free_tlb(tlb, pud, start); + p4d_free_tlb(tlb, p4d, start); } /* @@ -539,7 +572,7 @@ void free_pgd_range(struct mmu_gather *tlb, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - free_pud_range(tlb, pgd, addr, next, floor, ceiling); + free_p4d_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } @@ -658,7 +691,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pte_t pte, struct page *page) { pgd_t *pgd = pgd_offset(vma->vm_mm, addr); - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; @@ -1023,16 +1057,16 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src } static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; unsigned long next; - dst_pud = pud_alloc(dst_mm, dst_pgd, addr); + dst_pud = pud_alloc(dst_mm, dst_p4d, addr); if (!dst_pud) return -ENOMEM; - src_pud = pud_offset(src_pgd, addr); + src_pud = pud_offset(src_p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { @@ -1056,6 +1090,28 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src return 0; } +static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + p4d_t *src_p4d, *dst_p4d; + unsigned long next; + + dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr); + if (!dst_p4d) + return -ENOMEM; + src_p4d = p4d_offset(src_pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(src_p4d)) + continue; + if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d, + vma, addr, next)) + return -ENOMEM; + } while (dst_p4d++, src_p4d++, addr = next, addr != end); + return 0; +} + int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *vma) { @@ -1111,7 +1167,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; - if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, vma, addr, next))) { ret = -ENOMEM; break; @@ -1267,14 +1323,14 @@ next: } static inline unsigned long zap_pud_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, pgd_t *pgd, + struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, struct zap_details *details) { pud_t *pud; unsigned long next; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*pud) || pud_devmap(*pud)) { @@ -1295,6 +1351,25 @@ next: return addr; } +static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + next = zap_pud_range(tlb, vma, p4d, addr, next, details); + } while (p4d++, addr = next, addr != end); + + return addr; +} + void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, @@ -1310,7 +1385,7 @@ void unmap_page_range(struct mmu_gather *tlb, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - next = zap_pud_range(tlb, vma, pgd, addr, next, details); + next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); } @@ -1465,16 +1540,24 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes); pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { - pgd_t *pgd = pgd_offset(mm, addr); - pud_t *pud = pud_alloc(mm, pgd, addr); - if (pud) { - pmd_t *pmd = pmd_alloc(mm, pud, addr); - if (pmd) { - VM_BUG_ON(pmd_trans_huge(*pmd)); - return pte_alloc_map_lock(mm, pmd, addr, ptl); - } - } - return NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return NULL; + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return NULL; + + VM_BUG_ON(pmd_trans_huge(*pmd)); + return pte_alloc_map_lock(mm, pmd, addr, ptl); } /* @@ -1740,7 +1823,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, return 0; } -static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, +static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { @@ -1748,7 +1831,7 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, unsigned long next; pfn -= addr >> PAGE_SHIFT; - pud = pud_alloc(mm, pgd, addr); + pud = pud_alloc(mm, p4d, addr); if (!pud) return -ENOMEM; do { @@ -1760,6 +1843,26 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, return 0; } +static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + p4d_t *p4d; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + if (remap_pud_range(mm, p4d, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (p4d++, addr = next, addr != end); + return 0; +} + /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to @@ -1816,7 +1919,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, flush_cache_range(vma, addr, end); do { next = pgd_addr_end(addr, end); - err = remap_pud_range(mm, pgd, addr, next, + err = remap_p4d_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) break; @@ -1932,7 +2035,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, return err; } -static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, +static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) { @@ -1940,7 +2043,7 @@ static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, unsigned long next; int err; - pud = pud_alloc(mm, pgd, addr); + pud = pud_alloc(mm, p4d, addr); if (!pud) return -ENOMEM; do { @@ -1952,6 +2055,26 @@ static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, return err; } +static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + p4d_t *p4d; + unsigned long next; + int err; + + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + err = apply_to_pud_range(mm, p4d, addr, next, fn, data); + if (err) + break; + } while (p4d++, addr = next, addr != end); + return err; +} + /* * Scan a region of virtual memory, filling in page tables as necessary * and calling a provided function on each leaf page table. @@ -1970,7 +2093,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); - err = apply_to_pud_range(mm, pgd, addr, next, fn, data); + err = apply_to_p4d_range(mm, pgd, addr, next, fn, data); if (err) break; } while (pgd++, addr = next, addr != end); @@ -3653,11 +3776,15 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, }; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; + p4d_t *p4d; int ret; pgd = pgd_offset(mm, address); + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + return VM_FAULT_OOM; - vmf.pud = pud_alloc(mm, pgd, address); + vmf.pud = pud_alloc(mm, p4d, address); if (!vmf.pud) return VM_FAULT_OOM; if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) { @@ -3779,12 +3906,35 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(handle_mm_fault); +#ifndef __PAGETABLE_P4D_FOLDED +/* + * Allocate p4d page table. + * We've already handled the fast-path in-line. + */ +int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + p4d_t *new = p4d_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&mm->page_table_lock); + if (pgd_present(*pgd)) /* Another has populated it */ + p4d_free(mm, new); + else + pgd_populate(mm, pgd, new); + spin_unlock(&mm->page_table_lock); + return 0; +} +#endif /* __PAGETABLE_P4D_FOLDED */ + #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. * We've already handled the fast-path in-line. */ -int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { pud_t *new = pud_alloc_one(mm, address); if (!new) @@ -3793,10 +3943,17 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&mm->page_table_lock); - if (pgd_present(*pgd)) /* Another has populated it */ +#ifndef __ARCH_HAS_5LEVEL_HACK + if (p4d_present(*p4d)) /* Another has populated it */ pud_free(mm, new); else - pgd_populate(mm, pgd, new); + p4d_populate(mm, p4d, new); +#else + if (pgd_present(*p4d)) /* Another has populated it */ + pud_free(mm, new); + else + pgd_populate(mm, p4d, new); +#endif /* __ARCH_HAS_5LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; } @@ -3839,6 +3996,7 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *ptep; @@ -3847,7 +4005,11 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto out; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) + goto out; + + pud = pud_offset(p4d, address); if (pud_none(*pud) || unlikely(pud_bad(*pud))) goto out; diff --git a/mm/mlock.c b/mm/mlock.c index 1050511f8b2b..0dd9ca18e19e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -380,6 +380,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, pte = get_locked_pte(vma->vm_mm, start, &ptl); /* Make sure we do not cross the page table boundary */ end = pgd_addr_end(start, end); + end = p4d_addr_end(start, end); end = pud_addr_end(start, end); end = pmd_addr_end(start, end); @@ -442,7 +443,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, while (start < end) { struct page *page; - unsigned int page_mask; + unsigned int page_mask = 0; unsigned long page_increm; struct pagevec pvec; struct zone *zone; @@ -456,8 +457,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * suits munlock very well (and if somehow an abnormal page * has sneaked into the range, we won't oops here: great). */ - page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, - &page_mask); + page = follow_page(vma, start, FOLL_GET | FOLL_DUMP); if (page && !IS_ERR(page)) { if (PageTransTail(page)) { @@ -468,8 +468,8 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, /* * Any THP page found by follow_page_mask() may * have gotten split before reaching - * munlock_vma_page(), so we need to recompute - * the page_mask here. + * munlock_vma_page(), so we need to compute + * the page_mask here instead. */ page_mask = munlock_vma_page(page); unlock_page(page); diff --git a/mm/mprotect.c b/mm/mprotect.c index 848e946b08e5..8edd0d576254 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -193,14 +193,14 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, } static inline unsigned long change_pud_range(struct vm_area_struct *vma, - pgd_t *pgd, unsigned long addr, unsigned long end, + p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; unsigned long pages = 0; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) @@ -212,6 +212,26 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, return pages; } +static inline unsigned long change_p4d_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) +{ + p4d_t *p4d; + unsigned long next; + unsigned long pages = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + pages += change_pud_range(vma, p4d, addr, next, newprot, + dirty_accountable, prot_numa); + } while (p4d++, addr = next, addr != end); + + return pages; +} + static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -230,7 +250,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - pages += change_pud_range(vma, pgd, addr, next, newprot, + pages += change_p4d_range(vma, pgd, addr, next, newprot, dirty_accountable, prot_numa); } while (pgd++, addr = next, addr != end); diff --git a/mm/mremap.c b/mm/mremap.c index 8233b0105c82..cd8a1b199ef9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -32,6 +32,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -39,7 +40,11 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) if (pgd_none_or_clear_bad(pgd)) return NULL; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none_or_clear_bad(p4d)) + return NULL; + + pud = pud_offset(p4d, addr); if (pud_none_or_clear_bad(pud)) return NULL; @@ -54,11 +59,15 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, addr); if (!pud) return NULL; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaa64d2ffdc5..6cbde310abed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -873,7 +873,8 @@ done_merging: higher_page = page + (combined_pfn - pfn); buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); higher_buddy = higher_page + (buddy_pfn - combined_pfn); - if (page_is_buddy(higher_page, higher_buddy, order + 1)) { + if (pfn_valid_within(buddy_pfn) && + page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); goto out; diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index a23001a22c15..c4c9def8ffea 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -104,6 +104,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) struct mm_struct *mm = pvmw->vma->vm_mm; struct page *page = pvmw->page; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; /* The only possible pmd mapping has been handled on last iteration */ @@ -133,7 +134,10 @@ restart: pgd = pgd_offset(mm, pvmw->address); if (!pgd_present(*pgd)) return false; - pud = pud_offset(pgd, pvmw->address); + p4d = p4d_offset(pgd, pvmw->address); + if (!p4d_present(*p4d)) + return false; + pud = pud_offset(p4d, pvmw->address); if (!pud_present(*pud)) return false; pvmw->pmd = pmd_offset(pud, pvmw->address); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 03761577ae86..60f7856e508f 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -69,14 +69,14 @@ again: return err; } -static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, +static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, struct mm_walk *walk) { pud_t *pud; unsigned long next; int err = 0; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { again: next = pud_addr_end(addr, end); @@ -113,6 +113,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, return err; } +static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + if (walk->pmd_entry || walk->pte_entry) + err = walk_pud_range(p4d, addr, next, walk); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + static int walk_pgd_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -131,7 +157,7 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, continue; } if (walk->pmd_entry || walk->pte_entry) - err = walk_pud_range(pgd, addr, next, walk); + err = walk_p4d_range(pgd, addr, next, walk); if (err) break; } while (pgd++, addr = next, addr != end); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 538998a137d2..9ac639499bd1 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -21,7 +21,6 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, /** * pcpu_get_pages - get temp pages array - * @chunk: chunk of interest * * Returns pointer to array of pointers to struct page which can be indexed * with pcpu_page_idx(). Note that there is only one array and accesses @@ -30,7 +29,7 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, * RETURNS: * Pointer to temp pages array on success. */ -static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) +static struct page **pcpu_get_pages(void) { static struct page **pages; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); @@ -275,7 +274,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, { struct page **pages; - pages = pcpu_get_pages(chunk); + pages = pcpu_get_pages(); if (!pages) return -ENOMEM; @@ -313,7 +312,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, * successful population attempt so the temp pages array must * be available now. */ - pages = pcpu_get_pages(chunk); + pages = pcpu_get_pages(); BUG_ON(!pages); /* unmap and free */ diff --git a/mm/percpu.c b/mm/percpu.c index 5696039b5c07..60a6488e9e6d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1011,8 +1011,11 @@ area_found: mutex_unlock(&pcpu_alloc_mutex); } - if (chunk != pcpu_reserved_chunk) + if (chunk != pcpu_reserved_chunk) { + spin_lock_irqsave(&pcpu_lock, flags); pcpu_nr_empty_pop_pages -= occ_pages; + spin_unlock_irqrestore(&pcpu_lock, flags); + } if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4ed5908c65b0..c99d9512a45b 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -22,6 +22,12 @@ void pgd_clear_bad(pgd_t *pgd) pgd_clear(pgd); } +void p4d_clear_bad(p4d_t *p4d) +{ + p4d_ERROR(*p4d); + p4d_clear(p4d); +} + void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); diff --git a/mm/rmap.c b/mm/rmap.c index 2da487d6cea8..49ed681ccc7b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -684,6 +684,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd = NULL; pmd_t pmde; @@ -692,7 +693,11 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) if (!pgd_present(*pgd)) goto out; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + goto out; + + pud = pud_offset(p4d, address); if (!pud_present(*pud)) goto out; @@ -1316,12 +1321,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } while (page_vma_mapped_walk(&pvmw)) { - subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); - address = pvmw.address; - - /* Unexpected PMD-mapped THP? */ - VM_BUG_ON_PAGE(!pvmw.pte, page); - /* * If the page is mlock()d, we cannot swap it out. * If it's recently referenced (perhaps page_referenced @@ -1345,6 +1344,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, continue; } + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_PAGE(!pvmw.pte, page); + + subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); + address = pvmw.address; + + if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 574c67b663fe..a56c3989f773 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -196,9 +196,9 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) return pmd; } -pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node) +pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) { - pud_t *pud = pud_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); if (pud_none(*pud)) { void *p = vmemmap_alloc_block(PAGE_SIZE, node); if (!p) @@ -208,6 +208,18 @@ pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node) return pud; } +p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) +{ + p4d_t *p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p) + return NULL; + p4d_populate(&init_mm, p4d, p); + } + return p4d; +} + pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) { pgd_t *pgd = pgd_offset_k(addr); @@ -225,6 +237,7 @@ int __meminit vmemmap_populate_basepages(unsigned long start, { unsigned long addr = start; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -233,7 +246,10 @@ int __meminit vmemmap_populate_basepages(unsigned long start, pgd = vmemmap_pgd_populate(addr, node); if (!pgd) return -ENOMEM; - pud = vmemmap_pud_populate(pgd, addr, node); + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + pud = vmemmap_pud_populate(p4d, addr, node); if (!pud) return -ENOMEM; pmd = vmemmap_pmd_populate(pud, addr, node); diff --git a/mm/swapfile.c b/mm/swapfile.c index 521ef9b6064f..178130880b90 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1517,7 +1517,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, return 0; } -static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, swp_entry_t entry, struct page *page) { @@ -1525,7 +1525,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long next; int ret; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) @@ -1537,6 +1537,26 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, return 0; } +static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + swp_entry_t entry, struct page *page) +{ + p4d_t *p4d; + unsigned long next; + int ret; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + ret = unuse_pud_range(vma, p4d, addr, next, entry, page); + if (ret) + return ret; + } while (p4d++, addr = next, addr != end); + return 0; +} + static int unuse_vma(struct vm_area_struct *vma, swp_entry_t entry, struct page *page) { @@ -1560,7 +1580,7 @@ static int unuse_vma(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - ret = unuse_pud_range(vma, pgd, addr, next, entry, page); + ret = unuse_p4d_range(vma, pgd, addr, next, entry, page); if (ret) return ret; } while (pgd++, addr = next, addr != end); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 479e631d43c2..8bcb501bce60 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -128,19 +128,22 @@ out_unlock: static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; - pmd_t *pmd = NULL; pgd = pgd_offset(mm, address); - pud = pud_alloc(mm, pgd, address); - if (pud) - /* - * Note that we didn't run this because the pmd was - * missing, the *pmd may be already established and in - * turn it may also be a trans_huge_pmd. - */ - pmd = pmd_alloc(mm, pud, address); - return pmd; + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, address); + if (!pud) + return NULL; + /* + * Note that we didn't run this because the pmd was + * missing, the *pmd may be already established and in + * turn it may also be a trans_huge_pmd. + */ + return pmd_alloc(mm, pud, address); } #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b4024d688f38..0dd80222b20b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -86,12 +86,12 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) } while (pmd++, addr = next, addr != end); } -static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) +static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end) { pud_t *pud; unsigned long next; - pud = pud_offset(pgd, addr); + pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_clear_huge(pud)) @@ -102,6 +102,22 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) } while (pud++, addr = next, addr != end); } +static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_clear_huge(p4d)) + continue; + if (p4d_none_or_clear_bad(p4d)) + continue; + vunmap_pud_range(p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + static void vunmap_page_range(unsigned long addr, unsigned long end) { pgd_t *pgd; @@ -113,7 +129,7 @@ static void vunmap_page_range(unsigned long addr, unsigned long end) next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - vunmap_pud_range(pgd, addr, next); + vunmap_p4d_range(pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -160,13 +176,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, return 0; } -static int vmap_pud_range(pgd_t *pgd, unsigned long addr, +static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pud_t *pud; unsigned long next; - pud = pud_alloc(&init_mm, pgd, addr); + pud = pud_alloc(&init_mm, p4d, addr); if (!pud) return -ENOMEM; do { @@ -177,6 +193,23 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr, return 0; } +static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_alloc(&init_mm, pgd, addr); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + if (vmap_pud_range(p4d, addr, next, prot, pages, nr)) + return -ENOMEM; + } while (p4d++, addr = next, addr != end); + return 0; +} + /* * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and * will have pfns corresponding to the "pages" array. @@ -196,7 +229,7 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end, pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); + err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr); if (err) return err; } while (pgd++, addr = next, addr != end); @@ -237,6 +270,10 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for @@ -244,21 +281,23 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) */ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); - if (!pgd_none(*pgd)) { - pud_t *pud = pud_offset(pgd, addr); - if (!pud_none(*pud)) { - pmd_t *pmd = pmd_offset(pud, addr); - if (!pmd_none(*pmd)) { - pte_t *ptep, pte; - - ptep = pte_offset_map(pmd, addr); - pte = *ptep; - if (pte_present(pte)) - page = pte_page(pte); - pte_unmap(ptep); - } - } - } + if (pgd_none(*pgd)) + return NULL; + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return NULL; + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return NULL; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return NULL; + + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); return page; } EXPORT_SYMBOL(vmalloc_to_page); diff --git a/mm/vmstat.c b/mm/vmstat.c index 69f9aff39a2e..b1947f0cbee2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1065,6 +1065,9 @@ const char * const vmstat_text[] = { "thp_split_page_failed", "thp_deferred_split_page", "thp_split_pmd", +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + "thp_split_pud", +#endif "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif |