diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 9 | ||||
-rw-r--r-- | mm/debug.c | 6 | ||||
-rw-r--r-- | mm/huge_memory.c | 14 | ||||
-rw-r--r-- | mm/hugetlb.c | 1 | ||||
-rw-r--r-- | mm/kasan/quarantine.c | 7 | ||||
-rw-r--r-- | mm/khugepaged.c | 25 | ||||
-rw-r--r-- | mm/memcontrol.c | 113 | ||||
-rw-r--r-- | mm/memory.c | 12 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
-rw-r--r-- | mm/mempolicy.c | 17 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 121 | ||||
-rw-r--r-- | mm/page_io.c | 3 | ||||
-rw-r--r-- | mm/readahead.c | 9 | ||||
-rw-r--r-- | mm/rmap.c | 7 | ||||
-rw-r--r-- | mm/shmem.c | 9 | ||||
-rw-r--r-- | mm/slub.c | 6 | ||||
-rw-r--r-- | mm/swapfile.c | 1 | ||||
-rw-r--r-- | mm/usercopy.c | 68 | ||||
-rw-r--r-- | mm/vmscan.c | 41 |
20 files changed, 279 insertions, 198 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 78a23c5c302d..be0ee11fa0d9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -262,7 +262,14 @@ config COMPACTION select MIGRATION depends on MMU help - Allows the compaction of memory for the allocation of huge pages. + Compaction is the only memory management component to form + high order (larger physically contiguous) memory blocks + reliably. The page allocator relies on compaction heavily and + the lack of the feature can lead to unexpected OOM killer + invocations for high order memory requests. You shouldn't + disable this option unless there really is a strong reason for + it and then we would be really interested to hear about that at + linux-mm@kvack.org. # # support for page migration diff --git a/mm/debug.c b/mm/debug.c index 8865bfb41b0b..74c7cae4f683 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -42,9 +42,11 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { + int mapcount = PageSlab(page) ? 0 : page_mapcount(page); + pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", - page, page_ref_count(page), page_mapcount(page), - page->mapping, page->index); + page, page_ref_count(page), mapcount, + page->mapping, page_to_pgoff(page)); if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2373f0a7d340..53ae6d00656a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1078,7 +1078,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, goto out; page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd); if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { @@ -1116,7 +1116,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, } skip_mlock: page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; - VM_BUG_ON_PAGE(!PageCompound(page), page); + VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); if (flags & FOLL_GET) get_page(page); @@ -1138,9 +1138,6 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) bool was_writable; int flags = 0; - /* A PROT_NONE fault should not end up here */ - BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); if (unlikely(!pmd_same(pmd, *fe->pmd))) goto out_unlock; @@ -1512,7 +1509,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t _pmd; - bool young, write, dirty; + bool young, write, dirty, soft_dirty; unsigned long addr; int i; @@ -1546,6 +1543,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, write = pmd_write(*pmd); young = pmd_young(*pmd); dirty = pmd_dirty(*pmd); + soft_dirty = pmd_soft_dirty(*pmd); pmdp_huge_split_prepare(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); @@ -1562,6 +1560,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t swp_entry; swp_entry = make_migration_entry(page + i, write); entry = swp_entry_to_pte(swp_entry); + if (soft_dirty) + entry = pte_swp_mksoft_dirty(entry); } else { entry = mk_pte(page + i, vma->vm_page_prot); entry = maybe_mkwrite(entry, vma); @@ -1569,6 +1569,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); + if (soft_dirty) + entry = pte_mksoft_dirty(entry); } if (dirty) SetPageDirty(page + i); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b9aa1b0b38b0..87e11d8ad536 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1448,6 +1448,7 @@ static void dissolve_free_huge_page(struct page *page) list_del(&page->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; + h->max_huge_pages--; update_and_free_page(h, page); } spin_unlock(&hugetlb_lock); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index b6728a33a4ac..baabaad4a4aa 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -217,11 +217,8 @@ void quarantine_reduce(void) new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); - if (WARN_ONCE(new_quarantine_size < percpu_quarantines, - "Too little memory, disabling global KASAN quarantine.\n")) - new_quarantine_size = 0; - else - new_quarantine_size -= percpu_quarantines; + new_quarantine_size = (new_quarantine_size < percpu_quarantines) ? + 0 : new_quarantine_size - percpu_quarantines; WRITE_ONCE(quarantine_size, new_quarantine_size); last = global_quarantine.head; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 79c52d0061af..728d7790dc2d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -838,7 +838,8 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) * value (scan code). */ -static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address) +static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, + struct vm_area_struct **vmap) { struct vm_area_struct *vma; unsigned long hstart, hend; @@ -846,7 +847,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address) if (unlikely(khugepaged_test_exit(mm))) return SCAN_ANY_PROCESS; - vma = find_vma(mm, address); + *vmap = vma = find_vma(mm, address); if (!vma) return SCAN_VMA_NULL; @@ -881,6 +882,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, .pmd = pmd, }; + /* we only decide to swapin, if there is enough young ptes */ + if (referenced < HPAGE_PMD_NR/2) { + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); + return false; + } fe.pte = pte_offset_map(pmd, address); for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; fe.pte++, fe.address += PAGE_SIZE) { @@ -888,17 +894,12 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, if (!is_swap_pte(pteval)) continue; swapped_in++; - /* we only decide to swapin, if there is enough young ptes */ - if (referenced < HPAGE_PMD_NR/2) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } ret = do_swap_page(&fe, pteval); /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { down_read(&mm->mmap_sem); - if (hugepage_vma_revalidate(mm, address)) { + if (hugepage_vma_revalidate(mm, address, &fe.vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; @@ -923,7 +924,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - struct vm_area_struct *vma, int node, int referenced) { pmd_t *pmd, _pmd; @@ -933,6 +933,7 @@ static void collapse_huge_page(struct mm_struct *mm, spinlock_t *pmd_ptl, *pte_ptl; int isolated = 0, result = 0; struct mem_cgroup *memcg; + struct vm_area_struct *vma; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t gfp; @@ -961,7 +962,7 @@ static void collapse_huge_page(struct mm_struct *mm, } down_read(&mm->mmap_sem); - result = hugepage_vma_revalidate(mm, address); + result = hugepage_vma_revalidate(mm, address, &vma); if (result) { mem_cgroup_cancel_charge(new_page, memcg, true); up_read(&mm->mmap_sem); @@ -994,7 +995,7 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ down_write(&mm->mmap_sem); - result = hugepage_vma_revalidate(mm, address); + result = hugepage_vma_revalidate(mm, address, &vma); if (result) goto out; /* check if the pmd is still valid */ @@ -1202,7 +1203,7 @@ out_unmap: if (ret) { node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, vma, node, referenced); + collapse_huge_page(mm, address, hpage, node, referenced); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 66beca1ad92f..4be518d4e68a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1740,17 +1740,22 @@ static DEFINE_MUTEX(percpu_charge_mutex); static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; + unsigned long flags; bool ret = false; if (nr_pages > CHARGE_BATCH) return ret; - stock = &get_cpu_var(memcg_stock); + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); if (memcg == stock->cached && stock->nr_pages >= nr_pages) { stock->nr_pages -= nr_pages; ret = true; } - put_cpu_var(memcg_stock); + + local_irq_restore(flags); + return ret; } @@ -1771,15 +1776,18 @@ static void drain_stock(struct memcg_stock_pcp *stock) stock->cached = NULL; } -/* - * This must be called under preempt disabled or must be called by - * a thread which is pinned to local cpu. - */ static void drain_local_stock(struct work_struct *dummy) { - struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); + struct memcg_stock_pcp *stock; + unsigned long flags; + + local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + + local_irq_restore(flags); } /* @@ -1788,14 +1796,19 @@ static void drain_local_stock(struct work_struct *dummy) */ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { - struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); + struct memcg_stock_pcp *stock; + unsigned long flags; + + local_irq_save(flags); + stock = this_cpu_ptr(&memcg_stock); if (stock->cached != memcg) { /* reset if necessary */ drain_stock(stock); stock->cached = memcg; } stock->nr_pages += nr_pages; - put_cpu_var(memcg_stock); + + local_irq_restore(flags); } /* @@ -2337,8 +2350,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) return 0; memcg = get_mem_cgroup_from_mm(current->mm); - if (!mem_cgroup_is_root(memcg)) + if (!mem_cgroup_is_root(memcg)) { ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); + if (!ret) + __SetPageKmemcg(page); + } css_put(&memcg->css); return ret; } @@ -2365,6 +2381,11 @@ void memcg_kmem_uncharge(struct page *page, int order) page_counter_uncharge(&memcg->memsw, nr_pages); page->mem_cgroup = NULL; + + /* slab pages do not have PageKmemcg flag set */ + if (PageKmemcg(page)) + __ClearPageKmemcg(page); + css_put_many(&memcg->css, nr_pages); } #endif /* !CONFIG_SLOB */ @@ -4069,14 +4090,14 @@ static struct cftype mem_cgroup_legacy_files[] = { static DEFINE_IDR(mem_cgroup_idr); -static void mem_cgroup_id_get(struct mem_cgroup *memcg) +static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) { - atomic_inc(&memcg->id.ref); + atomic_add(n, &memcg->id.ref); } -static void mem_cgroup_id_put(struct mem_cgroup *memcg) +static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { - if (atomic_dec_and_test(&memcg->id.ref)) { + if (atomic_sub_and_test(n, &memcg->id.ref)) { idr_remove(&mem_cgroup_idr, memcg->id.id); memcg->id.id = 0; @@ -4085,6 +4106,16 @@ static void mem_cgroup_id_put(struct mem_cgroup *memcg) } } +static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + mem_cgroup_id_get_many(memcg, 1); +} + +static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + mem_cgroup_id_put_many(memcg, 1); +} + /** * mem_cgroup_from_id - look up a memcg from a memcg id * @id: the memcg id to look up @@ -4719,6 +4750,8 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) page_counter_uncharge(&mc.from->memsw, mc.moved_swap); + mem_cgroup_id_put_many(mc.from, mc.moved_swap); + /* * we charged both to->memory and to->memsw, so we * should uncharge to->memory. @@ -4726,9 +4759,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.to)) page_counter_uncharge(&mc.to->memory, mc.moved_swap); - css_put_many(&mc.from->css, mc.moved_swap); + mem_cgroup_id_get_many(mc.to, mc.moved_swap); + css_put_many(&mc.to->css, mc.moved_swap); - /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); @@ -5537,8 +5570,10 @@ static void uncharge_list(struct list_head *page_list) else nr_file += nr_pages; pgpgout++; - } else + } else { nr_kmem += 1 << compound_order(page); + __ClearPageKmemcg(page); + } page->mem_cgroup = NULL; } while (next != page_list); @@ -5781,6 +5816,24 @@ static int __init mem_cgroup_init(void) subsys_initcall(mem_cgroup_init); #ifdef CONFIG_MEMCG_SWAP +static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +{ + while (!atomic_inc_not_zero(&memcg->id.ref)) { + /* + * The root cgroup cannot be destroyed, so it's refcount must + * always be >= 1. + */ + if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + VM_BUG_ON(1); + break; + } + memcg = parent_mem_cgroup(memcg); + if (!memcg) + memcg = root_mem_cgroup; + } + return memcg; +} + /** * mem_cgroup_swapout - transfer a memsw charge to swap * @page: page whose memsw charge to transfer @@ -5790,7 +5843,7 @@ subsys_initcall(mem_cgroup_init); */ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *swap_memcg; unsigned short oldid; VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5805,16 +5858,27 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; - mem_cgroup_id_get(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + /* + * In case the memcg owning these pages has been offlined and doesn't + * have an ID allocated to it anymore, charge the closest online + * ancestor for the swap instead and transfer the memory+swap charge. + */ + swap_memcg = mem_cgroup_id_get_online(memcg); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_swap_statistics(swap_memcg, true); page->mem_cgroup = NULL; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); + if (memcg != swap_memcg) { + if (!mem_cgroup_is_root(swap_memcg)) + page_counter_charge(&swap_memcg->memsw, 1); + page_counter_uncharge(&memcg->memsw, 1); + } + /* * Interrupts should be disabled here because the caller holds the * mapping->tree_lock lock which is taken with interrupts-off. It is @@ -5853,11 +5917,14 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!memcg) return 0; + memcg = mem_cgroup_id_get_online(memcg); + if (!mem_cgroup_is_root(memcg) && - !page_counter_try_charge(&memcg->swap, 1, &counter)) + !page_counter_try_charge(&memcg->swap, 1, &counter)) { + mem_cgroup_id_put(memcg); return -ENOMEM; + } - mem_cgroup_id_get(memcg); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); diff --git a/mm/memory.c b/mm/memory.c index 83be99d9d8a1..793fe0f9841c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3351,9 +3351,6 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) bool was_writable = pte_write(pte); int flags = 0; - /* A PROT_NONE fault should not end up here */ - BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); - /* * The "pte" at this point cannot be used safely without * validation through pte_unmap_same(). It's of NUMA type but @@ -3458,6 +3455,11 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) return VM_FAULT_FALLBACK; } +static inline bool vma_is_accessible(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3524,7 +3526,7 @@ static int handle_pte_fault(struct fault_env *fe) if (!pte_present(entry)) return do_swap_page(fe, entry); - if (pte_protnone(entry)) + if (pte_protnone(entry) && vma_is_accessible(fe->vma)) return do_numa_page(fe, entry); fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); @@ -3590,7 +3592,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { - if (pmd_protnone(orig_pmd)) + if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&fe, orig_pmd); if ((fe.flags & FAULT_FLAG_WRITE) && diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3894b65b1555..b58906b6215c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1219,6 +1219,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_node(nid, zones_size, start_pfn, zholes_size); + pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* * The node we allocated has no zone fallback lists. For avoiding @@ -1249,6 +1250,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) static void rollback_node_hotadd(int nid, pg_data_t *pgdat) { arch_refresh_nodedata(nid, NULL); + free_percpu(pgdat->per_cpu_nodestats); arch_free_nodedata(pgdat); return; } @@ -1565,7 +1567,9 @@ static struct page *new_node_page(struct page *page, unsigned long private, return alloc_huge_page_node(page_hstate(compound_head(page)), next_node_in(nid, nmask)); - node_clear(nid, nmask); + if (nid != next_node_in(nid, nmask)) + node_clear(nid, nmask); + if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) gfp_mask |= __GFP_HIGHMEM; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d8c4e38fb5f4..2da72a5b6ecc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2336,6 +2336,23 @@ out: return ret; } +/* + * Drop the (possibly final) reference to task->mempolicy. It needs to be + * dropped after task->mempolicy is set to NULL so that any allocation done as + * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed + * policy. + */ +void mpol_put_task_policy(struct task_struct *task) +{ + struct mempolicy *pol; + + task_lock(task); + pol = task->mempolicy; + task->mempolicy = NULL; + task_unlock(task); + mpol_put(pol); +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7d0a275df822..d53a9aa00977 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -764,7 +764,7 @@ bool task_will_free_mem(struct task_struct *task) { struct mm_struct *mm = task->mm; struct task_struct *p; - bool ret; + bool ret = true; /* * Skip tasks without mm because it might have passed its exit_mm and diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fb975cec3518..a2214c64ed3c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1008,10 +1008,8 @@ static __always_inline bool free_pages_prepare(struct page *page, } if (PageMappingFlags(page)) page->mapping = NULL; - if (memcg_kmem_enabled() && PageKmemcg(page)) { + if (memcg_kmem_enabled() && PageKmemcg(page)) memcg_kmem_uncharge(page, order); - __ClearPageKmemcg(page); - } if (check_free) bad += free_pages_check(page); if (bad) @@ -3139,54 +3137,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; } -static inline bool -should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, - enum compact_result compact_result, - enum compact_priority *compact_priority, - int compaction_retries) -{ - int max_retries = MAX_COMPACT_RETRIES; - - if (!order) - return false; - - /* - * compaction considers all the zone as desperately out of memory - * so it doesn't really make much sense to retry except when the - * failure could be caused by insufficient priority - */ - if (compaction_failed(compact_result)) { - if (*compact_priority > MIN_COMPACT_PRIORITY) { - (*compact_priority)--; - return true; - } - return false; - } - - /* - * make sure the compaction wasn't deferred or didn't bail out early - * due to locks contention before we declare that we should give up. - * But do not retry if the given zonelist is not suitable for - * compaction. - */ - if (compaction_withdrawn(compact_result)) - return compaction_zonelist_suitable(ac, order, alloc_flags); - - /* - * !costly requests are much more important than __GFP_REPEAT - * costly ones because they are de facto nofail and invoke OOM - * killer to move on while costly can fail and users are ready - * to cope with that. 1/4 retries is rather arbitrary but we - * would need much more detailed feedback from compaction to - * make a better decision. - */ - if (order > PAGE_ALLOC_COSTLY_ORDER) - max_retries /= 4; - if (compaction_retries <= max_retries) - return true; - - return false; -} #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, @@ -3197,6 +3147,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; } +#endif /* CONFIG_COMPACTION */ + static inline bool should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, enum compact_result compact_result, @@ -3223,7 +3175,6 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla } return false; } -#endif /* CONFIG_COMPACTION */ /* Perform direct synchronous page reclaim */ static int @@ -3756,12 +3707,10 @@ no_zone: } out: - if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page) { - if (unlikely(memcg_kmem_charge(page, gfp_mask, order))) { - __free_pages(page, order); - page = NULL; - } else - __SetPageKmemcg(page); + if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && + unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { + __free_pages(page, order); + page = NULL; } if (kmemcheck_enabled && page) @@ -4064,7 +4013,7 @@ long si_mem_available(void) int lru; for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_page_state(NR_LRU_BASE + lru); + pages[lru] = global_node_page_state(NR_LRU_BASE + lru); for_each_zone(zone) wmark_low += zone->watermark[WMARK_LOW]; @@ -4411,7 +4360,7 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, do { zone_type--; zone = pgdat->node_zones + zone_type; - if (populated_zone(zone)) { + if (managed_zone(zone)) { zoneref_set_zone(zone, &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); @@ -4649,7 +4598,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) for (j = 0; j < nr_nodes; j++) { node = node_order[j]; z = &NODE_DATA(node)->node_zones[zone_type]; - if (populated_zone(z)) { + if (managed_zone(z)) { zoneref_set_zone(z, &zonelist->_zonerefs[pos++]); check_highest_zone(zone_type); @@ -4761,6 +4710,8 @@ int local_memory_node(int node) } #endif +static void setup_min_unmapped_ratio(void); +static void setup_min_slab_ratio(void); #else /* CONFIG_NUMA */ static void set_zonelist_order(void) @@ -5882,9 +5833,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; - pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio) - / 100; - pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; zone->zone_pgdat = pgdat; @@ -6805,6 +6753,12 @@ int __meminit init_per_zone_wmark_min(void) setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); + +#ifdef CONFIG_NUMA + setup_min_unmapped_ratio(); + setup_min_slab_ratio(); +#endif + return 0; } core_initcall(init_per_zone_wmark_min) @@ -6846,43 +6800,58 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, } #ifdef CONFIG_NUMA +static void setup_min_unmapped_ratio(void) +{ + pg_data_t *pgdat; + struct zone *zone; + + for_each_online_pgdat(pgdat) + pgdat->min_unmapped_pages = 0; + + for_each_zone(zone) + zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * + sysctl_min_unmapped_ratio) / 100; +} + + int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - struct pglist_data *pgdat; - struct zone *zone; int rc; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; + setup_min_unmapped_ratio(); + + return 0; +} + +static void setup_min_slab_ratio(void) +{ + pg_data_t *pgdat; + struct zone *zone; + for_each_online_pgdat(pgdat) pgdat->min_slab_pages = 0; for_each_zone(zone) - zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * - sysctl_min_unmapped_ratio) / 100; - return 0; + zone->zone_pgdat->min_slab_pages += (zone->managed_pages * + sysctl_min_slab_ratio) / 100; } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - struct pglist_data *pgdat; - struct zone *zone; int rc; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; - for_each_online_pgdat(pgdat) - pgdat->min_slab_pages = 0; + setup_min_slab_ratio(); - for_each_zone(zone) - zone->zone_pgdat->min_slab_pages += (zone->managed_pages * - sysctl_min_slab_ratio) / 100; return 0; } #endif diff --git a/mm/page_io.c b/mm/page_io.c index 16bd82fad38c..eafe5ddc2b54 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -264,6 +264,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, int ret; struct swap_info_struct *sis = page_swap_info(page); + BUG_ON(!PageSwapCache(page)); if (sis->flags & SWP_FILE) { struct kiocb kiocb; struct file *swap_file = sis->swap_file; @@ -337,6 +338,7 @@ int swap_readpage(struct page *page) int ret = 0; struct swap_info_struct *sis = page_swap_info(page); + BUG_ON(!PageSwapCache(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); if (frontswap_load(page) == 0) { @@ -386,6 +388,7 @@ int swap_set_page_dirty(struct page *page) if (sis->flags & SWP_FILE) { struct address_space *mapping = sis->swap_file->f_mapping; + BUG_ON(!PageSwapCache(page)); return mapping->a_ops->set_page_dirty(page); } else { return __set_page_dirty_no_writeback(page); diff --git a/mm/readahead.c b/mm/readahead.c index 65ec288dc057..c8a955b1297e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,6 +8,7 @@ */ #include <linux/kernel.h> +#include <linux/dax.h> #include <linux/gfp.h> #include <linux/export.h> #include <linux/blkdev.h> @@ -544,6 +545,14 @@ do_readahead(struct address_space *mapping, struct file *filp, if (!mapping || !mapping->a_ops) return -EINVAL; + /* + * Readahead doesn't make sense for DAX inodes, but we don't want it + * to report a failure either. Instead, we just return success and + * don't do any work. + */ + if (dax_mapping(mapping)) + return 0; + return force_page_cache_readahead(mapping, filp, index, nr); } diff --git a/mm/rmap.c b/mm/rmap.c index 709bc83703b1..1ef36404e7b2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1284,8 +1284,9 @@ void page_add_file_rmap(struct page *page, bool compound) VM_BUG_ON_PAGE(!PageSwapBacked(page), page); __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); } else { - if (PageTransCompound(page)) { - VM_BUG_ON_PAGE(!PageLocked(page), page); + if (PageTransCompound(page) && page_mapping(page)) { + VM_WARN_ON_ONCE(!PageLocked(page)); + SetPageDoubleMap(compound_head(page)); if (PageMlocked(page)) clear_page_mlock(compound_head(page)); @@ -1303,7 +1304,7 @@ static void page_remove_file_rmap(struct page *page, bool compound) { int i, nr = 1; - VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); + VM_BUG_ON_PAGE(compound && !PageHead(page), page); lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ diff --git a/mm/shmem.c b/mm/shmem.c index 886b1236e68a..898a6f224187 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -270,7 +270,7 @@ bool shmem_charge(struct inode *inode, long pages) info->alloced -= pages; shmem_recalc_inode(inode); spin_unlock_irqrestore(&info->lock, flags); - + shmem_unacct_blocks(info->flags, pages); return false; } percpu_counter_add(&sbinfo->used_blocks, pages); @@ -291,6 +291,7 @@ void shmem_uncharge(struct inode *inode, long pages) if (sbinfo->max_blocks) percpu_counter_sub(&sbinfo->used_blocks, pages); + shmem_unacct_blocks(info->flags, pages); } /* @@ -1980,7 +1981,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, return addr; sb = shm_mnt->mnt_sb; } - if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) + if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) return addr; } @@ -3975,7 +3976,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute shmem_enabled_attr = __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); +#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ +#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE bool shmem_huge_enabled(struct vm_area_struct *vma) { struct inode *inode = file_inode(vma->vm_file); @@ -4006,7 +4009,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) return false; } } -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ +#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ #else /* !CONFIG_SHMEM */ diff --git a/mm/slub.c b/mm/slub.c index cead06394e9e..9adae58462f8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3629,6 +3629,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { + LIST_HEAD(discard); struct page *page, *h; BUG_ON(irqs_disabled()); @@ -3636,13 +3637,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { remove_partial(n, page); - discard_slab(s, page); + list_add(&page->lru, &discard); } else { list_slab_objects(s, page, "Objects remaining in %s on __kmem_cache_shutdown()"); } } spin_unlock_irq(&n->list_lock); + + list_for_each_entry_safe(page, h, &discard, lru) + discard_slab(s, page); } /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 78cfa292a29a..2657accc6e2b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2724,7 +2724,6 @@ int swapcache_prepare(swp_entry_t entry) struct swap_info_struct *page_swap_info(struct page *page) { swp_entry_t swap = { .val = page_private(page) }; - BUG_ON(!PageSwapCache(page)); return swap_info[swp_type(swap)]; } diff --git a/mm/usercopy.c b/mm/usercopy.c index 8ebae91a6b55..3c8da0af9695 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -83,7 +83,7 @@ static bool overlaps(const void *ptr, unsigned long n, unsigned long low, unsigned long check_high = check_low + n; /* Does not overlap if entirely above or entirely below. */ - if (check_low >= high || check_high < low) + if (check_low >= high || check_high <= low) return false; return true; @@ -124,7 +124,7 @@ static inline const char *check_kernel_text_object(const void *ptr, static inline const char *check_bogus_address(const void *ptr, unsigned long n) { /* Reject if object wraps past end of memory. */ - if (ptr + n < ptr) + if ((unsigned long)ptr + n < (unsigned long)ptr) return "<wrapped address>"; /* Reject if NULL or ZERO-allocation. */ @@ -134,31 +134,16 @@ static inline const char *check_bogus_address(const void *ptr, unsigned long n) return NULL; } -static inline const char *check_heap_object(const void *ptr, unsigned long n, - bool to_user) +/* Checks for allocs that are marked in some way as spanning multiple pages. */ +static inline const char *check_page_span(const void *ptr, unsigned long n, + struct page *page, bool to_user) { - struct page *page, *endpage; +#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN const void *end = ptr + n - 1; + struct page *endpage; bool is_reserved, is_cma; /* - * Some architectures (arm64) return true for virt_addr_valid() on - * vmalloced addresses. Work around this by checking for vmalloc - * first. - */ - if (is_vmalloc_addr(ptr)) - return NULL; - - if (!virt_addr_valid(ptr)) - return NULL; - - page = virt_to_head_page(ptr); - - /* Check slab allocator for flags and size. */ - if (PageSlab(page)) - return __check_heap_object(ptr, n, page); - - /* * Sometimes the kernel data regions are not marked Reserved (see * check below). And sometimes [_sdata,_edata) does not cover * rodata and/or bss, so check each range explicitly. @@ -186,7 +171,7 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, ((unsigned long)end & (unsigned long)PAGE_MASK))) return NULL; - /* Allow if start and end are inside the same compound page. */ + /* Allow if fully inside the same compound (__GFP_COMP) page. */ endpage = virt_to_head_page(end); if (likely(endpage == page)) return NULL; @@ -199,20 +184,47 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, is_reserved = PageReserved(page); is_cma = is_migrate_cma_page(page); if (!is_reserved && !is_cma) - goto reject; + return "<spans multiple pages>"; for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) { page = virt_to_head_page(ptr); if (is_reserved && !PageReserved(page)) - goto reject; + return "<spans Reserved and non-Reserved pages>"; if (is_cma && !is_migrate_cma_page(page)) - goto reject; + return "<spans CMA and non-CMA pages>"; } +#endif return NULL; +} + +static inline const char *check_heap_object(const void *ptr, unsigned long n, + bool to_user) +{ + struct page *page; + + /* + * Some architectures (arm64) return true for virt_addr_valid() on + * vmalloced addresses. Work around this by checking for vmalloc + * first. + * + * We also need to check for module addresses explicitly since we + * may copy static data from modules to userspace + */ + if (is_vmalloc_or_module_addr(ptr)) + return NULL; + + if (!virt_addr_valid(ptr)) + return NULL; + + page = virt_to_head_page(ptr); + + /* Check slab allocator for flags and size. */ + if (PageSlab(page)) + return __check_heap_object(ptr, n, page); -reject: - return "<spans multiple pages>"; + /* Verify object does not incorrectly span multiple pages. */ + return check_page_span(ptr, n, page, to_user); } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 374d95d04178..0fe8b7113868 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1665,7 +1665,7 @@ static bool inactive_reclaimable_pages(struct lruvec *lruvec, for (zid = sc->reclaim_idx; zid >= 0; zid--) { zone = &pgdat->node_zones[zid]; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE + @@ -2036,7 +2036,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, struct zone *zone = &pgdat->node_zones[zid]; unsigned long inactive_zone, active_zone; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; inactive_zone = zone_page_state(zone, @@ -2171,7 +2171,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = &pgdat->node_zones[z]; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; total_high_wmark += high_wmark_pages(zone); @@ -2303,23 +2303,6 @@ out: } } -#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -static void init_tlb_ubc(void) -{ - /* - * This deliberately does not clear the cpumask as it's expensive - * and unnecessary. If there happens to be data in there then the - * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and - * then will be cleared. - */ - current->tlb_ubc.flush_required = false; -} -#else -static inline void init_tlb_ubc(void) -{ -} -#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ - /* * This is a basic per-node page freer. Used by both kswapd and direct reclaim. */ @@ -2355,8 +2338,6 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); - init_tlb_ubc(); - blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -2510,7 +2491,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, /* If compaction would go ahead or the allocation would succeed, stop */ for (z = 0; z <= sc->reclaim_idx; z++) { struct zone *zone = &pgdat->node_zones[z]; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { @@ -2840,7 +2821,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!populated_zone(zone) || + if (!managed_zone(zone) || pgdat_reclaimable_pages(pgdat) == 0) continue; @@ -3141,7 +3122,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) for (i = 0; i <= classzone_idx; i++) { struct zone *zone = pgdat->node_zones + i; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; if (!zone_balanced(zone, order, classzone_idx)) @@ -3169,7 +3150,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, sc->nr_to_reclaim = 0; for (z = 0; z <= sc->reclaim_idx; z++) { zone = pgdat->node_zones + z; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); @@ -3242,7 +3223,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) if (buffer_heads_over_limit) { for (i = MAX_NR_ZONES - 1; i >= 0; i--) { zone = pgdat->node_zones + i; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; sc.reclaim_idx = i; @@ -3262,7 +3243,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) */ for (i = classzone_idx; i >= 0; i--) { zone = pgdat->node_zones + i; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; if (zone_balanced(zone, sc.order, classzone_idx)) @@ -3508,7 +3489,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) pg_data_t *pgdat; int z; - if (!populated_zone(zone)) + if (!managed_zone(zone)) return; if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) @@ -3522,7 +3503,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) /* Only wake kswapd if all zones are unbalanced */ for (z = 0; z <= classzone_idx; z++) { zone = pgdat->node_zones + z; - if (!populated_zone(zone)) + if (!managed_zone(zone)) continue; if (zone_balanced(zone, order, classzone_idx)) |