From d70f2a14b72a4bc094cf3a92e4794644a7adc590 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 31 Jan 2018 16:15:51 -0800 Subject: include/linux/sched/mm.h: uninline mmdrop_async(), etc mmdrop_async() is only used in fork.c. Move that and its support functions into fork.c, uninline it all. Quite a lot of code gets moved around to avoid forward declarations. Cc: Ingo Molnar Cc: Michal Hocko Cc: Peter Zijlstra Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3d49b91b674d..bd422561a75e 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -11,7 +11,7 @@ /* * Routines for handling mm_structs */ -extern struct mm_struct * mm_alloc(void); +extern struct mm_struct *mm_alloc(void); /** * mmgrab() - Pin a &struct mm_struct. @@ -35,27 +35,7 @@ static inline void mmgrab(struct mm_struct *mm) atomic_inc(&mm->mm_count); } -/* mmdrop drops the mm and the page tables */ -extern void __mmdrop(struct mm_struct *); -static inline void mmdrop(struct mm_struct *mm) -{ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) - __mmdrop(mm); -} - -static inline void mmdrop_async_fn(struct work_struct *work) -{ - struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); - __mmdrop(mm); -} - -static inline void mmdrop_async(struct mm_struct *mm) -{ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) { - INIT_WORK(&mm->async_put_work, mmdrop_async_fn); - schedule_work(&mm->async_put_work); - } -} +extern void mmdrop(struct mm_struct *mm); /** * mmget() - Pin the address space associated with a &struct mm_struct. -- cgit v1.2.3 From 9852a7212324fd25f896932f4f4607ce47b0a22f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:16:19 -0800 Subject: mm: drop hotplug lock from lru_add_drain_all() Pulling cpu hotplug locks inside the mm core function like lru_add_drain_all just asks for problems and the recent lockdep splat [1] just proves this. While the usage in that particular case might be wrong we should avoid the locking as lru_add_drain_all() is used in many places. It seems that this is not all that hard to achieve actually. We have done the same thing for drain_all_pages which is analogous by commit a459eeb7b852 ("mm, page_alloc: do not depend on cpu hotplug locks inside the allocator"). All we have to care about is to handle - the work item might be executed on a different cpu in worker from unbound pool so it doesn't run on pinned on the cpu - we have to make sure that we do not race with page_alloc_cpu_dead calling lru_add_drain_cpu the first part is already handled because the worker calls lru_add_drain which disables preemption when calling lru_add_drain_cpu on the local cpu it is draining. The later is true because page_alloc_cpu_dead is called on the controlling CPU after the hotplugged CPU vanished completely. [1] http://lkml.kernel.org/r/089e0825eec8955c1f055c83d476@google.com [add a cpu hotplug locking interaction as per tglx] Link: http://lkml.kernel.org/r/20171116120535.23765-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Thomas Gleixner Cc: Tejun Heo Cc: Peter Zijlstra Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index c2b8128799c1..0bd4c25016f9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -332,7 +332,6 @@ extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); -extern void lru_add_drain_all_cpuslocked(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); extern void mark_page_lazyfree(struct page *page); -- cgit v1.2.3 From c9019e9bf42e66d028d70d2da6206cad4dd9250d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 31 Jan 2018 16:16:37 -0800 Subject: mm: memcontrol: eliminate raw access to stat and event counters Replace all raw 'this_cpu_' modifications of the stat and event per-cpu counters with API functions such as mod_memcg_state(). This makes the code easier to read, but is also in preparation for the next patch, which changes the per-cpu implementation of those counters. Link: http://lkml.kernel.org/r/20171103153336.24044-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 69966c461d1c..2c80b69dd266 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -272,13 +272,6 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline void mem_cgroup_event(struct mem_cgroup *memcg, - enum memcg_event_item event) -{ - this_cpu_inc(memcg->stat->events[event]); - cgroup_file_notify(&memcg->events_file); -} - bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, @@ -627,15 +620,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +/* idx can be of type enum memcg_event_item or vm_event_item */ +static inline void __count_memcg_events(struct mem_cgroup *memcg, + int idx, unsigned long count) +{ + if (!mem_cgroup_disabled()) + __this_cpu_add(memcg->stat->events[idx], count); +} + +/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) + int idx, unsigned long count) { if (!mem_cgroup_disabled()) this_cpu_add(memcg->stat->events[idx], count); } -/* idx can be of type enum memcg_stat_item or node_stat_item */ +/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_page_event(struct page *page, int idx) { @@ -654,12 +655,20 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) { - this_cpu_inc(memcg->stat->events[idx]); + count_memcg_events(memcg, idx, 1); if (idx == OOM_KILL) cgroup_file_notify(&memcg->events_file); } rcu_read_unlock(); } + +static inline void mem_cgroup_event(struct mem_cgroup *memcg, + enum memcg_event_item event) +{ + count_memcg_events(memcg, event, 1); + cgroup_file_notify(&memcg->events_file); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE void mem_cgroup_split_huge_fixup(struct page *head); #endif -- cgit v1.2.3 From 284542656e22c43fdada8c8cc0ca9ede8453eed7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 31 Jan 2018 16:16:41 -0800 Subject: mm: memcontrol: implement lruvec stat functions on top of each other The implementation of the lruvec stat functions and their variants for accounting through a page, or accounting from a preemptible context, are mostly identical and needlessly repetitive. Implement the lruvec_page functions by looking up the page's lruvec and then using the lruvec function. Implement the functions for preemptible contexts by disabling preemption before calling the atomic context functions. Link: http://lkml.kernel.org/r/20171103153336.24044-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2c80b69dd266..1ffc54ac4cc9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -569,51 +569,51 @@ static inline void __mod_lruvec_state(struct lruvec *lruvec, { struct mem_cgroup_per_node *pn; + /* Update node */ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + if (mem_cgroup_disabled()) return; + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + + /* Update memcg */ __mod_memcg_state(pn->memcg, idx, val); + + /* Update lruvec */ __this_cpu_add(pn->lruvec_stat->count[idx], val); } static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { - struct mem_cgroup_per_node *pn; - - mod_node_page_state(lruvec_pgdat(lruvec), idx, val); - if (mem_cgroup_disabled()) - return; - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - mod_memcg_state(pn->memcg, idx, val); - this_cpu_add(pn->lruvec_stat->count[idx], val); + preempt_disable(); + __mod_lruvec_state(lruvec, idx, val); + preempt_enable(); } static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { - struct mem_cgroup_per_node *pn; + pg_data_t *pgdat = page_pgdat(page); + struct lruvec *lruvec; - __mod_node_page_state(page_pgdat(page), idx, val); - if (mem_cgroup_disabled() || !page->mem_cgroup) + /* Untracked pages have no memcg, no lruvec. Update only the node */ + if (!page->mem_cgroup) { + __mod_node_page_state(pgdat, idx, val); return; - __mod_memcg_state(page->mem_cgroup, idx, val); - pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; - __this_cpu_add(pn->lruvec_stat->count[idx], val); + } + + lruvec = mem_cgroup_lruvec(pgdat, page->mem_cgroup); + __mod_lruvec_state(lruvec, idx, val); } static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { - struct mem_cgroup_per_node *pn; - - mod_node_page_state(page_pgdat(page), idx, val); - if (mem_cgroup_disabled() || !page->mem_cgroup) - return; - mod_memcg_state(page->mem_cgroup, idx, val); - pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; - this_cpu_add(pn->lruvec_stat->count[idx], val); + preempt_disable(); + __mod_lruvec_page_state(page, idx, val); + preempt_enable(); } unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, -- cgit v1.2.3 From a983b5ebee57209c99f68c8327072f25e0e6e3da Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 31 Jan 2018 16:16:45 -0800 Subject: mm: memcontrol: fix excessive complexity in memory.stat reporting We've seen memory.stat reads in top-level cgroups take up to fourteen seconds during a userspace bug that created tens of thousands of ghost cgroups pinned by lingering page cache. Even with a more reasonable number of cgroups, aggregating memory.stat is unnecessarily heavy. The complexity is this: nr_cgroups * nr_stat_items * nr_possible_cpus where the stat items are ~70 at this point. With 128 cgroups and 128 CPUs - decent, not enormous setups - reading the top-level memory.stat has to aggregate over a million per-cpu counters. This doesn't scale. Instead of spreading the source of truth across all CPUs, use the per-cpu counters merely to batch updates to shared atomic counters. This is the same as the per-cpu stocks we use for charging memory to the shared atomic page_counters, and also the way the global vmstat counters are implemented. Vmstat has elaborate spilling thresholds that depend on the number of CPUs, amount of memory, and memory pressure - carefully balancing the cost of counter updates with the amount of per-cpu error. That's because the vmstat counters are system-wide, but also used for decisions inside the kernel (e.g. NR_FREE_PAGES in the allocator). Neither is true for the memory controller. Use the same static batch size we already use for page_counter updates during charging. The per-cpu error in the stats will be 128k, which is an acceptable ratio of cores to memory accounting granularity. [hannes@cmpxchg.org: fix warning in __this_cpu_xchg() calls] Link: http://lkml.kernel.org/r/20171201135750.GB8097@cmpxchg.org Link: http://lkml.kernel.org/r/20171103153336.24044-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 96 ++++++++++++++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1ffc54ac4cc9..882046863581 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -108,7 +108,10 @@ struct lruvec_stat { */ struct mem_cgroup_per_node { struct lruvec lruvec; - struct lruvec_stat __percpu *lruvec_stat; + + struct lruvec_stat __percpu *lruvec_stat_cpu; + atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; @@ -227,10 +230,10 @@ struct mem_cgroup { spinlock_t move_lock; struct task_struct *move_lock_task; unsigned long move_lock_flags; - /* - * percpu counter. - */ - struct mem_cgroup_stat_cpu __percpu *stat; + + struct mem_cgroup_stat_cpu __percpu *stat_cpu; + atomic_long_t stat[MEMCG_NR_STAT]; + atomic_long_t events[MEMCG_NR_EVENTS]; unsigned long socket_pressure; @@ -265,6 +268,12 @@ struct mem_cgroup { /* WARNING: nodeinfo must be the last member here */ }; +/* + * size of first charge trial. "32" comes from vmscan.c's magic value. + * TODO: maybe necessary to use big numbers in big irons. + */ +#define MEMCG_CHARGE_BATCH 32U + extern struct mem_cgroup *root_mem_cgroup; static inline bool mem_cgroup_disabled(void) @@ -485,32 +494,38 @@ void unlock_page_memcg(struct page *page); static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long val = 0; - int cpu; - - for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->count[idx], cpu); - - if (val < 0) - val = 0; - - return val; + long x = atomic_long_read(&memcg->stat[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - if (!mem_cgroup_disabled()) - __this_cpu_add(memcg->stat->count[idx], val); + long x; + + if (mem_cgroup_disabled()) + return; + + x = val + __this_cpu_read(memcg->stat_cpu->count[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &memcg->stat[idx]); + x = 0; + } + __this_cpu_write(memcg->stat_cpu->count[idx], x); } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - if (!mem_cgroup_disabled()) - this_cpu_add(memcg->stat->count[idx], val); + preempt_disable(); + __mod_memcg_state(memcg, idx, val); + preempt_enable(); } /** @@ -548,26 +563,25 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; - long val = 0; - int cpu; + long x; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - for_each_possible_cpu(cpu) - val += per_cpu(pn->lruvec_stat->count[idx], cpu); - - if (val < 0) - val = 0; - - return val; + x = atomic_long_read(&pn->lruvec_stat[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } static inline void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { struct mem_cgroup_per_node *pn; + long x; /* Update node */ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); @@ -581,7 +595,12 @@ static inline void __mod_lruvec_state(struct lruvec *lruvec, __mod_memcg_state(pn->memcg, idx, val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stat->count[idx], val); + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &pn->lruvec_stat[idx]); + x = 0; + } + __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); } static inline void mod_lruvec_state(struct lruvec *lruvec, @@ -624,16 +643,25 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, static inline void __count_memcg_events(struct mem_cgroup *memcg, int idx, unsigned long count) { - if (!mem_cgroup_disabled()) - __this_cpu_add(memcg->stat->events[idx], count); + unsigned long x; + + if (mem_cgroup_disabled()) + return; + + x = count + __this_cpu_read(memcg->stat_cpu->events[idx]); + if (unlikely(x > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &memcg->events[idx]); + x = 0; + } + __this_cpu_write(memcg->stat_cpu->events[idx], x); } -/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_events(struct mem_cgroup *memcg, int idx, unsigned long count) { - if (!mem_cgroup_disabled()) - this_cpu_add(memcg->stat->events[idx], count); + preempt_disable(); + __count_memcg_events(memcg, idx, count); + preempt_enable(); } /* idx can be of type enum memcg_event_item or vm_event_item */ -- cgit v1.2.3 From a4ef87684108e5fef38cf289ee360f9b87a53cfd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 31 Jan 2018 16:17:06 -0800 Subject: mm: remove unused pgdat_reclaimable_pages() Remove unused function pgdat_reclaimable_pages() and node_page_state_snapshot() which becomes unused as well. Link: http://lkml.kernel.org/r/20171122094416.26019-1-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 - include/linux/vmstat.h | 17 ----------------- 2 files changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0bd4c25016f9..7b6a59f722a3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -344,7 +344,6 @@ extern void lru_cache_add_active_or_unevictable(struct page *page, /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); -extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1779c9817b39..a4c2317d8b9f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -216,23 +216,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, return x; } -static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat, - enum node_stat_item item) -{ - long x = atomic_long_read(&pgdat->vm_stat[item]); - -#ifdef CONFIG_SMP - int cpu; - for_each_online_cpu(cpu) - x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item]; - - if (x < 0) - x = 0; -#endif - return x; -} - - #ifdef CONFIG_NUMA extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item); extern unsigned long sum_zone_node_page_state(int node, -- cgit v1.2.3 From d6cb41cc44c63492702281b1d329955ca767d399 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:17:10 -0800 Subject: mm, hugetlb: remove hugepages_treat_as_movable sysctl hugepages_treat_as_movable has been introduced by 396faf0303d2 ("Allow huge page allocations to use GFP_HIGH_MOVABLE") to allow hugetlb allocations from ZONE_MOVABLE even when hugetlb pages were not migrateable. The purpose of the movable zone was different at the time. It aimed at reducing memory fragmentation and hugetlb pages being long lived and large werre not contributing to the fragmentation so it was acceptable to use the zone back then. Things have changed though and the primary purpose of the zone became migratability guarantee. If we allow non migrateable hugetlb pages to be in ZONE_MOVABLE memory hotplug might fail to offline the memory. Remove the knob and only rely on hugepage_migration_supported to allow movable zones. Mel said: : Primarily it was aimed at allowing the hugetlb pool to safely shrink with : the ability to grow it again. The use case was for batched jobs, some of : which needed huge pages and others that did not but didn't want the memory : useless pinned in the huge pages pool. : : I suspect that more users rely on THP than hugetlbfs for flexible use of : huge pages with fallback options so I think that removing the option : should be ok. Link: http://lkml.kernel.org/r/20171003072619.8654-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Alexandru Moise <00moses.alexander00@gmail.com> Acked-by: Mel Gorman Cc: Alexandru Moise <00moses.alexander00@gmail.com> Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 82a25880714a..6fcf140188d0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -129,7 +129,6 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); -extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; -- cgit v1.2.3 From 977fbdcd5986c9ff700bf276644d2b1973a53348 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:17:36 -0800 Subject: mm: add unmap_mapping_pages() Several users of unmap_mapping_range() would prefer to express their range in pages rather than bytes. Unfortuately, on a 32-bit kernel, you have to remember to cast your page number to a 64-bit type before shifting it, and four places in the current tree didn't remember to do that. That's a sign of a bad interface. Conveniently, unmap_mapping_range() actually converts from bytes into pages, so hoist the guts of unmap_mapping_range() into a new function unmap_mapping_pages() and convert the callers which want to use pages. Link: http://lkml.kernel.org/r/20171206142627.GD32044@bombadil.infradead.org Signed-off-by: Matthew Wilcox Reported-by: "zhangyi (F)" Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7fc92384977e..173d2484f6e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1312,8 +1312,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, unsigned long *start, unsigned long *end, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); @@ -1324,12 +1322,6 @@ int follow_phys(struct vm_area_struct *vma, unsigned long address, int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); -static inline void unmap_shared_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen) -{ - unmap_mapping_range(mapping, holebegin, holelen, 0); -} - extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); @@ -1344,6 +1336,10 @@ extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); +void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows); +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -1360,10 +1356,20 @@ static inline int fixup_user_fault(struct task_struct *tsk, BUG(); return -EFAULT; } +static inline void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows) { } +static inline void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif -extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, - unsigned int gup_flags); +static inline void unmap_shared_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen) +{ + unmap_mapping_range(mapping, holebegin, holelen, 0); +} + +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, -- cgit v1.2.3 From 146500e9604cece72d4bed1cd15fac789220c795 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:17:40 -0800 Subject: mm: get 7% more pages in a pagevec We don't have to use an entire 'long' for the number of elements in the pagevec; we know it's a number between 0 and 14 (now 15). So we can store it in a char, and then the bool packs next to it and we still have two or six bytes of padding for more elements in the header. That gives us space to cram in an extra page. Link: http://lkml.kernel.org/r/20171206022521.GM26021@bombadil.infradead.org Signed-off-by: Matthew Wilcox Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 5fb6580f7f23..6dc456ac6136 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -9,14 +9,14 @@ #ifndef _LINUX_PAGEVEC_H #define _LINUX_PAGEVEC_H -/* 14 pointers + two long's align the pagevec structure to a power of two */ -#define PAGEVEC_SIZE 14 +/* 15 pointers + header align the pagevec structure to a power of two */ +#define PAGEVEC_SIZE 15 struct page; struct address_space; struct pagevec { - unsigned long nr; + unsigned char nr; bool percpu_pvec_drained; struct page *pages[PAGEVEC_SIZE]; }; -- cgit v1.2.3 From 5ff7091f5a2ca1b7b642ca0dbdede8f693a56926 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 31 Jan 2018 16:18:32 -0800 Subject: mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers") prevented the oom reaper from unmapping private anonymous memory with the oom reaper when the oom victim mm had mmu notifiers registered. The rationale is that doing mmu_notifier_invalidate_range_{start,end}() around the unmap_page_range(), which is needed, can block and the oom killer will stall forever waiting for the victim to exit, which may not be possible without reaping. That concern is real, but only true for mmu notifiers that have blockable invalidate_range_{start,end}() callbacks. This patch adds a "flags" field to mmu notifier ops that can set a bit to indicate that these callbacks do not block. The implementation is steered toward an expensive slowpath, such as after the oom reaper has grabbed mm->mmap_sem of a still alive oom victim. [rientjes@google.com: mmu_notifier_invalidate_range_end() can also call the invalidate_range() must not block, fix comment] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1801091339570.240101@chino.kir.corp.google.com [akpm@linux-foundation.org: make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1712141329500.74052@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Michal Hocko Acked-by: Paolo Bonzini Acked-by: Christian König Acked-by: Dimitri Sivanich Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Oded Gabbay Cc: Alex Deucher Cc: David Airlie Cc: Joerg Roedel Cc: Doug Ledford Cc: Jani Nikula Cc: Mike Marciniszyn Cc: Sean Hefty Cc: Boris Ostrovsky Cc: Jérôme Glisse Cc: Radim Krčmář Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b25dc9db19fc..2d07a1ed5a31 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -2,6 +2,7 @@ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H +#include #include #include #include @@ -10,6 +11,9 @@ struct mmu_notifier; struct mmu_notifier_ops; +/* mmu_notifier_ops flags */ +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01) + #ifdef CONFIG_MMU_NOTIFIER /* @@ -26,6 +30,15 @@ struct mmu_notifier_mm { }; struct mmu_notifier_ops { + /* + * Flags to specify behavior of callbacks for this MMU notifier. + * Used to determine which context an operation may be called. + * + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not + * block + */ + int flags; + /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are @@ -137,6 +150,10 @@ struct mmu_notifier_ops { * page. Pages will no longer be referenced by the linux * address space but may still be referenced by sptes until * the last refcount is dropped. + * + * If both of these callbacks cannot block, and invalidate_range + * cannot block, mmu_notifier_ops.flags should have + * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range_start)(struct mmu_notifier *mn, struct mm_struct *mm, @@ -159,12 +176,13 @@ struct mmu_notifier_ops { * external TLB range needs to be flushed. For more in depth * discussion on this see Documentation/vm/mmu_notifier.txt * - * The invalidate_range() function is called under the ptl - * spin-lock and not allowed to sleep. - * * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if * called between those functions. + * + * If this callback cannot block, and invalidate_range_{start,end} + * cannot block, mmu_notifier_ops.flags should have + * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -218,6 +236,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm); static inline void mmu_notifier_release(struct mm_struct *mm) { @@ -457,6 +476,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, { } +static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) +{ + return false; +} + static inline void mmu_notifier_mm_init(struct mm_struct *mm) { } -- cgit v1.2.3 From e20df2c6a86cf8e2caeb3665427d077bfb97f177 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:44 -0800 Subject: mm: align struct page more aesthetically Patch series "Restructure struct page", v2. This series does not attempt any grand restructuring. Instead, it cures the worst of the indentitis, fixes the documentation and reduces the ifdeffery. The only layout change is compound_dtor and compound_order are each reduced to one byte. This patch (of 8): Instead of an ifdef block at the end of the struct, which needed its own comment, define _struct_page_alignment up at the top where it fits nicely with the existing comment. Link: http://lkml.kernel.org/r/20171220155552.15884-2-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cfd0ac4e5e0e..4509f0cfaf39 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -39,6 +39,12 @@ struct hmm; * allows the use of atomic double word operations on the flags/mapping * and lru list pointers also. */ +#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE +#define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) +#else +#define _struct_page_alignment +#endif + struct page { /* First double word block */ unsigned long flags; /* Atomic flags, some possibly @@ -212,15 +218,7 @@ struct page { #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif -} -/* - * The struct page can be forced to be double word aligned so that atomic ops - * on double words work. The SLUB allocator can make use of such a feature. - */ -#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE - __aligned(2 * sizeof(unsigned long)) -#endif -; +} _struct_page_alignment; #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) -- cgit v1.2.3 From ca9c88c781b8e5d837068db6d1ca8e775fb7e154 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:47 -0800 Subject: mm: de-indent struct page I found the struct { union { struct { union { struct { } } } } } layout rather confusing. Fortunately, there is an easier way to write this. The innermost union is of four things which are the size of an int, so the ones which are used by slab/slob/slub can be pulled up two levels to be in the outermost union with 'counters'. That leaves us with struct { union { struct { atomic_t; atomic_t; } } } which has the same layout, but is easier to read. Output from the current git version of pahole, diffed with -uw to ignore the whitespace changes from the indentation: }; /* 16 8 */ union { long unsigned int counters; /* 24 8 */ - struct { - union { - atomic_t _mapcount; /* 24 4 */ unsigned int active; /* 24 4 */ struct { unsigned int inuse:16; /* 24:16 4 */ @@ -21,7 +18,8 @@ unsigned int frozen:1; /* 24: 0 4 */ }; /* 24 4 */ int units; /* 24 4 */ - }; /* 24 4 */ + struct { + atomic_t _mapcount; /* 24 4 */ atomic_t _refcount; /* 28 4 */ }; /* 24 8 */ }; /* 24 8 */ Link: http://lkml.kernel.org/r/20171220155552.15884-3-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4509f0cfaf39..27973166af28 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -84,28 +84,26 @@ struct page { */ unsigned counters; #endif - struct { + unsigned int active; /* SLAB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + int units; /* SLOB */ + + struct { /* Page cache */ + /* + * Count of ptes mapped in mms, to show when + * page is mapped & limit reverse map searches. + * + * Extra information about page type may be + * stored here for pages that are never mapped, + * in which case the value MUST BE <= -2. + * See page-flags.h for more details. + */ + atomic_t _mapcount; - union { - /* - * Count of ptes mapped in mms, to show when - * page is mapped & limit reverse map searches. - * - * Extra information about page type may be - * stored here for pages that are never mapped, - * in which case the value MUST BE <= -2. - * See page-flags.h for more details. - */ - atomic_t _mapcount; - - unsigned int active; /* SLAB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; - }; - int units; /* SLOB */ - }; /* * Usage count, *USE WRAPPER FUNCTION* when manual * accounting. See page_ref.h -- cgit v1.2.3 From 4cf7c8bfb36f4b4dbc333bf844ea801d089f44f8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:51 -0800 Subject: mm: remove misleading alignment claims The "third double word block" isn't on 32-bit systems. The layout looks like this: unsigned long flags; struct address_space *mapping pgoff_t index; atomic_t _mapcount; atomic_t _refcount; which is 32 bytes on 64-bit, but 20 bytes on 32-bit. Nobody is trying to use the fact that it's double-word aligned today, so just remove the misleading claims. Link: http://lkml.kernel.org/r/20171220155552.15884-4-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Christoph Lameter Cc: Michal Hocko Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 27973166af28..c2294e6204e8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -33,11 +33,11 @@ struct hmm; * a page, though if it is a pagecache page, rmap structures can tell us * who is mapping it. * - * The objects in struct page are organized in double word blocks in - * order to allows us to use atomic double word operations on portions - * of struct page. That is currently only used by slub but the arrangement - * allows the use of atomic double word operations on the flags/mapping - * and lru list pointers also. + * SLUB uses cmpxchg_double() to atomically update its freelist and + * counters. That requires that freelist & counters be adjacent and + * double-word aligned. We align all struct pages to double-word + * boundaries, and ensure that 'freelist' is aligned within the + * struct. */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) @@ -113,8 +113,6 @@ struct page { }; /* - * Third double word block - * * WARNING: bit 0 of the first word encode PageTail(). That means * the rest users of the storage space MUST NOT use the bit to * avoid collision and false-positive PageTail(). @@ -175,7 +173,6 @@ struct page { #endif }; - /* Remainder is not double word aligned */ union { unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads -- cgit v1.2.3 From b26435a0115b245ea2dd705efcce877ec417bc74 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:55 -0800 Subject: mm: improve comment on page->mapping The comment on page->mapping is terse, and out of date (it does not mention the possibility of PAGE_MAPPING_MOVABLE). Instead, point the interested reader to page-flags.h where there is a much better comment. Link: http://lkml.kernel.org/r/20171220155552.15884-5-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c2294e6204e8..8c3b8cea22ee 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -50,15 +50,9 @@ struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ union { - struct address_space *mapping; /* If low bit clear, points to - * inode address_space, or NULL. - * If page mapped as anonymous - * memory, low bit is set, and - * it points to anon_vma object - * or KSM private structure. See - * PAGE_MAPPING_ANON and - * PAGE_MAPPING_KSM. - */ + /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */ + struct address_space *mapping; + void *s_mem; /* slab first object */ atomic_t compound_mapcount; /* first tail page */ /* page_deferred_list().next -- second tail page */ -- cgit v1.2.3 From 0dd4da5b110c6915d4244b8ed87a1c8d3945224b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:58 -0800 Subject: mm: introduce _slub_counter_t Instead of putting the ifdef in the middle of the definition of struct page, pull it forward to the rest of the ifdeffery around the SLUB cmpxchg_double optimisation. Link: http://lkml.kernel.org/r/20171220155552.15884-6-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8c3b8cea22ee..5521c9799c50 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -41,9 +41,15 @@ struct hmm; */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) +#define _slub_counter_t unsigned long #else -#define _struct_page_alignment +#define _slub_counter_t unsigned int #endif +#else /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ +#define _struct_page_alignment +#define _slub_counter_t unsigned int +#endif /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ struct page { /* First double word block */ @@ -66,18 +72,7 @@ struct page { }; union { -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - /* Used for cmpxchg_double in slub */ - unsigned long counters; -#else - /* - * Keep _refcount separate from slub cmpxchg_double data. - * As the rest of the double word is protected by slab_lock - * but _refcount is not. - */ - unsigned counters; -#endif + _slub_counter_t counters; unsigned int active; /* SLAB */ struct { /* SLUB */ unsigned inuse:16; -- cgit v1.2.3 From 036e7aa49fb29e0b49b99a56fa5611d4a5b99fb1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:19:02 -0800 Subject: mm: store compound_dtor / compound_order as bytes Neither of these values get even close to 256; compound_dtor is currently at a maximum of 3, and compound_order can't be over 64. No machine has inefficient access to bytes since EV5, and while those are still supported, we don't optimise for them any more. This does not shrink struct page, but it removes an ifdef and frees up 2-6 bytes for future use. diff of pahole output: struct callback_head callback_head; /* 32 16 */ struct { long unsigned int compound_head; /* 32 8 */ - unsigned int compound_dtor; /* 40 4 */ - unsigned int compound_order; /* 44 4 */ + unsigned char compound_dtor; /* 40 1 */ + unsigned char compound_order; /* 41 1 */ }; /* 32 16 */ }; /* 32 16 */ union { [mawilcox@microsoft.com: add comment] Link: http://lkml.kernel.org/r/20171221000144.GB2980@bombadil.infradead.org Link: http://lkml.kernel.org/r/20171220155552.15884-7-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5521c9799c50..3e7e99784656 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -136,19 +136,9 @@ struct page { unsigned long compound_head; /* If bit zero is set */ /* First tail page only */ -#ifdef CONFIG_64BIT - /* - * On 64 bit system we have enough space in struct page - * to encode compound_dtor and compound_order with - * unsigned int. It can help compiler generate better or - * smaller code on some archtectures. - */ - unsigned int compound_dtor; - unsigned int compound_order; -#else - unsigned short int compound_dtor; - unsigned short int compound_order; -#endif + unsigned char compound_dtor; + unsigned char compound_order; + /* two/six bytes available here */ }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS -- cgit v1.2.3 From be50015d7eec0e96b312468291d8209c1cc49908 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:19:06 -0800 Subject: mm: document how to use struct page Be really explicit about what bits / bytes are reserved for users that want to store extra information about the pages they allocate. Link: http://lkml.kernel.org/r/20171220155552.15884-8-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Reviewed-by: Randy Dunlap Acked-by: Michal Hocko Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3e7e99784656..3f1fae8fb140 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -31,7 +31,29 @@ struct hmm; * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us - * who is mapping it. + * who is mapping it. If you allocate the page using alloc_pages(), you + * can use some of the space in struct page for your own purposes. + * + * Pages that were once in the page cache may be found under the RCU lock + * even after they have been recycled to a different purpose. The page + * cache reads and writes some of the fields in struct page to pin the + * page before checking that it's still in the page cache. It is vital + * that all users of struct page: + * 1. Use the first word as PageFlags. + * 2. Clear or preserve bit 0 of page->compound_head. It is used as + * PageTail for compound pages, and the page cache must not see false + * positives. Some users put a pointer here (guaranteed to be at least + * 4-byte aligned), other users avoid using the field altogether. + * 3. page->_refcount must either not be used, or must be used in such a + * way that other CPUs temporarily incrementing and then decrementing the + * refcount does not cause problems. On receiving the page from + * alloc_pages(), the refcount will be positive. + * 4. Either preserve page->_mapcount or restore it to -1 before freeing it. + * + * If you allocate pages of order > 0, you can use the fields in the struct + * page associated with each page, but bear in mind that the pages may have + * been inserted individually into the page cache, so you must use the above + * four fields in a compatible way for each struct page. * * SLUB uses cmpxchg_double() to atomically update its freelist and * counters. That requires that freelist & counters be adjacent and -- cgit v1.2.3 From ab8928b72fd77d936034da4c077f1580619697f4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:19:11 -0800 Subject: mm: remove reference to PG_buddy PG_buddy doesn't exist any more. It's called PageBuddy now. Link: http://lkml.kernel.org/r/20171220155552.15884-9-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3f1fae8fb140..fd1af6b9591d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -175,13 +175,13 @@ struct page { }; union { - unsigned long private; /* Mapping-private opaque data: - * usually used for buffer_heads - * if PagePrivate set; used for - * swp_entry_t if PageSwapCache; - * indicates order in the buddy - * system if PG_buddy is set. - */ + /* + * Mapping-private opaque data: + * Usually used for buffer_heads if PagePrivate + * Used for swp_entry_t if PageSwapCache + * Indicates order in the buddy system if PageBuddy + */ + unsigned long private; #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; -- cgit v1.2.3 From e9d586a8217882eb4068e3ed94a5234ba6dead34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 31 Jan 2018 16:19:14 -0800 Subject: shmem: unexport shmem_add_seals()/shmem_get_seals() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "memfd: add sealing to hugetlb-backed memory", v3. Recently, Mike Kravetz added hugetlbfs support to memfd. However, he didn't add sealing support. One of the reasons to use memfd is to have shared memory sealing when doing IPC or sharing memory with another process with some extra safety. qemu uses shared memory & hugetables with vhost-user (used by dpdk), so it is reasonable to use memfd now instead for convenience and security reasons. This patch (of 9): The functions are called through shmem_fcntl() only. And no danger in removing the EXPORTs as the routines only work with shmem file structs. Link: http://lkml.kernel.org/r/20171107122800.25517-2-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 06b295bec00d..e464815a7e4c 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -112,8 +112,6 @@ extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_TMPFS -extern int shmem_add_seals(struct file *file, unsigned int seals); -extern int shmem_get_seals(struct file *file); extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); #else -- cgit v1.2.3 From 5aadc431a593ac1f3a026dfbceaa16cc4d5e15ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 31 Jan 2018 16:19:18 -0800 Subject: shmem: rename functions that are memfd-related MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those functions are called for memfd files, backed by shmem or hugetlb (the next patches will handle hugetlb). Link: http://lkml.kernel.org/r/20171107122800.25517-3-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index e464815a7e4c..73b5e655a76e 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -112,11 +112,11 @@ extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_TMPFS -extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); +extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); #else -static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a) +static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) { return -EINVAL; } -- cgit v1.2.3 From da14c1e524a56d62b846f73ae44fd722d63747b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 31 Jan 2018 16:19:22 -0800 Subject: hugetlb: expose hugetlbfs_inode_info in header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hugetlbfs inode information will need to be accessed by code in mm/shmem.c for file sealing operations. Move inode information definition from .c file to header for needed access. Link: http://lkml.kernel.org/r/20171107122800.25517-4-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6fcf140188d0..d02301e3f232 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -270,6 +270,16 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) return sb->s_fs_info; } +struct hugetlbfs_inode_info { + struct shared_policy policy; + struct inode vfs_inode; +}; + +static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) +{ + return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); +} + extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, -- cgit v1.2.3 From ff62a34210441103108d435ae8a00a777c4dcb99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 31 Jan 2018 16:19:25 -0800 Subject: hugetlb: implement memfd sealing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements memfd sealing, similar to shmem: - WRITE: deny fallocate(PUNCH_HOLE). mmap() write is denied in memfd_add_seals(). write() doesn't exist for hugetlbfs. - SHRINK: added similar check as shmem_setattr() - GROW: added similar check as shmem_setattr() & shmem_fallocate() Except write() operation that doesn't exist with hugetlbfs, that should make sealing as close as it can be to shmem support. Link: http://lkml.kernel.org/r/20171107122800.25517-5-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d02301e3f232..944e6e8bd572 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -273,6 +273,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) struct hugetlbfs_inode_info { struct shared_policy policy; struct inode vfs_inode; + unsigned int seals; }; static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) -- cgit v1.2.3 From 9c3760eb80880f3e02546e0a2ef479e1454986b3 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 31 Jan 2018 16:19:59 -0800 Subject: zswap: only save zswap header when necessary We waste sizeof(swp_entry_t) for zswap header when using zsmalloc as zpool driver because zsmalloc doesn't support eviction. Add zpool_evictable() to detect if zpool is potentially evictable, and use it in zswap to avoid waste memory for zswap header. [yuzhao@google.com: The zpool->" prefix is a result of copy & paste] Link: http://lkml.kernel.org/r/20180110225626.110330-1-yuzhao@google.com Link: http://lkml.kernel.org/r/20180110224741.83751-1-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Dan Streetman Reviewed-by: Sergey Senozhatsky Cc: Seth Jennings Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zpool.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 004ba807df96..7238865e75b0 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -108,4 +108,6 @@ void zpool_register_driver(struct zpool_driver *driver); int zpool_unregister_driver(struct zpool_driver *driver); +bool zpool_evictable(struct zpool *pool); + #endif -- cgit v1.2.3 From def9b71ee651a6fee93a10734b94f93a69cdb2d4 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Wed, 31 Jan 2018 16:20:26 -0800 Subject: include/linux/mmzone.h: fix explanation of lower bits in the SPARSEMEM mem_map pointer The comment is confusing. On the one hand, it refers to 32-bit alignment (struct page alignment on 32-bit platforms), but this would only guarantee that the 2 lowest bits must be zero. On the other hand, it claims that at least 3 bits are available, and 3 bits are actually used. This is not broken, because there is a stronger alignment guarantee, just less obvious. Let's fix the comment to make it clear how many bits are available and why. Although memmap arrays are allocated in various places, the resulting pointer is encoded eventually, so I am adding a BUG_ON() here to enforce at runtime that all expected bits are indeed available. I have also added a BUILD_BUG_ON to check that PFN_SECTION_SHIFT is sufficient, because this part of the calculation can be easily checked at build time. [ptesarik@suse.com: v2] Link: http://lkml.kernel.org/r/20180125100516.589ea6af@ezekiel.suse.cz Link: http://lkml.kernel.org/r/20180119080908.3a662e6f@ezekiel.suse.cz Signed-off-by: Petr Tesarik Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Mel Gorman Cc: Johannes Weiner Cc: Kemi Wang Cc: YASUAKI ISHIMATSU Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 67f2e3c38939..7522a6987595 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1166,8 +1166,16 @@ extern unsigned long usemap_size(void); /* * We use the lower bits of the mem_map pointer to store - * a little bit of information. There should be at least - * 3 bits here due to 32-bit alignment. + * a little bit of information. The pointer is calculated + * as mem_map - section_nr_to_pfn(pnum). The result is + * aligned to the minimum alignment of the two values: + * 1. All mem_map arrays are page-aligned. + * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT + * lowest bits. PFN_SECTION_SHIFT is arch-specific + * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the + * worst combination is powerpc with 256k pages, + * which results in PFN_SECTION_SHIFT equal 6. + * To sum it up, at least 6 bits are available. */ #define SECTION_MARKED_PRESENT (1UL<<0) #define SECTION_HAS_MEM_MAP (1UL<<1) -- cgit v1.2.3 From ab5ac90aecf5685eb630c42c396f5f14726b0afd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:20:48 -0800 Subject: mm, hugetlb: do not rely on overcommit limit during migration hugepage migration relies on __alloc_buddy_huge_page to get a new page. This has 2 main disadvantages. 1) it doesn't allow to migrate any huge page if the pool is used completely which is not an exceptional case as the pool is static and unused memory is just wasted. 2) it leads to a weird semantic when migration between two numa nodes might increase the pool size of the destination NUMA node while the page is in use. The issue is caused by per NUMA node surplus pages tracking (see free_huge_page). Address both issues by changing the way how we allocate and account pages allocated for migration. Those should temporal by definition. So we mark them that way (we will abuse page flags in the 3rd page) and update free_huge_page to free such pages to the page allocator. Page migration path then just transfers the temporal status from the new page to the old one which will be freed on the last reference. The global surplus count will never change during this path but we still have to be careful when migrating a per-node suprlus page. This is now handled in move_hugetlb_state which is called from the migration path and it copies the hugetlb specific page state and fixes up the accounting when needed Rename __alloc_buddy_huge_page to __alloc_surplus_huge_page to better reflect its purpose. The new allocation routine for the migration path is __alloc_migrate_huge_page. The user visible effect of this patch is that migrated pages are really temporal and they travel between NUMA nodes as per the migration request: Before migration /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:1 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0 After /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:1 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0 with the previous implementation, both nodes would have nr_hugepages:1 until the page is freed. Link: http://lkml.kernel.org/r/20180103093213.26329-4-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 944e6e8bd572..66992348531e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); +void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -157,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); bool is_hugetlb_entry_migration(pte_t pte); + #else /* !CONFIG_HUGETLB_PAGE */ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) @@ -197,6 +199,7 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list) return false; } #define putback_active_hugepage(p) do {} while (0) +#define move_hugetlb_state(old, new, reason) do {} while (0) static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) -- cgit v1.2.3 From ebd637235890a3fa6a6d4bb57522098f2f59c693 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:21:00 -0800 Subject: hugetlb, mempolicy: fix the mbind hugetlb migration do_mbind migration code relies on alloc_huge_page_noerr for hugetlb pages. alloc_huge_page_noerr uses alloc_huge_page which is a highlevel allocation function which has to take care of reserves, overcommit or hugetlb cgroup accounting. None of that is really required for the page migration because the new page is only temporal and either will replace the original page or it will be dropped. This is essentially as for other migration call paths and there shouldn't be any reason to handle mbind in a special way. The current implementation is even suboptimal because the migration might fail just because the hugetlb cgroup limit is reached, or the overcommit is saturated. Fix this by making mbind like other hugetlb migration paths. Add a new migration helper alloc_huge_page_vma as a wrapper around alloc_huge_page_nodemask with additional mempolicy handling. alloc_huge_page_noerr has no more users and it can go. Link: http://lkml.kernel.org/r/20180103093213.26329-7-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 66992348531e..612a29b7f6c6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -356,10 +356,9 @@ struct huge_bootmem_page { struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_node(struct hstate *h, int nid); -struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask); +struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -537,7 +536,7 @@ struct hstate {}; #define alloc_huge_page(v, a, r) NULL #define alloc_huge_page_node(h, nid) NULL #define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL -#define alloc_huge_page_noerr(v, a, r) NULL +#define alloc_huge_page_vma(vma, address) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL #define hstate_sizelog(s) NULL -- cgit v1.2.3 From 389c8178d0904f944887ccca2256ff9d79c12e8e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:21:03 -0800 Subject: hugetlb, mbind: fall back to default policy if vma is NULL Dan Carpenter has noticed that mbind migration callback (new_page) can get a NULL vma pointer and choke on it inside alloc_huge_page_vma which relies on the VMA to get the hstate. We used to BUG_ON this case but the BUG_+ON has been removed recently by "hugetlb, mempolicy: fix the mbind hugetlb migration". The proper way to handle this is to get the hstate from the migrated page and rely on huge_node (resp. get_vma_policy) do the right thing with null VMA. We are currently falling back to the default mempolicy in that case which is in line what THP path is doing here. Link: http://lkml.kernel.org/r/20180110104712.GR1732@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Dan Carpenter Cc: Naoya Horiguchi Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 612a29b7f6c6..36fa6a2a82e3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -358,7 +358,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, struct page *alloc_huge_page_node(struct hstate *h, int nid); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask); -struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address); +struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, + unsigned long address); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -536,7 +537,7 @@ struct hstate {}; #define alloc_huge_page(v, a, r) NULL #define alloc_huge_page_node(h, nid) NULL #define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL -#define alloc_huge_page_vma(vma, address) NULL +#define alloc_huge_page_vma(h, vma, address) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL #define hstate_sizelog(s) NULL -- cgit v1.2.3 From 3f56a2f8030071cf86520ef4fc3045ba6856e610 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Wed, 31 Jan 2018 16:21:27 -0800 Subject: mm: remove PG_highmem description Commit cbe37d093707 ("[PATCH] mm: remove PG_highmem") removed PG_highmem to save a page flag. So the description of PG_highmem is no longer needed. Link: http://lkml.kernel.org/r/1517391212-2950-1-git-send-email-miles.chen@mediatek.com Signed-off-by: Miles Chen Acked-by: Michal Hocko Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 3ec44e27aa9d..50c2b8786831 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -46,11 +46,6 @@ * guarantees that this bit is cleared for a page when it first is entered into * the page cache. * - * PG_highmem pages are not permanently mapped into the kernel virtual address - * space, they need to be kmapped separately for doing IO on the pages. The - * struct page (these bits with information) are always mapped into kernel - * address space... - * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! -- cgit v1.2.3