From 27ee57c93ff00b8a2d6c6dd6b0b3dddda7b43b77 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:35 -0700 Subject: mm: memcontrol: report slab usage in cgroup2 memory.stat Show how much memory is used for storing reclaimable and unreclaimable in-kernel data structures allocated from slab caches. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f0c4bec6565b..e7af4834ffea 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -53,6 +53,8 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_NSTATS, /* default hierarchy stats */ MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS, + MEMCG_SLAB_RECLAIMABLE, + MEMCG_SLAB_UNRECLAIMABLE, MEMCG_NR_STAT, }; @@ -883,6 +885,20 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) if (memcg_kmem_enabled()) __memcg_kmem_put_cache(cachep); } + +/** + * memcg_kmem_update_page_stat - update kmem page state statistics + * @page: the page + * @idx: page state item to account + * @val: number of pages (positive or negative) + */ +static inline void memcg_kmem_update_page_stat(struct page *page, + enum mem_cgroup_stat_index idx, int val) +{ + if (memcg_kmem_enabled() && page->mem_cgroup) + this_cpu_add(page->mem_cgroup->stat->count[idx], val); +} + #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -928,6 +944,11 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) { } + +static inline void memcg_kmem_update_page_stat(struct page *page, + enum mem_cgroup_stat_index idx, int val) +{ +} #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ #endif /* _LINUX_MEMCONTROL_H */ -- cgit v1.2.3 From 12580e4b54ba8a1b22ec977c200be0174ca42348 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:38 -0700 Subject: mm: memcontrol: report kernel stack usage in cgroup2 memory.stat Show how much memory is allocated to kernel stacks. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e7af4834ffea..d6300313b298 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,9 +52,10 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, /* default hierarchy stats */ - MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS, + MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS, MEMCG_SLAB_RECLAIMABLE, MEMCG_SLAB_UNRECLAIMABLE, + MEMCG_SOCK, MEMCG_NR_STAT, }; -- cgit v1.2.3 From 832fc1de01aea28255cb11d270679b7f1273f0d7 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 17 Mar 2016 14:17:41 -0700 Subject: /proc/kpageflags: return KPF_BUDDY for "tail" buddy pages Currently /proc/kpageflags returns nothing for "tail" buddy pages, which is inconvenient when grasping how free pages are distributed. This patch sets KPF_BUDDY for such pages. With this patch: $ grep MemFree /proc/meminfo ; tools/vm/page-types -b buddy MemFree: 3134992 kB flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000400 779272 3044 __________B_______________________________ buddy 0x0000000000000c00 4385 17 __________BM______________________________ buddy,mmap total 783657 3061 783657 pages is 3134628 kB (roughly consistent with the global counter,) so it's OK. [akpm@linux-foundation.org: update comment, per Naoya] Signed-off-by: Naoya Horiguchi Reviewed-by: Vladimir Davydov > Cc: Konstantin Khlebnikov Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 19724e6ebd26..597695523679 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -593,6 +593,8 @@ static inline void __ClearPageBuddy(struct page *page) atomic_set(&page->_mapcount, -1); } +extern bool is_free_buddy_page(struct page *page); + #define PAGE_BALLOON_MAPCOUNT_VALUE (-256) static inline int PageBalloon(struct page *page) -- cgit v1.2.3 From 698b1b30642f1ff0ea10ef1de9745ab633031377 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:08 -0700 Subject: mm, compaction: introduce kcompactd Memory compaction can be currently performed in several contexts: - kswapd balancing a zone after a high-order allocation failure - direct compaction to satisfy a high-order allocation, including THP page fault attemps - khugepaged trying to collapse a hugepage - manually from /proc The purpose of compaction is two-fold. The obvious purpose is to satisfy a (pending or future) high-order allocation, and is easy to evaluate. The other purpose is to keep overal memory fragmentation low and help the anti-fragmentation mechanism. The success wrt the latter purpose is more The current situation wrt the purposes has a few drawbacks: - compaction is invoked only when a high-order page or hugepage is not available (or manually). This might be too late for the purposes of keeping memory fragmentation low. - direct compaction increases latency of allocations. Again, it would be better if compaction was performed asynchronously to keep fragmentation low, before the allocation itself comes. - (a special case of the previous) the cost of compaction during THP page faults can easily offset the benefits of THP. - kswapd compaction appears to be complex, fragile and not working in some scenarios. It could also end up compacting for a high-order allocation request when it should be reclaiming memory for a later order-0 request. To improve the situation, we should be able to benefit from an equivalent of kswapd, but for compaction - i.e. a background thread which responds to fragmentation and the need for high-order allocations (including hugepages) somewhat proactively. One possibility is to extend the responsibilities of kswapd, which could however complicate its design too much. It should be better to let kswapd handle reclaim, as order-0 allocations are often more critical than high-order ones. Another possibility is to extend khugepaged, but this kthread is a single instance and tied to THP configs. This patch goes with the option of a new set of per-node kthreads called kcompactd, and lays the foundations, without introducing any new tunables. The lifecycle mimics kswapd kthreads, including the memory hotplug hooks. For compaction, kcompactd uses the standard compaction_suitable() and ompact_finished() criteria and the deferred compaction functionality. Unlike direct compaction, it uses only sync compaction, as there's no allocation latency to minimize. This patch doesn't yet add a call to wakeup_kcompactd. The kswapd compact/reclaim loop for high-order pages will be replaced by waking up kcompactd in the next patch with the description of what's wrong with the old approach. Waking up of the kcompactd threads is also tied to kswapd activity and follows these rules: - we don't want to affect any fastpaths, so wake up kcompactd only from the slowpath, as it's done for kswapd - if kswapd is doing reclaim, it's more important than compaction, so don't invoke kcompactd until kswapd goes to sleep - the target order used for kswapd is passed to kcompactd Future possible future uses for kcompactd include the ability to wake up kcompactd on demand in special situations, such as when hugepages are not available (currently not done due to __GFP_NO_KSWAPD) or when a fragmentation event (i.e. __rmqueue_fallback()) occurs. It's also possible to perform periodic compaction with kcompactd. [arnd@arndb.de: fix build errors with kcompactd] [paul.gortmaker@windriver.com: don't use modular references for non modular code] Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Arnd Bergmann Signed-off-by: Paul Gortmaker Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 16 ++++++++++++++++ include/linux/mmzone.h | 6 ++++++ include/linux/vm_event_item.h | 1 + 3 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 4cd4ddf64cc7..d7c8de583a23 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -52,6 +52,10 @@ extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); extern bool compaction_restarting(struct zone *zone, int order); +extern int kcompactd_run(int nid); +extern void kcompactd_stop(int nid); +extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); + #else static inline unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, int alloc_flags, @@ -84,6 +88,18 @@ static inline bool compaction_deferred(struct zone *zone, int order) return true; } +static inline int kcompactd_run(int nid) +{ + return 0; +} +static inline void kcompactd_stop(int nid) +{ +} + +static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +{ +} + #endif /* CONFIG_COMPACTION */ #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6de02ac378a0..bdd9a270a813 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -668,6 +668,12 @@ typedef struct pglist_data { mem_hotplug_begin/end() */ int kswapd_max_order; enum zone_type classzone_idx; +#ifdef CONFIG_COMPACTION + int kcompactd_max_order; + enum zone_type kcompactd_classzone_idx; + wait_queue_head_t kcompactd_wait; + struct task_struct *kcompactd; +#endif #ifdef CONFIG_NUMA_BALANCING /* Lock serializing the migrate rate limiting window */ spinlock_t numabalancing_migrate_lock; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 67c1dbd19c6d..58ecc056ee45 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -53,6 +53,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED, COMPACTISOLATED, COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, + KCOMPACTD_WAKE, #endif #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, -- cgit v1.2.3 From ee91ef6173e81819f5ff610c2485802081635657 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 17 Mar 2016 14:18:21 -0700 Subject: bufferhead: force inlining of buffer head flag operations With both gcc 4.7.2 and 4.9.2, sometimes gcc mysteriously doesn't inline very small functions we expect to be inlined. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 With this .config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os, set_buffer_foo(), clear_buffer_foo() and similar functions get deinlined about 60 times. Examples of disassembly: (14 copies, 43 calls): 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 80 0f 20 lock orb $0x20,(%rdi) 5d pop %rbp c3 retq (3 copies, 34 calls): 48 8b 07 mov (%rdi),%rax 55 push %rbp 48 89 e5 mov %rsp,%rbp 48 c1 e8 05 shr $0x5,%rax 83 e0 01 and $0x1,%eax 5d pop %rbp c3 retq (5 copies, 13 calls): 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 80 0f 40 lock orb $0x40,(%rdi) 5d pop %rbp c3 retq This patch fixes this via s/inline/__always_inline/. This decreases vmlinux by about 3 kbytes. text data bss dec hex filename 88200439 19905208 36421632 144527279 89d4faf vmlinux2 88197239 19905240 36421632 144524111 89d434f vmlinux Signed-off-by: Denys Vlasenko Cc: Ingo Molnar Cc: Thomas Graf Cc: Peter Zijlstra Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 89d9aa9e79bf..c67f052cc5e5 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -82,15 +82,15 @@ struct buffer_head { * and buffer_foo() functions. */ #define BUFFER_FNS(bit, name) \ -static inline void set_buffer_##name(struct buffer_head *bh) \ +static __always_inline void set_buffer_##name(struct buffer_head *bh) \ { \ set_bit(BH_##bit, &(bh)->b_state); \ } \ -static inline void clear_buffer_##name(struct buffer_head *bh) \ +static __always_inline void clear_buffer_##name(struct buffer_head *bh) \ { \ clear_bit(BH_##bit, &(bh)->b_state); \ } \ -static inline int buffer_##name(const struct buffer_head *bh) \ +static __always_inline int buffer_##name(const struct buffer_head *bh) \ { \ return test_bit(BH_##bit, &(bh)->b_state); \ } @@ -99,11 +99,11 @@ static inline int buffer_##name(const struct buffer_head *bh) \ * test_set_buffer_foo() and test_clear_buffer_foo() */ #define TAS_BUFFER_FNS(bit, name) \ -static inline int test_set_buffer_##name(struct buffer_head *bh) \ +static __always_inline int test_set_buffer_##name(struct buffer_head *bh) \ { \ return test_and_set_bit(BH_##bit, &(bh)->b_state); \ } \ -static inline int test_clear_buffer_##name(struct buffer_head *bh) \ +static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \ { \ return test_and_clear_bit(BH_##bit, &(bh)->b_state); \ } \ -- cgit v1.2.3 From 4b0f326163f09e6d67f09dd5e1a70093cee710fc Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 17 Mar 2016 14:18:24 -0700 Subject: include/linux/page-flags.h: force inlining of selected page flag modifications Sometimes gcc mysteriously doesn't inline very small functions we expect to be inlined. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 With this .config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os, the following functions get deinlined many times. Examples of disassembly: (43 copies, 141 calls): 55 push %rbp 48 89 e5 mov %rsp,%rbp f0 80 0f 08 lock orb $0x8,(%rdi) 5d pop %rbp c3 retq (10 copies, 134 calls): 48 8b 07 mov (%rdi),%rax 55 push %rbp 48 89 e5 mov %rsp,%rbp 48 c1 e8 0b shr $0xb,%rax 83 e0 01 and $0x1,%eax 5d pop %rbp c3 retq This patch fixes this via s/inline/__always_inline/. Code size decrease after the patch is ~7k: text data bss dec hex filename 92125002 20826048 36417536 149368586 8e72f0a vmlinux 92118087 20826112 36417536 149361735 8e71447 vmlinux7_pageops_after Signed-off-by: Denys Vlasenko Cc: Ingo Molnar Cc: Thomas Graf Cc: Peter Zijlstra Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 597695523679..f4ed4f1b0c77 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -144,12 +144,12 @@ static inline struct page *compound_head(struct page *page) return page; } -static inline int PageTail(struct page *page) +static __always_inline int PageTail(struct page *page) { return READ_ONCE(page->compound_head) & 1; } -static inline int PageCompound(struct page *page) +static __always_inline int PageCompound(struct page *page) { return test_bit(PG_head, &page->flags) || PageTail(page); } @@ -184,31 +184,31 @@ static inline int PageCompound(struct page *page) * Macros to create function definitions for page flags */ #define TESTPAGEFLAG(uname, lname, policy) \ -static inline int Page##uname(struct page *page) \ +static __always_inline int Page##uname(struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ -static inline void SetPage##uname(struct page *page) \ +static __always_inline void SetPage##uname(struct page *page) \ { set_bit(PG_##lname, &policy(page, 1)->flags); } #define CLEARPAGEFLAG(uname, lname, policy) \ -static inline void ClearPage##uname(struct page *page) \ +static __always_inline void ClearPage##uname(struct page *page) \ { clear_bit(PG_##lname, &policy(page, 1)->flags); } #define __SETPAGEFLAG(uname, lname, policy) \ -static inline void __SetPage##uname(struct page *page) \ +static __always_inline void __SetPage##uname(struct page *page) \ { __set_bit(PG_##lname, &policy(page, 1)->flags); } #define __CLEARPAGEFLAG(uname, lname, policy) \ -static inline void __ClearPage##uname(struct page *page) \ +static __always_inline void __ClearPage##uname(struct page *page) \ { __clear_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTSETFLAG(uname, lname, policy) \ -static inline int TestSetPage##uname(struct page *page) \ +static __always_inline int TestSetPage##uname(struct page *page) \ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTCLEARFLAG(uname, lname, policy) \ -static inline int TestClearPage##uname(struct page *page) \ +static __always_inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } #define PAGEFLAG(uname, lname, policy) \ @@ -371,7 +371,7 @@ PAGEFLAG(Idle, idle, PF_ANY) #define PAGE_MAPPING_KSM 2 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) -static inline int PageAnon(struct page *page) +static __always_inline int PageAnon(struct page *page) { page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; @@ -384,7 +384,7 @@ static inline int PageAnon(struct page *page) * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ -static inline int PageKsm(struct page *page) +static __always_inline int PageKsm(struct page *page) { page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == @@ -415,14 +415,14 @@ static inline int PageUptodate(struct page *page) return ret; } -static inline void __SetPageUptodate(struct page *page) +static __always_inline void __SetPageUptodate(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); smp_wmb(); __set_bit(PG_uptodate, &page->flags); } -static inline void SetPageUptodate(struct page *page) +static __always_inline void SetPageUptodate(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); /* @@ -456,12 +456,12 @@ static inline void set_page_writeback_keepwrite(struct page *page) __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) -static inline void set_compound_head(struct page *page, struct page *head) +static __always_inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); } -static inline void clear_compound_head(struct page *page) +static __always_inline void clear_compound_head(struct page *page) { WRITE_ONCE(page->compound_head, 0); } -- cgit v1.2.3 From b6ecd2dea4435a771a99c497a6ac5df6d3618c5a Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:33 -0700 Subject: mm: memcontrol: zap memcg_kmem_online helper As kmem accounting is now either enabled for all cgroups or disabled system-wide, there's no point in having memcg_kmem_online() helper - instead one can use memcg_kmem_enabled() and mem_cgroup_online(), as shrink_slab() now does. There are only two places left where this helper is used - __memcg_kmem_charge() and memcg_create_kmem_cache(). The former can only be called if memcg_kmem_enabled() returned true. Since the cgroup it operates on is online, mem_cgroup_is_root() check will be enough. memcg_create_kmem_cache() can't use mem_cgroup_online() helper instead of memcg_kmem_online(), because it relies on the fact that in memcg_offline_kmem() memcg->kmem_state is changed before memcg_deactivate_kmem_caches() is called, but there we can just open-code the check. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d6300313b298..bc8e4e22f58f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -795,11 +795,6 @@ static inline bool memcg_kmem_enabled(void) return static_branch_unlikely(&memcg_kmem_enabled_key); } -static inline bool memcg_kmem_online(struct mem_cgroup *memcg) -{ - return memcg->kmem_state == KMEM_ONLINE; -} - /* * In general, we'll do everything in our power to not incur in any overhead * for non-memcg users for the kmem functions. Not even a function call, if we @@ -909,11 +904,6 @@ static inline bool memcg_kmem_enabled(void) return false; } -static inline bool memcg_kmem_online(struct mem_cgroup *memcg) -{ - return false; -} - static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { return 0; -- cgit v1.2.3 From 0a6b76dd23fa08c5fd7b68acdb55018a37afd4aa Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:18:42 -0700 Subject: mm: workingset: make shadow node shrinker memcg aware Workingset code was recently made memcg aware, but shadow node shrinker is still global. As a result, one small cgroup can consume all memory available for shadow nodes, possibly hurting other cgroups by reclaiming their shadow nodes, even though reclaim distances stored in its shadow nodes have no effect. To avoid this, we need to make shadow node shrinker memcg aware. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bc8e4e22f58f..1191d79aa495 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -403,6 +403,9 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int nr_pages); +unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask); + static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { @@ -661,6 +664,13 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, { } +static inline unsigned long +mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask) +{ + return 0; +} + static inline void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { -- cgit v1.2.3 From f9719a03de51e13526d614e79d002f838770b2d6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:18:45 -0700 Subject: thp, vmstats: count deferred split events Count how many times we put a THP in split queue. Currently, it happens on partial unmap of a THP. Rapidly growing value can indicate that an application behaves unfriendly wrt THP: often fault in huge page and then unmap part of it. This leads to unnecessary memory fragmentation and the application may require tuning. The event also can help with debugging kernel [mis-]behaviour. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 58ecc056ee45..ec084321fe09 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -72,6 +72,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_COLLAPSE_ALLOC_FAILED, THP_SPLIT_PAGE, THP_SPLIT_PAGE_FAILED, + THP_DEFERRED_SPLIT_PAGE, THP_SPLIT_PMD, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, -- cgit v1.2.3 From ea606cf5d8df370e7932460dfd960b21f20e7c6d Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 17 Mar 2016 14:18:48 -0700 Subject: mm: move max_map_count bits into mm.h max_map_count sysctl unrelated to scheduler. Move its bits from include/linux/sched/sysctl.h to include/linux/mm.h. Signed-off-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 21 +++++++++++++++++++++ include/linux/sched/sysctl.h | 21 --------------------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index dbf1eddab964..6922adf41938 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -82,6 +82,27 @@ extern int mmap_rnd_compat_bits __read_mostly; #define mm_forbids_zeropage(X) (0) #endif +/* + * Default maximum number of active map areas, this limits the number of vmas + * per mm struct. Users can overwrite this number by sysctl but there is a + * problem. + * + * When a program's coredump is generated as ELF format, a section is created + * per a vma. In ELF, the number of sections is represented in unsigned short. + * This means the number of sections should be smaller than 65535 at coredump. + * Because the kernel adds some informative sections to a image of program at + * generating coredump, we need some margin. The number of extra sections is + * 1-3 now and depends on arch. We use "5" as safe margin, here. + * + * ELF extended numbering allows more than 65535 sections, so 16-bit bound is + * not a hard limit any more. Although some userspace tools can be surprised by + * that. + */ +#define MAPCOUNT_ELF_CORE_MARGIN (5) +#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) + +extern int sysctl_max_map_count; + extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 4f080ab4f2cd..22db1e63707e 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -14,27 +14,6 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, enum { sysctl_hung_task_timeout_secs = 0 }; #endif -/* - * Default maximum number of active map areas, this limits the number of vmas - * per mm struct. Users can overwrite this number by sysctl but there is a - * problem. - * - * When a program's coredump is generated as ELF format, a section is created - * per a vma. In ELF, the number of sections is represented in unsigned short. - * This means the number of sections should be smaller than 65535 at coredump. - * Because the kernel adds some informative sections to a image of program at - * generating coredump, we need some margin. The number of extra sections is - * 1-3 now and depends on arch. We use "5" as safe margin, here. - * - * ELF extended numbering allows more than 65535 sections, so 16-bit bound is - * not a hard limit any more. Although some userspace tools can be surprised by - * that. - */ -#define MAPCOUNT_ELF_CORE_MARGIN (5) -#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) - -extern int sysctl_max_map_count; - extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; -- cgit v1.2.3 From bcf6691797f425b301f629bb783b7ff2d0bcfa5a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:18:53 -0700 Subject: mm, tracing: refresh __def_vmaflag_names Get list of VMA flags up-to-date and sort it to match VM_* definition order. [vbabka@suse.cz: add a note above vmaflag definitions to update the names when changing] Signed-off-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6922adf41938..b3202612bb95 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -143,6 +143,7 @@ extern unsigned int kobjsize(const void *objp); /* * vm_flags in vm_area_struct, see mm_types.h. + * When changing, update also include/trace/events/mmflags.h */ #define VM_NONE 0x00000000 -- cgit v1.2.3 From d02bd27bd33dd7e8d22594cd568b81be0cb584cd Mon Sep 17 00:00:00 2001 From: Igor Redko Date: Thu, 17 Mar 2016 14:19:05 -0700 Subject: mm/page_alloc.c: calculate 'available' memory in a separate function Add a new field, VIRTIO_BALLOON_S_AVAIL, to virtio_balloon memory statistics protocol, corresponding to 'Available' in /proc/meminfo. It indicates to the hypervisor how big the balloon can be inflated without pushing the guest system to swap. This metric would be very useful in VM orchestration software to improve memory management of different VMs under overcommit. This patch (of 2): Factor out calculation of the available memory counter into a separate exportable function, in order to be able to use it in other parts of the kernel. In particular, it appears a relevant metric to report to the hypervisor via virtio-balloon statistics interface (in a followup patch). Signed-off-by: Igor Redko Signed-off-by: Denis V. Lunev Reviewed-by: Roman Kagan Cc: Michael S. Tsirkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b3202612bb95..db9df3f78de1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1875,6 +1875,7 @@ extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); extern void show_mem(unsigned int flags); +extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); -- cgit v1.2.3 From 3ed3a4f0ddffece942bb2661924d87be4ce63cb7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:19:11 -0700 Subject: mm: cleanup *pte_alloc* interfaces There are few things about *pte_alloc*() helpers worth cleaning up: - 'vma' argument is unused, let's drop it; - most __pte_alloc() callers do speculative check for pmd_none(), before taking ptl: let's introduce pte_alloc() macro which does the check. The only direct user of __pte_alloc left is userfaultfd, which has different expectation about atomicity wrt pmd. - pte_alloc_map() and pte_alloc_map_lock() are redefined using pte_alloc(). [sudeep.holla@arm.com: fix build for arm64 hugetlbpage] [sfr@canb.auug.org.au: fix arch/arm/mm/mmu.c some more] Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Signed-off-by: Sudeep Holla Acked-by: Kirill A. Shutemov Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index db9df3f78de1..75d1907b9009 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1545,8 +1545,7 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) } #endif -int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, - pmd_t *pmd, unsigned long address); +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); /* @@ -1672,15 +1671,15 @@ static inline void pgtable_page_dtor(struct page *page) pte_unmap(pte); \ } while (0) -#define pte_alloc_map(mm, vma, pmd, address) \ - ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \ - pmd, address))? \ - NULL: pte_offset_map(pmd, address)) +#define pte_alloc(mm, pmd, address) \ + (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd, address)) + +#define pte_alloc_map(mm, pmd, address) \ + (pte_alloc(mm, pmd, address) ? NULL : pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \ - pmd, address))? \ - NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) + (pte_alloc(mm, pmd, address) ? \ + NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ -- cgit v1.2.3 From 795ae7a0de6b834a0cc202aa55c190ef81496665 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:19:14 -0700 Subject: mm: scale kswapd watermarks in proportion to memory In machines with 140G of memory and enterprise flash storage, we have seen read and write bursts routinely exceed the kswapd watermarks and cause thundering herds in direct reclaim. Unfortunately, the only way to tune kswapd aggressiveness is through adjusting min_free_kbytes - the system's emergency reserves - which is entirely unrelated to the system's latency requirements. In order to get kswapd to maintain a 250M buffer of free memory, the emergency reserves need to be set to 1G. That is a lot of memory wasted for no good reason. On the other hand, it's reasonable to assume that allocation bursts and overall allocation concurrency scale with memory capacity, so it makes sense to make kswapd aggressiveness a function of that as well. Change the kswapd watermark scale factor from the currently fixed 25% of the tunable emergency reserve to a tunable 0.1% of memory. Beyond 1G of memory, this will produce bigger watermark steps than the current formula in default settings. Ensure that the new formula never chooses steps smaller than that, i.e. 25% of the emergency reserve. On a 140G machine, this raises the default watermark steps - the distance between min and low, and low and high - from 16M to 143M. Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Acked-by: Rik van Riel Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + include/linux/mmzone.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 75d1907b9009..f7fd64227d3a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1889,6 +1889,7 @@ extern void zone_pcp_reset(struct zone *zone); /* page_alloc.c */ extern int min_free_kbytes; +extern int watermark_scale_factor; /* nommu.c */ extern atomic_long_t mmap_pages_allocated; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bdd9a270a813..c60df9257cc7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -841,6 +841,8 @@ static inline int is_highmem(struct zone *zone) struct ctl_table; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, + void __user *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -- cgit v1.2.3 From b14a1ef58e8acb7fa64c1c8f745b21fd8a577aba Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Thu, 17 Mar 2016 14:19:17 -0700 Subject: mm: remove unnecessary description about a non-exist gfp flag Since __GFP_NOACCOUNT was removed by commit 20b5c3039863 ("Revert 'gfp: add __GFP_NOACCOUNT'"), its description is not necessary. Signed-off-by: Satoru Takeuchi Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index bb16dfeb917e..c083d0820a87 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -105,8 +105,6 @@ struct vm_area_struct; * * __GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the __GFP_MEMALLOC flag if both are set. - * - * __GFP_NOACCOUNT ignores the accounting for kmemcg limit enforcement. */ #define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) -- cgit v1.2.3 From 444eb2a449ef36fe115431ed7b71467c4563c7f1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 17 Mar 2016 14:19:23 -0700 Subject: mm: thp: set THP defrag by default to madvise and add a stall-free defrag option THP defrag is enabled by default to direct reclaim/compact but not wake kswapd in the event of a THP allocation failure. The problem is that THP allocation requests potentially enter reclaim/compaction. This potentially incurs a severe stall that is not guaranteed to be offset by reduced TLB misses. While there has been considerable effort to reduce the impact of reclaim/compaction, it is still a high cost and workloads that should fit in memory fail to do so. Specifically, a simple anon/file streaming workload will enter direct reclaim on NUMA at least even though the working set size is 80% of RAM. It's been years and it's time to throw in the towel. First, this patch defines THP defrag as follows; madvise: A failed allocation will direct reclaim/compact if the application requests it never: Neither reclaim/compact nor wake kswapd defer: A failed allocation will wake kswapd/kcompactd always: A failed allocation will direct reclaim/compact (historical behaviour) khugepaged defrag will enter direct/reclaim but not wake kswapd. Next it sets the default defrag option to be "madvise" to only enter direct reclaim/compaction for applications that specifically requested it. Lastly, it removes a check from the page allocator slowpath that is related to __GFP_THISNODE to allow "defer" to work. The callers that really cares are slub/slab and they are updated accordingly. The slab one may be surprising because it also corrects a comment as kswapd was never woken up by that path. This means that a THP fault will no longer stall for most applications by default and the ideal for most users that get THP if they are immediately available. There are still options for users that prefer a stall at startup of a new application by either restoring historical behaviour with "always" or pick a half-way point with "defer" where kswapd does some of the work in the background and wakes kcompactd if necessary. THP defrag for khugepaged remains enabled and will enter direct/reclaim but no wakeup kswapd or kcompactd. After this patch a THP allocation failure will quickly fallback and rely on khugepaged to recover the situation at some time in the future. In some cases, this will reduce THP usage but the benefit of THP is hard to measure and not a universal win where as a stall to reclaim/compaction is definitely measurable and can be painful. The first test for this is using "usemem" to read a large file and write a large anonymous mapping (to avoid the zero page) multiple times. The total size of the mappings is 80% of RAM and the benchmark simply measures how long it takes to complete. It uses multiple threads to see if that is a factor. On UMA, the performance is almost identical so is not reported but on NUMA, we see this usemem 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Amean System-1 102.86 ( 0.00%) 46.81 ( 54.50%) Amean System-4 37.85 ( 0.00%) 34.02 ( 10.12%) Amean System-7 48.12 ( 0.00%) 46.89 ( 2.56%) Amean System-12 51.98 ( 0.00%) 56.96 ( -9.57%) Amean System-21 80.16 ( 0.00%) 79.05 ( 1.39%) Amean System-30 110.71 ( 0.00%) 107.17 ( 3.20%) Amean System-48 127.98 ( 0.00%) 124.83 ( 2.46%) Amean Elapsd-1 185.84 ( 0.00%) 105.51 ( 43.23%) Amean Elapsd-4 26.19 ( 0.00%) 25.58 ( 2.33%) Amean Elapsd-7 21.65 ( 0.00%) 21.62 ( 0.16%) Amean Elapsd-12 18.58 ( 0.00%) 17.94 ( 3.43%) Amean Elapsd-21 17.53 ( 0.00%) 16.60 ( 5.33%) Amean Elapsd-30 17.45 ( 0.00%) 17.13 ( 1.84%) Amean Elapsd-48 15.40 ( 0.00%) 15.27 ( 0.82%) For a single thread, the benchmark completes 43.23% faster with this patch applied with smaller benefits as the thread increases. Similar, notice the large reduction in most cases in system CPU usage. The overall CPU time is 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 User 10357.65 10438.33 System 3988.88 3543.94 Elapsed 2203.01 1634.41 Which is substantial. Now, the reclaim figures 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 Minor Faults 128458477 278352931 Major Faults 2174976 225 Swap Ins 16904701 0 Swap Outs 17359627 0 Allocation stalls 43611 0 DMA allocs 0 0 DMA32 allocs 19832646 19448017 Normal allocs 614488453 580941839 Movable allocs 0 0 Direct pages scanned 24163800 0 Kswapd pages scanned 0 0 Kswapd pages reclaimed 0 0 Direct pages reclaimed 20691346 0 Compaction stalls 42263 0 Compaction success 938 0 Compaction failures 41325 0 This patch eliminates almost all swapping and direct reclaim activity. There is still overhead but it's from NUMA balancing which does not identify that it's pointless trying to do anything with this workload. I also tried the thpscale benchmark which forces a corner case where compaction can be used heavily and measures the latency of whether base or huge pages were used thpscale Fault Latencies 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Amean fault-base-1 5288.84 ( 0.00%) 2817.12 ( 46.73%) Amean fault-base-3 6365.53 ( 0.00%) 3499.11 ( 45.03%) Amean fault-base-5 6526.19 ( 0.00%) 4363.06 ( 33.15%) Amean fault-base-7 7142.25 ( 0.00%) 4858.08 ( 31.98%) Amean fault-base-12 13827.64 ( 0.00%) 10292.11 ( 25.57%) Amean fault-base-18 18235.07 ( 0.00%) 13788.84 ( 24.38%) Amean fault-base-24 21597.80 ( 0.00%) 24388.03 (-12.92%) Amean fault-base-30 26754.15 ( 0.00%) 19700.55 ( 26.36%) Amean fault-base-32 26784.94 ( 0.00%) 19513.57 ( 27.15%) Amean fault-huge-1 4223.96 ( 0.00%) 2178.57 ( 48.42%) Amean fault-huge-3 2194.77 ( 0.00%) 2149.74 ( 2.05%) Amean fault-huge-5 2569.60 ( 0.00%) 2346.95 ( 8.66%) Amean fault-huge-7 3612.69 ( 0.00%) 2997.70 ( 17.02%) Amean fault-huge-12 3301.75 ( 0.00%) 6727.02 (-103.74%) Amean fault-huge-18 6696.47 ( 0.00%) 6685.72 ( 0.16%) Amean fault-huge-24 8000.72 ( 0.00%) 9311.43 (-16.38%) Amean fault-huge-30 13305.55 ( 0.00%) 9750.45 ( 26.72%) Amean fault-huge-32 9981.71 ( 0.00%) 10316.06 ( -3.35%) The average time to fault pages is substantially reduced in the majority of caseds but with the obvious caveat that fewer THPs are actually used in this adverse workload 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Percentage huge-1 0.71 ( 0.00%) 14.04 (1865.22%) Percentage huge-3 10.77 ( 0.00%) 33.05 (206.85%) Percentage huge-5 60.39 ( 0.00%) 38.51 (-36.23%) Percentage huge-7 45.97 ( 0.00%) 34.57 (-24.79%) Percentage huge-12 68.12 ( 0.00%) 40.07 (-41.17%) Percentage huge-18 64.93 ( 0.00%) 47.82 (-26.35%) Percentage huge-24 62.69 ( 0.00%) 44.23 (-29.44%) Percentage huge-30 43.49 ( 0.00%) 55.38 ( 27.34%) Percentage huge-32 50.72 ( 0.00%) 51.90 ( 2.35%) 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 Minor Faults 37429143 47564000 Major Faults 1916 1558 Swap Ins 1466 1079 Swap Outs 2936863 149626 Allocation stalls 62510 3 DMA allocs 0 0 DMA32 allocs 6566458 6401314 Normal allocs 216361697 216538171 Movable allocs 0 0 Direct pages scanned 25977580 17998 Kswapd pages scanned 0 3638931 Kswapd pages reclaimed 0 207236 Direct pages reclaimed 8833714 88 Compaction stalls 103349 5 Compaction success 270 4 Compaction failures 103079 1 Note again that while this does swap as it's an aggressive workload, the direct relcim activity and allocation stalls is substantially reduced. There is some kswapd activity but ftrace showed that the kswapd activity was due to normal wakeups from 4K pages being allocated. Compaction-related stalls and activity are almost eliminated. I also tried the stutter benchmark. For this, I do not have figures for NUMA but it's something that does impact UMA so I'll report what is available stutter 4.4.0 4.4.0 kcompactd-v1r1 nodefrag-v1r3 Min mmap 7.3571 ( 0.00%) 7.3438 ( 0.18%) 1st-qrtle mmap 7.5278 ( 0.00%) 17.9200 (-138.05%) 2nd-qrtle mmap 7.6818 ( 0.00%) 21.6055 (-181.25%) 3rd-qrtle mmap 11.0889 ( 0.00%) 21.8881 (-97.39%) Max-90% mmap 27.8978 ( 0.00%) 22.1632 ( 20.56%) Max-93% mmap 28.3202 ( 0.00%) 22.3044 ( 21.24%) Max-95% mmap 28.5600 ( 0.00%) 22.4580 ( 21.37%) Max-99% mmap 29.6032 ( 0.00%) 25.5216 ( 13.79%) Max mmap 4109.7289 ( 0.00%) 4813.9832 (-17.14%) Mean mmap 12.4474 ( 0.00%) 19.3027 (-55.07%) This benchmark is trying to fault an anonymous mapping while there is a heavy IO load -- a scenario that desktop users used to complain about frequently. This shows a mix because the ideal case of mapping with THP is not hit as often. However, note that 99% of the mappings complete 13.79% faster. The CPU usage here is particularly interesting 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 User 67.50 0.99 System 1327.88 91.30 Elapsed 2079.00 2128.98 And once again we look at the reclaim figures 4.4.0 4.4.0 kcompactd-v1r1nodefrag-v1r3 Minor Faults 335241922 1314582827 Major Faults 715 819 Swap Ins 0 0 Swap Outs 0 0 Allocation stalls 532723 0 DMA allocs 0 0 DMA32 allocs 1822364341 1177950222 Normal allocs 1815640808 1517844854 Movable allocs 0 0 Direct pages scanned 21892772 0 Kswapd pages scanned 20015890 41879484 Kswapd pages reclaimed 19961986 41822072 Direct pages reclaimed 21892741 0 Compaction stalls 1065755 0 Compaction success 514 0 Compaction failures 1065241 0 Allocation stalls and all direct reclaim activity is eliminated as well as compaction-related stalls. THP gives impressive gains in some cases but only if they are quickly available. We're not going to reach the point where they are completely free so lets take the costs out of the fast paths finally and defer the cost to kswapd, kcompactd and khugepaged where it belongs. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- include/linux/huge_mm.h | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c083d0820a87..11d56c6e7ef2 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -257,7 +257,7 @@ struct vm_area_struct; #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) #define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \ - ~__GFP_KSWAPD_RECLAIM) + ~__GFP_RECLAIM) /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 459fd25b378e..a4cecb4801ec 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -41,7 +41,8 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, - TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, @@ -71,12 +72,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); ((__vma)->vm_flags & VM_HUGEPAGE))) && \ !((__vma)->vm_flags & VM_NOHUGEPAGE) && \ !is_vma_temporary_stack(__vma)) -#define transparent_hugepage_defrag(__vma) \ - ((transparent_hugepage_flags & \ - (1<vm_flags & VM_HUGEPAGE)) #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1< Date: Thu, 17 Mar 2016 14:19:26 -0700 Subject: mm: introduce page reference manipulation functions The success of CMA allocation largely depends on the success of migration and key factor of it is page reference count. Until now, page reference is manipulated by direct calling atomic functions so we cannot follow up who and where manipulate it. Then, it is hard to find actual reason of CMA allocation failure. CMA allocation should be guaranteed to succeed so finding offending place is really important. In this patch, call sites where page reference is manipulated are converted to introduced wrapper function. This is preparation step to add tracepoint to each page reference manipulation function. With this facility, we can easily find reason of CMA allocation failure. There is no functional change in this patch. In addition, this patch also converts reference read sites. It will help a second step that renames page._count to something else and prevents later attempt to direct access to it (Suggested by Andrew). Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Sergey Senozhatsky Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 25 ++++---------- include/linux/page_ref.h | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pagemap.h | 19 ++--------- 3 files changed, 94 insertions(+), 35 deletions(-) create mode 100644 include/linux/page_ref.h (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f7fd64227d3a..997fc2e5d9d8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -22,6 +22,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -386,8 +387,8 @@ static inline int pmd_devmap(pmd_t pmd) */ static inline int put_page_testzero(struct page *page) { - VM_BUG_ON_PAGE(atomic_read(&page->_count) == 0, page); - return atomic_dec_and_test(&page->_count); + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); + return page_ref_dec_and_test(page); } /* @@ -398,7 +399,7 @@ static inline int put_page_testzero(struct page *page) */ static inline int get_page_unless_zero(struct page *page) { - return atomic_inc_not_zero(&page->_count); + return page_ref_add_unless(page, 1, 0); } extern int page_is_ram(unsigned long pfn); @@ -486,11 +487,6 @@ static inline int total_mapcount(struct page *page) } #endif -static inline int page_count(struct page *page) -{ - return atomic_read(&compound_head(page)->_count); -} - static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); @@ -498,15 +494,6 @@ static inline struct page *virt_to_head_page(const void *x) return compound_head(page); } -/* - * Setup the page count before being freed into the page allocator for - * the first time (boot or memory hotplug) - */ -static inline void init_page_count(struct page *page) -{ - atomic_set(&page->_count, 1); -} - void __put_page(struct page *page); void put_pages_list(struct list_head *pages); @@ -716,8 +703,8 @@ static inline void get_page(struct page *page) * Getting a normal page or the head of a compound page * requires to already have an elevated page->_count. */ - VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); - atomic_inc(&page->_count); + VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); + page_ref_inc(page); if (unlikely(is_zone_device_page(page))) get_zone_device_page(page); diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h new file mode 100644 index 000000000000..30f5817f6b8e --- /dev/null +++ b/include/linux/page_ref.h @@ -0,0 +1,85 @@ +#ifndef _LINUX_PAGE_REF_H +#define _LINUX_PAGE_REF_H + +#include +#include +#include + +static inline int page_ref_count(struct page *page) +{ + return atomic_read(&page->_count); +} + +static inline int page_count(struct page *page) +{ + return atomic_read(&compound_head(page)->_count); +} + +static inline void set_page_count(struct page *page, int v) +{ + atomic_set(&page->_count, v); +} + +/* + * Setup the page count before being freed into the page allocator for + * the first time (boot or memory hotplug) + */ +static inline void init_page_count(struct page *page) +{ + set_page_count(page, 1); +} + +static inline void page_ref_add(struct page *page, int nr) +{ + atomic_add(nr, &page->_count); +} + +static inline void page_ref_sub(struct page *page, int nr) +{ + atomic_sub(nr, &page->_count); +} + +static inline void page_ref_inc(struct page *page) +{ + atomic_inc(&page->_count); +} + +static inline void page_ref_dec(struct page *page) +{ + atomic_dec(&page->_count); +} + +static inline int page_ref_sub_and_test(struct page *page, int nr) +{ + return atomic_sub_and_test(nr, &page->_count); +} + +static inline int page_ref_dec_and_test(struct page *page) +{ + return atomic_dec_and_test(&page->_count); +} + +static inline int page_ref_dec_return(struct page *page) +{ + return atomic_dec_return(&page->_count); +} + +static inline int page_ref_add_unless(struct page *page, int nr, int u) +{ + return atomic_add_unless(&page->_count, nr, u); +} + +static inline int page_ref_freeze(struct page *page, int count) +{ + return likely(atomic_cmpxchg(&page->_count, count, 0) == count); +} + +static inline void page_ref_unfreeze(struct page *page, int count) +{ + VM_BUG_ON_PAGE(page_count(page) != 0, page); + VM_BUG_ON(count == 0); + + atomic_set(&page->_count, count); +} + +#endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 183b15ea052b..1ebd65c91422 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -165,7 +165,7 @@ static inline int page_cache_get_speculative(struct page *page) * SMP requires. */ VM_BUG_ON_PAGE(page_count(page) == 0, page); - atomic_inc(&page->_count); + page_ref_inc(page); #else if (unlikely(!get_page_unless_zero(page))) { @@ -194,10 +194,10 @@ static inline int page_cache_add_speculative(struct page *page, int count) VM_BUG_ON(!in_atomic()); # endif VM_BUG_ON_PAGE(page_count(page) == 0, page); - atomic_add(count, &page->_count); + page_ref_add(page, count); #else - if (unlikely(!atomic_add_unless(&page->_count, count, 0))) + if (unlikely(!page_ref_add_unless(page, count, 0))) return 0; #endif VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page); @@ -205,19 +205,6 @@ static inline int page_cache_add_speculative(struct page *page, int count) return 1; } -static inline int page_freeze_refs(struct page *page, int count) -{ - return likely(atomic_cmpxchg(&page->_count, count, 0) == count); -} - -static inline void page_unfreeze_refs(struct page *page, int count) -{ - VM_BUG_ON_PAGE(page_count(page) != 0, page); - VM_BUG_ON(count == 0); - - atomic_set(&page->_count, count); -} - #ifdef CONFIG_NUMA extern struct page *__page_cache_alloc(gfp_t gfp); #else -- cgit v1.2.3 From 95813b8faa0cd315f61a8b9d9c523792370b693e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:19:29 -0700 Subject: mm/page_ref: add tracepoint to track down page reference manipulation CMA allocation should be guaranteed to succeed by definition, but, unfortunately, it would be failed sometimes. It is hard to track down the problem, because it is related to page reference manipulation and we don't have any facility to analyze it. This patch adds tracepoints to track down page reference manipulation. With it, we can find exact reason of failure and can fix the problem. Following is an example of tracepoint output. (note: this example is stale version that printing flags as the number. Recent version will print it as human readable string.) <...>-9018 [004] 92.678375: page_ref_set: pfn=0x17ac9 flags=0x0 count=1 mapcount=0 mapping=(nil) mt=4 val=1 <...>-9018 [004] 92.678378: kernel_stack: => get_page_from_freelist (ffffffff81176659) => __alloc_pages_nodemask (ffffffff81176d22) => alloc_pages_vma (ffffffff811bf675) => handle_mm_fault (ffffffff8119e693) => __do_page_fault (ffffffff810631ea) => trace_do_page_fault (ffffffff81063543) => do_async_page_fault (ffffffff8105c40a) => async_page_fault (ffffffff817581d8) [snip] <...>-9018 [004] 92.678379: page_ref_mod: pfn=0x17ac9 flags=0x40048 count=2 mapcount=1 mapping=0xffff880015a78dc1 mt=4 val=1 [snip] ... ... <...>-9131 [001] 93.174468: test_pages_isolated: start_pfn=0x17800 end_pfn=0x17c00 fin_pfn=0x17ac9 ret=fail [snip] <...>-9018 [004] 93.174843: page_ref_mod_and_test: pfn=0x17ac9 flags=0x40068 count=0 mapcount=0 mapping=0xffff880015a78dc1 mt=4 val=-1 ret=1 => release_pages (ffffffff8117c9e4) => free_pages_and_swap_cache (ffffffff811b0697) => tlb_flush_mmu_free (ffffffff81199616) => tlb_finish_mmu (ffffffff8119a62c) => exit_mmap (ffffffff811a53f7) => mmput (ffffffff81073f47) => do_exit (ffffffff810794e9) => do_group_exit (ffffffff81079def) => SyS_exit_group (ffffffff81079e74) => entry_SYSCALL_64_fastpath (ffffffff817560b6) This output shows that problem comes from exit path. In exit path, to improve performance, pages are not freed immediately. They are gathered and processed by batch. During this process, migration cannot be possible and CMA allocation is failed. This problem is hard to find without this page reference tracepoint facility. Enabling this feature bloat kernel text 30 KB in my configuration. text data bss dec hex filename 12127327 2243616 1507328 15878271 f2487f vmlinux_disabled 12157208 2258880 1507328 15923416 f2f8d8 vmlinux_enabled Note that, due to header file dependency problem between mm.h and tracepoint.h, this feature has to open code the static key functions for tracepoints. Proposed by Steven Rostedt in following link. https://lkml.org/lkml/2015/12/9/699 [arnd@arndb.de: crypto/async_pq: use __free_page() instead of put_page()] [iamjoonsoo.kim@lge.com: fix build failure for xtensa] [akpm@linux-foundation.org: tweak Kconfig text, per Vlastimil] Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Acked-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Sergey Senozhatsky Acked-by: Steven Rostedt Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ref.h | 98 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 93 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 30f5817f6b8e..e596d5d9540e 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -4,6 +4,62 @@ #include #include #include +#include + +extern struct tracepoint __tracepoint_page_ref_set; +extern struct tracepoint __tracepoint_page_ref_mod; +extern struct tracepoint __tracepoint_page_ref_mod_and_test; +extern struct tracepoint __tracepoint_page_ref_mod_and_return; +extern struct tracepoint __tracepoint_page_ref_mod_unless; +extern struct tracepoint __tracepoint_page_ref_freeze; +extern struct tracepoint __tracepoint_page_ref_unfreeze; + +#ifdef CONFIG_DEBUG_PAGE_REF + +/* + * Ideally we would want to use the trace__enabled() helper + * functions. But due to include header file issues, that is not + * feasible. Instead we have to open code the static key functions. + * + * See trace_##name##_enabled(void) in include/linux/tracepoint.h + */ +#define page_ref_tracepoint_active(t) static_key_false(&(t).key) + +extern void __page_ref_set(struct page *page, int v); +extern void __page_ref_mod(struct page *page, int v); +extern void __page_ref_mod_and_test(struct page *page, int v, int ret); +extern void __page_ref_mod_and_return(struct page *page, int v, int ret); +extern void __page_ref_mod_unless(struct page *page, int v, int u); +extern void __page_ref_freeze(struct page *page, int v, int ret); +extern void __page_ref_unfreeze(struct page *page, int v); + +#else + +#define page_ref_tracepoint_active(t) false + +static inline void __page_ref_set(struct page *page, int v) +{ +} +static inline void __page_ref_mod(struct page *page, int v) +{ +} +static inline void __page_ref_mod_and_test(struct page *page, int v, int ret) +{ +} +static inline void __page_ref_mod_and_return(struct page *page, int v, int ret) +{ +} +static inline void __page_ref_mod_unless(struct page *page, int v, int u) +{ +} +static inline void __page_ref_freeze(struct page *page, int v, int ret) +{ +} +static inline void __page_ref_unfreeze(struct page *page, int v) +{ +} + +#endif static inline int page_ref_count(struct page *page) { @@ -18,6 +74,8 @@ static inline int page_count(struct page *page) static inline void set_page_count(struct page *page, int v) { atomic_set(&page->_count, v); + if (page_ref_tracepoint_active(__tracepoint_page_ref_set)) + __page_ref_set(page, v); } /* @@ -32,46 +90,74 @@ static inline void init_page_count(struct page *page) static inline void page_ref_add(struct page *page, int nr) { atomic_add(nr, &page->_count); + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) + __page_ref_mod(page, nr); } static inline void page_ref_sub(struct page *page, int nr) { atomic_sub(nr, &page->_count); + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) + __page_ref_mod(page, -nr); } static inline void page_ref_inc(struct page *page) { atomic_inc(&page->_count); + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) + __page_ref_mod(page, 1); } static inline void page_ref_dec(struct page *page) { atomic_dec(&page->_count); + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) + __page_ref_mod(page, -1); } static inline int page_ref_sub_and_test(struct page *page, int nr) { - return atomic_sub_and_test(nr, &page->_count); + int ret = atomic_sub_and_test(nr, &page->_count); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) + __page_ref_mod_and_test(page, -nr, ret); + return ret; } static inline int page_ref_dec_and_test(struct page *page) { - return atomic_dec_and_test(&page->_count); + int ret = atomic_dec_and_test(&page->_count); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) + __page_ref_mod_and_test(page, -1, ret); + return ret; } static inline int page_ref_dec_return(struct page *page) { - return atomic_dec_return(&page->_count); + int ret = atomic_dec_return(&page->_count); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return)) + __page_ref_mod_and_return(page, -1, ret); + return ret; } static inline int page_ref_add_unless(struct page *page, int nr, int u) { - return atomic_add_unless(&page->_count, nr, u); + int ret = atomic_add_unless(&page->_count, nr, u); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless)) + __page_ref_mod_unless(page, nr, ret); + return ret; } static inline int page_ref_freeze(struct page *page, int count) { - return likely(atomic_cmpxchg(&page->_count, count, 0) == count); + int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze)) + __page_ref_freeze(page, count, ret); + return ret; } static inline void page_ref_unfreeze(struct page *page, int count) @@ -80,6 +166,8 @@ static inline void page_ref_unfreeze(struct page *page, int count) VM_BUG_ON(count == 0); atomic_set(&page->_count, count); + if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze)) + __page_ref_unfreeze(page, count); } #endif -- cgit v1.2.3 From b11a7b94100cba5ec926a181894c2897a22651b9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 17 Mar 2016 14:19:41 -0700 Subject: mm: exclude ZONE_DEVICE from GFP_ZONE_TABLE ZONE_DEVICE (merged in 4.3) and ZONE_CMA (proposed) are examples of new mm zones that are bumping up against the current maximum limit of 4 zones, i.e. 2 bits in page->flags for the GFP_ZONE_TABLE. The GFP_ZONE_TABLE poses an interesting constraint since include/linux/gfp.h gets included by the 32-bit portion of a 64-bit build. We need to be careful to only build the table for zones that have a corresponding gfp_t flag. GFP_ZONES_SHIFT is introduced for this purpose. This patch does not attempt to solve the problem of adding a new zone that also has a corresponding GFP_ flag. Vlastimil points out that ZONE_DEVICE, by depending on x86_64 and SPARSEMEM_VMEMMAP implies that SECTIONS_WIDTH is zero. In other words even though ZONE_DEVICE does not fit in GFP_ZONE_TABLE it is free to consume another bit in page->flags (expand ZONES_WIDTH) with room to spare. Link: https://bugzilla.kernel.org/show_bug.cgi?id=110931 Fixes: 033fbae988fc ("mm: ZONE_DEVICE for "device memory"") Signed-off-by: Dan Williams Reported-by: Mark Reported-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Cc: Joonsoo Kim Cc: Dave Hansen Cc: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 33 ++++++++++++++++++++------------- include/linux/page-flags-layout.h | 2 ++ 2 files changed, 22 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 11d56c6e7ef2..570383a41853 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -331,22 +331,29 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) * - * ZONES_SHIFT must be <= 2 on 32 bit platforms. + * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms. */ -#if 16 * ZONES_SHIFT > BITS_PER_LONG -#error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer +#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4 +/* ZONE_DEVICE is not a valid GFP zone specifier */ +#define GFP_ZONES_SHIFT 2 +#else +#define GFP_ZONES_SHIFT ZONES_SHIFT +#endif + +#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG +#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer #endif #define GFP_ZONE_TABLE ( \ - (ZONE_NORMAL << 0 * ZONES_SHIFT) \ - | (OPT_ZONE_DMA << ___GFP_DMA * ZONES_SHIFT) \ - | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * ZONES_SHIFT) \ - | (OPT_ZONE_DMA32 << ___GFP_DMA32 * ZONES_SHIFT) \ - | (ZONE_NORMAL << ___GFP_MOVABLE * ZONES_SHIFT) \ - | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * ZONES_SHIFT) \ - | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * ZONES_SHIFT) \ - | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * ZONES_SHIFT) \ + (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \ + | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \ + | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \ + | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \ + | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \ + | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \ + | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\ + | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\ ) /* @@ -371,8 +378,8 @@ static inline enum zone_type gfp_zone(gfp_t flags) enum zone_type z; int bit = (__force int) (flags & GFP_ZONEMASK); - z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) & - ((1 << ZONES_SHIFT) - 1); + z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & + ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); return z; } diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index da523661500a..77b078c103b2 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -17,6 +17,8 @@ #define ZONES_SHIFT 1 #elif MAX_NR_ZONES <= 4 #define ZONES_SHIFT 2 +#elif MAX_NR_ZONES <= 8 +#define ZONES_SHIFT 3 #else #error ZONES_SHIFT -- too many zones configured adjust calculation #endif -- cgit v1.2.3 From 0e8fb9312fbaf1a687dd731b04d8ab3121c4ff5a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 17 Mar 2016 14:19:55 -0700 Subject: mm: remove VM_FAULT_MINOR The define has a comment from Nick Piggin from 2007: /* For backwards compat. Remove me quickly. */ I guess 9 years should not be too hurried sense of 'quickly' even for kernel measures. Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 997fc2e5d9d8..7d42501c8bb4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1052,8 +1052,6 @@ static inline void clear_page_pfmemalloc(struct page *page) * just gets major/minor fault counters bumped up. */ -#define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */ - #define VM_FAULT_OOM 0x0001 #define VM_FAULT_SIGBUS 0x0002 #define VM_FAULT_MAJOR 0x0004 -- cgit v1.2.3 From b97731992d00f09456726bfc5ab6641c07773038 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:01 -0700 Subject: rmap: introduce rmap_walk_locked() This patchset rewrites freeze_page() and unfreeze_page() using try_to_unmap() and remove_migration_ptes(). Result is much simpler, but somewhat slower. Migration 8GiB worth of PMD-mapped THP: Baseline 20.21 +/- 0.393 Patched 20.73 +/- 0.082 Slowdown 1.03x It's 3% slower, comparing to 14% in v1. I don't it should be a stopper. Splitting of PTE-mapped pages slowed more. But this is not a common case. Migration 8GiB worth of PMD-mapped THP: Baseline 20.39 +/- 0.225 Patched 22.43 +/- 0.496 Slowdown 1.10x rmap_walk_locked() is the same as rmap_walk(), but the caller takes care of the relevant rmap lock. This is preparation for switching THP splitting from custom rmap walk in freeze_page()/unfreeze_page() to the generic one. There is no support for KSM pages for now: not clear which lock is implied. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index a07f42bedda3..a5875e9b4a27 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -266,6 +266,7 @@ struct rmap_walk_control { }; int rmap_walk(struct page *page, struct rmap_walk_control *rwc); +int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ -- cgit v1.2.3 From 2a52bcbcc688eecead2953143f7ef695b8e44575 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:04 -0700 Subject: rmap: extend try_to_unmap() to be usable by split_huge_page() Add support for two ttu_flags: - TTU_SPLIT_HUGE_PMD would split PMD if it's there, before trying to unmap page; - TTU_RMAP_LOCKED indicates that caller holds relevant rmap lock; Also, change rwc->done to !page_mapcount() instead of !page_mapped(). try_to_unmap() works on pte level, so we are really interested in the mappedness of this small page rather than of the compound page it's a part of. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 7 +++++++ include/linux/rmap.h | 3 +++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a4cecb4801ec..01ad22e938b0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -106,6 +106,9 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, __split_huge_pmd(__vma, __pmd, __address); \ } while (0) + +void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address); + #if HPAGE_PMD_ORDER >= MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif @@ -173,6 +176,10 @@ static inline int split_huge_page(struct page *page) static inline void deferred_split_huge_page(struct page *page) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) + +static inline void split_huge_pmd_address(struct vm_area_struct *vma, + unsigned long address) {} + static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index a5875e9b4a27..3d975e2252d4 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -86,6 +86,7 @@ enum ttu_flags { TTU_MIGRATION = 2, /* migration mode */ TTU_MUNLOCK = 4, /* munlock mode */ TTU_LZFREE = 8, /* lazy free mode */ + TTU_SPLIT_HUGE_PMD = 16, /* split huge PMD if any */ TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ @@ -93,6 +94,8 @@ enum ttu_flags { TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible * and caller guarantees they will * do a final flush if necessary */ + TTU_RMAP_LOCKED = (1 << 12) /* do not grab rmap lock: + * caller holds it */ }; #ifdef CONFIG_MMU -- cgit v1.2.3 From e388466de4a2a1a50c43bfaeacc0c8254d9e7cb2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:07 -0700 Subject: mm: make remove_migration_ptes() beyond mm/migration.c Make remove_migration_ptes() available to be used in split_huge_page(). New parameter 'locked' added: as with try_to_umap() we need a way to indicate that caller holds rmap lock. We also shouldn't try to mlock() pte-mapped huge pages: pte-mapeed THP pages are never mlocked. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3d975e2252d4..49eb4f8ebac9 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -243,6 +243,8 @@ int page_mkclean(struct page *); */ int try_to_munlock(struct page *); +void remove_migration_ptes(struct page *old, struct page *new, bool locked); + /* * Called by memory-failure.c to kill processes. */ -- cgit v1.2.3 From fec89c109f3a7737fe3a7bf0f40d1fb7709d353b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 17 Mar 2016 14:20:10 -0700 Subject: thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers freeze_page() and unfreeze_page() helpers evolved in rather complex beasts. It would be nice to cut complexity of this code. This patch rewrites freeze_page() using standard try_to_unmap(). unfreeze_page() is rewritten with remove_migration_ptes(). The result is much simpler. But the new variant is somewhat slower for PTE-mapped THPs. Current helpers iterates over VMAs the compound page is mapped to, and then over ptes within this VMA. New helpers iterates over small page, then over VMA the small page mapped to, and only then find relevant pte. We have short cut for PMD-mapped THP: we directly install migration entries on PMD split. I don't think the slowdown is critical, considering how much simpler result is and that split_huge_page() is quite rare nowadays. It only happens due memory pressure or migration. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 01ad22e938b0..5307dfb3f8ec 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -96,18 +96,20 @@ static inline int split_huge_page(struct page *page) void deferred_split_huge_page(struct page *page); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address); + unsigned long address, bool freeze); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ if (pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ - __split_huge_pmd(__vma, __pmd, __address); \ + __split_huge_pmd(__vma, __pmd, __address, \ + false); \ } while (0) -void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address); +void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, + bool freeze, struct page *page); #if HPAGE_PMD_ORDER >= MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" @@ -178,7 +180,7 @@ static inline void deferred_split_huge_page(struct page *page) {} do { } while (0) static inline void split_huge_pmd_address(struct vm_area_struct *vma, - unsigned long address) {} + unsigned long address, bool freeze, struct page *page) {} static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) -- cgit v1.2.3 From da8b44d5a9f8bf26da637b7336508ca534d6b319 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 17 Mar 2016 14:20:51 -0700 Subject: timer: convert timer_slack_ns from unsigned long to u64 This patchset introduces a /proc//timerslack_ns interface which would allow controlling processes to be able to set the timerslack value on other processes in order to save power by avoiding wakeups (Something Android currently does via out-of-tree patches). The first patch tries to fix the internal timer_slack_ns usage which was defined as a long, which limits the slack range to ~4 seconds on 32bit systems. It converts it to a u64, which provides the same basically unlimited slack (500 years) on both 32bit and 64bit machines. The second patch introduces the /proc//timerslack_ns interface which allows the full 64bit slack range for a task to be read or set on both 32bit and 64bit machines. With these two patches, on a 32bit machine, after setting the slack on bash to 10 seconds: $ time sleep 1 real 0m10.747s user 0m0.001s sys 0m0.005s The first patch is a little ugly, since I had to chase the slack delta arguments through a number of functions converting them to u64s. Let me know if it makes sense to break that up more or not. Other than that things are fairly straightforward. This patch (of 2): The timer_slack_ns value in the task struct is currently a unsigned long. This means that on 32bit applications, the maximum slack is just over 4 seconds. However, on 64bit machines, its much much larger (~500 years). This disparity could make application development a little (as well as the default_slack) to a u64. This means both 32bit and 64bit systems have the same effective internal slack range. Now the existing ABI via PR_GET_TIMERSLACK and PR_SET_TIMERSLACK specify the interface as a unsigned long, so we preserve that limitation on 32bit systems, where SET_TIMERSLACK can only set the slack to a unsigned long value, and GET_TIMERSLACK will return ULONG_MAX if the slack is actually larger then what can be stored by an unsigned long. This patch also modifies hrtimer functions which specified the slack delta as a unsigned long. Signed-off-by: John Stultz Cc: Arjan van de Ven Cc: Thomas Gleixner Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Kees Cook Cc: Android Kernel Team Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/freezer.h | 2 +- include/linux/hrtimer.h | 12 +++++++----- include/linux/poll.h | 2 +- include/linux/sched.h | 4 ++-- 4 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 6b7fd9cf5ea2..dd03e837ebb7 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -231,7 +231,7 @@ static inline long freezable_schedule_timeout_killable_unsafe(long timeout) * call this with locks held. */ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, - unsigned long delta, const enum hrtimer_mode mode) + u64 delta, const enum hrtimer_mode mode) { int __retval; freezer_do_not_count(); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 2ead22dd74a0..c98c6539e2c2 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -220,7 +220,7 @@ static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time timer->node.expires = ktime_add_safe(time, delta); } -static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta) +static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); @@ -378,7 +378,7 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } /* Basic timer operations: */ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long range_ns, const enum hrtimer_mode mode); + u64 range_ns, const enum hrtimer_mode mode); /** * hrtimer_start - (re)start an hrtimer on the current CPU @@ -399,7 +399,7 @@ extern int hrtimer_try_to_cancel(struct hrtimer *timer); static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { - unsigned long delta; + u64 delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); @@ -477,10 +477,12 @@ extern long hrtimer_nanosleep_restart(struct restart_block *restart_block); extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *tsk); -extern int schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode); extern int schedule_hrtimeout_range_clock(ktime_t *expires, - unsigned long delta, const enum hrtimer_mode mode, int clock); + u64 delta, + const enum hrtimer_mode mode, + int clock); extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ diff --git a/include/linux/poll.h b/include/linux/poll.h index c08386fb3e08..9fb4f40d9a26 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq); extern void poll_freewait(struct poll_wqueues *pwq); extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack); -extern long select_estimate_accuracy(struct timespec *tv); +extern u64 select_estimate_accuracy(struct timespec *tv); static inline int poll_schedule(struct poll_wqueues *pwq, int state) diff --git a/include/linux/sched.h b/include/linux/sched.h index eb7f2f84009b..3284d07edec7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1792,8 +1792,8 @@ struct task_struct { * time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ - unsigned long timer_slack_ns; - unsigned long default_timer_slack_ns; + u64 timer_slack_ns; + u64 default_timer_slack_ns; #ifdef CONFIG_KASAN unsigned int kasan_depth; -- cgit v1.2.3 From 93e205a728e6cb8d7d11f6836e289798a1de25e2 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 17 Mar 2016 14:21:15 -0700 Subject: fix Christoph's email addresses There are various email addresses for me throughout the kernel. Use the one that will always be valid. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/quicklist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h index bd466439c588..3bdfa70bc642 100644 --- a/include/linux/quicklist.h +++ b/include/linux/quicklist.h @@ -5,7 +5,7 @@ * as needed after allocation when they are freed. Per cpu lists of pages * are kept that only contain node local pages. * - * (C) 2007, SGI. Christoph Lameter + * (C) 2007, SGI. Christoph Lameter */ #include #include -- cgit v1.2.3 From 26a247fd9f8d2df1965b7f22c9be2fb3a48603f3 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 17 Mar 2016 14:21:35 -0700 Subject: include/linux/list_bl.h: use bool instead of int for boolean functions hlist_bl_unhashed() and hlist_bl_empty() are all boolean functions, so return bool instead of int. Signed-off-by: Chen Gang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list_bl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h index ee7229a6c06a..cb483305e1f5 100644 --- a/include/linux/list_bl.h +++ b/include/linux/list_bl.h @@ -48,7 +48,7 @@ static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h) #define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member) -static inline int hlist_bl_unhashed(const struct hlist_bl_node *h) +static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h) { return !h->pprev; } @@ -68,7 +68,7 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h, h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK); } -static inline int hlist_bl_empty(const struct hlist_bl_head *h) +static inline bool hlist_bl_empty(const struct hlist_bl_head *h) { return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK); } -- cgit v1.2.3 From f67c07f07fca95a7f330b8bb928eabaf9fcce75d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 Mar 2016 14:21:42 -0700 Subject: radix-tree: add an explicit include of bitops.h The radix-tree header uses the __ffs() function, which is defined in bitops.h. The current kernel headers implicitly include bitops.h, but the userspace test harness does not. Signed-off-by: Matthew Wilcox Cc: Johannes Weiner Cc: Matthew Wilcox Cc: "Kirill A. Shutemov" Cc: Ross Zwisler Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index f54be7082207..39598b9cf1d9 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -21,6 +21,7 @@ #ifndef _LINUX_RADIX_TREE_H #define _LINUX_RADIX_TREE_H +#include #include #include #include -- cgit v1.2.3 From e61452365372570253b2b1de84bab0cdb2e62c64 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 Mar 2016 14:21:54 -0700 Subject: radix_tree: add support for multi-order entries With huge pages, it is convenient to have the radix tree be able to return an entry that covers multiple indices. Previous attempts to deal with the problem have involved inserting N duplicate entries, which is a waste of memory and leads to problems trying to handle aliased tags, or probing the tree multiple times to find alternative entries which might cover the requested index. This approach inserts one canonical entry into the tree for a given range of indices, and may also insert other entries in order to ensure that lookups find the canonical entry. This solution only tolerates inserting powers of two that are greater than the fanout of the tree. If we wish to expand the radix tree's abilities to support large-ish pages that is less than the fanout at the penultimate level of the tree, then we would need to add one more step in lookup to ensure that any sibling nodes in the final level of the tree are dereferenced and we return the canonical entry that they reference. Signed-off-by: Matthew Wilcox Cc: Johannes Weiner Cc: Matthew Wilcox Cc: "Kirill A. Shutemov" Cc: Ross Zwisler Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 39598b9cf1d9..b211f145c811 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -271,8 +271,15 @@ static inline void radix_tree_replace_slot(void **pslot, void *item) } int __radix_tree_create(struct radix_tree_root *root, unsigned long index, - struct radix_tree_node **nodep, void ***slotp); -int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); + unsigned order, struct radix_tree_node **nodep, + void ***slotp); +int __radix_tree_insert(struct radix_tree_root *, unsigned long index, + unsigned order, void *); +static inline int radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *entry) +{ + return __radix_tree_insert(root, index, 0, entry); +} void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); -- cgit v1.2.3 From 7165092fe5ca70bae722ac6cd78421cfd0eec18d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 Mar 2016 14:22:06 -0700 Subject: radix-tree,shmem: introduce radix_tree_iter_next() shmem likes to occasionally drop the lock, schedule, then reacqire the lock and continue with the iteration from the last place it left off. This is currently done with a pretty ugly goto. Introduce radix_tree_iter_next() and use it throughout shmem.c. [koct9i@gmail.com: fix bug in radix_tree_iter_next() for tagged iteration] Signed-off-by: Matthew Wilcox Cc: Hugh Dickins Signed-off-by: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index b211f145c811..51a97ac8bfbf 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -402,6 +402,22 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter) return NULL; } +/** + * radix_tree_iter_next - resume iterating when the chunk may be invalid + * @iter: iterator state + * + * If the iterator needs to release then reacquire a lock, the chunk may + * have been invalidated by an insertion or deletion. Call this function + * to continue the iteration from the next index. + */ +static inline __must_check +void **radix_tree_iter_next(struct radix_tree_iter *iter) +{ + iter->next_index = iter->index + 1; + iter->tags = 0; + return NULL; +} + /** * radix_tree_chunk_size - get current chunk size * -- cgit v1.2.3 From 56b060814e2d87d6646a85a2f4609c73587399ca Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 17 Mar 2016 14:22:14 -0700 Subject: lib/string: introduce match_string() helper Occasionally we have to search for an occurrence of a string in an array of strings. Make a simple helper for that purpose. Signed-off-by: Andy Shevchenko Cc: "David S. Miller" Cc: Bartlomiej Zolnierkiewicz Cc: David Airlie Cc: David Woodhouse Cc: Dmitry Eremin-Solenikov Cc: Greg Kroah-Hartman Cc: Heikki Krogerus Cc: Linus Walleij Cc: Mika Westerberg Cc: Rafael J. Wysocki Cc: Sebastian Reichel Cc: Tejun Heo Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 9eebc66d957a..0f235e80d355 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -130,6 +130,8 @@ extern void argv_free(char **argv); extern bool sysfs_streq(const char *s1, const char *s2); extern int strtobool(const char *s, bool *res); +int match_string(const char * const *array, size_t n, const char *string); + #ifdef CONFIG_BINARY_PRINTF int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); -- cgit v1.2.3 From e3bde9568d992c5f985e6e30731a5f9f9bef7b13 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 17 Mar 2016 14:22:47 -0700 Subject: include/linux/unaligned: force inlining of byteswap operations Sometimes gcc mysteriously doesn't inline very small functions we expect to be inlined. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 With this .config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os, the following functions get deinlined many times. Examples of disassembly: (24 copies, 108 calls): 66 8b 07 mov (%rdi),%ax 55 push %rbp 48 89 e5 mov %rsp,%rbp 86 e0 xchg %ah,%al 5d pop %rbp c3 retq (25 copies, 181 calls): 8b 07 mov (%rdi),%eax 55 push %rbp 48 89 e5 mov %rsp,%rbp 0f c8 bswap %eax 5d pop %rbp c3 retq (23 copies, 94 calls): 48 8b 07 mov (%rdi),%rax 55 push %rbp 48 89 e5 mov %rsp,%rbp 48 0f c8 bswap %rax 5d pop %rbp c3 retq (2 copies, 11 calls): 89 f8 mov %edi,%eax 55 push %rbp c1 ef 08 shr $0x8,%edi c1 e0 08 shl $0x8,%eax 09 c7 or %eax,%edi 48 89 e5 mov %rsp,%rbp 66 89 3e mov %di,(%rsi) (8 copies, 43 calls): 55 push %rbp 0f cf bswap %edi 89 3e mov %edi,(%rsi) 48 89 e5 mov %rsp,%rbp 5d pop %rbp c3 retq (26 copies, 157 calls): 55 push %rbp 48 0f cf bswap %rdi 48 89 3e mov %rdi,(%rsi) 48 89 e5 mov %rsp,%rbp 5d pop %rbp c3 retq This patch fixes this via s/inline/__always_inline/. It only affects arches with efficient unaligned access insns, such as x86. (arched which lack such ops do not include linux/unaligned/access_ok.h) Code size decrease after the patch is ~8.5k: text data bss dec hex filename 92197848 20826112 36417536 149441496 8e84bd8 vmlinux 92189231 20826144 36417536 149432911 8e82a4f vmlinux6_unaligned_be_after Signed-off-by: Denys Vlasenko Acked-by: Ingo Molnar Cc: Thomas Graf Cc: Peter Zijlstra Cc: David Rientjes Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/unaligned/access_ok.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/unaligned/access_ok.h b/include/linux/unaligned/access_ok.h index 99c1b4d20b0f..33383ca23837 100644 --- a/include/linux/unaligned/access_ok.h +++ b/include/linux/unaligned/access_ok.h @@ -4,62 +4,62 @@ #include #include -static inline u16 get_unaligned_le16(const void *p) +static __always_inline u16 get_unaligned_le16(const void *p) { return le16_to_cpup((__le16 *)p); } -static inline u32 get_unaligned_le32(const void *p) +static __always_inline u32 get_unaligned_le32(const void *p) { return le32_to_cpup((__le32 *)p); } -static inline u64 get_unaligned_le64(const void *p) +static __always_inline u64 get_unaligned_le64(const void *p) { return le64_to_cpup((__le64 *)p); } -static inline u16 get_unaligned_be16(const void *p) +static __always_inline u16 get_unaligned_be16(const void *p) { return be16_to_cpup((__be16 *)p); } -static inline u32 get_unaligned_be32(const void *p) +static __always_inline u32 get_unaligned_be32(const void *p) { return be32_to_cpup((__be32 *)p); } -static inline u64 get_unaligned_be64(const void *p) +static __always_inline u64 get_unaligned_be64(const void *p) { return be64_to_cpup((__be64 *)p); } -static inline void put_unaligned_le16(u16 val, void *p) +static __always_inline void put_unaligned_le16(u16 val, void *p) { *((__le16 *)p) = cpu_to_le16(val); } -static inline void put_unaligned_le32(u32 val, void *p) +static __always_inline void put_unaligned_le32(u32 val, void *p) { *((__le32 *)p) = cpu_to_le32(val); } -static inline void put_unaligned_le64(u64 val, void *p) +static __always_inline void put_unaligned_le64(u64 val, void *p) { *((__le64 *)p) = cpu_to_le64(val); } -static inline void put_unaligned_be16(u16 val, void *p) +static __always_inline void put_unaligned_be16(u16 val, void *p) { *((__be16 *)p) = cpu_to_be16(val); } -static inline void put_unaligned_be32(u32 val, void *p) +static __always_inline void put_unaligned_be32(u32 val, void *p) { *((__be32 *)p) = cpu_to_be32(val); } -static inline void put_unaligned_be64(u64 val, void *p) +static __always_inline void put_unaligned_be64(u64 val, void *p) { *((__be64 *)p) = cpu_to_be64(val); } -- cgit v1.2.3 From ef951599074ba4fad2d0efa0a977129b41e6d203 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Mar 2016 14:22:50 -0700 Subject: lib: move strtobool() to kstrtobool() Create the kstrtobool_from_user() helper and move strtobool() logic into the new kstrtobool() (matching all the other kstrto* functions). Provides an inline wrapper for existing strtobool() callers. Signed-off-by: Kees Cook Cc: Joe Perches Cc: Andy Shevchenko Cc: Rasmus Villemoes Cc: Daniel Borkmann Cc: Amitkumar Karwar Cc: Nishant Sarmukadam Cc: Kalle Valo Cc: Steve French Cc: Michael Ellerman Cc: Heiko Carstens Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 2 ++ include/linux/string.h | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f31638c6e873..f4fa2b29c38c 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -357,6 +357,7 @@ int __must_check kstrtou16(const char *s, unsigned int base, u16 *res); int __must_check kstrtos16(const char *s, unsigned int base, s16 *res); int __must_check kstrtou8(const char *s, unsigned int base, u8 *res); int __must_check kstrtos8(const char *s, unsigned int base, s8 *res); +int __must_check kstrtobool(const char *s, bool *res); int __must_check kstrtoull_from_user(const char __user *s, size_t count, unsigned int base, unsigned long long *res); int __must_check kstrtoll_from_user(const char __user *s, size_t count, unsigned int base, long long *res); @@ -368,6 +369,7 @@ int __must_check kstrtou16_from_user(const char __user *s, size_t count, unsigne int __must_check kstrtos16_from_user(const char __user *s, size_t count, unsigned int base, s16 *res); int __must_check kstrtou8_from_user(const char __user *s, size_t count, unsigned int base, u8 *res); int __must_check kstrtos8_from_user(const char __user *s, size_t count, unsigned int base, s8 *res); +int __must_check kstrtobool_from_user(const char __user *s, size_t count, bool *res); static inline int __must_check kstrtou64_from_user(const char __user *s, size_t count, unsigned int base, u64 *res) { diff --git a/include/linux/string.h b/include/linux/string.h index 0f235e80d355..d3993a79a325 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -128,7 +128,11 @@ extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); extern bool sysfs_streq(const char *s1, const char *s2); -extern int strtobool(const char *s, bool *res); +extern int kstrtobool(const char *s, bool *res); +static inline int strtobool(const char *s, bool *res) +{ + return kstrtobool(s, res); +} int match_string(const char * const *array, size_t n, const char *string); -- cgit v1.2.3 From 4cc7ecb7f2a60e8deb783b8fbf7c1ae467acb920 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Mar 2016 14:23:00 -0700 Subject: param: convert some "on"/"off" users to strtobool This changes several users of manual "on"/"off" parsing to use strtobool. Some side-effects: - these uses will now parse y/n/1/0 meaningfully too - the early_param uses will now bubble up parse errors Signed-off-by: Kees Cook Acked-by: Heiko Carstens Acked-by: Michael Ellerman Cc: Amitkumar Karwar Cc: Andy Shevchenko Cc: Daniel Borkmann Cc: Joe Perches Cc: Kalle Valo Cc: Martin Schwidefsky Cc: Nishant Sarmukadam Cc: Rasmus Villemoes Cc: Steve French Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tick.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index 21f73649a4dc..62be0786d6d0 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -111,7 +111,7 @@ enum tick_dep_bits { #define TICK_DEP_MASK_CLOCK_UNSTABLE (1 << TICK_DEP_BIT_CLOCK_UNSTABLE) #ifdef CONFIG_NO_HZ_COMMON -extern int tick_nohz_enabled; +extern bool tick_nohz_enabled; extern int tick_nohz_tick_stopped(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); -- cgit v1.2.3