From e8ecde25f5e08f89b61d86c32bbb56b405e90c32 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 14 Jan 2016 17:52:59 -0500 Subject: Make sure that highmem pages are not added to symlink page cache inode_nohighmem() is sufficient to make sure that page_get_link() won't try to allocate a highmem page. Moreover, it is sufficient to make sure that page_symlink/__page_symlink won't do the same thing. However, any filesystem that manually preseeds the symlink's page cache upon symlink(2) needs to make sure that the page it inserts there won't be a highmem one. Fortunately, only nfs and shmem have run afoul of that... Signed-off-by: Al Viro --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 5813b7fa85b6..642471b0ddea 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2469,6 +2469,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s inode->i_op = &shmem_short_symlink_operations; inode->i_link = info->symlink; } else { + inode_nohighmem(inode); error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); if (error) { iput(inode); @@ -2476,7 +2477,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s } inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; - inode_nohighmem(inode); memcpy(page_address(page), symname, len); SetPageUptodate(page); set_page_dirty(page); -- cgit v1.2.3 From d8ad47d83f95abe2dfece1338633e376fec3bd31 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 14 Jan 2016 15:17:56 -0800 Subject: mm/slab.c use list_first_entry_or_null() Simplify the code with list_first_entry_or_null(). Signed-off-by: Geliang Tang Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 4765c97ce690..6bb046649450 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2791,18 +2791,18 @@ retry: } while (batchcount > 0) { - struct list_head *entry; struct page *page; /* Get slab alloc is to come from. */ - entry = n->slabs_partial.next; - if (entry == &n->slabs_partial) { + page = list_first_entry_or_null(&n->slabs_partial, + struct page, lru); + if (!page) { n->free_touched = 1; - entry = n->slabs_free.next; - if (entry == &n->slabs_free) + page = list_first_entry_or_null(&n->slabs_free, + struct page, lru); + if (!page) goto must_grow; } - page = list_entry(entry, struct page, lru); check_spinlock_acquired(cachep); /* @@ -3085,7 +3085,6 @@ retry: static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - struct list_head *entry; struct page *page; struct kmem_cache_node *n; void *obj; @@ -3098,15 +3097,16 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, retry: check_irq_off(); spin_lock(&n->list_lock); - entry = n->slabs_partial.next; - if (entry == &n->slabs_partial) { + page = list_first_entry_or_null(&n->slabs_partial, + struct page, lru); + if (!page) { n->free_touched = 1; - entry = n->slabs_free.next; - if (entry == &n->slabs_free) + page = list_first_entry_or_null(&n->slabs_free, + struct page, lru); + if (!page) goto must_grow; } - page = list_entry(entry, struct page, lru); check_spinlock_acquired_node(cachep, nodeid); STATS_INC_NODEALLOCS(cachep); -- cgit v1.2.3 From 73c0219d8eca4114d81626032055598bc0a17130 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 14 Jan 2016 15:17:59 -0800 Subject: mm/slab.c: use list_for_each_entry in cache_flusharray Simplify the code with list_for_each_entry(). Signed-off-by: Geliang Tang Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 6bb046649450..5d5aa3bbdc3f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3338,17 +3338,12 @@ free_done: #if STATS { int i = 0; - struct list_head *p; - - p = n->slabs_free.next; - while (p != &(n->slabs_free)) { - struct page *page; + struct page *page; - page = list_entry(p, struct page, lru); + list_for_each_entry(page, &n->slabs_free, lru) { BUG_ON(page->active); i++; - p = p->next; } STATS_SET_FREEABLE(cachep, i); } -- cgit v1.2.3 From 7aa0d22785deea2725a23716823edd96e65c2ff6 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 14 Jan 2016 15:18:02 -0800 Subject: mm/slab.c: add a helper function get_first_slab Add a new helper function get_first_slab() that get the first slab from a kmem_cache_node. Signed-off-by: Geliang Tang Acked-by: Christoph Lameter Acked-by: David Rientjes Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 5d5aa3bbdc3f..6ecc697a8bc4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2756,6 +2756,21 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, #define cache_free_debugcheck(x,objp,z) (objp) #endif +static struct page *get_first_slab(struct kmem_cache_node *n) +{ + struct page *page; + + page = list_first_entry_or_null(&n->slabs_partial, + struct page, lru); + if (!page) { + n->free_touched = 1; + page = list_first_entry_or_null(&n->slabs_free, + struct page, lru); + } + + return page; +} + static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, bool force_refill) { @@ -2793,15 +2808,9 @@ retry: while (batchcount > 0) { struct page *page; /* Get slab alloc is to come from. */ - page = list_first_entry_or_null(&n->slabs_partial, - struct page, lru); - if (!page) { - n->free_touched = 1; - page = list_first_entry_or_null(&n->slabs_free, - struct page, lru); - if (!page) - goto must_grow; - } + page = get_first_slab(n); + if (!page) + goto must_grow; check_spinlock_acquired(cachep); @@ -3097,15 +3106,9 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, retry: check_irq_off(); spin_lock(&n->list_lock); - page = list_first_entry_or_null(&n->slabs_partial, - struct page, lru); - if (!page) { - n->free_touched = 1; - page = list_first_entry_or_null(&n->slabs_free, - struct page, lru); - if (!page) - goto must_grow; - } + page = get_first_slab(n); + if (!page) + goto must_grow; check_spinlock_acquired_node(cachep, nodeid); -- cgit v1.2.3 From 20b5c30398639b458371c228abfda829854b61c5 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:08 -0800 Subject: Revert "gfp: add __GFP_NOACCOUNT" This reverts commit 8f4fc071b192 ("gfp: add __GFP_NOACCOUNT"). Black-list kmem accounting policy (aka __GFP_NOACCOUNT) turned out to be fragile and difficult to maintain, because there seem to be many more allocations that should not be accounted than those that should be. Besides, false accounting an allocation might result in much worse consequences than not accounting at all, namely increased memory consumption due to pinned dead kmem caches. So it was decided to switch to the white-list policy. This patch reverts bits introducing the black-list policy. The white-list policy will be introduced later in the series. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 19423a45d7d7..25c0ad36fe38 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -122,8 +122,7 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ - __GFP_NOACCOUNT)) | \ +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ __GFP_NORETRY | __GFP_NOMEMALLOC | \ __GFP_NOWARN) -- cgit v1.2.3 From a9bb7e620efdfd29b6d1c238041173e411670996 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:12 -0800 Subject: memcg: only account kmem allocations marked as __GFP_ACCOUNT Black-list kmem accounting policy (aka __GFP_NOACCOUNT) turned out to be fragile and difficult to maintain, because there seem to be many more allocations that should not be accounted than those that should be. Besides, false accounting an allocation might result in much worse consequences than not accounting at all, namely increased memory consumption due to pinned dead kmem caches. So this patch switches kmem accounting to the white-policy: now only those kmem allocations that are marked as __GFP_ACCOUNT are accounted to memcg. Currently, no kmem allocations are marked like this. The following patches will mark several kmem allocations that are known to be easily triggered from userspace and therefore should be accounted to memcg. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d666df5ef95..ca58bfcdadac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3402,7 +3402,8 @@ EXPORT_SYMBOL(__free_page_frag); /* * alloc_kmem_pages charges newly allocated pages to the kmem resource counter - * of the current memory cgroup. + * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is + * equivalent to alloc_pages. * * It should be used when the caller would like to use kmalloc, but since the * allocation is large, it has to fall back to the page allocator. -- cgit v1.2.3 From 230e9fc2860450fbb1f33bdcf9093d92d7d91f5b Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:15 -0800 Subject: slab: add SLAB_ACCOUNT flag Currently, if we want to account all objects of a particular kmem cache, we have to pass __GFP_ACCOUNT to each kmem_cache_alloc call, which is inconvenient. This patch introduces SLAB_ACCOUNT flag which if passed to kmem_cache_create will force accounting for every allocation from this cache even if __GFP_ACCOUNT is not passed. This patch does not make any of the existing caches use this flag - it will be done later in the series. Note, a cache with SLAB_ACCOUNT cannot be merged with a cache w/o SLAB_ACCOUNT, because merged caches share the same kmem_cache struct and hence cannot have different sets of SLAB_* flags. Thus using this flag will probably reduce the number of merged slabs even if kmem accounting is not used (only compiled in). Signed-off-by: Vladimir Davydov Suggested-by: Tejun Heo Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +++++++- mm/slab.h | 5 +++-- mm/slab_common.c | 3 ++- mm/slub.c | 2 ++ 4 files changed, 14 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 14cb1db4c52b..4bd6c4513393 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2356,7 +2356,7 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, * Can't be called in interrupt context or from kernel threads. * This function needs to be called with rcu_read_lock() held. */ -struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; @@ -2364,6 +2364,12 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) VM_BUG_ON(!is_root_cache(cachep)); + if (cachep->flags & SLAB_ACCOUNT) + gfp |= __GFP_ACCOUNT; + + if (!(gfp & __GFP_ACCOUNT)) + return cachep; + if (current->memcg_kmem_skip_account) return cachep; diff --git a/mm/slab.h b/mm/slab.h index 7b6087197997..c63b8699cfa3 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -128,10 +128,11 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #if defined(CONFIG_SLAB) #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ - SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK) + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ + SLAB_NOTRACK | SLAB_ACCOUNT) #elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_NOTRACK) + SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT) #else #define SLAB_CACHE_FLAGS (0) #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index 3c6a86b4ec25..e016178063e1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -37,7 +37,8 @@ struct kmem_cache *kmem_cache; SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ SLAB_FAILSLAB) -#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK) +#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ + SLAB_NOTRACK | SLAB_ACCOUNT) /* * Merge control. If this is set then no merging of slab caches will occur. diff --git a/mm/slub.c b/mm/slub.c index 46997517406e..2d0e610d195a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5362,6 +5362,8 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'F'; if (!(s->flags & SLAB_NOTRACK)) *p++ = 't'; + if (s->flags & SLAB_ACCOUNT) + *p++ = 'A'; if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); -- cgit v1.2.3 From 37f08dda29dac8a595999b8d3eaa9bf0f763dd9d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:18 -0800 Subject: vmalloc: allow to account vmalloc to memcg Make vmalloc family functions allocate vmalloc area pages with alloc_kmem_pages so that if __GFP_ACCOUNT is set they will be accounted to memcg. This is needed, at least, to account alloc_fdmem allocations. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8e3c9c5a3042..9a58f9a1874d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1477,7 +1477,7 @@ static void __vunmap(const void *addr, int deallocate_pages) struct page *page = area->pages[i]; BUG_ON(!page); - __free_page(page); + __free_kmem_pages(page, 0); } if (area->flags & VM_VPAGES) @@ -1608,9 +1608,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_page(alloc_mask); + page = alloc_kmem_pages(alloc_mask, order); else - page = alloc_pages_node(node, alloc_mask, order); + page = alloc_kmem_pages_node(node, alloc_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ -- cgit v1.2.3 From 5d097056c9a017a3b720849efb5432f37acabbac Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:18:21 -0800 Subject: kmemcg: account certain kmem allocations to memcg Mark those kmem allocations that are known to be easily triggered from userspace as __GFP_ACCOUNT/SLAB_ACCOUNT, which makes them accounted to memcg. For the list, see below: - threadinfo - task_struct - task_delay_info - pid - cred - mm_struct - vm_area_struct and vm_region (nommu) - anon_vma and anon_vma_chain - signal_struct - sighand_struct - fs_struct - files_struct - fdtable and fdtable->full_fds_bits - dentry and external_name - inode for all filesystems. This is the most tedious part, because most filesystems overwrite the alloc_inode method. The list is far from complete, so feel free to add more objects. Nevertheless, it should be close to "account everything" approach and keep most workloads within bounds. Malevolent users will be able to breach the limit, but this was possible even with the former "account everything" approach (simply because it did not account everything in fact). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Greg Thelen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 2 +- mm/rmap.c | 6 ++++-- mm/shmem.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 92be862c859b..fbf6f0f1d6c9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -560,7 +560,7 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); - vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); + vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); } /* diff --git a/mm/rmap.c b/mm/rmap.c index b577fbb98d4b..3c3f1d21f075 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -428,8 +428,10 @@ static void anon_vma_ctor(void *data) void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); - anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, + anon_vma_ctor); + anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, + SLAB_PANIC|SLAB_ACCOUNT); } /* diff --git a/mm/shmem.c b/mm/shmem.c index 5813b7fa85b6..9e60093aca3f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3064,7 +3064,7 @@ static int shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), - 0, SLAB_PANIC, shmem_init_inode); + 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); return 0; } -- cgit v1.2.3 From ab7a5af7fd9cc38576b432690367bbabc8da99b2 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Thu, 14 Jan 2016 15:18:24 -0800 Subject: mm/mlock.c: drop unneeded initialization in munlock_vma_pages_range() Before usage page pointer initialized by NULL is reinitialized by follow_page_mask(). Drop useless init of page pointer in the beginning of loop. Signed-off-by: Alexey Klimov Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 339d9e0949b6..9cb87cbc4071 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -425,7 +425,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, vma->vm_flags &= VM_LOCKED_CLEAR_MASK; while (start < end) { - struct page *page = NULL; + struct page *page; unsigned int page_mask; unsigned long page_increm; struct pagevec pvec; -- cgit v1.2.3 From 0b57d6ba0bd11a41a791cf6c5bbf3b55630dc70f Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 14 Jan 2016 15:18:27 -0800 Subject: mm/mmap.c: remove redundant local variables for may_expand_vm() Simplify may_expand_vm(). [akpm@linux-foundation.org: further simplification, per Naoya Horiguchi] Signed-off-by: Chen Gang Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 2ce04a649f6b..9da9c27c33a2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2988,14 +2988,7 @@ out: */ int may_expand_vm(struct mm_struct *mm, unsigned long npages) { - unsigned long cur = mm->total_vm; /* pages */ - unsigned long lim; - - lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; - - if (cur + npages > lim) - return 0; - return 1; + return mm->total_vm + npages <= rlimit(RLIMIT_AS) >> PAGE_SHIFT; } static int special_mapping_fault(struct vm_area_struct *vma, -- cgit v1.2.3 From 3aa2385111168187f24a6db04697c6fab0fab9b4 Mon Sep 17 00:00:00 2001 From: yalin wang Date: Thu, 14 Jan 2016 15:18:30 -0800 Subject: mm/vmscan.c: change trace_mm_vmscan_writepage() proto type Move trace_reclaim_flags() into trace function, so that we don't need caculate these flags if the trace is disabled. Signed-off-by: yalin wang Reviewed-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 2aec4241b42a..fd7823ccf301 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -594,7 +594,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } - trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); + trace_mm_vmscan_writepage(page); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } -- cgit v1.2.3 From 4a8c7bb59ac85b038c29adf6d32ff56e11fbb267 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Thu, 14 Jan 2016 15:18:36 -0800 Subject: mm/mempolicy.c: convert the shared_policy lock to a rwlock When running the SPECint_rate gcc on some very large boxes it was noticed that the system was spending lots of time in mpol_shared_policy_lookup(). The gamess benchmark can also show it and is what I mostly used to chase down the issue since the setup for that I found to be easier. To be clear the binaries were on tmpfs because of disk I/O requirements. We then used text replication to avoid icache misses and having all the copies banging on the memory where the instruction code resides. This results in us hitting a bottleneck in mpol_shared_policy_lookup() since lookup is serialised by the shared_policy lock. I have only reproduced this on very large (3k+ cores) boxes. The problem starts showing up at just a few hundred ranks getting worse until it threatens to livelock once it gets large enough. For example on the gamess benchmark at 128 ranks this area consumes only ~1% of time, at 512 ranks it consumes nearly 13%, and at 2k ranks it is over 90%. To alleviate the contention in this area I converted the spinlock to an rwlock. This allows a large number of lookups to happen simultaneously. The results were quite good reducing this consumtion at max ranks to around 2%. [akpm@linux-foundation.org: tidy up code comments] Signed-off-by: Nathan Zimmer Acked-by: David Rientjes Acked-by: Vlastimil Babka Cc: Nadia Yvette Chambers Cc: Naoya Horiguchi Cc: Mel Gorman Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 87a177917cb2..d8caff071a30 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2142,12 +2142,14 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. - * They are protected by the sp->lock spinlock, which should be held + * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ -/* lookup first element intersecting start-end */ -/* Caller holds sp->lock */ +/* + * lookup first element intersecting start-end. Caller holds sp->lock for + * reading or for writing + */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { @@ -2178,8 +2180,10 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) return rb_entry(n, struct sp_node, nd); } -/* Insert a new shared policy into the list. */ -/* Caller holds sp->lock */ +/* + * Insert a new shared policy into the list. Caller holds sp->lock for + * writing. + */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; @@ -2211,13 +2215,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) if (!sp->root.rb_node) return NULL; - spin_lock(&sp->lock); + read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - spin_unlock(&sp->lock); + read_unlock(&sp->lock); return pol; } @@ -2360,7 +2364,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, int ret = 0; restart: - spin_lock(&sp->lock); + write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -2393,7 +2397,7 @@ restart: } if (new) sp_insert(sp, new); - spin_unlock(&sp->lock); + write_unlock(&sp->lock); ret = 0; err_out: @@ -2405,7 +2409,7 @@ err_out: return ret; alloc_new: - spin_unlock(&sp->lock); + write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) @@ -2431,7 +2435,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ - spin_lock_init(&sp->lock); + rwlock_init(&sp->lock); if (mpol) { struct vm_area_struct pvma; @@ -2497,14 +2501,14 @@ void mpol_free_shared_policy(struct shared_policy *p) if (!p->root.rb_node) return; - spin_lock(&p->lock); + write_lock(&p->lock); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(p, n); } - spin_unlock(&p->lock); + write_unlock(&p->lock); } #ifdef CONFIG_NUMA_BALANCING -- cgit v1.2.3 From fea85cff11de4377040deff0e85eb2793fb078aa Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 14 Jan 2016 15:18:39 -0800 Subject: mm/page_isolation.c: return last tested pfn rather than failure indicator This is preparation step to report test failed pfn in new tracepoint to analyze cma allocation failure problem. There is no functional change in this patch. Signed-off-by: Joonsoo Kim Acked-by: David Rientjes Acked-by: Michal Nazarewicz Cc: Minchan Kim Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 4568fd58f70a..029a171d35dc 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -212,7 +212,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * * Returns 1 if all pages in the range are isolated. */ -static int +static unsigned long __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, bool skip_hwpoisoned_pages) { @@ -237,9 +237,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, else break; } - if (pfn < end_pfn) - return 0; - return 1; + + return pfn; } int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, @@ -248,7 +247,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn, flags; struct page *page; struct zone *zone; - int ret; /* * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages @@ -266,10 +264,11 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, /* Check all pages are free or marked as ISOLATED */ zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn, + pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, skip_hwpoisoned_pages); spin_unlock_irqrestore(&zone->lock, flags); - return ret ? 0 : -EBUSY; + + return pfn < end_pfn ? -EBUSY : 0; } struct page *alloc_migrate_target(struct page *page, unsigned long private, -- cgit v1.2.3 From 0f0848e5118a4cb2cb92cef0c3af6f647649ff47 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 14 Jan 2016 15:18:42 -0800 Subject: mm/page_isolation.c: add new tracepoint, test_pages_isolated cma allocation should be guranteeded to succeed. But sometimes it can fail in the current implementation. To track down the problem, we need to know which page is problematic and this new tracepoint will report it. Signed-off-by: Joonsoo Kim Acked-by: Michal Nazarewicz Acked-by: David Rientjes Cc: Minchan Kim Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 029a171d35dc..f484b9300fe3 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -9,6 +9,9 @@ #include #include "internal.h" +#define CREATE_TRACE_POINTS +#include + static int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) { @@ -268,6 +271,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, skip_hwpoisoned_pages); spin_unlock_irqrestore(&zone->lock, flags); + trace_test_pages_isolated(start_pfn, end_pfn, pfn); + return pfn < end_pfn ? -EBUSY : 0; } -- cgit v1.2.3 From 8ef5849fa8a2c7894885e27ee8c8fe06e21a88fd Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 14 Jan 2016 15:18:45 -0800 Subject: mm/cma: always check which page caused allocation failure Now, we have tracepoint in test_pages_isolated() to notify pfn which cannot be isolated. But, in alloc_contig_range(), some error path doesn't call test_pages_isolated() so it's still hard to know exact pfn that causes allocation failure. This patch change this situation by calling test_pages_isolated() in almost error path. In allocation failure case, some overhead is added by this change, but, allocation failure is really rare event so it would not matter. In fatal signal pending case, we don't call test_pages_isolated() because this failure is intentional one. There was a bogus outer_start problem due to unchecked buddy order and this patch also fix it. Before this patch, it didn't matter, because end result is same thing. But, after this patch, tracepoint will report failed pfn so it should be accurate. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Acked-by: Michal Nazarewicz Cc: David Rientjes Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ca58bfcdadac..2f6d30db4c94 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6725,8 +6725,12 @@ int alloc_contig_range(unsigned long start, unsigned long end, if (ret) return ret; + /* + * In case of -EBUSY, we'd like to know which page causes problem. + * So, just fall through. We will check it in test_pages_isolated(). + */ ret = __alloc_contig_migrate_range(&cc, start, end); - if (ret) + if (ret && ret != -EBUSY) goto done; /* @@ -6753,12 +6757,25 @@ int alloc_contig_range(unsigned long start, unsigned long end, outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { if (++order >= MAX_ORDER) { - ret = -EBUSY; - goto done; + outer_start = start; + break; } outer_start &= ~0UL << order; } + if (outer_start != start) { + order = page_order(pfn_to_page(outer_start)); + + /* + * outer_start page could be small order buddy page and + * it doesn't include start page. Adjust outer_start + * in this case to report failed page properly + * on tracepoint in test_pages_isolated() + */ + if (outer_start + (1UL << order) <= start) + outer_start = start; + } + /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { pr_info("%s: [%lx, %lx) PFNs busy\n", -- cgit v1.2.3 From ba5e9579433aefcdccdec207601e124d3bdf2a71 Mon Sep 17 00:00:00 2001 From: yalin wang Date: Thu, 14 Jan 2016 15:18:48 -0800 Subject: mm: change mm_vmscan_lru_shrink_inactive() proto types Move node_id zone_idx shrink flags into trace function, so thay we don't need caculate these args if the trace is disabled, and will make this function have less arguments. Signed-off-by: yalin wang Reviewed-by: Steven Rostedt Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index fd7823ccf301..35dd57e99282 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1691,11 +1691,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, current_may_throttle()) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); - trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, - zone_idx(zone), - nr_scanned, nr_reclaimed, - sc->priority, - trace_shrink_flags(file)); + trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed, + sc->priority, file); return nr_reclaimed; } -- cgit v1.2.3 From b4ad0c7e004a2cc0e52790eff72f5176b59ca386 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 14 Jan 2016 15:18:54 -0800 Subject: mm/memblock.c: memblock_is_memory()/reserved() can be boolean Make memblock_is_memory() and memblock_is_reserved return bool to improve readability due to these particular functions only using either one or zero as their return value. No functional change. Signed-off-by: Yaowei Bai Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 07ff069fef25..9695398a14c0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1528,12 +1528,12 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr return -1; } -int __init memblock_is_reserved(phys_addr_t addr) +bool __init memblock_is_reserved(phys_addr_t addr) { return memblock_search(&memblock.reserved, addr) != -1; } -int __init_memblock memblock_is_memory(phys_addr_t addr) +bool __init_memblock memblock_is_memory(phys_addr_t addr) { return memblock_search(&memblock.memory, addr) != -1; } -- cgit v1.2.3 From c00eb15a8914b8ba84032a36044a5aaf7f71709d Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 14 Jan 2016 15:19:00 -0800 Subject: mm/zonelist: enumerate zonelists array index Hardcoding index to zonelists array in gfp_zonelist() is not a good idea, let's enumerate it to improve readability. No functional change. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix CONFIG_NUMA=n build] [n-horiguchi@ah.jp.nec.com: fix warning in comparing enumerator] Signed-off-by: Yaowei Bai Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2f6d30db4c94..e40e702ce919 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4148,8 +4148,7 @@ static void set_zonelist_order(void) static void build_zonelists(pg_data_t *pgdat) { - int j, node, load; - enum zone_type i; + int i, node, load; nodemask_t used_mask; int local_node, prev_node; struct zonelist *zonelist; @@ -4169,7 +4168,7 @@ static void build_zonelists(pg_data_t *pgdat) nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); - j = 0; + i = 0; while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { /* @@ -4186,12 +4185,12 @@ static void build_zonelists(pg_data_t *pgdat) if (order == ZONELIST_ORDER_NODE) build_zonelists_in_node_order(pgdat, node); else - node_order[j++] = node; /* remember order */ + node_order[i++] = node; /* remember order */ } if (order == ZONELIST_ORDER_ZONE) { /* calculate node order -- i.e., DMA last! */ - build_zonelists_in_zone_order(pgdat, j); + build_zonelists_in_zone_order(pgdat, i); } build_thisnode_zonelists(pgdat); -- cgit v1.2.3 From fde82aaa731de8a23d817971f6080041a4917d06 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 14 Jan 2016 15:19:03 -0800 Subject: mm/page_alloc.c: get rid of __alloc_pages_high_priority() __alloc_pages_high_priority doesn't do anything special other than it calls get_page_from_freelist and loops around GFP_NOFAIL allocation until it succeeds. It would be better if the first part was done in __alloc_pages_slowpath where we modify the zonelist because this would be easier to read and understand. Opencoding the function into its only caller allows to simplify it a bit as well. This patch doesn't introduce any functional changes. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Michal Hocko Acked-by: Mel Gorman Acked-by: David Rientjes Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 36 +++++++++--------------------------- 1 file changed, 9 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e40e702ce919..1f3c26992e90 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2876,28 +2876,6 @@ retry: return page; } -/* - * This is called in the allocator slow-path if the allocation request is of - * sufficient urgency to ignore watermarks and take other desperate measures - */ -static inline struct page * -__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, - const struct alloc_context *ac) -{ - struct page *page; - - do { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS, ac); - - if (!page && gfp_mask & __GFP_NOFAIL) - wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, - HZ/50); - } while (!page && (gfp_mask & __GFP_NOFAIL)); - - return page; -} - static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) { struct zoneref *z; @@ -3042,12 +3020,16 @@ retry: * allocations are system rather than user orientated */ ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); + do { + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); + if (page) + goto got_pg; - page = __alloc_pages_high_priority(gfp_mask, order, ac); - - if (page) { - goto got_pg; - } + if (gfp_mask & __GFP_NOFAIL) + wait_iff_congested(ac->preferred_zone, + BLK_RW_ASYNC, HZ/50); + } while (gfp_mask & __GFP_NOFAIL); } /* Caller is not willing to reclaim, we can't balance anything */ -- cgit v1.2.3 From 33d5310306ec244d96533da5f9183e05a7a51106 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 14 Jan 2016 15:19:05 -0800 Subject: mm/page_alloc.c: do not loop over ALLOC_NO_WATERMARKS without triggering reclaim __alloc_pages_slowpath is looping over ALLOC_NO_WATERMARKS requests if __GFP_NOFAIL is requested. This is fragile because we are basically relying on somebody else to make the reclaim (be it the direct reclaim or OOM killer) for us. The caller might be holding resources (e.g. locks) which block other other reclaimers from making any progress for example. Remove the retry loop and rely on __alloc_pages_slowpath to invoke all allowed reclaim steps and retry logic. We have to be careful about __GFP_NOFAIL allocations from the PF_MEMALLOC context even though this is a very bad idea to begin with because no progress can be gurateed at all. We shouldn't break the __GFP_NOFAIL semantic here though. It could be argued that this is essentially GFP_NOWAIT context which we do not support but PF_MEMALLOC is much harder to check for existing users because they might happen deep down the code path performed much later after setting the flag so we cannot really rule out there is no kernel path triggering this combination. Signed-off-by: Michal Hocko Acked-by: Mel Gorman Acked-by: David Rientjes Cc: Tetsuo Handa Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1f3c26992e90..2a6fe377cafc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3020,32 +3020,36 @@ retry: * allocations are system rather than user orientated */ ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); - do { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS, ac); - if (page) - goto got_pg; - - if (gfp_mask & __GFP_NOFAIL) - wait_iff_congested(ac->preferred_zone, - BLK_RW_ASYNC, HZ/50); - } while (gfp_mask & __GFP_NOFAIL); + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); + if (page) + goto got_pg; } /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) { /* - * All existing users of the deprecated __GFP_NOFAIL are - * blockable, so warn of any new users that actually allow this - * type of allocation to fail. + * All existing users of the __GFP_NOFAIL are blockable, so warn + * of any new users that actually allow this type of allocation + * to fail. */ WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); goto nopage; } /* Avoid recursion of direct reclaim */ - if (current->flags & PF_MEMALLOC) + if (current->flags & PF_MEMALLOC) { + /* + * __GFP_NOFAIL request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us. + */ + if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { + cond_resched(); + goto retry; + } goto nopage; + } /* Avoid allocations with no watermarks from looping endlessly */ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) -- cgit v1.2.3 From 6219c2a2ec990f80a586216172c811a9099c5cdf Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 14 Jan 2016 15:19:08 -0800 Subject: mm/vmalloc.c: use list_{next,first}_entry To make the intention clearer, use list_{next,first}_entry instead of list_entry. Signed-off-by: Geliang Tang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9a58f9a1874d..7007fe85840e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -441,8 +441,7 @@ nocache: if (list_is_last(&first->list, &vmap_area_list)) goto found; - first = list_entry(first->list.next, - struct vmap_area, list); + first = list_next_entry(first, list); } found: @@ -2559,10 +2558,10 @@ static void *s_start(struct seq_file *m, loff_t *pos) struct vmap_area *va; spin_lock(&vmap_area_lock); - va = list_entry((&vmap_area_list)->next, typeof(*va), list); + va = list_first_entry(&vmap_area_list, typeof(*va), list); while (n > 0 && &va->list != &vmap_area_list) { n--; - va = list_entry(va->list.next, typeof(*va), list); + va = list_next_entry(va, list); } if (!n && &va->list != &vmap_area_list) return va; @@ -2576,7 +2575,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) struct vmap_area *va = p, *next; ++*pos; - next = list_entry(va->list.next, typeof(*va), list); + next = list_next_entry(va, list); if (&next->list != &vmap_area_list) return next; -- cgit v1.2.3 From 5b80287a65da927742c6d43b1369bd5ed133aad1 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 14 Jan 2016 15:19:11 -0800 Subject: mm/mmzone.c: memmap_valid_within() can be boolean Make memmap_valid_within return bool due to this particular function only using either one or zero as its return value. No functional change. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmzone.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mmzone.c b/mm/mmzone.c index 7d87ebb0d632..52687fb4de6f 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -72,16 +72,16 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, } #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL -int memmap_valid_within(unsigned long pfn, +bool memmap_valid_within(unsigned long pfn, struct page *page, struct zone *zone) { if (page_to_pfn(page) != pfn) - return 0; + return false; if (page_zone(page) != zone) - return 0; + return false; - return 1; + return true; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ -- cgit v1.2.3 From 6a15a37097c7e02390bb08d83dac433c9f10144f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 14 Jan 2016 15:19:20 -0800 Subject: mm, proc: reduce cost of /proc/pid/smaps for shmem mappings The previous patch has improved swap accounting for shmem mapping, which however made /proc/pid/smaps more expensive for shmem mappings, as we consult the radix tree for each pte_none entry, so the overal complexity is O(n*log(n)). We can reduce this significantly for mappings that cannot contain COWed pages, because then we can either use the statistics tha shmem object itself tracks (if the mapping contains the whole object, or the swap usage of the whole object is zero), or use the radix tree iterator, which is much more effective than repeated find_get_entry() calls. This patch therefore introduces a function shmem_swap_usage(vma) and makes /proc/pid/smaps use it when possible. Only for writable private mappings of shmem objects (i.e. tmpfs files) with the shmem object itself (partially) swapped outwe have to resort to the find_get_entry() approach. Hopefully such mappings are relatively uncommon. To demonstrate the diference, I have measured this on a process that creates a 2GB mapping and dirties single pages with a stride of 2MB, and time how long does it take to cat /proc/pid/smaps of this process 100 times. Private writable mapping of a /dev/shm/file (the most complex case): real 0m3.831s user 0m0.180s sys 0m3.212s Shared mapping of an almost full mapping of a partially swapped /dev/shm/file (which needs to employ the radix tree iterator). real 0m1.351s user 0m0.096s sys 0m0.768s Same, but with /dev/shm/file not swapped (so no radix tree walk needed) real 0m0.935s user 0m0.128s sys 0m0.344s Private anonymous mapping: real 0m0.949s user 0m0.116s sys 0m0.348s The cost is now much closer to the private anonymous mapping case, unless the shmem mapping is private and writable. Signed-off-by: Vlastimil Babka Cc: Hugh Dickins Cc: Jerome Marchand Cc: Konstantin Khlebnikov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 9e60093aca3f..e978621de1ef 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -359,6 +359,76 @@ static int shmem_free_swap(struct address_space *mapping, return 0; } +/* + * Determine (in bytes) how many of the shmem object's pages mapped by the + * given vma is swapped out. + * + * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * as long as the inode doesn't go away and racy results are not a problem. + */ +unsigned long shmem_swap_usage(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct shmem_inode_info *info = SHMEM_I(inode); + struct address_space *mapping = inode->i_mapping; + unsigned long swapped; + pgoff_t start, end; + struct radix_tree_iter iter; + void **slot; + struct page *page; + + /* Be careful as we don't hold info->lock */ + swapped = READ_ONCE(info->swapped); + + /* + * The easier cases are when the shmem object has nothing in swap, or + * the vma maps it whole. Then we can simply use the stats that we + * already track. + */ + if (!swapped) + return 0; + + if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) + return swapped << PAGE_SHIFT; + + swapped = 0; + + /* Here comes the more involved part */ + start = linear_page_index(vma, vma->vm_start); + end = linear_page_index(vma, vma->vm_end); + + rcu_read_lock(); + +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + if (iter.index >= end) + break; + + page = radix_tree_deref_slot(slot); + + /* + * This should only be possible to happen at index 0, so we + * don't need to reset the counter, nor do we risk infinite + * restarts. + */ + if (radix_tree_deref_retry(page)) + goto restart; + + if (radix_tree_exceptional_entry(page)) + swapped++; + + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + + rcu_read_unlock(); + + return swapped << PAGE_SHIFT; +} + /* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. */ -- cgit v1.2.3 From 48131e03ca4ed71d73fbe55c311a258c6fa2a090 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 14 Jan 2016 15:19:23 -0800 Subject: mm, proc: reduce cost of /proc/pid/smaps for unpopulated shmem mappings Following the previous patch, further reduction of /proc/pid/smaps cost is possible for private writable shmem mappings with unpopulated areas where the page walk invokes the .pte_hole function. We can use radix tree iterator for each such area instead of calling find_get_entry() in a loop. This is possible at the extra maintenance cost of introducing another shmem function shmem_partial_swap_usage(). To demonstrate the diference, I have measured this on a process that creates a private writable 2GB mapping of a partially swapped out /dev/shm/file (which cannot employ the optimizations from the prvious patch) and doesn't populate it at all. I time how long does it take to cat /proc/pid/smaps of this process 100 times. Before this patch: real 0m3.831s user 0m0.180s sys 0m3.212s After this patch: real 0m1.176s user 0m0.180s sys 0m0.684s The time is similar to the case where a radix tree iterator is employed on the whole mapping. Signed-off-by: Vlastimil Babka Cc: Hugh Dickins Cc: Jerome Marchand Cc: Konstantin Khlebnikov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 65 ++++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index e978621de1ef..760d90cf2a41 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -361,41 +361,18 @@ static int shmem_free_swap(struct address_space *mapping, /* * Determine (in bytes) how many of the shmem object's pages mapped by the - * given vma is swapped out. + * given offsets are swapped out. * * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ -unsigned long shmem_swap_usage(struct vm_area_struct *vma) +unsigned long shmem_partial_swap_usage(struct address_space *mapping, + pgoff_t start, pgoff_t end) { - struct inode *inode = file_inode(vma->vm_file); - struct shmem_inode_info *info = SHMEM_I(inode); - struct address_space *mapping = inode->i_mapping; - unsigned long swapped; - pgoff_t start, end; struct radix_tree_iter iter; void **slot; struct page *page; - - /* Be careful as we don't hold info->lock */ - swapped = READ_ONCE(info->swapped); - - /* - * The easier cases are when the shmem object has nothing in swap, or - * the vma maps it whole. Then we can simply use the stats that we - * already track. - */ - if (!swapped) - return 0; - - if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) - return swapped << PAGE_SHIFT; - - swapped = 0; - - /* Here comes the more involved part */ - start = linear_page_index(vma, vma->vm_start); - end = linear_page_index(vma, vma->vm_end); + unsigned long swapped = 0; rcu_read_lock(); @@ -429,6 +406,40 @@ restart: return swapped << PAGE_SHIFT; } +/* + * Determine (in bytes) how many of the shmem object's pages mapped by the + * given vma is swapped out. + * + * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * as long as the inode doesn't go away and racy results are not a problem. + */ +unsigned long shmem_swap_usage(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct shmem_inode_info *info = SHMEM_I(inode); + struct address_space *mapping = inode->i_mapping; + unsigned long swapped; + + /* Be careful as we don't hold info->lock */ + swapped = READ_ONCE(info->swapped); + + /* + * The easier cases are when the shmem object has nothing in swap, or + * the vma maps it whole. Then we can simply use the stats that we + * already track. + */ + if (!swapped) + return 0; + + if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) + return swapped << PAGE_SHIFT; + + /* Here comes the more involved part */ + return shmem_partial_swap_usage(mapping, + linear_page_index(vma, vma->vm_start), + linear_page_index(vma, vma->vm_end)); +} + /* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. */ -- cgit v1.2.3 From eca56ff906bdd0239485e8b47154a6e73dd9a2f3 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 14 Jan 2016 15:19:26 -0800 Subject: mm, shmem: add internal shmem resident memory accounting Currently looking at /proc//status or statm, there is no way to distinguish shmem pages from pages mapped to a regular file (shmem pages are mapped to /dev/zero), even though their implication in actual memory use is quite different. The internal accounting currently counts shmem pages together with regular files. As a preparation to extend the userspace interfaces, this patch adds MM_SHMEMPAGES counter to mm_rss_stat to account for shmem pages separately from MM_FILEPAGES. The next patch will expose it to userspace - this patch doesn't change the exported values yet, by adding up MM_SHMEMPAGES to MM_FILEPAGES at places where MM_FILEPAGES was used before. The only user-visible change after this patch is the OOM killer message that separates the reported "shmem-rss" from "file-rss". [vbabka@suse.cz: forward-porting, tweak changelog] Signed-off-by: Jerome Marchand Signed-off-by: Vlastimil Babka Acked-by: Konstantin Khlebnikov Acked-by: Michal Hocko Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 30 ++++++++++-------------------- mm/oom_kill.c | 5 +++-- mm/rmap.c | 12 +++--------- 3 files changed, 16 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index c387430f06c3..f7026c035940 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -832,10 +832,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } else if (is_migration_entry(entry)) { page = migration_entry_to_page(entry); - if (PageAnon(page)) - rss[MM_ANONPAGES]++; - else - rss[MM_FILEPAGES]++; + rss[mm_counter(page)]++; if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { @@ -874,10 +871,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (page) { get_page(page); page_dup_rmap(page); - if (PageAnon(page)) - rss[MM_ANONPAGES]++; - else - rss[MM_FILEPAGES]++; + rss[mm_counter(page)]++; } out_set_pte: @@ -1113,9 +1107,8 @@ again: tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; - if (PageAnon(page)) - rss[MM_ANONPAGES]--; - else { + + if (!PageAnon(page)) { if (pte_dirty(ptent)) { force_flush = 1; set_page_dirty(page); @@ -1123,8 +1116,8 @@ again: if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); - rss[MM_FILEPAGES]--; } + rss[mm_counter(page)]--; page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); @@ -1146,11 +1139,7 @@ again: struct page *page; page = migration_entry_to_page(entry); - - if (PageAnon(page)) - rss[MM_ANONPAGES]--; - else - rss[MM_FILEPAGES]--; + rss[mm_counter(page)]--; } if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); @@ -1460,7 +1449,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, mm_counter_file(page)); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -2097,7 +2086,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter_fast(mm, MM_FILEPAGES); + dec_mm_counter_fast(mm, + mm_counter_file(old_page)); inc_mm_counter_fast(mm, MM_ANONPAGES); } } else { @@ -2820,7 +2810,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page); } set_pte_at(vma->vm_mm, address, pte, entry); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c12680993ff3..dc490c06941b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -585,10 +585,11 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, */ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); mark_oom_victim(victim); - pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", + pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), - K(get_mm_counter(victim->mm, MM_FILEPAGES))); + K(get_mm_counter(victim->mm, MM_FILEPAGES)), + K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); task_unlock(victim); /* diff --git a/mm/rmap.c b/mm/rmap.c index 3c3f1d21f075..622756c16ac8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1364,10 +1364,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageHuge(page)) { hugetlb_count_sub(1 << compound_order(page), mm); } else { - if (PageAnon(page)) - dec_mm_counter(mm, MM_ANONPAGES); - else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter(page)); } set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); @@ -1377,10 +1374,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * interest anymore. Simply discard the pte, vmscan * will take care of the rest. */ - if (PageAnon(page)) - dec_mm_counter(mm, MM_ANONPAGES); - else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter(page)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { swp_entry_t entry; pte_t swp_pte; @@ -1420,7 +1414,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pte, swp_pte); } else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter_file(page)); page_remove_rmap(page); page_cache_release(page); -- cgit v1.2.3 From 146693471f1df44115cf4aaf13c33618619ad855 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 14 Jan 2016 15:19:32 -0800 Subject: mm, thp: use list_first_entry_or_null() Simplify the code with list_first_entry_or_null(). Signed-off-by: Geliang Tang Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pgtable-generic.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 7d3db0247983..4c681baff363 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -176,13 +176,10 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) /* FIFO */ pgtable = pmd_huge_pte(mm, pmdp); - if (list_empty(&pgtable->lru)) - pmd_huge_pte(mm, pmdp) = NULL; - else { - pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next, - struct page, lru); + pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru, + struct page, lru); + if (pmd_huge_pte(mm, pmdp)) list_del(&pgtable->lru); - } return pgtable; } #endif -- cgit v1.2.3 From 244d63ee345bd9d45c87f665ef5e3f7bcd5db45b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 14 Jan 2016 15:19:35 -0800 Subject: mm, vmalloc: remove VM_VPAGES VM_VPAGES is unnecessary, it's easier to check is_vmalloc_addr() when reading /proc/vmallocinfo. [akpm@linux-foundation.org: remove VM_VPAGES reference via kvfree()] Signed-off-by: David Rientjes Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7007fe85840e..58ceeb107960 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1479,10 +1479,7 @@ static void __vunmap(const void *addr, int deallocate_pages) __free_kmem_pages(page, 0); } - if (area->flags & VM_VPAGES) - vfree(area->pages); - else - kfree(area->pages); + kvfree(area->pages); } kfree(area); @@ -1592,7 +1589,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, PAGE_KERNEL, node, area->caller); - area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, nested_gfp, node); } @@ -2650,7 +2646,7 @@ static int s_show(struct seq_file *m, void *p) if (v->flags & VM_USERMAP) seq_puts(m, " user"); - if (v->flags & VM_VPAGES) + if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages"); show_numa_info(m, v); -- cgit v1.2.3 From 316bda0e6cc5f36f94b4af8bded16d642c90ad75 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:19:38 -0800 Subject: vmscan: do not force-scan file lru if its absolute size is small We assume there is enough inactive page cache if the size of inactive file lru is greater than the size of active file lru, in which case we force-scan file lru ignoring anonymous pages. While this logic works fine when there are plenty of page cache pages, it fails if the size of file lru is small (several MB): in this case (lru_size >> prio) will be 0 for normal scan priorities, as a result, if inactive file lru happens to be larger than active file lru, anonymous pages of a cgroup will never get evicted unless the system experiences severe memory pressure, even if there are gigabytes of unused anonymous memory there, which is unfair in respect to other cgroups, whose workloads might be page cache oriented. This patch attempts to fix this by elaborating the "enough inactive page cache" check: it makes it not only check that inactive lru size > active lru size, but also that we will scan something from the cgroup at the current scan priority. If these conditions do not hold, we proceed to SCAN_FRACT as usual. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 35dd57e99282..1dab3a3ddcd3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2043,10 +2043,16 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, } /* - * There is enough inactive page cache, do not reclaim - * anything from the anonymous working set right now. + * If there is enough inactive page cache, i.e. if the size of the + * inactive list is greater than that of the active list *and* the + * inactive list actually has some pages to scan on this priority, we + * do not reclaim anything from the anonymous working set right now. + * Without the second condition we could end up never scanning an + * lruvec even if it has plenty of old anonymous pages unless the + * system is under heavy pressure. */ - if (!inactive_file_is_low(lruvec)) { + if (!inactive_file_is_low(lruvec) && + get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; } -- cgit v1.2.3 From 9ee11ba4251dddf1b0e507d184b25b1bd7820773 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 14 Jan 2016 15:19:41 -0800 Subject: memcg: do not allow to disable tcp accounting after limit is set There are two bits defined for cg_proto->flags - MEMCG_SOCK_ACTIVATED and MEMCG_SOCK_ACTIVE - both are set in tcp_update_limit, but the former is never cleared while the latter can be cleared by unsetting the limit. This allows to disable tcp socket accounting for new sockets after it was enabled by writing -1 to memory.kmem.tcp.limit_in_bytes while still guaranteeing that memcg_socket_limit_enabled static key will be decremented on memcg destruction. This functionality looks dubious, because it is not clear what a use case would be. By enabling tcp accounting a user accepts the price. If they then find the performance degradation unacceptable, they can always restart their workload with tcp accounting disabled. It does not seem there is any need to flip it while the workload is running. Besides, it contradicts to how kmem accounting API works: writing whatever to memory.kmem.limit_in_bytes enables kmem accounting for the cgroup in question, after which it cannot be disabled. Therefore one might expect that writing -1 to memory.kmem.tcp.limit_in_bytes just enables socket accounting w/o limiting it, which might be useful by itself, but it isn't true. Since this API peculiarity is not documented anywhere, I propose to drop it. This will allow to simplify the code by dropping cg_proto->flags. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4bd6c4513393..0bc140d998ad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -316,7 +316,7 @@ void sock_update_memcg(struct sock *sk) rcu_read_lock(); memcg = mem_cgroup_from_task(current); cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) && + if (cg_proto && cg_proto->active && css_tryget_online(&memcg->css)) { sk->sk_cgrp = cg_proto; } -- cgit v1.2.3 From 9f6c399ddc369856d787bd43ccd43b19ed1e4e32 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 14 Jan 2016 15:19:47 -0800 Subject: mm, vmscan: consider isolated pages in zone_reclaimable_pages zone_reclaimable_pages counts how many pages are reclaimable in the given zone. This currently includes all pages on file lrus and anon lrus if there is an available swap storage. We do not consider NR_ISOLATED_{ANON,FILE} counters though which is not correct because these counters reflect temporarily isolated pages which are still reclaimable because they either get back to their LRU or get freed either by the page reclaim or page migration. The number of these pages might be sufficiently high to confuse users of zone_reclaimable_pages (e.g. mbind can migrate large ranges of memory at once). Signed-off-by: Michal Hocko Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: David Rientjes Cc: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 1dab3a3ddcd3..1bbfd623630e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -197,11 +197,13 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) unsigned long nr; nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); + zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_ISOLATED_FILE); if (get_nr_swap_pages() > 0) nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); + zone_page_state(zone, NR_INACTIVE_ANON) + + zone_page_state(zone, NR_ISOLATED_ANON); return nr; } -- cgit v1.2.3 From bc36f7017c39d1fe3c01aecd09cb2fe14753be90 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Thu, 14 Jan 2016 15:19:50 -0800 Subject: mm/mmap.c: remove incorrect MAP_FIXED flag comparison from mmap_region The following flag comparison in mmap_region makes no sense: if (!(vm_flags & MAP_FIXED)) return -ENOMEM; The condition is always false and thus the above "return -ENOMEM" is never executed. The vm_flags must not be compared with MAP_FIXED flag. The vm_flags may only be compared with VM_* flags. MAP_FIXED has the same value as VM_MAYREAD. Hitting the rlimit is a slow path and find_vma_intersection should realize that there is no overlapping VMA for !MAP_FIXED case pretty quickly. Signed-off-by: Piotr Kwapulinski Acked-by: Michal Hocko Cc: Oleg Nesterov Cc: Chris Metcalf Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 9da9c27c33a2..c311bfd8005b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1551,9 +1551,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * MAP_FIXED may remove pages of mappings that intersects with * requested mapping. Account for the pages it would unmap. */ - if (!(vm_flags & MAP_FIXED)) - return -ENOMEM; - nr_pages = count_vma_pages_range(mm, addr, addr + len); if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) -- cgit v1.2.3 From d07e22597d1d355829b7b18ac19afa912cf758d1 Mon Sep 17 00:00:00 2001 From: Daniel Cashman Date: Thu, 14 Jan 2016 15:19:53 -0800 Subject: mm: mmap: add new /proc tunable for mmap_base ASLR Address Space Layout Randomization (ASLR) provides a barrier to exploitation of user-space processes in the presence of security vulnerabilities by making it more difficult to find desired code/data which could help an attack. This is done by adding a random offset to the location of regions in the process address space, with a greater range of potential offset values corresponding to better protection/a larger search-space for brute force, but also to greater potential for fragmentation. The offset added to the mmap_base address, which provides the basis for the majority of the mappings for a process, is set once on process exec in arch_pick_mmap_layout() and is done via hard-coded per-arch values, which reflect, hopefully, the best compromise for all systems. The trade-off between increased entropy in the offset value generation and the corresponding increased variability in address space fragmentation is not absolute, however, and some platforms may tolerate higher amounts of entropy. This patch introduces both new Kconfig values and a sysctl interface which may be used to change the amount of entropy used for offset generation on a system. The direct motivation for this change was in response to the libstagefright vulnerabilities that affected Android, specifically to information provided by Google's project zero at: http://googleprojectzero.blogspot.com/2015/09/stagefrightened.html The attack presented therein, by Google's project zero, specifically targeted the limited randomness used to generate the offset added to the mmap_base address in order to craft a brute-force-based attack. Concretely, the attack was against the mediaserver process, which was limited to respawning every 5 seconds, on an arm device. The hard-coded 8 bits used resulted in an average expected success rate of defeating the mmap ASLR after just over 10 minutes (128 tries at 5 seconds a piece). With this patch, and an accompanying increase in the entropy value to 16 bits, the same attack would take an average expected time of over 45 hours (32768 tries), which makes it both less feasible and more likely to be noticed. The introduced Kconfig and sysctl options are limited by per-arch minimum and maximum values, the minimum of which was chosen to match the current hard-coded value and the maximum of which was chosen so as to give the greatest flexibility without generating an invalid mmap_base address, generally a 3-4 bits less than the number of bits in the user-space accessible virtual address space. When decided whether or not to change the default value, a system developer should consider that mmap_base address could be placed anywhere up to 2^(value) bits away from the non-randomized location, which would introduce variable-sized areas above and below the mmap_base address such that the maximum vm_area_struct size may be reduced, preventing very large allocations. This patch (of 4): ASLR only uses as few as 8 bits to generate the random offset for the mmap base address on 32 bit architectures. This value was chosen to prevent a poorly chosen value from dividing the address space in such a way as to prevent large allocations. This may not be an issue on all platforms. Allow the specification of a minimum number of bits so that platforms desiring greater ASLR protection may determine where to place the trade-off. Signed-off-by: Daniel Cashman Cc: Russell King Acked-by: Kees Cook Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Don Zickus Cc: Eric W. Biederman Cc: Heinrich Schuchardt Cc: Josh Poimboeuf Cc: Kirill A. Shutemov Cc: Naoya Horiguchi Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Thomas Gleixner Cc: David Rientjes Cc: Mark Salyzyn Cc: Jeff Vander Stoep Cc: Nick Kralevich Cc: Catalin Marinas Cc: Will Deacon Cc: "H. Peter Anvin" Cc: Hector Marco-Gisbert Cc: Borislav Petkov Cc: Ralf Baechle Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index c311bfd8005b..f32b84ad621a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -58,6 +58,18 @@ #define arch_rebalance_pgtables(addr, len) (addr) #endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS +const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; +const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; +int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS +const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; +const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; +int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; +#endif + + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -- cgit v1.2.3 From fec4eb2c8d89bf8d18136df3d688c42b7959a057 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 14 Jan 2016 15:20:09 -0800 Subject: mm/compaction: improve comment for compact_memory tunable knob handler sysctl_compaction_handler() is the handler function for compact_memory tunable knob under /proc/sys/vm, add the missing knob name to make this more accurate in comment. No functional change. Signed-off-by: Yaowei Bai Acked-by: Vlastimil Babka Acked-by: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index de3e1e71cd9f..ac6c6943d2ce 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1708,7 +1708,10 @@ static void compact_nodes(void) /* The written value is actually unused, all memory is compacted */ int sysctl_compact_memory; -/* This is the entry point for compacting all nodes via /proc/sys/vm */ +/* + * This is the entry point for compacting all nodes via + * /proc/sys/vm/compact_memory + */ int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { -- cgit v1.2.3 From c20cd45eb01748f0fba77a504f956b000df4ea73 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 14 Jan 2016 15:20:12 -0800 Subject: mm: allow GFP_{FS,IO} for page_cache_read page cache allocation page_cache_read has been historically using page_cache_alloc_cold to allocate a new page. This means that mapping_gfp_mask is used as the base for the gfp_mask. Many filesystems are setting this mask to GFP_NOFS to prevent from fs recursion issues. page_cache_read is called from the vm_operations_struct::fault() context during the page fault. This context doesn't need the reclaim protection normally. ceph and ocfs2 which call filemap_fault from their fault handlers seem to be OK because they are not taking any fs lock before invoking generic implementation. xfs which takes XFS_MMAPLOCK_SHARED is safe from the reclaim recursion POV because this lock serializes truncate and punch hole with the page faults and it doesn't get involved in the reclaim. There is simply no reason to deliberately use a weaker allocation context when a __GFP_FS | __GFP_IO can be used. The GFP_NOFS protection might be even harmful. There is a push to fail GFP_NOFS allocations rather than loop within allocator indefinitely with a very limited reclaim ability. Once we start failing those requests the OOM killer might be triggered prematurely because the page cache allocation failure is propagated up the page fault path and end up in pagefault_out_of_memory. We cannot play with mapping_gfp_mask directly because that would be racy wrt. parallel page faults and it might interfere with other users who really rely on NOFS semantic from the stored gfp_mask. The mask is also inode proper so it would even be a layering violation. What we can do instead is to push the gfp_mask into struct vm_fault and allow fs layer to overwrite it should the callback need to be called with a different allocation context. Initialize the default to (mapping_gfp_mask | __GFP_FS | __GFP_IO) because this should be safe from the page fault path normally. Why do we care about mapping_gfp_mask at all then? Because this doesn't hold only reclaim protection flags but it also might contain zone and movability restrictions (GFP_DMA32, __GFP_MOVABLE and others) so we have to respect those. Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Acked-by: Jan Kara Acked-by: Vlastimil Babka Cc: Tetsuo Handa Cc: Mel Gorman Cc: Dave Chinner Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 9 ++++----- mm/memory.c | 17 +++++++++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 1bb007624b53..ff42d31c891a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1812,19 +1812,18 @@ EXPORT_SYMBOL(generic_file_read_iter); * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static int page_cache_read(struct file *file, pgoff_t offset) +static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) { struct address_space *mapping = file->f_mapping; struct page *page; int ret; do { - page = page_cache_alloc_cold(mapping); + page = __page_cache_alloc(gfp_mask|__GFP_COLD); if (!page) return -ENOMEM; - ret = add_to_page_cache_lru(page, mapping, offset, - mapping_gfp_constraint(mapping, GFP_KERNEL)); + ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) @@ -2005,7 +2004,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, offset); + error = page_cache_read(file, offset, vmf->gfp_mask); /* * The page we want has now been added to the page cache. diff --git a/mm/memory.c b/mm/memory.c index f7026c035940..d4e4d37c1989 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1938,6 +1938,20 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo copy_user_highpage(dst, src, va, vma); } +static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) +{ + struct file *vm_file = vma->vm_file; + + if (vm_file) + return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; + + /* + * Special mappings (e.g. VDSO) do not have any file so fake + * a default GFP_KERNEL for them. + */ + return GFP_KERNEL; +} + /* * Notify the address space that the page is about to become writable so that * it can prohibit this or wait for the page to get into an appropriate state. @@ -1953,6 +1967,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = page->index; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.gfp_mask = __get_fault_gfp_mask(vma); vmf.page = page; vmf.cow_page = NULL; @@ -2757,6 +2772,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, vmf.pgoff = pgoff; vmf.flags = flags; vmf.page = NULL; + vmf.gfp_mask = __get_fault_gfp_mask(vma); vmf.cow_page = cow_page; ret = vma->vm_ops->fault(vma, &vmf); @@ -2923,6 +2939,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, vmf.pgoff = pgoff; vmf.max_pgoff = max_pgoff; vmf.flags = flags; + vmf.gfp_mask = __get_fault_gfp_mask(vma); vma->vm_ops->map_pages(vma, &vmf); } -- cgit v1.2.3 From a8d0143730d7b42c9fe6d1435d92ecce6863a62a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:20:15 -0800 Subject: mm: page_alloc: generalize the dirty balance reserve The dirty balance reserve that dirty throttling has to consider is merely memory not available to userspace allocations. There is nothing writeback-specific about it. Generalize the name so that it's reusable outside of that context. Signed-off-by: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 14 ++++++++++++-- mm/page_alloc.c | 21 +++------------------ 2 files changed, 15 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d15d88c8efa1..6fe7d15bd1f7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -278,7 +278,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone) unsigned long nr_pages; nr_pages = zone_page_state(zone, NR_FREE_PAGES); - nr_pages -= min(nr_pages, zone->dirty_balance_reserve); + /* + * Pages reserved for the kernel should not be considered + * dirtyable, to prevent a situation where reclaim has to + * clean pages in order to balance the zones. + */ + nr_pages -= min(nr_pages, zone->totalreserve_pages); nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); @@ -332,7 +337,12 @@ static unsigned long global_dirtyable_memory(void) unsigned long x; x = global_page_state(NR_FREE_PAGES); - x -= min(x, dirty_balance_reserve); + /* + * Pages reserved for the kernel should not be considered + * dirtyable, to prevent a situation where reclaim has to + * clean pages in order to balance the zones. + */ + x -= min(x, totalreserve_pages); x += global_page_state(NR_INACTIVE_FILE); x += global_page_state(NR_ACTIVE_FILE); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a6fe377cafc..1e9a56065400 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -114,13 +114,6 @@ static DEFINE_SPINLOCK(managed_page_count_lock); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; -/* - * When calculating the number of globally allowed dirty pages, there - * is a certain number of per-zone reserves that should not be - * considered dirtyable memory. This is the sum of those reserves - * over all existing zones that contribute dirtyable memory. - */ -unsigned long dirty_balance_reserve __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -5942,20 +5935,12 @@ static void calculate_totalreserve_pages(void) if (max > zone->managed_pages) max = zone->managed_pages; + + zone->totalreserve_pages = max; + reserve_pages += max; - /* - * Lowmem reserves are not available to - * GFP_HIGHUSER page cache allocations and - * kswapd tries to balance zones to their high - * watermark. As a result, neither should be - * regarded as dirtyable memory, to prevent a - * situation where reclaim has to clean pages - * in order to balance the zones. - */ - zone->dirty_balance_reserve = max; } } - dirty_balance_reserve = reserve_pages; totalreserve_pages = reserve_pages; } -- cgit v1.2.3 From 6ac0206bc0d13381e3ede3594bc0a3f8cd1d8ec9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 14 Jan 2016 15:20:28 -0800 Subject: mm/page_alloc.c: remove unnecessary parameter from __rmqueue Commit 0aaa29a56e4f ("mm, page_alloc: reserve pageblocks for high-order atomic allocations on demand") added an unnecessary and unused parameter to __rmqueue. It was a parameter that was used in an earlier version of the patch and then left behind. This patch cleans it up. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1e9a56065400..fbff97d7b298 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1781,7 +1781,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) * Call me with the zone->lock already held. */ static struct page *__rmqueue(struct zone *zone, unsigned int order, - int migratetype, gfp_t gfp_flags) + int migratetype) { struct page *page; @@ -1811,7 +1811,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - struct page *page = __rmqueue(zone, order, migratetype, 0); + struct page *page = __rmqueue(zone, order, migratetype); if (unlikely(page == NULL)) break; @@ -2234,7 +2234,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, trace_mm_page_alloc_zone_locked(page, order, migratetype); } if (!page) - page = __rmqueue(zone, order, migratetype, gfp_flags); + page = __rmqueue(zone, order, migratetype); spin_unlock(&zone->lock); if (!page) goto failed; -- cgit v1.2.3 From a16601c5458eb702f26cd48b9e8e1a9471700e72 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 14 Jan 2016 15:20:30 -0800 Subject: mm/page_alloc.c: use list_{first,last}_entry instead of list_entry To make the intention clearer, use list_{first,last}_entry instead of list_entry. Signed-off-by: Geliang Tang Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fbff97d7b298..b9747aa0fb59 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -805,7 +805,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, do { int mt; /* migratetype of the to-be-freed page */ - page = list_entry(list->prev, struct page, lru); + page = list_last_entry(list, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); @@ -1410,11 +1410,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, /* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < MAX_ORDER; ++current_order) { area = &(zone->free_area[current_order]); - if (list_empty(&area->free_list[migratetype])) - continue; - - page = list_entry(area->free_list[migratetype].next, + page = list_first_entry_or_null(&area->free_list[migratetype], struct page, lru); + if (!page) + continue; list_del(&page->lru); rmv_page_order(page); area->nr_free--; @@ -1693,12 +1692,12 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) for (order = 0; order < MAX_ORDER; order++) { struct free_area *area = &(zone->free_area[order]); - if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + page = list_first_entry_or_null( + &area->free_list[MIGRATE_HIGHATOMIC], + struct page, lru); + if (!page) continue; - page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next, - struct page, lru); - /* * It should never happen but changes to locking could * inadvertently allow a per-cpu drain to add pages @@ -1746,7 +1745,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) if (fallback_mt == -1) continue; - page = list_entry(area->free_list[fallback_mt].next, + page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); if (can_steal) steal_suitable_fallback(zone, page, start_migratetype); @@ -2205,9 +2204,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, } if (cold) - page = list_entry(list->prev, struct page, lru); + page = list_last_entry(list, struct page, lru); else - page = list_entry(list->next, struct page, lru); + page = list_first_entry(list, struct page, lru); list_del(&page->lru); pcp->count--; -- cgit v1.2.3 From 86760a2c6e827858f8eaf020b12b72b3210faf79 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 14 Jan 2016 15:20:33 -0800 Subject: mm/page_alloc.c: use list_for_each_entry in mark_free_pages() Use list_for_each_entry instead of list_for_each + list_entry to simplify the code. Signed-off-by: Geliang Tang Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b9747aa0fb59..d7f5bc895157 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1980,7 +1980,7 @@ void mark_free_pages(struct zone *zone) unsigned long pfn, max_zone_pfn; unsigned long flags; unsigned int order, t; - struct list_head *curr; + struct page *page; if (zone_is_empty(zone)) return; @@ -1990,17 +1990,17 @@ void mark_free_pages(struct zone *zone) max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - + page = pfn_to_page(pfn); if (!swsusp_page_is_forbidden(page)) swsusp_unset_page_free(page); } for_each_migratetype_order(order, t) { - list_for_each(curr, &zone->free_area[order].free_list[t]) { + list_for_each_entry(page, + &zone->free_area[order].free_list[t], lru) { unsigned long i; - pfn = page_to_pfn(list_entry(curr, struct page, lru)); + pfn = page_to_pfn(page); for (i = 0; i < (1UL << order); i++) swsusp_set_page_free(pfn_to_page(pfn + i)); } -- cgit v1.2.3 From 5020e285856cb406224e6f977fd893a006077806 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 14 Jan 2016 15:20:36 -0800 Subject: mm, oom: give __GFP_NOFAIL allocations access to memory reserves __GFP_NOFAIL is a big hammer used to ensure that the allocation request can never fail. This is a strong requirement and as such it also deserves a special treatment when the system is OOM. The primary problem here is that the allocation request might have come with some locks held and the oom victim might be blocked on the same locks. This is basically an OOM deadlock situation. This patch tries to reduce the risk of such a deadlocks by giving __GFP_NOFAIL allocations a special treatment and let them dive into memory reserves after oom killer invocation. This should help them to make a progress and release resources they are holding. The OOM victim should compensate for the reserves consumption. Signed-off-by: Michal Hocko Suggested-by: Andrea Arcangeli Cc: Mel Gorman Cc: Johannes Weiner Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d7f5bc895157..ce63d603820f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2732,8 +2732,21 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) + if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; + + if (gfp_mask & __GFP_NOFAIL) { + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); + /* + * fallback to ignore cpuset restriction if our nodes + * are depleted + */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); + } + } out: mutex_unlock(&oom_lock); return page; -- cgit v1.2.3 From f14516fbf0f6bec7d98c1cb5b5c73ccd2bb4a5e9 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 14 Jan 2016 15:20:39 -0800 Subject: mm/memblock: remove rgnbase and rgnsize variables Remove rgnbase and rgnsize variables from memblock_overlaps_region(). We use these variables only for passing to the memblock_addrs_overlap() function and that's all. Let's remove them. Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 9695398a14c0..c33a2a2ae69f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -96,13 +96,10 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type, { unsigned long i; - for (i = 0; i < type->cnt; i++) { - phys_addr_t rgnbase = type->regions[i].base; - phys_addr_t rgnsize = type->regions[i].size; - if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) + for (i = 0; i < type->cnt; i++) + if (memblock_addrs_overlap(base, size, type->regions[i].base, + type->regions[i].size)) break; - } - return i < type->cnt; } -- cgit v1.2.3 From 8c9c1701c7c23a57ebfd1a0b27b87053ae43cfb5 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 14 Jan 2016 15:20:42 -0800 Subject: mm/memblock: introduce for_each_memblock_type() We already have the for_each_memblock() macro in which provides ability to iterate over memblock regions of a known type. The for_each_memblock() macro allows us to pass the pointer to the struct memblock_type, instead we need to pass name of the type. This patch introduces a new macro for_each_memblock_type() which allows us iterate over memblock regions with the given type when the type is unknown. Signed-off-by: Alexander Kuleshov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index c33a2a2ae69f..d2ed81e59a94 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -525,7 +525,8 @@ int __init_memblock memblock_add_range(struct memblock_type *type, bool insert = false; phys_addr_t obase = base; phys_addr_t end = base + memblock_cap_size(base, &size); - int i, nr_new; + int idx, nr_new; + struct memblock_region *rgn; if (!size) return 0; @@ -549,8 +550,7 @@ repeat: base = obase; nr_new = 0; - for (i = 0; i < type->cnt; i++) { - struct memblock_region *rgn = &type->regions[i]; + for_each_memblock_type(type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; @@ -569,7 +569,7 @@ repeat: WARN_ON(flags != rgn->flags); nr_new++; if (insert) - memblock_insert_region(type, i++, base, + memblock_insert_region(type, idx++, base, rbase - base, nid, flags); } @@ -581,7 +581,7 @@ repeat: if (base < end) { nr_new++; if (insert) - memblock_insert_region(type, i, base, end - base, + memblock_insert_region(type, idx, base, end - base, nid, flags); } @@ -648,7 +648,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, int *start_rgn, int *end_rgn) { phys_addr_t end = base + memblock_cap_size(base, &size); - int i; + int idx; + struct memblock_region *rgn; *start_rgn = *end_rgn = 0; @@ -660,8 +661,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, if (memblock_double_array(type, base, size) < 0) return -ENOMEM; - for (i = 0; i < type->cnt; i++) { - struct memblock_region *rgn = &type->regions[i]; + for_each_memblock_type(type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; @@ -678,7 +678,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->base = base; rgn->size -= base - rbase; type->total_size -= base - rbase; - memblock_insert_region(type, i, rbase, base - rbase, + memblock_insert_region(type, idx, rbase, base - rbase, memblock_get_region_node(rgn), rgn->flags); } else if (rend > end) { @@ -689,14 +689,14 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->base = end; rgn->size -= end - rbase; type->total_size -= end - rbase; - memblock_insert_region(type, i--, rbase, end - rbase, + memblock_insert_region(type, idx--, rbase, end - rbase, memblock_get_region_node(rgn), rgn->flags); } else { /* @rgn is fully contained, record it */ if (!*end_rgn) - *start_rgn = i; - *end_rgn = i + 1; + *start_rgn = idx; + *end_rgn = idx + 1; } } @@ -1638,12 +1638,12 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name { unsigned long long base, size; unsigned long flags; - int i; + int idx; + struct memblock_region *rgn; pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); - for (i = 0; i < type->cnt; i++) { - struct memblock_region *rgn = &type->regions[i]; + for_each_memblock_type(type, rgn) { char nid_buf[32] = ""; base = rgn->base; @@ -1655,7 +1655,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name memblock_get_region_node(rgn)); #endif pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", - name, i, base, base + size - 1, size, nid_buf, flags); + name, idx, base, base + size - 1, size, nid_buf, flags); } } -- cgit v1.2.3 From a8ae49917077facbe4ed7b3a89bf025b1cefa0ed Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 14 Jan 2016 15:20:45 -0800 Subject: mm/swapfile.c: use list_{next,first}_entry To make the intention clearer, use list_{next,first}_entry instead of list_entry(). Signed-off-by: Geliang Tang Cc: "Kirill A. Shutemov" Cc: Jerome Marchand Cc: Vlastimil Babka Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 58877312cf6b..77551a553f57 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -165,8 +165,6 @@ static void discard_swap_cluster(struct swap_info_struct *si, int found_extent = 0; while (nr_pages) { - struct list_head *lh; - if (se->start_page <= start_page && start_page < se->start_page + se->nr_pages) { pgoff_t offset = start_page - se->start_page; @@ -188,8 +186,7 @@ static void discard_swap_cluster(struct swap_info_struct *si, break; } - lh = se->list.next; - se = list_entry(lh, struct swap_extent, list); + se = list_next_entry(se, list); } } @@ -903,7 +900,7 @@ int swp_swapcount(swp_entry_t entry) VM_BUG_ON(page_private(page) != SWP_CONTINUED); do { - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); map = kmap_atomic(page); tmp_count = map[offset]; kunmap_atomic(map); @@ -1633,14 +1630,11 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) se = start_se; for ( ; ; ) { - struct list_head *lh; - if (se->start_page <= offset && offset < (se->start_page + se->nr_pages)) { return se->start_block + (offset - se->start_page); } - lh = se->list.next; - se = list_entry(lh, struct swap_extent, list); + se = list_next_entry(se, list); sis->curr_swap_extent = se; BUG_ON(se == start_se); /* It *must* be present */ } @@ -1664,7 +1658,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis) while (!list_empty(&sis->first_swap_extent.list)) { struct swap_extent *se; - se = list_entry(sis->first_swap_extent.list.next, + se = list_first_entry(&sis->first_swap_extent.list, struct swap_extent, list); list_del(&se->list); kfree(se); -- cgit v1.2.3 From 7546934570f48283b7fe64a56d1c984fcb1e341e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 14 Jan 2016 15:20:48 -0800 Subject: mm/compaction.c: __compact_pgdat() code cleanuup This patch uses is_via_compact_memory() to distinguish compaction from sysfs or sysctl. And, this patch also reduces indentation on compaction_defer_reset() by filtering these cases first before checking watermark. There is no functional change. Signed-off-by: Joonsoo Kim Acked-by: Yaowei Bai Acked-by: David Rientjes Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index ac6c6943d2ce..585de54dbe8c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1658,14 +1658,15 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) !compaction_deferred(zone, cc->order)) compact_zone(zone, cc); - if (cc->order > 0) { - if (zone_watermark_ok(zone, cc->order, - low_wmark_pages(zone), 0, 0)) - compaction_defer_reset(zone, cc->order, false); - } - VM_BUG_ON(!list_empty(&cc->freepages)); VM_BUG_ON(!list_empty(&cc->migratepages)); + + if (is_via_compact_memory(cc->order)) + continue; + + if (zone_watermark_ok(zone, cc->order, + low_wmark_pages(zone), 0, 0)) + compaction_defer_reset(zone, cc->order, false); } } -- cgit v1.2.3 From c8ad6302c2b664e73f0abb3517c16531e294b9d7 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 14 Jan 2016 15:20:51 -0800 Subject: mm/readahead.c, mm/vmscan.c: use lru_to_page instead of list_to_page list_to_page() in readahead.c is the same as lru_to_page() in vmscan.c. So I move lru_to_page to internal.h and drop list_to_page(). Signed-off-by: Geliang Tang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 2 ++ mm/readahead.c | 8 +++----- mm/vmscan.c | 2 -- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 38e24b89e4c4..016452d2fede 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -119,6 +119,8 @@ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); extern bool zone_reclaimable(struct zone *zone); +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) + /* * in mm/rmap.c: */ diff --git a/mm/readahead.c b/mm/readahead.c index ba22d7fe0afb..0aff760b09d4 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -32,8 +32,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) } EXPORT_SYMBOL_GPL(file_ra_state_init); -#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) - /* * see if a page needs releasing upon read_cache_pages() failure * - the caller of read_cache_pages() may have set PG_private or PG_fscache @@ -64,7 +62,7 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping, struct page *victim; while (!list_empty(pages)) { - victim = list_to_page(pages); + victim = lru_to_page(pages); list_del(&victim->lru); read_cache_pages_invalidate_page(mapping, victim); } @@ -87,7 +85,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, int ret = 0; while (!list_empty(pages)) { - page = list_to_page(pages); + page = lru_to_page(pages); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, mapping_gfp_constraint(mapping, GFP_KERNEL))) { @@ -125,7 +123,7 @@ static int read_pages(struct address_space *mapping, struct file *filp, } for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_to_page(pages); + struct page *page = lru_to_page(pages); list_del(&page->lru); if (!add_to_page_cache_lru(page, mapping, page->index, mapping_gfp_constraint(mapping, GFP_KERNEL))) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 1bbfd623630e..e36d766dade9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -106,8 +106,6 @@ struct scan_control { unsigned long nr_reclaimed; }; -#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) - #ifdef ARCH_HAS_PREFETCH #define prefetch_prev_lru_page(_page, _base, _field) \ do { \ -- cgit v1.2.3 From 036404183eb3ebff93dea7b572fc718e9d652c7f Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 14 Jan 2016 15:20:54 -0800 Subject: mm/ksm.c: use list_for_each_entry_safe Use list_for_each_entry_safe() instead of list_for_each_safe() to simplify the code. Signed-off-by: Geliang Tang Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index b5cd647daa52..2d162c5625f6 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -740,8 +740,7 @@ static int remove_stable_node(struct stable_node *stable_node) static int remove_all_stable_nodes(void) { - struct stable_node *stable_node; - struct list_head *this, *next; + struct stable_node *stable_node, *next; int nid; int err = 0; @@ -756,8 +755,7 @@ static int remove_all_stable_nodes(void) cond_resched(); } } - list_for_each_safe(this, next, &migrate_nodes) { - stable_node = list_entry(this, struct stable_node, list); + list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (remove_stable_node(stable_node)) err = -EBUSY; cond_resched(); @@ -1583,13 +1581,11 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) * so prune them once before each full scan. */ if (!ksm_merge_across_nodes) { - struct stable_node *stable_node; - struct list_head *this, *next; + struct stable_node *stable_node, *next; struct page *page; - list_for_each_safe(this, next, &migrate_nodes) { - stable_node = list_entry(this, - struct stable_node, list); + list_for_each_entry_safe(stable_node, next, + &migrate_nodes, list) { page = get_ksm_page(stable_node, false); if (page) put_page(page); @@ -2012,8 +2008,7 @@ static void wait_while_offlining(void) static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { - struct stable_node *stable_node; - struct list_head *this, *next; + struct stable_node *stable_node, *next; struct rb_node *node; int nid; @@ -2034,8 +2029,7 @@ static void ksm_check_stable_tree(unsigned long start_pfn, cond_resched(); } } - list_for_each_safe(this, next, &migrate_nodes) { - stable_node = list_entry(this, struct stable_node, list); + list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) remove_node_from_stable_tree(stable_node); -- cgit v1.2.3 From 7d828602e5ef3297a69392a2d31264e4ab9c8bb7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:20:56 -0800 Subject: mm: memcontrol: export root_mem_cgroup A later patch will need this symbol in files other than memcontrol.c, so export it now and replace mem_cgroup_root_css at the same time. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: David S. Miller Reviewed-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 2 +- mm/memcontrol.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7340353f8aea..cc5d29d2da9b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -672,7 +672,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { - bdi->wb.memcg_css = mem_cgroup_root_css; + bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; } return ret; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0bc140d998ad..44ed2dee8f0c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -76,9 +76,9 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(memory_cgrp_subsys); +struct mem_cgroup *root_mem_cgroup __read_mostly; + #define MEM_CGROUP_RECLAIM_RETRIES 5 -static struct mem_cgroup *root_mem_cgroup __read_mostly; -struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP @@ -4241,7 +4241,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* root ? */ if (parent_css == NULL) { root_mem_cgroup = memcg; - mem_cgroup_root_css = &memcg->css; page_counter_init(&memcg->memory, NULL); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; -- cgit v1.2.3 From 3d596f7b907b0281b997cf30c92994a71ad0a1a9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:05 -0800 Subject: net: tcp_memcontrol: protect all tcp_memcontrol calls by jump-label Move the jump-label from sock_update_memcg() and sock_release_memcg() to the callsite, and so eliminate those function calls when socket accounting is not enabled. This also eliminates the need for dummy functions because the calls will be optimized away if the Kconfig options are not enabled. Signed-off-by: Johannes Weiner Acked-by: David S. Miller Reviewed-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 56 +++++++++++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 44ed2dee8f0c..d9344dad207e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -293,46 +293,40 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) void sock_update_memcg(struct sock *sk) { - if (mem_cgroup_sockets_enabled) { - struct mem_cgroup *memcg; - struct cg_proto *cg_proto; + struct mem_cgroup *memcg; + struct cg_proto *cg_proto; - BUG_ON(!sk->sk_prot->proto_cgroup); + BUG_ON(!sk->sk_prot->proto_cgroup); - /* Socket cloning can throw us here with sk_cgrp already - * filled. It won't however, necessarily happen from - * process context. So the test for root memcg given - * the current task's memcg won't help us in this case. - * - * Respecting the original socket's memcg is a better - * decision in this case. - */ - if (sk->sk_cgrp) { - BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); - css_get(&sk->sk_cgrp->memcg->css); - return; - } + /* Socket cloning can throw us here with sk_cgrp already + * filled. It won't however, necessarily happen from + * process context. So the test for root memcg given + * the current task's memcg won't help us in this case. + * + * Respecting the original socket's memcg is a better + * decision in this case. + */ + if (sk->sk_cgrp) { + BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); + css_get(&sk->sk_cgrp->memcg->css); + return; + } - rcu_read_lock(); - memcg = mem_cgroup_from_task(current); - cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (cg_proto && cg_proto->active && - css_tryget_online(&memcg->css)) { - sk->sk_cgrp = cg_proto; - } - rcu_read_unlock(); + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + cg_proto = sk->sk_prot->proto_cgroup(memcg); + if (cg_proto && cg_proto->active && + css_tryget_online(&memcg->css)) { + sk->sk_cgrp = cg_proto; } + rcu_read_unlock(); } EXPORT_SYMBOL(sock_update_memcg); void sock_release_memcg(struct sock *sk) { - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { - struct mem_cgroup *memcg; - WARN_ON(!sk->sk_cgrp->memcg); - memcg = sk->sk_cgrp->memcg; - css_put(&sk->sk_cgrp->memcg->css); - } + WARN_ON(!sk->sk_cgrp->memcg); + css_put(&sk->sk_cgrp->memcg->css); } struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) -- cgit v1.2.3 From e805605c721021879a1469bdae45c6f80bc985f4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:14 -0800 Subject: net: tcp_memcontrol: sanitize tcp memory accounting callbacks There won't be a tcp control soft limit, so integrating the memcg code into the global skmem limiting scheme complicates things unnecessarily. Replace this with simple and clear charge and uncharge calls--hidden behind a jump label--to account skb memory. Note that this is not purely aesthetic: as a result of shoehorning the per-memcg code into the same memory accounting functions that handle the global level, the old code would compare the per-memcg consumption against the smaller of the per-memcg limit and the global limit. This allowed the total consumption of multiple sockets to exceed the global limit, as long as the individual sockets stayed within bounds. After this change, the code will always compare the per-memcg consumption to the per-memcg limit, and the global consumption to the global limit, and thus close this loophole. Without a soft limit, the per-memcg memory pressure state in sockets is generally questionable. However, we did it until now, so we continue to enter it when the hard limit is hit, and packets are dropped, to let other sockets in the cgroup know that they shouldn't grow their transmit windows, either. However, keep it simple in the new callback model and leave memory pressure lazily when the next packet is accepted (as opposed to doing it synchroneously when packets are processed). When packets are dropped, network performance will already be in the toilet, so that should be a reasonable trade-off. As described above, consumption is now checked on the per-memcg level and the global level separately. Likewise, memory pressure states are maintained on both the per-memcg level and the global level, and a socket is considered under pressure when either level asserts as much. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d9344dad207e..f5de783860b8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -338,6 +338,38 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(tcp_proto_cgroup); +/** + * mem_cgroup_charge_skmem - charge socket memory + * @proto: proto to charge + * @nr_pages: number of pages to charge + * + * Charges @nr_pages to @proto. Returns %true if the charge fit within + * @proto's configured limit, %false if the charge had to be forced. + */ +bool mem_cgroup_charge_skmem(struct cg_proto *proto, unsigned int nr_pages) +{ + struct page_counter *counter; + + if (page_counter_try_charge(&proto->memory_allocated, + nr_pages, &counter)) { + proto->memory_pressure = 0; + return true; + } + page_counter_charge(&proto->memory_allocated, nr_pages); + proto->memory_pressure = 1; + return false; +} + +/** + * mem_cgroup_uncharge_skmem - uncharge socket memory + * @proto - proto to uncharge + * @nr_pages - number of pages to uncharge + */ +void mem_cgroup_uncharge_skmem(struct cg_proto *proto, unsigned int nr_pages) +{ + page_counter_uncharge(&proto->memory_allocated, nr_pages); +} + #endif #ifdef CONFIG_MEMCG_KMEM -- cgit v1.2.3 From baac50bbc3cdfd184ebf586b1704edbfcee866df Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:17 -0800 Subject: net: tcp_memcontrol: simplify linkage between socket and page counter There won't be any separate counters for socket memory consumed by protocols other than TCP in the future. Remove the indirection and link sockets directly to their owning memory cgroup. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 57 ++++++++++++++++++++++----------------------------------- 1 file changed, 22 insertions(+), 35 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f5de783860b8..eaaa86126277 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -294,9 +294,6 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) void sock_update_memcg(struct sock *sk) { struct mem_cgroup *memcg; - struct cg_proto *cg_proto; - - BUG_ON(!sk->sk_prot->proto_cgroup); /* Socket cloning can throw us here with sk_cgrp already * filled. It won't however, necessarily happen from @@ -306,68 +303,58 @@ void sock_update_memcg(struct sock *sk) * Respecting the original socket's memcg is a better * decision in this case. */ - if (sk->sk_cgrp) { - BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); - css_get(&sk->sk_cgrp->memcg->css); + if (sk->sk_memcg) { + BUG_ON(mem_cgroup_is_root(sk->sk_memcg)); + css_get(&sk->sk_memcg->css); return; } rcu_read_lock(); memcg = mem_cgroup_from_task(current); - cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (cg_proto && cg_proto->active && - css_tryget_online(&memcg->css)) { - sk->sk_cgrp = cg_proto; - } + if (memcg != root_mem_cgroup && + memcg->tcp_mem.active && + css_tryget_online(&memcg->css)) + sk->sk_memcg = memcg; rcu_read_unlock(); } EXPORT_SYMBOL(sock_update_memcg); void sock_release_memcg(struct sock *sk) { - WARN_ON(!sk->sk_cgrp->memcg); - css_put(&sk->sk_cgrp->memcg->css); -} - -struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) -{ - if (!memcg || mem_cgroup_is_root(memcg)) - return NULL; - - return &memcg->tcp_mem; + WARN_ON(!sk->sk_memcg); + css_put(&sk->sk_memcg->css); } -EXPORT_SYMBOL(tcp_proto_cgroup); /** * mem_cgroup_charge_skmem - charge socket memory - * @proto: proto to charge + * @memcg: memcg to charge * @nr_pages: number of pages to charge * - * Charges @nr_pages to @proto. Returns %true if the charge fit within - * @proto's configured limit, %false if the charge had to be forced. + * Charges @nr_pages to @memcg. Returns %true if the charge fit within + * @memcg's configured limit, %false if the charge had to be forced. */ -bool mem_cgroup_charge_skmem(struct cg_proto *proto, unsigned int nr_pages) +bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { struct page_counter *counter; - if (page_counter_try_charge(&proto->memory_allocated, + if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated, nr_pages, &counter)) { - proto->memory_pressure = 0; + memcg->tcp_mem.memory_pressure = 0; return true; } - page_counter_charge(&proto->memory_allocated, nr_pages); - proto->memory_pressure = 1; + page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages); + memcg->tcp_mem.memory_pressure = 1; return false; } /** * mem_cgroup_uncharge_skmem - uncharge socket memory - * @proto - proto to uncharge + * @memcg - memcg to uncharge * @nr_pages - number of pages to uncharge */ -void mem_cgroup_uncharge_skmem(struct cg_proto *proto, unsigned int nr_pages) +void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { - page_counter_uncharge(&proto->memory_allocated, nr_pages); + page_counter_uncharge(&memcg->tcp_mem.memory_allocated, nr_pages); } #endif @@ -3653,7 +3640,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) if (ret) return ret; - return mem_cgroup_sockets_init(memcg, ss); + return tcp_init_cgroup(memcg, ss); } static void memcg_deactivate_kmem(struct mem_cgroup *memcg) @@ -3709,7 +3696,7 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) static_key_slow_dec(&memcg_kmem_enabled_key); WARN_ON(page_counter_read(&memcg->kmem)); } - mem_cgroup_sockets_destroy(memcg); + tcp_destroy_cgroup(memcg); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -- cgit v1.2.3 From 80e95fe0fdcde2812c341ad4209d62dc1a7af53b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:20 -0800 Subject: mm: memcontrol: generalize the socket accounting jump label The unified hierarchy memory controller is going to use this jump label as well to control the networking callbacks. Move it to the memory controller code and give it a more generic name. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index eaaa86126277..08ef3d2ca663 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -291,6 +291,9 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) /* Writing them here to avoid exposing memcg's inner layout */ #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) +struct static_key memcg_sockets_enabled_key; +EXPORT_SYMBOL(memcg_sockets_enabled_key); + void sock_update_memcg(struct sock *sk) { struct mem_cgroup *memcg; -- cgit v1.2.3 From 7941d2145abc4def5583f9d8d0b2e02647b6d1de Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:23 -0800 Subject: mm: memcontrol: do not account memory+swap on unified hierarchy The unified hierarchy memory controller doesn't expose the memory+swap counter to userspace, but its accounting is hardcoded in all charge paths right now, including the per-cpu charge cache ("the stock"). To avoid adding yet more pointless memory+swap accounting with the socket memory support in unified hierarchy, disable the counter altogether when in unified hierarchy mode. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 08ef3d2ca663..6903e7d8c7be 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -87,6 +87,12 @@ int do_swap_account __read_mostly; #define do_swap_account 0 #endif +/* Whether legacy memory+swap accounting is active */ +static bool do_memsw_account(void) +{ + return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; +} + static const char * const mem_cgroup_stat_names[] = { "cache", "rss", @@ -1199,7 +1205,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) if (count < limit) margin = limit - count; - if (do_swap_account) { + if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); limit = READ_ONCE(memcg->memsw.limit); if (count <= limit) @@ -1302,7 +1308,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_cont(":"); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) continue; pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], K(mem_cgroup_read_stat(iter, i))); @@ -1925,7 +1931,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) if (stock->nr_pages) { page_counter_uncharge(&old->memory, stock->nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&old->memsw, stock->nr_pages); css_put_many(&old->css, stock->nr_pages); stock->nr_pages = 0; @@ -2055,11 +2061,11 @@ retry: if (consume_stock(memcg, nr_pages)) return 0; - if (!do_swap_account || + if (!do_memsw_account() || page_counter_try_charge(&memcg->memsw, batch, &counter)) { if (page_counter_try_charge(&memcg->memory, batch, &counter)) goto done_restock; - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, batch); mem_over_limit = mem_cgroup_from_counter(counter, memory); } else { @@ -2146,7 +2152,7 @@ force: * temporarily by force charging it. */ page_counter_charge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_charge(&memcg->memsw, nr_pages); css_get_many(&memcg->css, nr_pages); @@ -2183,7 +2189,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) return; page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); css_put_many(&memcg->css, nr_pages); @@ -2469,7 +2475,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) page_counter_uncharge(&memcg->kmem, nr_pages); page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); page->mem_cgroup = NULL; @@ -3184,7 +3190,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) continue; seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i], mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); @@ -3206,14 +3212,14 @@ static int memcg_stat_show(struct seq_file *m, void *v) } seq_printf(m, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); - if (do_swap_account) + if (do_memsw_account()) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { unsigned long long val = 0; - if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) continue; for_each_mem_cgroup_tree(mi, memcg) val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; @@ -3344,7 +3350,7 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg) { while (memcg) { __mem_cgroup_threshold(memcg, false); - if (do_swap_account) + if (do_memsw_account()) __mem_cgroup_threshold(memcg, true); memcg = parent_mem_cgroup(memcg); @@ -4497,7 +4503,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), ent.val); - if (do_swap_account) + if (do_memsw_account()) entry->val = ent.val; return page; @@ -4532,7 +4538,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, page = find_get_entry(mapping, pgoff); if (radix_tree_exceptional_entry(page)) { swp_entry_t swp = radix_to_swp_entry(page); - if (do_swap_account) + if (do_memsw_account()) *entry = swp; page = find_get_page(swap_address_space(swp), swp.val); } @@ -5325,7 +5331,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, if (page->mem_cgroup) goto out; - if (do_swap_account) { + if (do_memsw_account()) { swp_entry_t ent = { .val = page_private(page), }; unsigned short id = lookup_swap_cgroup_id(ent); @@ -5399,7 +5405,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, memcg_check_events(memcg, page); local_irq_enable(); - if (do_swap_account && PageSwapCache(page)) { + if (do_memsw_account() && PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; /* * The swap entry might not get freed for a long time, @@ -5448,7 +5454,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, if (!mem_cgroup_is_root(memcg)) { page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); memcg_oom_recover(memcg); } @@ -5656,7 +5662,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); - if (!do_swap_account) + if (!do_memsw_account()) return; memcg = page->mem_cgroup; @@ -5696,7 +5702,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) struct mem_cgroup *memcg; unsigned short id; - if (!do_swap_account) + if (!do_memsw_account()) return; id = swap_cgroup_record(entry, 0); -- cgit v1.2.3 From 1109208766d9fa7059a9b66ad488e66d99ce49af Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:26 -0800 Subject: mm: memcontrol: move socket code for unified hierarchy accounting The unified hierarchy memory controller will account socket memory. Move the infrastructure functions accordingly. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 148 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 74 insertions(+), 74 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6903e7d8c7be..6aac8d2e31d7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -294,80 +294,6 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return mem_cgroup_from_css(css); } -/* Writing them here to avoid exposing memcg's inner layout */ -#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) - -struct static_key memcg_sockets_enabled_key; -EXPORT_SYMBOL(memcg_sockets_enabled_key); - -void sock_update_memcg(struct sock *sk) -{ - struct mem_cgroup *memcg; - - /* Socket cloning can throw us here with sk_cgrp already - * filled. It won't however, necessarily happen from - * process context. So the test for root memcg given - * the current task's memcg won't help us in this case. - * - * Respecting the original socket's memcg is a better - * decision in this case. - */ - if (sk->sk_memcg) { - BUG_ON(mem_cgroup_is_root(sk->sk_memcg)); - css_get(&sk->sk_memcg->css); - return; - } - - rcu_read_lock(); - memcg = mem_cgroup_from_task(current); - if (memcg != root_mem_cgroup && - memcg->tcp_mem.active && - css_tryget_online(&memcg->css)) - sk->sk_memcg = memcg; - rcu_read_unlock(); -} -EXPORT_SYMBOL(sock_update_memcg); - -void sock_release_memcg(struct sock *sk) -{ - WARN_ON(!sk->sk_memcg); - css_put(&sk->sk_memcg->css); -} - -/** - * mem_cgroup_charge_skmem - charge socket memory - * @memcg: memcg to charge - * @nr_pages: number of pages to charge - * - * Charges @nr_pages to @memcg. Returns %true if the charge fit within - * @memcg's configured limit, %false if the charge had to be forced. - */ -bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) -{ - struct page_counter *counter; - - if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated, - nr_pages, &counter)) { - memcg->tcp_mem.memory_pressure = 0; - return true; - } - page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages); - memcg->tcp_mem.memory_pressure = 1; - return false; -} - -/** - * mem_cgroup_uncharge_skmem - uncharge socket memory - * @memcg - memcg to uncharge - * @nr_pages - number of pages to uncharge - */ -void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) -{ - page_counter_uncharge(&memcg->tcp_mem.memory_allocated, nr_pages); -} - -#endif - #ifdef CONFIG_MEMCG_KMEM /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. @@ -5607,6 +5533,80 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) commit_charge(newpage, memcg, true); } +/* Writing them here to avoid exposing memcg's inner layout */ +#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) + +struct static_key memcg_sockets_enabled_key; +EXPORT_SYMBOL(memcg_sockets_enabled_key); + +void sock_update_memcg(struct sock *sk) +{ + struct mem_cgroup *memcg; + + /* Socket cloning can throw us here with sk_cgrp already + * filled. It won't however, necessarily happen from + * process context. So the test for root memcg given + * the current task's memcg won't help us in this case. + * + * Respecting the original socket's memcg is a better + * decision in this case. + */ + if (sk->sk_memcg) { + BUG_ON(mem_cgroup_is_root(sk->sk_memcg)); + css_get(&sk->sk_memcg->css); + return; + } + + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + if (memcg != root_mem_cgroup && + memcg->tcp_mem.active && + css_tryget_online(&memcg->css)) + sk->sk_memcg = memcg; + rcu_read_unlock(); +} +EXPORT_SYMBOL(sock_update_memcg); + +void sock_release_memcg(struct sock *sk) +{ + WARN_ON(!sk->sk_memcg); + css_put(&sk->sk_memcg->css); +} + +/** + * mem_cgroup_charge_skmem - charge socket memory + * @memcg: memcg to charge + * @nr_pages: number of pages to charge + * + * Charges @nr_pages to @memcg. Returns %true if the charge fit within + * @memcg's configured limit, %false if the charge had to be forced. + */ +bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + struct page_counter *counter; + + if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated, + nr_pages, &counter)) { + memcg->tcp_mem.memory_pressure = 0; + return true; + } + page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages); + memcg->tcp_mem.memory_pressure = 1; + return false; +} + +/** + * mem_cgroup_uncharge_skmem - uncharge socket memory + * @memcg - memcg to uncharge + * @nr_pages - number of pages to uncharge + */ +void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + page_counter_uncharge(&memcg->tcp_mem.memory_allocated, nr_pages); +} + +#endif + /* * subsys_initcall() for memory controller. * -- cgit v1.2.3 From f7e1cb6ec51b041335b5ad4dd7aefb37a56d79a6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:29 -0800 Subject: mm: memcontrol: account socket memory in unified hierarchy memory controller Socket memory can be a significant share of overall memory consumed by common workloads. In order to provide reasonable resource isolation in the unified hierarchy, this type of memory needs to be included in the tracking/accounting of a cgroup under active memory resource control. Overhead is only incurred when a non-root control group is created AND the memory controller is instructed to track and account the memory footprint of that group. cgroup.memory=nosocket can be specified on the boot commandline to override any runtime configuration and forcibly exclude socket memory from active memory resource control. Signed-off-by: Johannes Weiner Acked-by: David S. Miller Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 122 +++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 98 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6aac8d2e31d7..60ebc486c2aa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -80,6 +80,9 @@ struct mem_cgroup *root_mem_cgroup __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 +/* Socket memory accounting disabled? */ +static bool cgroup_memory_nosocket; + /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP int do_swap_account __read_mostly; @@ -1945,6 +1948,26 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } +static void reclaim_high(struct mem_cgroup *memcg, + unsigned int nr_pages, + gfp_t gfp_mask) +{ + do { + if (page_counter_read(&memcg->memory) <= memcg->high) + continue; + mem_cgroup_events(memcg, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + } while ((memcg = parent_mem_cgroup(memcg))); +} + +static void high_work_func(struct work_struct *work) +{ + struct mem_cgroup *memcg; + + memcg = container_of(work, struct mem_cgroup, high_work); + reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL); +} + /* * Scheduled by try_charge() to be executed from the userland return path * and reclaims memory over the high limit. @@ -1952,20 +1975,13 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, void mem_cgroup_handle_over_high(void) { unsigned int nr_pages = current->memcg_nr_pages_over_high; - struct mem_cgroup *memcg, *pos; + struct mem_cgroup *memcg; if (likely(!nr_pages)) return; - pos = memcg = get_mem_cgroup_from_mm(current->mm); - - do { - if (page_counter_read(&pos->memory) <= pos->high) - continue; - mem_cgroup_events(pos, MEMCG_HIGH, 1); - try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true); - } while ((pos = parent_mem_cgroup(pos))); - + memcg = get_mem_cgroup_from_mm(current->mm); + reclaim_high(memcg, nr_pages, GFP_KERNEL); css_put(&memcg->css); current->memcg_nr_pages_over_high = 0; } @@ -2100,6 +2116,11 @@ done_restock: */ do { if (page_counter_read(&memcg->memory) > memcg->high) { + /* Don't bother a random interrupted task */ + if (in_interrupt()) { + schedule_work(&memcg->high_work); + break; + } current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; @@ -4150,6 +4171,8 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; + cancel_work_sync(&memcg->high_work); + mem_cgroup_remove_from_trees(memcg); for_each_node(node) @@ -4196,6 +4219,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->kmem, NULL); } + INIT_WORK(&memcg->high_work, high_work_func); memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); memcg->move_charge_at_immigrate = 0; @@ -4267,6 +4291,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (ret) return ret; +#ifdef CONFIG_INET + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) + static_key_slow_inc(&memcg_sockets_enabled_key); +#endif + /* * Make sure the memcg is initialized: mem_cgroup_iter() * orders reading memcg->initialized against its callers @@ -4313,6 +4342,10 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); memcg_destroy_kmem(memcg); +#ifdef CONFIG_INET + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) + static_key_slow_dec(&memcg_sockets_enabled_key); +#endif __mem_cgroup_free(memcg); } @@ -5533,8 +5566,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) commit_charge(newpage, memcg, true); } -/* Writing them here to avoid exposing memcg's inner layout */ -#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) +#ifdef CONFIG_INET struct static_key memcg_sockets_enabled_key; EXPORT_SYMBOL(memcg_sockets_enabled_key); @@ -5559,10 +5591,15 @@ void sock_update_memcg(struct sock *sk) rcu_read_lock(); memcg = mem_cgroup_from_task(current); - if (memcg != root_mem_cgroup && - memcg->tcp_mem.active && - css_tryget_online(&memcg->css)) + if (memcg == root_mem_cgroup) + goto out; +#ifdef CONFIG_MEMCG_KMEM + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active) + goto out; +#endif + if (css_tryget_online(&memcg->css)) sk->sk_memcg = memcg; +out: rcu_read_unlock(); } EXPORT_SYMBOL(sock_update_memcg); @@ -5583,15 +5620,30 @@ void sock_release_memcg(struct sock *sk) */ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { - struct page_counter *counter; + gfp_t gfp_mask = GFP_KERNEL; - if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated, - nr_pages, &counter)) { - memcg->tcp_mem.memory_pressure = 0; - return true; +#ifdef CONFIG_MEMCG_KMEM + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + struct page_counter *counter; + + if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated, + nr_pages, &counter)) { + memcg->tcp_mem.memory_pressure = 0; + return true; + } + page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages); + memcg->tcp_mem.memory_pressure = 1; + return false; } - page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages); - memcg->tcp_mem.memory_pressure = 1; +#endif + /* Don't block in the packet receive path */ + if (in_softirq()) + gfp_mask = GFP_NOWAIT; + + if (try_charge(memcg, gfp_mask, nr_pages) == 0) + return true; + + try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); return false; } @@ -5602,10 +5654,32 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) */ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) { - page_counter_uncharge(&memcg->tcp_mem.memory_allocated, nr_pages); +#ifdef CONFIG_MEMCG_KMEM + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + page_counter_uncharge(&memcg->tcp_mem.memory_allocated, + nr_pages); + return; + } +#endif + page_counter_uncharge(&memcg->memory, nr_pages); + css_put_many(&memcg->css, nr_pages); } -#endif +#endif /* CONFIG_INET */ + +static int __init cgroup_memory(char *s) +{ + char *token; + + while ((token = strsep(&s, ",")) != NULL) { + if (!*token) + continue; + if (!strcmp(token, "nosocket")) + cgroup_memory_nosocket = true; + } + return 0; +} +__setup("cgroup.memory=", cgroup_memory); /* * subsys_initcall() for memory controller. -- cgit v1.2.3 From 8e8ae645249b85c8ed6c178557f8db8613a6bcc7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:32 -0800 Subject: mm: memcontrol: hook up vmpressure to socket pressure Let the networking stack know when a memcg is under reclaim pressure so that it can clamp its transmit windows accordingly. Whenever the reclaim efficiency of a cgroup's LRU lists drops low enough for a MEDIUM or HIGH vmpressure event to occur, assert a pressure state in the socket and tcp memory code that tells it to curb consumption growth from sockets associated with said control group. Traditionally, vmpressure reports for the entire subtree of a memcg under pressure, which drops useful information on the individual groups reclaimed. However, it's too late to change the userinterface, so add a second reporting mode that reports on the level of reclaim instead of at the level of pressure, and use that report for sockets. vmpressure events are naturally edge triggered, so for hysteresis assert socket pressure for a second to allow for subsequent vmpressure events to occur before letting the socket code return to normal. This will likely need finetuning for a wider variety of workloads, but for now stick to the vmpressure presets and keep hysteresis simple. Signed-off-by: Johannes Weiner Acked-by: David S. Miller Reviewed-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 +++---------- mm/vmpressure.c | 78 +++++++++++++++++++++++++++++++++++++++++++-------------- mm/vmscan.c | 10 +++++++- 3 files changed, 71 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 60ebc486c2aa..df7f144a5a4b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1113,9 +1113,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) return ret; } -#define mem_cgroup_from_counter(counter, member) \ - container_of(counter, struct mem_cgroup, member) - /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup * @memcg: the memory cgroup @@ -4183,17 +4180,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) kfree(memcg); } -/* - * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. - */ -struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) -{ - if (!memcg->memory.parent) - return NULL; - return mem_cgroup_from_counter(memcg->memory.parent, memory); -} -EXPORT_SYMBOL(parent_mem_cgroup); - static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -4233,6 +4219,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); +#endif +#ifdef CONFIG_INET + memcg->socket_pressure = jiffies; #endif return &memcg->css; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index c5afd573d7da..506f03e4be47 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -137,14 +137,11 @@ struct vmpressure_event { }; static bool vmpressure_event(struct vmpressure *vmpr, - unsigned long scanned, unsigned long reclaimed) + enum vmpressure_levels level) { struct vmpressure_event *ev; - enum vmpressure_levels level; bool signalled = false; - level = vmpressure_calc_level(scanned, reclaimed); - mutex_lock(&vmpr->events_lock); list_for_each_entry(ev, &vmpr->events, node) { @@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work) struct vmpressure *vmpr = work_to_vmpressure(work); unsigned long scanned; unsigned long reclaimed; + enum vmpressure_levels level; spin_lock(&vmpr->sr_lock); /* @@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work) * here. No need for any locks here since we don't care if * vmpr->reclaimed is in sync. */ - scanned = vmpr->scanned; + scanned = vmpr->tree_scanned; if (!scanned) { spin_unlock(&vmpr->sr_lock); return; } - reclaimed = vmpr->reclaimed; - vmpr->scanned = 0; - vmpr->reclaimed = 0; + reclaimed = vmpr->tree_reclaimed; + vmpr->tree_scanned = 0; + vmpr->tree_reclaimed = 0; spin_unlock(&vmpr->sr_lock); + level = vmpressure_calc_level(scanned, reclaimed); + do { - if (vmpressure_event(vmpr, scanned, reclaimed)) + if (vmpressure_event(vmpr, level)) break; /* * If not handled, propagate the event upward into the @@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work) * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle + * @tree: legacy subtree mode * @scanned: number of pages scanned * @reclaimed: number of pages reclaimed * @@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work) * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw * pressure index is then further refined and averaged over time. * + * If @tree is set, vmpressure is in traditional userspace reporting + * mode: @memcg is considered the pressure root and userspace is + * notified of the entire subtree's reclaim efficiency. + * + * If @tree is not set, reclaim efficiency is recorded for @memcg, and + * only in-kernel users are notified. + * * This function does not return any value. */ -void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); @@ -238,15 +246,47 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, if (!scanned) return; - spin_lock(&vmpr->sr_lock); - vmpr->scanned += scanned; - vmpr->reclaimed += reclaimed; - scanned = vmpr->scanned; - spin_unlock(&vmpr->sr_lock); + if (tree) { + spin_lock(&vmpr->sr_lock); + vmpr->tree_scanned += scanned; + vmpr->tree_reclaimed += reclaimed; + scanned = vmpr->scanned; + spin_unlock(&vmpr->sr_lock); - if (scanned < vmpressure_win) - return; - schedule_work(&vmpr->work); + if (scanned < vmpressure_win) + return; + schedule_work(&vmpr->work); + } else { + enum vmpressure_levels level; + + /* For now, no users for root-level efficiency */ + if (memcg == root_mem_cgroup) + return; + + spin_lock(&vmpr->sr_lock); + scanned = vmpr->scanned += scanned; + reclaimed = vmpr->reclaimed += reclaimed; + if (scanned < vmpressure_win) { + spin_unlock(&vmpr->sr_lock); + return; + } + vmpr->scanned = vmpr->reclaimed = 0; + spin_unlock(&vmpr->sr_lock); + + level = vmpressure_calc_level(scanned, reclaimed); + + if (level > VMPRESSURE_LOW) { + /* + * Let the socket buffer allocator know that + * we are having trouble reclaiming LRU pages. + * + * For hysteresis keep the pressure state + * asserted for a second in which subsequent + * pressure events can occur. + */ + memcg->socket_pressure = jiffies + HZ; + } + } } /** @@ -276,7 +316,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * to the vmpressure() basically means that we signal 'critical' * level. */ - vmpressure(gfp, memcg, vmpressure_win, 0); + vmpressure(gfp, memcg, true, vmpressure_win, 0); } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index e36d766dade9..bb0cbd4c9f01 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2396,6 +2396,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, memcg = mem_cgroup_iter(root, NULL, &reclaim); do { unsigned long lru_pages; + unsigned long reclaimed; unsigned long scanned; struct lruvec *lruvec; int swappiness; @@ -2408,6 +2409,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, lruvec = mem_cgroup_zone_lruvec(zone, memcg); swappiness = mem_cgroup_swappiness(memcg); + reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; shrink_lruvec(lruvec, swappiness, sc, &lru_pages); @@ -2418,6 +2420,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, memcg, sc->nr_scanned - scanned, lru_pages); + /* Record the group's reclaim efficiency */ + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + /* * Direct reclaim and kswapd have to scan all memory * cgroups to fulfill the overall scan target for the @@ -2449,7 +2456,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, reclaim_state->reclaimed_slab = 0; } - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + /* Record the subtree's reclaim efficiency */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); -- cgit v1.2.3 From ef12947c9c5a96af549c49f10e5503f0612a397c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Jan 2016 15:21:34 -0800 Subject: mm: memcontrol: switch to the updated jump-label API According to the direct use of struct static_key is deprecated. Update the socket and slab accounting code accordingly. Signed-off-by: Johannes Weiner Acked-by: David S. Miller Reported-by: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index df7f144a5a4b..54eae4f19d80 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -346,7 +346,7 @@ void memcg_put_cache_ids(void) * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ -struct static_key memcg_kmem_enabled_key; +DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif /* CONFIG_MEMCG_KMEM */ @@ -2907,7 +2907,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, err = page_counter_limit(&memcg->kmem, nr_pages); VM_BUG_ON(err); - static_key_slow_inc(&memcg_kmem_enabled_key); + static_branch_inc(&memcg_kmem_enabled_key); /* * A memory cgroup is considered kmem-active as soon as it gets * kmemcg_id. Setting the id after enabling static branching will @@ -3646,7 +3646,7 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) { if (memcg->kmem_acct_activated) { memcg_destroy_kmem_caches(memcg); - static_key_slow_dec(&memcg_kmem_enabled_key); + static_branch_dec(&memcg_kmem_enabled_key); WARN_ON(page_counter_read(&memcg->kmem)); } tcp_destroy_cgroup(memcg); @@ -4282,7 +4282,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) #ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) - static_key_slow_inc(&memcg_sockets_enabled_key); + static_branch_inc(&memcg_sockets_enabled_key); #endif /* @@ -4333,7 +4333,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) memcg_destroy_kmem(memcg); #ifdef CONFIG_INET if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) - static_key_slow_dec(&memcg_sockets_enabled_key); + static_branch_dec(&memcg_sockets_enabled_key); #endif __mem_cgroup_free(memcg); } @@ -5557,7 +5557,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) #ifdef CONFIG_INET -struct static_key memcg_sockets_enabled_key; +DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); EXPORT_SYMBOL(memcg_sockets_enabled_key); void sock_update_memcg(struct sock *sk) -- cgit v1.2.3 From 686739f6af5e8d5687ffebbf1193ff066aada6d9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 14 Jan 2016 15:21:37 -0800 Subject: memcg: avoid vmpressure oops when memcg disabled A CONFIG_MEMCG=y kernel booted with "cgroup_disable=memory" crashes on a NULL memcg (but non-NULL root_mem_cgroup) when vmpressure kicks in. Here's the patch I use to avoid that, but you might prefer a test on mem_cgroup_disabled() somewhere. Signed-off-by: Hugh Dickins Acked-by: Johannes Weiner Cc: David S. Miller Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmpressure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 506f03e4be47..9a6c0704211c 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -260,7 +260,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, enum vmpressure_levels level; /* For now, no users for root-level efficiency */ - if (memcg == root_mem_cgroup) + if (!memcg || memcg == root_mem_cgroup) return; spin_lock(&vmpr->sr_lock); -- cgit v1.2.3 From 0eb77e9880321915322d42913c3b53241739c8aa Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 14 Jan 2016 15:21:40 -0800 Subject: vmstat: make vmstat_updater deferrable again and shut down on idle Currently the vmstat updater is not deferrable as a result of commit ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update"). This in turn can cause multiple interruptions of the applications because the vmstat updater may run at Make vmstate_update deferrable again and provide a function that folds the differentials when the processor is going to idle mode thus addressing the issue of the above commit in a clean way. Note that the shepherd thread will continue scanning the differentials from another processor and will reenable the vmstat workers if it detects any changes. Fixes: ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update") Signed-off-by: Christoph Lameter Cc: Michal Hocko Cc: Johannes Weiner Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 69 +++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index c54fd2924f25..83a003bc3cae 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -460,7 +460,7 @@ static int fold_diff(int *diff) * * The function returns the number of global counters updated. */ -static int refresh_cpu_vm_stats(void) +static int refresh_cpu_vm_stats(bool do_pagesets) { struct zone *zone; int i; @@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void) #endif } } - cond_resched(); #ifdef CONFIG_NUMA - /* - * Deal with draining the remote pageset of this - * processor - * - * Check if there are pages remaining in this pageset - * if not then there is nothing to expire. - */ - if (!__this_cpu_read(p->expire) || + if (do_pagesets) { + cond_resched(); + /* + * Deal with draining the remote pageset of this + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!__this_cpu_read(p->expire) || !__this_cpu_read(p->pcp.count)) - continue; + continue; - /* - * We never drain zones local to this processor. - */ - if (zone_to_nid(zone) == numa_node_id()) { - __this_cpu_write(p->expire, 0); - continue; - } + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == numa_node_id()) { + __this_cpu_write(p->expire, 0); + continue; + } - if (__this_cpu_dec_return(p->expire)) - continue; + if (__this_cpu_dec_return(p->expire)) + continue; - if (__this_cpu_read(p->pcp.count)) { - drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); - changes++; + if (__this_cpu_read(p->pcp.count)) { + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } } #endif } @@ -1386,7 +1388,7 @@ static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats()) { + if (refresh_cpu_vm_stats(true)) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1417,6 +1419,23 @@ static void vmstat_update(struct work_struct *w) } } +/* + * Switch off vmstat processing and then fold all the remaining differentials + * until the diffs stay at zero. The function is used by NOHZ and can only be + * invoked when tick processing is not active. + */ +void quiet_vmstat(void) +{ + if (system_state != SYSTEM_RUNNING) + return; + + do { + if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + cancel_delayed_work(this_cpu_ptr(&vmstat_work)); + + } while (refresh_cpu_vm_stats(false)); +} + /* * Check if the diffs for a certain cpu indicate that * an update is needed. @@ -1449,7 +1468,7 @@ static bool need_update(int cpu) */ static void vmstat_shepherd(struct work_struct *w); -static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); +static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd); static void vmstat_shepherd(struct work_struct *w) { -- cgit v1.2.3 From bb5b8589767a300014ba21c5984e6a4d15f0702d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 14 Jan 2016 15:21:43 -0800 Subject: mm: make sure isolate_lru_page() is never called for tail page The VM_BUG_ON_PAGE() would catch such cases if any still exists. Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index bb0cbd4c9f01..108bd119f2f6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1426,6 +1426,7 @@ int isolate_lru_page(struct page *page) int ret = -EBUSY; VM_BUG_ON_PAGE(!page_count(page), page); + VM_BUG_ON_PAGE(PageTail(page), page); if (PageLRU(page)) { struct zone *zone = page_zone(page); -- cgit v1.2.3 From 0d576d20ccdeffbb7ba28053107fa6f838b67f3a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 14 Jan 2016 15:21:49 -0800 Subject: mm/swapfile.c: use list_for_each_entry_safe in free_swap_count_continuations Use list_for_each_entry_safe() instead of list_for_each_safe() to simplify the code. Signed-off-by: Geliang Tang Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 77551a553f57..e6b8591a3ed2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2953,11 +2953,10 @@ static void free_swap_count_continuations(struct swap_info_struct *si) struct page *head; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head)) { - struct list_head *this, *next; - list_for_each_safe(this, next, &head->lru) { - struct page *page; - page = list_entry(this, struct page, lru); - list_del(this); + struct page *page, *next; + + list_for_each_entry_safe(page, next, &head->lru, lru) { + list_del(&page->lru); __free_page(page); } } -- cgit v1.2.3 From 3e89e1c5ea84211b99199c7636a7d73c50c6b512 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 14 Jan 2016 15:21:52 -0800 Subject: hugetlb: make mm and fs code explicitly non-modular The Kconfig currently controlling compilation of this code is: config HUGETLBFS bool "HugeTLB file system support" ...meaning that it currently is not being built as a module by anyone. Lets remove the modular code that is essentially orphaned, so that when reading the driver there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering gets moved to earlier levels when we use the more appropriate initcalls here. Originally I had the fs part and the mm part as separate commits, just by happenstance of the nature of how I detected these non-modular use cases. But that can possibly introduce regressions if the patch merge ordering puts the fs part 1st -- as the 0-day testing reported a splat at mount time. Investigating with "initcall_debug" showed that the delta was init_hugetlbfs_fs being called _before_ hugetlb_init instead of after. So both the fs change and the mm change are here together. In addition, it worked before due to luck of link order, since they were both in the same initcall category. So we now have the fs part using fs_initcall, and the mm part using subsys_initcall, which puts it one bucket earlier. It now passes the basic sanity test that failed in earlier 0-day testing. We delete the MODULE_LICENSE tag and capture that information at the top of the file alongside author comments, etc. We don't replace module.h with init.h since the file already has that. Also note that MODULE_ALIAS is a no-op for non-modular code. Signed-off-by: Paul Gortmaker Reported-by: kernel test robot Cc: Nadia Yvette Chambers Cc: Alexander Viro Cc: Naoya Horiguchi Reviewed-by: Mike Kravetz Cc: David Rientjes Cc: Hillf Danton Acked-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 39 +-------------------------------------- 1 file changed, 1 insertion(+), 38 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ef6963b577fd..be934df69b85 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4,7 +4,6 @@ */ #include #include -#include #include #include #include @@ -2549,25 +2548,6 @@ static void hugetlb_unregister_node(struct node *node) nhs->hugepages_kobj = NULL; } -/* - * hugetlb module exit: unregister hstate attributes from node devices - * that have them. - */ -static void hugetlb_unregister_all_nodes(void) -{ - int nid; - - /* - * disable node device registrations. - */ - register_hugetlbfs_with_node(NULL, NULL); - - /* - * remove hstate attributes from any nodes that have them. - */ - for (nid = 0; nid < nr_node_ids; nid++) - hugetlb_unregister_node(node_devices[nid]); -} /* * Register hstate attributes for a single node device. @@ -2632,27 +2612,10 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) return NULL; } -static void hugetlb_unregister_all_nodes(void) { } - static void hugetlb_register_all_nodes(void) { } #endif -static void __exit hugetlb_exit(void) -{ - struct hstate *h; - - hugetlb_unregister_all_nodes(); - - for_each_hstate(h) { - kobject_put(hstate_kobjs[hstate_index(h)]); - } - - kobject_put(hugepages_kobj); - kfree(hugetlb_fault_mutex_table); -} -module_exit(hugetlb_exit); - static int __init hugetlb_init(void) { int i; @@ -2690,7 +2653,7 @@ static int __init hugetlb_init(void) mutex_init(&hugetlb_fault_mutex_table[i]); return 0; } -module_init(hugetlb_init); +subsys_initcall(hugetlb_init); /* Should be called on processing a hugepagesz=... option */ void __init hugetlb_add_hstate(unsigned int order) -- cgit v1.2.3 From 6f754ba4cfcc044078d4836056ac45404e1b6e85 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 14 Jan 2016 15:21:55 -0800 Subject: memory-hotplug: don't BUG() in register_memory_resource() Out of memory condition is not a bug and while we can't add new memory in such case crashing the system seems wrong. Propagating the return value from register_memory_resource() requires interface change. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Igor Mammedov Acked-by: David Rientjes Cc: Tang Chen Cc: Naoya Horiguchi Cc: Xishi Qiu Cc: Sheng Yong Cc: Zhu Guihua Cc: Dan Williams Cc: David Vrabel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a042a9d537bb..92f95952692b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -131,7 +131,8 @@ static struct resource *register_memory_resource(u64 start, u64 size) { struct resource *res; res = kzalloc(sizeof(struct resource), GFP_KERNEL); - BUG_ON(!res); + if (!res) + return ERR_PTR(-ENOMEM); res->name = "System RAM"; res->start = start; @@ -140,7 +141,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) if (request_resource(&iomem_resource, res) < 0) { pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); - res = NULL; + return ERR_PTR(-EEXIST); } return res; } @@ -1312,8 +1313,8 @@ int __ref add_memory(int nid, u64 start, u64 size) int ret; res = register_memory_resource(start, size); - if (!res) - return -EEXIST; + if (IS_ERR(res)) + return PTR_ERR(res); ret = add_memory_resource(nid, res); if (ret < 0) -- cgit v1.2.3 From d72ee911130631b50a8ccc615a7d4622c2062194 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 14 Jan 2016 15:22:01 -0800 Subject: mm: move lru_to_page to mm_inline.h Move lru_to_page() from internal.h to mm_inline.h. Signed-off-by: Geliang Tang Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 2 -- mm/readahead.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 016452d2fede..38e24b89e4c4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -119,8 +119,6 @@ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); extern bool zone_reclaimable(struct zone *zone); -#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) - /* * in mm/rmap.c: */ diff --git a/mm/readahead.c b/mm/readahead.c index 0aff760b09d4..20e58e820e44 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "internal.h" -- cgit v1.2.3 From 84638335900f1995495838fe1bd4870c43ec1f67 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 14 Jan 2016 15:22:07 -0800 Subject: mm: rework virtual memory accounting When inspecting a vague code inside prctl(PR_SET_MM_MEM) call (which testing the RLIMIT_DATA value to figure out if we're allowed to assign new @start_brk, @brk, @start_data, @end_data from mm_struct) it's been commited that RLIMIT_DATA in a form it's implemented now doesn't do anything useful because most of user-space libraries use mmap() syscall for dynamic memory allocations. Linus suggested to convert RLIMIT_DATA rlimit into something suitable for anonymous memory accounting. But in this patch we go further, and the changes are bundled together as: * keep vma counting if CONFIG_PROC_FS=n, will be used for limits * replace mm->shared_vm with better defined mm->data_vm * account anonymous executable areas as executable * account file-backed growsdown/up areas as stack * drop struct file* argument from vm_stat_account * enforce RLIMIT_DATA for size of data areas This way code looks cleaner: now code/stack/data classification depends only on vm_flags state: VM_EXEC & ~VM_WRITE -> code (VmExe + VmLib in proc) VM_GROWSUP | VM_GROWSDOWN -> stack (VmStk) VM_WRITE & ~VM_SHARED & !stack -> data (VmData) The rest (VmSize - VmData - VmStk - VmExe - VmLib) could be called "shared", but that might be strange beast like readonly-private or VM_IO area. - RLIMIT_AS limits whole address space "VmSize" - RLIMIT_STACK limits stack "VmStk" (but each vma individually) - RLIMIT_DATA now limits "VmData" Signed-off-by: Konstantin Khlebnikov Signed-off-by: Cyrill Gorcunov Cc: Quentin Casasnovas Cc: Vegard Nossum Acked-by: Linus Torvalds Cc: Willy Tarreau Cc: Andy Lutomirski Cc: Kees Cook Cc: Vladimir Davydov Cc: Pavel Emelyanov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 4 ++-- mm/mmap.c | 63 ++++++++++++++++++++++++++++++----------------------------- mm/mprotect.c | 8 ++++++-- mm/mremap.c | 7 ++++--- 4 files changed, 44 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/mm/debug.c b/mm/debug.c index 668aa35191ca..5d2072ed8d5e 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -175,7 +175,7 @@ void dump_mm(const struct mm_struct *mm) "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" - "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" + "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" @@ -209,7 +209,7 @@ void dump_mm(const struct mm_struct *mm) mm_nr_pmds((struct mm_struct *)mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, - mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, + mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, diff --git a/mm/mmap.c b/mm/mmap.c index f32b84ad621a..b3f00b616b81 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1220,24 +1220,6 @@ none: return NULL; } -#ifdef CONFIG_PROC_FS -void vm_stat_account(struct mm_struct *mm, unsigned long flags, - struct file *file, long pages) -{ - const unsigned long stack_flags - = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); - - mm->total_vm += pages; - - if (file) { - mm->shared_vm += pages; - if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) - mm->exec_vm += pages; - } else if (flags & stack_flags) - mm->stack_vm += pages; -} -#endif /* CONFIG_PROC_FS */ - /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr @@ -1556,7 +1538,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long charged = 0; /* Check against address space limit. */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { + if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; /* @@ -1565,7 +1547,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ nr_pages = count_vma_pages_range(mm, addr, addr + len); - if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) + if (!may_expand_vm(mm, vm_flags, + (len >> PAGE_SHIFT) - nr_pages)) return -ENOMEM; } @@ -1664,7 +1647,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, out: perf_event_mmap(vma); - vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); + vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))) @@ -2111,7 +2094,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns unsigned long new_start, actual_size; /* address space limit tests */ - if (!may_expand_vm(mm, grow)) + if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ @@ -2208,8 +2191,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, - vma->vm_file, grow); + vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; anon_vma_interval_tree_post_update_vma(vma); @@ -2284,8 +2266,7 @@ int expand_downwards(struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, - vma->vm_file, grow); + vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; @@ -2399,7 +2380,7 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_flags & VM_ACCOUNT) nr_accounted += nrpages; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); + vm_stat_account(mm, vma->vm_flags, -nrpages); vma = remove_vma(vma); } while (vma); vm_unacct_memory(nr_accounted); @@ -2769,7 +2750,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) } /* Check against address space limits *after* clearing old maps... */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) + if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) @@ -2804,6 +2785,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; + mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; @@ -2995,9 +2977,28 @@ out: * Return true if the calling process may expand its vm space by the passed * number of pages */ -int may_expand_vm(struct mm_struct *mm, unsigned long npages) +bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) { - return mm->total_vm + npages <= rlimit(RLIMIT_AS) >> PAGE_SHIFT; + if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) + return false; + + if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS & + (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE) + return mm->data_vm + npages <= rlimit(RLIMIT_DATA); + + return true; +} + +void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) +{ + mm->total_vm += npages; + + if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC) + mm->exec_vm += npages; + else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) + mm->stack_vm += npages; + else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + mm->data_vm += npages; } static int special_mapping_fault(struct vm_area_struct *vma, @@ -3079,7 +3080,7 @@ static struct vm_area_struct *__install_special_mapping( if (ret) goto out; - mm->total_vm += len >> PAGE_SHIFT; + vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); perf_event_mmap(vma); diff --git a/mm/mprotect.c b/mm/mprotect.c index ef5be8eaab00..c764402c464f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -278,6 +278,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * even if read-only so there is no need to account for them here */ if (newflags & VM_WRITE) { + /* Check space limits when area turns into data. */ + if (!may_expand_vm(mm, newflags, nrpages) && + may_expand_vm(mm, oldflags, nrpages)) + return -ENOMEM; if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; @@ -334,8 +338,8 @@ success: populate_vma_page_range(vma, start, end, NULL); } - vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); - vm_stat_account(mm, newflags, vma->vm_file, nrpages); + vm_stat_account(mm, oldflags, -nrpages); + vm_stat_account(mm, newflags, nrpages); perf_event_mmap(vma); return 0; diff --git a/mm/mremap.c b/mm/mremap.c index de824e72c3e8..e55b157865d5 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -317,7 +317,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, * If this were a serious issue, we'd add a flag to do_munmap(). */ hiwater_vm = mm->hiwater_vm; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); + vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT); /* Tell pfnmap has moved from this vma */ if (unlikely(vma->vm_flags & VM_PFNMAP)) @@ -383,7 +383,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, return ERR_PTR(-EAGAIN); } - if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) + if (!may_expand_vm(mm, vma->vm_flags, + (new_len - old_len) >> PAGE_SHIFT)) return ERR_PTR(-ENOMEM); if (vma->vm_flags & VM_ACCOUNT) { @@ -545,7 +546,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } - vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); + vm_stat_account(mm, vma->vm_flags, pages); if (vma->vm_flags & VM_LOCKED) { mm->locked_vm += pages; locked = true; -- cgit v1.2.3 From fec174d6699e2e39d34c7c6ee2fe360e518e68c0 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 14 Jan 2016 15:22:13 -0800 Subject: mm/page_isolation: use macro to judge the alignment Signed-off-by: Wang Xiaoqiang Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f484b9300fe3..5e139fec6c6c 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -165,8 +165,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long undo_pfn; struct page *page; - BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); - BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); + BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); + BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); for (pfn = start_pfn; pfn < end_pfn; -- cgit v1.2.3 From 7d2eba0557c18f7522b98befed98799990dd4fdb Mon Sep 17 00:00:00 2001 From: Ebru Akagunduz Date: Thu, 14 Jan 2016 15:22:19 -0800 Subject: mm: add tracepoint for scanning pages This patch series makes swapin readahead up to a certain number to gain more thp performance and adds tracepoint for khugepaged_scan_pmd, collapse_huge_page, __collapse_huge_page_isolate. This patch series was written to deal with programs that access most, but not all, of their memory after they get swapped out. Currently these programs do not get their memory collapsed into THPs after the system swapped their memory out, while they would get THPs before swapping happened. This patch series was tested with a test program, it allocates 400MB of memory, writes to it, and then sleeps. I force the system to swap out all. Afterwards, the test program touches the area by writing and leaves a piece of it without writing. This shows how much swap in readahead made by the patch. Test results: After swapped out ------------------------------------------------------------------- | Anonymous | AnonHugePages | Swap | Fraction | ------------------------------------------------------------------- With patch | 90076 kB | 88064 kB | 309928 kB | %99 | ------------------------------------------------------------------- Without patch | 194068 kB | 192512 kB | 205936 kB | %99 | ------------------------------------------------------------------- After swapped in ------------------------------------------------------------------- | Anonymous | AnonHugePages | Swap | Fraction | ------------------------------------------------------------------- With patch | 201408 kB | 198656 kB | 198596 kB | %98 | ------------------------------------------------------------------- Without patch | 292624 kB | 192512 kB | 107380 kB | %65 | ------------------------------------------------------------------- This patch (of 3): Using static tracepoints, data of functions is recorded. It is good to automatize debugging without doing a lot of changes in the source code. This patch adds tracepoint for khugepaged_scan_pmd, collapse_huge_page and __collapse_huge_page_isolate. [dan.carpenter@oracle.com: add a missing tab] Signed-off-by: Ebru Akagunduz Acked-by: Kirill A. Shutemov Acked-by: Rik van Riel Cc: Naoya Horiguchi Cc: Andrea Arcangeli Cc: Joonsoo Kim Cc: Xie XiuQi Cc: Cyrill Gorcunov Cc: Mel Gorman Cc: David Rientjes Cc: Vlastimil Babka Cc: Aneesh Kumar K.V Cc: Hugh Dickins Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 166 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 134 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 62fe06bb7d04..f952f055fdcf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -31,6 +31,33 @@ #include #include "internal.h" +enum scan_result { + SCAN_FAIL, + SCAN_SUCCEED, + SCAN_PMD_NULL, + SCAN_EXCEED_NONE_PTE, + SCAN_PTE_NON_PRESENT, + SCAN_PAGE_RO, + SCAN_NO_REFERENCED_PAGE, + SCAN_PAGE_NULL, + SCAN_SCAN_ABORT, + SCAN_PAGE_COUNT, + SCAN_PAGE_LRU, + SCAN_PAGE_LOCK, + SCAN_PAGE_ANON, + SCAN_ANY_PROCESS, + SCAN_VMA_NULL, + SCAN_VMA_CHECK, + SCAN_ADDRESS_RANGE, + SCAN_SWAP_CACHE_PAGE, + SCAN_DEL_PAGE_LRU, + SCAN_ALLOC_HUGE_PAGE_FAIL, + SCAN_CGROUP_CHARGE_FAIL +}; + +#define CREATE_TRACE_POINTS +#include + /* * By default transparent hugepage support is disabled in order that avoid * to risk increase the memory footprint of applications without a guaranteed @@ -2198,26 +2225,33 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { - struct page *page; + struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0; + int none_or_zero = 0, result = 0; bool referenced = false, writable = false; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) + ++none_or_zero <= khugepaged_max_ptes_none) { continue; - else + } else { + result = SCAN_EXCEED_NONE_PTE; goto out; + } } - if (!pte_present(pteval)) + if (!pte_present(pteval)) { + result = SCAN_PTE_NON_PRESENT; goto out; + } page = vm_normal_page(vma, address, pteval); - if (unlikely(!page)) + if (unlikely(!page)) { + result = SCAN_PAGE_NULL; goto out; + } VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!PageAnon(page), page); @@ -2229,8 +2263,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * is needed to serialize against split_huge_page * when invoked from the VM. */ - if (!trylock_page(page)) + if (!trylock_page(page)) { + result = SCAN_PAGE_LOCK; goto out; + } /* * cannot use mapcount: can't collapse if there's a gup pin. @@ -2239,6 +2275,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, */ if (page_count(page) != 1 + !!PageSwapCache(page)) { unlock_page(page); + result = SCAN_PAGE_COUNT; goto out; } if (pte_write(pteval)) { @@ -2246,6 +2283,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } else { if (PageSwapCache(page) && !reuse_swap_page(page)) { unlock_page(page); + result = SCAN_SWAP_CACHE_PAGE; goto out; } /* @@ -2260,6 +2298,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, */ if (isolate_lru_page(page)) { unlock_page(page); + result = SCAN_DEL_PAGE_LRU; goto out; } /* 0 stands for page_is_file_cache(page) == false */ @@ -2273,10 +2312,21 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } - if (likely(referenced && writable)) - return 1; + if (likely(writable)) { + if (likely(referenced)) { + result = SCAN_SUCCEED; + trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero, + referenced, writable, result); + return 1; + } + } else { + result = SCAN_PAGE_RO; + } + out: release_pte_pages(pte, _pte); + trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero, + referenced, writable, result); return 0; } @@ -2513,7 +2563,7 @@ static void collapse_huge_page(struct mm_struct *mm, pgtable_t pgtable; struct page *new_page; spinlock_t *pmd_ptl, *pte_ptl; - int isolated; + int isolated, result = 0; unsigned long hstart, hend; struct mem_cgroup *memcg; unsigned long mmun_start; /* For mmu_notifiers */ @@ -2528,12 +2578,15 @@ static void collapse_huge_page(struct mm_struct *mm, /* release the mmap_sem read lock. */ new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); - if (!new_page) - return; + if (!new_page) { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out_nolock; + } - if (unlikely(mem_cgroup_try_charge(new_page, mm, - gfp, &memcg))) - return; + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) { + result = SCAN_CGROUP_CHARGE_FAIL; + goto out_nolock; + } /* * Prevent all access to pagetables with the exception of @@ -2541,21 +2594,31 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ down_write(&mm->mmap_sem); - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(khugepaged_test_exit(mm))) { + result = SCAN_ANY_PROCESS; goto out; + } vma = find_vma(mm, address); - if (!vma) + if (!vma) { + result = SCAN_VMA_NULL; goto out; + } hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; - if (address < hstart || address + HPAGE_PMD_SIZE > hend) + if (address < hstart || address + HPAGE_PMD_SIZE > hend) { + result = SCAN_ADDRESS_RANGE; goto out; - if (!hugepage_vma_check(vma)) + } + if (!hugepage_vma_check(vma)) { + result = SCAN_VMA_CHECK; goto out; + } pmd = mm_find_pmd(mm, address); - if (!pmd) + if (!pmd) { + result = SCAN_PMD_NULL; goto out; + } anon_vma_lock_write(vma->anon_vma); @@ -2592,6 +2655,7 @@ static void collapse_huge_page(struct mm_struct *mm, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); + result = SCAN_FAIL; goto out; } @@ -2629,10 +2693,15 @@ static void collapse_huge_page(struct mm_struct *mm, *hpage = NULL; khugepaged_pages_collapsed++; + result = SCAN_SUCCEED; out_up_write: up_write(&mm->mmap_sem); + trace_mm_collapse_huge_page(mm, isolated, result); return; +out_nolock: + trace_mm_collapse_huge_page(mm, isolated, result); + return; out: mem_cgroup_cancel_charge(new_page, memcg); goto out_up_write; @@ -2645,8 +2714,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none_or_zero = 0; - struct page *page; + int ret = 0, none_or_zero = 0, result = 0; + struct page *page = NULL; unsigned long _address; spinlock_t *ptl; int node = NUMA_NO_NODE; @@ -2655,8 +2724,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); pmd = mm_find_pmd(mm, address); - if (!pmd) + if (!pmd) { + result = SCAN_PMD_NULL; goto out; + } memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -2665,19 +2736,25 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) + ++none_or_zero <= khugepaged_max_ptes_none) { continue; - else + } else { + result = SCAN_EXCEED_NONE_PTE; goto out_unmap; + } } - if (!pte_present(pteval)) + if (!pte_present(pteval)) { + result = SCAN_PTE_NON_PRESENT; goto out_unmap; + } if (pte_write(pteval)) writable = true; page = vm_normal_page(vma, _address, pteval); - if (unlikely(!page)) + if (unlikely(!page)) { + result = SCAN_PAGE_NULL; goto out_unmap; + } /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. @@ -2685,26 +2762,49 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * hit record. */ node = page_to_nid(page); - if (khugepaged_scan_abort(node)) + if (khugepaged_scan_abort(node)) { + result = SCAN_SCAN_ABORT; goto out_unmap; + } khugepaged_node_load[node]++; VM_BUG_ON_PAGE(PageCompound(page), page); - if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) + if (!PageLRU(page)) { + result = SCAN_SCAN_ABORT; + goto out_unmap; + } + if (PageLocked(page)) { + result = SCAN_PAGE_LOCK; + goto out_unmap; + } + if (!PageAnon(page)) { + result = SCAN_PAGE_ANON; goto out_unmap; + } + /* * cannot use mapcount: can't collapse if there's a gup pin. * The page must only be referenced by the scanned process * and page swap cache. */ - if (page_count(page) != 1 + !!PageSwapCache(page)) + if (page_count(page) != 1 + !!PageSwapCache(page)) { + result = SCAN_PAGE_COUNT; goto out_unmap; + } if (pte_young(pteval) || page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } - if (referenced && writable) - ret = 1; + if (writable) { + if (referenced) { + result = SCAN_SUCCEED; + ret = 1; + } else { + result = SCAN_NO_REFERENCED_PAGE; + } + } else { + result = SCAN_PAGE_RO; + } out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { @@ -2713,6 +2813,8 @@ out_unmap: collapse_huge_page(mm, address, hpage, vma, node); } out: + trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced, + none_or_zero, result); return ret; } -- cgit v1.2.3 From f58fb5e7f0ab05c9083869c1ec27854af2afc7b7 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 14 Jan 2016 15:22:38 -0800 Subject: mm/zbud.c: use list_last_entry() instead of list_tail_entry() list_last_entry*( has been defined in list.h, so replace list_tail_entry() with it. Signed-off-by: Geliang Tang Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zbud.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/zbud.c b/mm/zbud.c index d8a181fd779b..b42322e50f63 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -463,9 +463,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle) spin_unlock(&pool->lock); } -#define list_tail_entry(ptr, type, member) \ - list_entry((ptr)->prev, type, member) - /** * zbud_reclaim_page() - evicts allocations from a pool page and frees it * @pool: pool from which a page will attempt to be evicted @@ -514,7 +511,7 @@ int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) return -EINVAL; } for (i = 0; i < retries; i++) { - zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru); + zhdr = list_last_entry(&pool->lru, struct zbud_header, lru); list_del(&zhdr->lru); list_del(&zhdr->buddy); /* Protect zbud page against free */ -- cgit v1.2.3 From 7dfa4612204b511c934ca2a0e4f306f9981bd9aa Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Thu, 14 Jan 2016 15:22:40 -0800 Subject: zsmalloc: reorganize struct size_class to pack 4 bytes hole Reoder the pages_per_zspage field in struct size_class which can eliminate the 4 bytes hole between it and stats field. Signed-off-by: Weijie Yang Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9f15bdd9163c..e7414cec220b 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -213,10 +213,10 @@ struct size_class { int size; unsigned int index; - /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ - int pages_per_zspage; struct zs_size_stat stats; + /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ + int pages_per_zspage; /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ bool huge; }; -- cgit v1.2.3