From 2847cf95c68fa5fa391c58a669e761c4b0c8fc57 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 12 Dec 2014 16:55:01 -0800 Subject: mm/debug-pagealloc: cleanup page guard code Page guard is used by debug-pagealloc feature. Currently, it is open-coded, but, I think that more abstraction of it makes core page allocator code more readable. There is no functional difference. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Gioh Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df542feaac3b..2e8b7f39605a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -439,18 +439,29 @@ static int __init debug_guardpage_minorder_setup(char *buf) } __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); -static inline void set_page_guard_flag(struct page *page) +static inline void set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); + INIT_LIST_HEAD(&page->lru); + set_page_private(page, order); + /* Guard pages are not available for any usage */ + __mod_zone_freepage_state(zone, -(1 << order), migratetype); } -static inline void clear_page_guard_flag(struct page *page) +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); + set_page_private(page, 0); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else -static inline void set_page_guard_flag(struct page *page) { } -static inline void clear_page_guard_flag(struct page *page) { } +static inline void set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) {} +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) {} #endif static inline void set_page_order(struct page *page, unsigned int order) @@ -581,12 +592,7 @@ static inline void __free_one_page(struct page *page, * merge with it and move up one order. */ if (page_is_guard(buddy)) { - clear_page_guard_flag(buddy); - set_page_private(buddy, 0); - if (!is_migrate_isolate(migratetype)) { - __mod_zone_freepage_state(zone, 1 << order, - migratetype); - } + clear_page_guard(zone, buddy, order, migratetype); } else { list_del(&buddy->lru); zone->free_area[order].nr_free--; @@ -861,23 +867,17 @@ static inline void expand(struct zone *zone, struct page *page, size >>= 1; VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); -#ifdef CONFIG_DEBUG_PAGEALLOC - if (high < debug_guardpage_minorder()) { + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && + high < debug_guardpage_minorder()) { /* * Mark as guard pages (or page), that will allow to * merge back to allocator when buddy will be freed. * Corresponding page table entries will not be touched, * pages will stay not present in virtual address space */ - INIT_LIST_HEAD(&page[size].lru); - set_page_guard_flag(&page[size]); - set_page_private(&page[size], high); - /* Guard pages are not available for any usage */ - __mod_zone_freepage_state(zone, -(1 << high), - migratetype); + set_page_guard(zone, &page[size], high, migratetype); continue; } -#endif list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); -- cgit v1.2.3 From eefa864b701d78dc9753c70a3540a2e9ae192595 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 12 Dec 2014 16:55:46 -0800 Subject: mm/page_ext: resurrect struct page extending code for debugging When we debug something, we'd like to insert some information to every page. For this purpose, we sometimes modify struct page itself. But, this has drawbacks. First, it requires re-compile. This makes us hesitate to use the powerful debug feature so development process is slowed down. And, second, sometimes it is impossible to rebuild the kernel due to third party module dependency. At third, system behaviour would be largely different after re-compile, because it changes size of struct page greatly and this structure is accessed by every part of kernel. Keeping this as it is would be better to reproduce errornous situation. This feature is intended to overcome above mentioned problems. This feature allocates memory for extended data per page in certain place rather than the struct page itself. This memory can be accessed by the accessor functions provided by this code. During the boot process, it checks whether allocation of huge chunk of memory is needed or not. If not, it avoids allocating memory at all. With this advantage, we can include this feature into the kernel in default and can avoid rebuild and solve related problems. Until now, memcg uses this technique. But, now, memcg decides to embed their variable to struct page itself and it's code to extend struct page has been removed. I'd like to use this code to develop debug feature, so this patch resurrect it. To help these things to work well, this patch introduces two callbacks for clients. One is the need callback which is mandatory if user wants to avoid useless memory allocation at boot-time. The other is optional, init callback, which is used to do proper initialization after memory is allocated. Detailed explanation about purpose of these functions is in code comment. Please refer it. Others are completely same with previous extension code in memcg. Signed-off-by: Joonsoo Kim Cc: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Dave Hansen Cc: Michal Nazarewicz Cc: Jungsoo Son Cc: Ingo Molnar Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2e8b7f39605a..b64666cf5865 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -4856,6 +4857,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); + pgdat_page_ext_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; -- cgit v1.2.3 From e30825f1869a75b29a69dc8e0aaaaccc492092cf Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 12 Dec 2014 16:55:49 -0800 Subject: mm/debug-pagealloc: prepare boottime configurable on/off Until now, debug-pagealloc needs extra flags in struct page, so we need to recompile whole source code when we decide to use it. This is really painful, because it takes some time to recompile and sometimes rebuild is not possible due to third party module depending on struct page. So, we can't use this good feature in many cases. Now, we have the page extension feature that allows us to insert extra flags to outside of struct page. This gets rid of third party module issue mentioned above. And, this allows us to determine if we need extra memory for this page extension in boottime. With these property, we can avoid using debug-pagealloc in boottime with low computational overhead in the kernel built with CONFIG_DEBUG_PAGEALLOC. This will help our development process greatly. This patch is the preparation step to achive above goal. debug-pagealloc originally uses extra field of struct page, but, after this patch, it will use field of struct page_ext. Because memory for page_ext is allocated later than initialization of page allocator in CONFIG_SPARSEMEM, we should disable debug-pagealloc feature temporarily until initialization of page_ext. This patch implements this. Signed-off-by: Joonsoo Kim Cc: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Dave Hansen Cc: Michal Nazarewicz Cc: Jungsoo Son Cc: Ingo Molnar Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b64666cf5865..e0a39d328ca1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -56,7 +56,7 @@ #include #include #include -#include +#include #include #include @@ -425,6 +425,22 @@ static inline void prep_zero_page(struct page *page, unsigned int order, #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; +bool _debug_guardpage_enabled __read_mostly; + +static bool need_debug_guardpage(void) +{ + return true; +} + +static void init_debug_guardpage(void) +{ + _debug_guardpage_enabled = true; +} + +struct page_ext_operations debug_guardpage_ops = { + .need = need_debug_guardpage, + .init = init_debug_guardpage, +}; static int __init debug_guardpage_minorder_setup(char *buf) { @@ -443,7 +459,14 @@ __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); static inline void set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { - __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); + struct page_ext *page_ext; + + if (!debug_guardpage_enabled()) + return; + + page_ext = lookup_page_ext(page); + __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + INIT_LIST_HEAD(&page->lru); set_page_private(page, order); /* Guard pages are not available for any usage */ @@ -453,12 +476,20 @@ static inline void set_page_guard(struct zone *zone, struct page *page, static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) { - __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); + struct page_ext *page_ext; + + if (!debug_guardpage_enabled()) + return; + + page_ext = lookup_page_ext(page); + __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + set_page_private(page, 0); if (!is_migrate_isolate(migratetype)) __mod_zone_freepage_state(zone, (1 << order), migratetype); } #else +struct page_ext_operations debug_guardpage_ops = { NULL, }; static inline void set_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) {} static inline void clear_page_guard(struct zone *zone, struct page *page, @@ -869,6 +900,7 @@ static inline void expand(struct zone *zone, struct page *page, VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && + debug_guardpage_enabled() && high < debug_guardpage_minorder()) { /* * Mark as guard pages (or page), that will allow to -- cgit v1.2.3 From 031bc5743f158b2d5498294f489e534a31251626 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 12 Dec 2014 16:55:52 -0800 Subject: mm/debug-pagealloc: make debug-pagealloc boottime configurable Now, we have prepared to avoid using debug-pagealloc in boottime. So introduce new kernel-parameter to disable debug-pagealloc in boottime, and makes related functions to be disabled in this case. Only non-intuitive part is change of guard page functions. Because guard page is effective only if debug-pagealloc is enabled, turning off according to debug-pagealloc is reasonable thing to do. Signed-off-by: Joonsoo Kim Cc: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Dave Hansen Cc: Michal Nazarewicz Cc: Jungsoo Son Cc: Ingo Molnar Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e0a39d328ca1..303d38516807 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -425,15 +425,35 @@ static inline void prep_zero_page(struct page *page, unsigned int order, #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; +bool _debug_pagealloc_enabled __read_mostly; bool _debug_guardpage_enabled __read_mostly; +static int __init early_debug_pagealloc(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + _debug_pagealloc_enabled = true; + + return 0; +} +early_param("debug_pagealloc", early_debug_pagealloc); + static bool need_debug_guardpage(void) { + /* If we don't use debug_pagealloc, we don't need guard page */ + if (!debug_pagealloc_enabled()) + return false; + return true; } static void init_debug_guardpage(void) { + if (!debug_pagealloc_enabled()) + return; + _debug_guardpage_enabled = true; } -- cgit v1.2.3 From 48c96a3685795e52903e60c7ee115e5e22e7d640 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 12 Dec 2014 16:56:01 -0800 Subject: mm/page_owner: keep track of page owners This is the page owner tracking code which is introduced so far ago. It is resident on Andrew's tree, though, nobody tried to upstream so it remain as is. Our company uses this feature actively to debug memory leak or to find a memory hogger so I decide to upstream this feature. This functionality help us to know who allocates the page. When allocating a page, we store some information about allocation in extra memory. Later, if we need to know status of all pages, we can get and analyze it from this stored information. In previous version of this feature, extra memory is statically defined in struct page, but, in this version, extra memory is allocated outside of struct page. It enables us to turn on/off this feature at boottime without considerable memory waste. Although we already have tracepoint for tracing page allocation/free, using it to analyze page owner is rather complex. We need to enlarge the trace buffer for preventing overlapping until userspace program launched. And, launched program continually dump out the trace buffer for later analysis and it would change system behaviour with more possibility rather than just keeping it in memory, so bad for debug. Moreover, we can use page_owner feature further for various purposes. For example, we can use it for fragmentation statistics implemented in this patch. And, I also plan to implement some CMA failure debugging feature using this interface. I'd like to give the credit for all developers contributed this feature, but, it's not easy because I don't know exact history. Sorry about that. Below is people who has "Signed-off-by" in the patches in Andrew's tree. Contributor: Alexander Nyberg Mel Gorman Dave Hansen Minchan Kim Michal Nazarewicz Andrew Morton Jungsoo Son Signed-off-by: Joonsoo Kim Cc: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Dave Hansen Cc: Michal Nazarewicz Cc: Jungsoo Son Cc: Ingo Molnar Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 303d38516807..c13b6b29add2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -813,6 +814,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) if (bad) return false; + reset_page_owner(page, order); + if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), PAGE_SIZE << order); @@ -988,6 +991,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); + set_page_owner(page, order, gfp_flags); + return 0; } @@ -1560,8 +1565,11 @@ void split_page(struct page *page, unsigned int order) split_page(virt_to_page(page[0].shadow), order); #endif - for (i = 1; i < (1 << order); i++) + set_page_owner(page, 0, 0); + for (i = 1; i < (1 << order); i++) { set_page_refcounted(page + i); + set_page_owner(page + i, 0, 0); + } } EXPORT_SYMBOL_GPL(split_page); @@ -1601,6 +1609,7 @@ int __isolate_free_page(struct page *page, unsigned int order) } } + set_page_owner(page, order, 0); return 1UL << order; } -- cgit v1.2.3 From 6b4f7799c6a5703ac6b8c0649f4c22f00fa07513 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 12 Dec 2014 16:56:13 -0800 Subject: mm: vmscan: invoke slab shrinkers from shrink_zone() The slab shrinkers are currently invoked from the zonelist walkers in kswapd, direct reclaim, and zone reclaim, all of which roughly gauge the eligible LRU pages and assemble a nodemask to pass to NUMA-aware shrinkers, which then again have to walk over the nodemask. This is redundant code, extra runtime work, and fairly inaccurate when it comes to the estimation of actually scannable LRU pages. The code duplication will only get worse when making the shrinkers cgroup-aware and requiring them to have out-of-band cgroup hierarchy walks as well. Instead, invoke the shrinkers from shrink_zone(), which is where all reclaimers end up, to avoid this duplication. Take the count for eligible LRU pages out of get_scan_count(), which considers many more factors than just the availability of swap space, like zone_reclaimable_pages() currently does. Accumulate the number over all visited lruvecs to get the per-zone value. Some nodes have multiple zones due to memory addressing restrictions. To avoid putting too much pressure on the shrinkers, only invoke them once for each such node, using the class zone of the allocation as the pivot zone. For now, this integrates the slab shrinking better into the reclaim logic and gets rid of duplicative invocations from kswapd, direct reclaim, and zone reclaim. It also prepares for cgroup-awareness, allowing memcg-capable shrinkers to be added at the lruvec level without much duplication of both code and runtime work. This changes kswapd behavior, which used to invoke the shrinkers for each zone, but with scan ratios gathered from the entire node, resulting in meaningless pressure quantities on multi-zone nodes. Zone reclaim behavior also changes. It used to shrink slabs until the same amount of pages were shrunk as were reclaimed from the LRUs. Now it merely invokes the shrinkers once with the zone's scan ratio, which makes the shrinkers go easier on caches that implement aging and would prefer feeding back pressure from recently used slab objects to unused LRU pages. [vdavydov@parallels.com: assure class zone is populated] Signed-off-by: Johannes Weiner Cc: Dave Chinner Signed-off-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c13b6b29add2..9d3870862293 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6284,9 +6284,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (!PageLRU(page)) found++; /* - * If there are RECLAIMABLE pages, we need to check it. - * But now, memory offline itself doesn't call shrink_slab() - * and it still to be fixed. + * If there are RECLAIMABLE pages, we need to check + * it. But now, memory offline itself doesn't call + * shrink_node_slabs() and it still to be fixed. */ /* * If the page is not RAM, page_count()should be 0. -- cgit v1.2.3 From ba914f481507a0542a7c8a3fc15d89414bc2ebf3 Mon Sep 17 00:00:00 2001 From: Zhong Hongbo Date: Fri, 12 Dec 2014 16:56:21 -0800 Subject: mm: remove the highmem zones' memmap in the highmem zone Since 01cefaef40c4 ("mm: provide more accurate estimation of pages occupied by memmap") allocate the pages from lowmem for the highmem zones' memmap. So It is not need to reserver the memmap's for the highmem. A 2G DDR3 for the arm platform: On node 0 totalpages: 524288 free_area_init_node: node 0, pgdat 80ccd380, node_mem_map 80d38000 DMA zone: 3568 pages used for memmap DMA zone: 0 pages reserved DMA zone: 456704 pages, LIFO batch:31 HighMem zone: 528 pages used for memmap HighMem zone: 67584 pages, LIFO batch:15 On node 0 totalpages: 524288 free_area_init_node: node 0, pgdat 80cd6f40, node_mem_map 80d42000 DMA zone: 3568 pages used for memmap DMA zone: 0 pages reserved DMA zone: 456704 pages, LIFO batch:31 HighMem zone: 67584 pages, LIFO batch:15 Signed-off-by: Hongbo Zhong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d3870862293..fa974d87f60d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4937,16 +4937,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, * and per-cpu initialisations */ memmap_pages = calc_memmap_size(size, realsize); - if (freesize >= memmap_pages) { - freesize -= memmap_pages; - if (memmap_pages) - printk(KERN_DEBUG - " %s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); - } else - printk(KERN_WARNING - " %s zone: %lu pages exceeds freesize %lu\n", - zone_names[j], memmap_pages, freesize); + if (!is_highmem_idx(j)) { + if (freesize >= memmap_pages) { + freesize -= memmap_pages; + if (memmap_pages) + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); + } else + printk(KERN_WARNING + " %s zone: %lu pages exceeds freesize %lu\n", + zone_names[j], memmap_pages, freesize); + } /* Account for reserved pages */ if (j == 0 && freesize > dma_reserve) { -- cgit v1.2.3