From 155cbfc802e4d9d01637e4bddb23091983a58b37 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Tue, 8 May 2012 17:24:14 +0200 Subject: mm: use KERN_CONT in printk() continuation lines Cc: Andrew Morton Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a712fb9e04ce..89b9af2ca263 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4763,12 +4763,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s ", zone_names[i]); + printk(KERN_CONT " %-8s ", zone_names[i]); if (arch_zone_lowest_possible_pfn[i] == arch_zone_highest_possible_pfn[i]) - printk("empty\n"); + printk(KERN_CONT "empty\n"); else - printk("%0#10lx -> %0#10lx\n", + printk(KERN_CONT "%0#10lx -> %0#10lx\n", arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); } -- cgit v1.2.3 From 93278814d3590eba0ee360b8d69a35c7f2203ea8 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 10 May 2012 13:01:44 -0700 Subject: mm: fix division by 0 in percpu_pagelist_fraction() percpu_pagelist_fraction_sysctl_handler() has only considered -EINVAL as a possible error from proc_dointvec_minmax(). If any other error is returned, it would proceed to divide by zero since percpu_pagelist_fraction wasn't getting initialized at any point. For example, writing 0 bytes into the proc file would trigger the issue. Signed-off-by: Sasha Levin Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a712fb9e04ce..b21b3db15a7f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -105,7 +105,7 @@ unsigned long totalreserve_pages __read_mostly; */ unsigned long dirty_balance_reserve __read_mostly; -int percpu_pagelist_fraction; +int percpu_pagelist_fraction = 8; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; #ifdef CONFIG_PM_SLEEP @@ -5203,7 +5203,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, int ret; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (!write || (ret == -EINVAL)) + if (!write || (ret < 0)) return ret; for_each_populated_zone(zone) { for_each_possible_cpu(cpu) { -- cgit v1.2.3 From 1b76b02f15c70d5f392ee2e231fbd20a26063a77 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 11 May 2012 01:00:07 -0700 Subject: mm: raise MemFree by reverting percpu_pagelist_fraction to 0 Why is there less MemFree than there used to be? It perturbed a test, so I've just been bisecting linux-next, and now find the offender went upstream yesterday. Commit 93278814d359 "mm: fix division by 0 in percpu_pagelist_fraction()" mistakenly initialized percpu_pagelist_fraction to the sysctl's minimum 8, which leaves 1/8th of memory on percpu lists (on each cpu??); but most of us expect it to be left unset at 0 (and it's not then used as a divisor). MemTotal: 8061476kB 8061476kB 8061476kB 8061476kB 8061476kB 8061476kB Repetitive test with percpu_pagelist_fraction 8: MemFree: 6948420kB 6237172kB 6949696kB 6840692kB 6949048kB 6862984kB Same test with percpu_pagelist_fraction back to 0: MemFree: 7945000kB 7944908kB 7948568kB 7949060kB 7948796kB 7948812kB Signed-off-by: Hugh Dickins [ We really should fix the crazy sysctl interface too, but that's a separate thing - Linus ] Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b21b3db15a7f..918330f71dba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -105,7 +105,7 @@ unsigned long totalreserve_pages __read_mostly; */ unsigned long dirty_balance_reserve __read_mostly; -int percpu_pagelist_fraction = 8; +int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; #ifdef CONFIG_PM_SLEEP -- cgit v1.2.3 From 5f63b720b62925ef3c6a85473dcd547b0fd90616 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 11 Jan 2012 15:16:11 +0100 Subject: mm: page_alloc: remove trailing whitespace Signed-off-by: Michal Nazarewicz Signed-off-by: Marek Szyprowski Acked-by: Mel Gorman --- mm/page_alloc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 918330f71dba..6fb46c1589b9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -513,10 +513,10 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * free pages of length of (1 << order) and marked with _mapcount -2. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the - * other. That is, if we allocate a small block, and both were - * free, the remainder of the region must be split into blocks. + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. * If a block is freed, and its buddy is also free, then this - * triggers coalescing into a block of larger size. + * triggers coalescing into a block of larger size. * * -- wli */ @@ -1061,17 +1061,17 @@ retry_reserve: return page; } -/* +/* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. * Returns the number of new pages which were placed at *list. */ -static int rmqueue_bulk(struct zone *zone, unsigned int order, +static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, int cold) { int i; - + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype); @@ -4301,7 +4301,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; pgdat_page_cgroup_init(pgdat); - + for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; -- cgit v1.2.3 From 041d3a8cdc18dc375a128d90bbb753949a81b1fb Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Thu, 29 Dec 2011 13:09:50 +0100 Subject: mm: page_alloc: introduce alloc_contig_range() This commit adds the alloc_contig_range() function which tries to allocate given range of pages. It tries to migrate all already allocated pages that fall in the range thus freeing them. Once all pages in the range are freed they are removed from the buddy system thus allocated for the caller to use. Signed-off-by: Michal Nazarewicz Signed-off-by: Marek Szyprowski Acked-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Tested-by: Rob Clark Tested-by: Ohad Ben-Cohen Tested-by: Benjamin Gaignard Tested-by: Robert Nelson Tested-by: Barry Song --- mm/page_alloc.c | 188 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6fb46c1589b9..2c38a30d064e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -5550,6 +5551,193 @@ out: spin_unlock_irqrestore(&zone->lock, flags); } +#ifdef CONFIG_CMA + +static unsigned long pfn_max_align_down(unsigned long pfn) +{ + return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, + pageblock_nr_pages) - 1); +} + +static unsigned long pfn_max_align_up(unsigned long pfn) +{ + return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, + pageblock_nr_pages)); +} + +static struct page * +__alloc_contig_migrate_alloc(struct page *page, unsigned long private, + int **resultp) +{ + return alloc_page(GFP_HIGHUSER_MOVABLE); +} + +/* [start, end) must belong to a single zone. */ +static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) +{ + /* This function is based on compact_zone() from compaction.c. */ + + unsigned long pfn = start; + unsigned int tries = 0; + int ret = 0; + + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(start)), + .sync = true, + }; + INIT_LIST_HEAD(&cc.migratepages); + + migrate_prep_local(); + + while (pfn < end || !list_empty(&cc.migratepages)) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + if (list_empty(&cc.migratepages)) { + cc.nr_migratepages = 0; + pfn = isolate_migratepages_range(cc.zone, &cc, + pfn, end); + if (!pfn) { + ret = -EINTR; + break; + } + tries = 0; + } else if (++tries == 5) { + ret = ret < 0 ? ret : -EBUSY; + break; + } + + ret = migrate_pages(&cc.migratepages, + __alloc_contig_migrate_alloc, + 0, false, true); + } + + putback_lru_pages(&cc.migratepages); + return ret > 0 ? 0 : ret; +} + +/** + * alloc_contig_range() -- tries to allocate given range of pages + * @start: start PFN to allocate + * @end: one-past-the-last PFN to allocate + * + * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES + * aligned, however it's the caller's responsibility to guarantee that + * we are the only thread that changes migrate type of pageblocks the + * pages fall in. + * + * The PFN range must belong to a single zone. + * + * Returns zero on success or negative error code. On success all + * pages which PFN is in [start, end) are allocated for the caller and + * need to be freed with free_contig_range(). + */ +int alloc_contig_range(unsigned long start, unsigned long end) +{ + struct zone *zone = page_zone(pfn_to_page(start)); + unsigned long outer_start, outer_end; + int ret = 0, order; + + /* + * What we do here is we mark all pageblocks in range as + * MIGRATE_ISOLATE. Because pageblock and max order pages may + * have different sizes, and due to the way page allocator + * work, we align the range to biggest of the two pages so + * that page allocator won't try to merge buddies from + * different pageblocks and change MIGRATE_ISOLATE to some + * other migration type. + * + * Once the pageblocks are marked as MIGRATE_ISOLATE, we + * migrate the pages from an unaligned range (ie. pages that + * we are interested in). This will put all the pages in + * range back to page allocator as MIGRATE_ISOLATE. + * + * When this is done, we take the pages in range from page + * allocator removing them from the buddy system. This way + * page allocator will never consider using them. + * + * This lets us mark the pageblocks back as + * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the + * aligned range but not in the unaligned, original range are + * put back to page allocator so that buddy can use them. + */ + + ret = start_isolate_page_range(pfn_max_align_down(start), + pfn_max_align_up(end)); + if (ret) + goto done; + + ret = __alloc_contig_migrate_range(start, end); + if (ret) + goto done; + + /* + * Pages from [start, end) are within a MAX_ORDER_NR_PAGES + * aligned blocks that are marked as MIGRATE_ISOLATE. What's + * more, all pages in [start, end) are free in page allocator. + * What we are going to do is to allocate all pages from + * [start, end) (that is remove them from page allocator). + * + * The only problem is that pages at the beginning and at the + * end of interesting range may be not aligned with pages that + * page allocator holds, ie. they can be part of higher order + * pages. Because of this, we reserve the bigger range and + * once this is done free the pages we are not interested in. + * + * We don't have to hold zone->lock here because the pages are + * isolated thus they won't get removed from buddy. + */ + + lru_add_drain_all(); + drain_all_pages(); + + order = 0; + outer_start = start; + while (!PageBuddy(pfn_to_page(outer_start))) { + if (++order >= MAX_ORDER) { + ret = -EBUSY; + goto done; + } + outer_start &= ~0UL << order; + } + + /* Make sure the range is really isolated. */ + if (test_pages_isolated(outer_start, end)) { + pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", + outer_start, end); + ret = -EBUSY; + goto done; + } + + outer_end = isolate_freepages_range(outer_start, end); + if (!outer_end) { + ret = -EBUSY; + goto done; + } + + /* Free head and tail (if any) */ + if (start != outer_start) + free_contig_range(outer_start, start - outer_start); + if (end != outer_end) + free_contig_range(end, outer_end - end); + +done: + undo_isolate_page_range(pfn_max_align_down(start), + pfn_max_align_up(end)); + return ret; +} + +void free_contig_range(unsigned long pfn, unsigned nr_pages) +{ + for (; nr_pages--; ++pfn) + __free_page(pfn_to_page(pfn)); +} +#endif + #ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be isolated before calling this. -- cgit v1.2.3 From 6d4a49160de2c684fb59fa627bce80e200224331 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 11 Jan 2012 15:31:33 +0100 Subject: mm: page_alloc: change fallbacks array handling This commit adds a row for MIGRATE_ISOLATE type to the fallbacks array which was missing from it. It also, changes the array traversal logic a little making MIGRATE_RESERVE an end marker. The letter change, removes the implicit MIGRATE_UNMOVABLE from the end of each row which was read by __rmqueue_fallback() function. Signed-off-by: Michal Nazarewicz Signed-off-by: Marek Szyprowski Acked-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Tested-by: Rob Clark Tested-by: Ohad Ben-Cohen Tested-by: Benjamin Gaignard Tested-by: Robert Nelson Tested-by: Barry Song --- mm/page_alloc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2c38a30d064e..d6b580c660f5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -875,11 +875,12 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted */ -static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { +static int fallbacks[MIGRATE_TYPES][3] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ }; /* @@ -974,12 +975,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order; --current_order) { - for (i = 0; i < MIGRATE_TYPES - 1; i++) { + for (i = 0;; i++) { migratetype = fallbacks[start_migratetype][i]; /* MIGRATE_RESERVE handled later if necessary */ if (migratetype == MIGRATE_RESERVE) - continue; + break; area = &(zone->free_area[current_order]); if (list_empty(&area->free_list[migratetype])) -- cgit v1.2.3 From 47118af076f64844b4f423bc2f545b2da9dab50d Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Thu, 29 Dec 2011 13:09:50 +0100 Subject: mm: mmzone: MIGRATE_CMA migration type added The MIGRATE_CMA migration type has two main characteristics: (i) only movable pages can be allocated from MIGRATE_CMA pageblocks and (ii) page allocator will never change migration type of MIGRATE_CMA pageblocks. This guarantees (to some degree) that page in a MIGRATE_CMA page block can always be migrated somewhere else (unless there's no memory left in the system). It is designed to be used for allocating big chunks (eg. 10MiB) of physically contiguous memory. Once driver requests contiguous memory, pages from MIGRATE_CMA pageblocks may be migrated away to create a contiguous block. To minimise number of migrations, MIGRATE_CMA migration type is the last type tried when page allocator falls back to other migration types when requested. Signed-off-by: Michal Nazarewicz Signed-off-by: Marek Szyprowski Signed-off-by: Kyungmin Park Acked-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Tested-by: Rob Clark Tested-by: Ohad Ben-Cohen Tested-by: Benjamin Gaignard Tested-by: Robert Nelson Tested-by: Barry Song --- mm/page_alloc.c | 76 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 16 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d6b580c660f5..0869eb1e9461 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -750,6 +750,24 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) __free_pages(page, order); } +#ifdef CONFIG_CMA +/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ +void __init init_cma_reserved_pageblock(struct page *page) +{ + unsigned i = pageblock_nr_pages; + struct page *p = page; + + do { + __ClearPageReserved(p); + set_page_count(p, 0); + } while (++p, --i); + + set_page_refcounted(page); + set_pageblock_migratetype(page, MIGRATE_CMA); + __free_pages(page, pageblock_order); + totalram_pages += pageblock_nr_pages; +} +#endif /* * The order of subdivision here is critical for the IO subsystem. @@ -875,10 +893,15 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted */ -static int fallbacks[MIGRATE_TYPES][3] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, +static int fallbacks[MIGRATE_TYPES][4] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, +#ifdef CONFIG_CMA + [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, + [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ +#else + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, +#endif [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ }; @@ -995,11 +1018,18 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * pages to the preferred allocation list. If falling * back for a reclaimable kernel allocation, be more * aggressive about taking ownership of free pages + * + * On the other hand, never change migration + * type of MIGRATE_CMA pageblocks nor move CMA + * pages on different free lists. We don't + * want unmovable pages to be allocated from + * MIGRATE_CMA areas. */ - if (unlikely(current_order >= (pageblock_order >> 1)) || - start_migratetype == MIGRATE_RECLAIMABLE || - page_group_by_mobility_disabled) { - unsigned long pages; + if (!is_migrate_cma(migratetype) && + (unlikely(current_order >= pageblock_order / 2) || + start_migratetype == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled)) { + int pages; pages = move_freepages_block(zone, page, start_migratetype); @@ -1017,11 +1047,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) rmv_page_order(page); /* Take ownership for orders >= pageblock_order */ - if (current_order >= pageblock_order) + if (current_order >= pageblock_order && + !is_migrate_cma(migratetype)) change_pageblock_range(page, current_order, start_migratetype); - expand(zone, page, order, current_order, area, migratetype); + expand(zone, page, order, current_order, area, + is_migrate_cma(migratetype) + ? migratetype : start_migratetype); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype); @@ -1072,7 +1105,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, int cold) { - int i; + int mt = migratetype, i; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -1093,7 +1126,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, list_add(&page->lru, list); else list_add_tail(&page->lru, list); - set_page_private(page, migratetype); + if (IS_ENABLED(CONFIG_CMA)) { + mt = get_pageblock_migratetype(page); + if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) + mt = migratetype; + } + set_page_private(page, mt); list = &page->lru; } __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); @@ -1373,8 +1411,12 @@ int split_free_page(struct page *page) if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; - for (; page < endpage; page += pageblock_nr_pages) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + for (; page < endpage; page += pageblock_nr_pages) { + int mt = get_pageblock_migratetype(page); + if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) + set_pageblock_migratetype(page, + MIGRATE_MOVABLE); + } } return 1 << order; @@ -5414,14 +5456,16 @@ static int __count_immobile_pages(struct zone *zone, struct page *page, int count) { unsigned long pfn, iter, found; + int mt; + /* * For avoiding noise data, lru_add_drain_all() should be called * If ZONE_MOVABLE, the zone never contains immobile pages */ if (zone_idx(zone) == ZONE_MOVABLE) return true; - - if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) + mt = get_pageblock_migratetype(page); + if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) return true; pfn = page_to_pfn(page); -- cgit v1.2.3 From 0815f3d81d76dfbf2abcfd93a85ff0a6008fe4c0 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Tue, 3 Apr 2012 15:06:15 +0200 Subject: mm: page_isolation: MIGRATE_CMA isolation functions added This commit changes various functions that change pages and pageblocks migrate type between MIGRATE_ISOLATE and MIGRATE_MOVABLE in such a way as to allow to work with MIGRATE_CMA migrate type. Signed-off-by: Michal Nazarewicz Signed-off-by: Marek Szyprowski Reviewed-by: KAMEZAWA Hiroyuki Tested-by: Rob Clark Tested-by: Ohad Ben-Cohen Tested-by: Benjamin Gaignard Tested-by: Robert Nelson Tested-by: Barry Song --- mm/page_alloc.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0869eb1e9461..116c087f76bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5582,7 +5582,7 @@ out: return ret; } -void unset_migratetype_isolate(struct page *page) +void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; unsigned long flags; @@ -5590,8 +5590,8 @@ void unset_migratetype_isolate(struct page *page) spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(zone, page, MIGRATE_MOVABLE); + set_pageblock_migratetype(page, migratetype); + move_freepages_block(zone, page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); } @@ -5669,6 +5669,10 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate * @end: one-past-the-last PFN to allocate + * @migratetype: migratetype of the underlaying pageblocks (either + * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks + * in range must have the same migratetype and it must + * be either of the two. * * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES * aligned, however it's the caller's responsibility to guarantee that @@ -5681,7 +5685,8 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) * pages which PFN is in [start, end) are allocated for the caller and * need to be freed with free_contig_range(). */ -int alloc_contig_range(unsigned long start, unsigned long end) +int alloc_contig_range(unsigned long start, unsigned long end, + unsigned migratetype) { struct zone *zone = page_zone(pfn_to_page(start)); unsigned long outer_start, outer_end; @@ -5712,7 +5717,7 @@ int alloc_contig_range(unsigned long start, unsigned long end) */ ret = start_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end)); + pfn_max_align_up(end), migratetype); if (ret) goto done; @@ -5772,7 +5777,7 @@ int alloc_contig_range(unsigned long start, unsigned long end) done: undo_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end)); + pfn_max_align_up(end), migratetype); return ret; } -- cgit v1.2.3 From cfd3da1e49bb95c355c01c0f502d657deb3d34a4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Apr 2011 21:36:42 +0000 Subject: mm: Serialize access to min_free_kbytes There is a race between the min_free_kbytes sysctl, memory hotplug and transparent hugepage support enablement. Memory hotplug uses a zonelists_mutex to avoid a race when building zonelists. Reuse it to serialise watermark updates. [a.p.zijlstra@chello.nl: Older patch fixed the race with spinlock] Signed-off-by: Mel Gorman Signed-off-by: Marek Szyprowski Reviewed-by: KAMEZAWA Hiroyuki Tested-by: Barry Song --- mm/page_alloc.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 116c087f76bb..8be37bcda0b2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5020,14 +5020,7 @@ static void setup_per_zone_lowmem_reserve(void) calculate_totalreserve_pages(); } -/** - * setup_per_zone_wmarks - called when min_free_kbytes changes - * or when memory is hot-{added|removed} - * - * Ensures that the watermark[min,low,high] values for each zone are set - * correctly with respect to min_free_kbytes. - */ -void setup_per_zone_wmarks(void) +static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -5082,6 +5075,20 @@ void setup_per_zone_wmarks(void) calculate_totalreserve_pages(); } +/** + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-{added|removed} + * + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. + */ +void setup_per_zone_wmarks(void) +{ + mutex_lock(&zonelists_mutex); + __setup_per_zone_wmarks(); + mutex_unlock(&zonelists_mutex); +} + /* * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance -- cgit v1.2.3 From bba9071087108d3de70bea274e35064cc480487b Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 25 Jan 2012 12:09:52 +0100 Subject: mm: extract reclaim code from __alloc_pages_direct_reclaim() This patch extracts common reclaim code from __alloc_pages_direct_reclaim() function to separate function: __perform_reclaim() which can be later used by alloc_contig_range(). Signed-off-by: Marek Szyprowski Signed-off-by: Kyungmin Park Cc: Michal Nazarewicz Acked-by: Mel Gorman Tested-by: Rob Clark Tested-by: Ohad Ben-Cohen Tested-by: Benjamin Gaignard Tested-by: Robert Nelson Tested-by: Barry Song --- mm/page_alloc.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8be37bcda0b2..4615531dcf66 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2130,16 +2130,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } #endif /* CONFIG_COMPACTION */ -/* The really slow allocator path where we enter direct reclaim */ -static inline struct page * -__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) +/* Perform direct synchronous page reclaim */ +static int +__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, + nodemask_t *nodemask) { - struct page *page = NULL; struct reclaim_state reclaim_state; - bool drained = false; + int progress; cond_resched(); @@ -2150,7 +2147,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; - *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); + progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@ -2158,6 +2155,21 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, cond_resched(); + return progress; +} + +/* The really slow allocator path where we enter direct reclaim */ +static inline struct page * +__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + struct page *page = NULL; + bool drained = false; + + *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, + nodemask); if (unlikely(!(*did_some_progress))) return NULL; -- cgit v1.2.3 From 49f223a9cd96c7293d7258ff88c2bdf83065f69c Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 25 Jan 2012 12:49:24 +0100 Subject: mm: trigger page reclaim in alloc_contig_range() to stabilise watermarks alloc_contig_range() performs memory allocation so it also should keep track on keeping the correct level of memory watermarks. This commit adds a call to *_slowpath style reclaim to grab enough pages to make sure that the final collection of contiguous pages from freelists will not starve the system. Signed-off-by: Marek Szyprowski Signed-off-by: Kyungmin Park CC: Michal Nazarewicz Tested-by: Rob Clark Tested-by: Ohad Ben-Cohen Tested-by: Benjamin Gaignard Tested-by: Robert Nelson Tested-by: Barry Song --- mm/page_alloc.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4615531dcf66..22348ae1005d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5079,6 +5079,11 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + + zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); + zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); + zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); + setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -5684,6 +5689,54 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) return ret > 0 ? 0 : ret; } +/* + * Update zone's cma pages counter used for watermark level calculation. + */ +static inline void __update_cma_watermarks(struct zone *zone, int count) +{ + unsigned long flags; + spin_lock_irqsave(&zone->lock, flags); + zone->min_cma_pages += count; + spin_unlock_irqrestore(&zone->lock, flags); + setup_per_zone_wmarks(); +} + +/* + * Trigger memory pressure bump to reclaim some pages in order to be able to + * allocate 'count' pages in single page units. Does similar work as + *__alloc_pages_slowpath() function. + */ +static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) +{ + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + struct zonelist *zonelist = node_zonelist(0, gfp_mask); + int did_some_progress = 0; + int order = 1; + + /* + * Increase level of watermarks to force kswapd do his job + * to stabilise at new watermark level. + */ + __update_cma_watermarks(zone, count); + + /* Obey watermarks as if the page was being allocated */ + while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { + wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); + + did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, + NULL); + if (!did_some_progress) { + /* Exhausted what can be done so it's blamo time */ + out_of_memory(zonelist, gfp_mask, order, NULL, false); + } + } + + /* Restore original watermark levels. */ + __update_cma_watermarks(zone, -count); + + return count; +} + /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate @@ -5782,6 +5835,13 @@ int alloc_contig_range(unsigned long start, unsigned long end, goto done; } + /* + * Reclaim enough pages to make sure that contiguous allocation + * will not starve the system. + */ + __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); + + /* Grab isolated pages from freelists. */ outer_end = isolate_freepages_range(outer_start, end); if (!outer_end) { ret = -EBUSY; -- cgit v1.2.3 From 58f42fd54144346898e6dc6d6ae3acd4c591b42f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 11 May 2012 09:37:13 +0200 Subject: cma: fix migration mode __alloc_contig_migrate_range calls migrate_pages with wrong argument for migrate_mode. Fix it. Cc: Marek Szyprowski Signed-off-by: Minchan Kim Acked-by: Michal Nazarewicz Signed-off-by: Marek Szyprowski --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22348ae1005d..ed85c02c1f6e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5682,7 +5682,7 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) ret = migrate_pages(&cc.migratepages, __alloc_contig_migrate_alloc, - 0, false, true); + 0, false, MIGRATE_SYNC); } putback_lru_pages(&cc.migratepages); -- cgit v1.2.3 From 31fe62b9586643953f0c0c37a6357dafc69034e2 Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Wed, 23 May 2012 13:33:35 +0000 Subject: mm: add a low limit to alloc_large_system_hash UDP stack needs a minimum hash size value for proper operation and also uses alloc_large_system_hash() for proper NUMA distribution of its hash tables and automatic sizing depending on available system memory. On some low memory situations, udp_table_init() must ignore the alloc_large_system_hash() result and reallocs a bigger memory area. As we cannot easily free old hash table, we leak it and kmemleak can issue a warning. This patch adds a low limit parameter to alloc_large_system_hash() to solve this problem. We then specify UDP_HTABLE_SIZE_MIN for UDP/UDPLite hash table allocation. Reported-by: Mark Asselstine Reported-by: Tim Bird Signed-off-by: Eric Dumazet Cc: Paul Gortmaker Signed-off-by: David S. Miller --- mm/page_alloc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 918330f71dba..b7af568f0ed9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5242,9 +5242,10 @@ void *__init alloc_large_system_hash(const char *tablename, int flags, unsigned int *_hash_shift, unsigned int *_hash_mask, - unsigned long limit) + unsigned long low_limit, + unsigned long high_limit) { - unsigned long long max = limit; + unsigned long long max = high_limit; unsigned long log2qty, size; void *table = NULL; @@ -5282,6 +5283,8 @@ void *__init alloc_large_system_hash(const char *tablename, } max = min(max, 0x80000000ULL); + if (numentries < low_limit) + numentries = low_limit; if (numentries > max) numentries = max; -- cgit v1.2.3 From a62e2f4f508863da8e0c2f2b42f5252a87330297 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 29 May 2012 15:06:30 -0700 Subject: mm: print physical addresses consistently with other parts of kernel Print physical address info in a style consistent with the %pR style used elsewhere in the kernel. For example: -Zone PFN ranges: +Zone ranges: - DMA32 0x00000010 -> 0x00100000 + DMA32 [mem 0x00010000-0xffffffff] - Normal 0x00100000 -> 0x01080000 + Normal [mem 0x100000000-0x107fffffff] Signed-off-by: Bjorn Helgaas Cc: Yinghai Lu Cc: Konrad Rzeszutek Wilk Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bab8e3bc4202..8d71ad8ffa48 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4815,7 +4815,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) find_zone_movable_pfns_for_nodes(); /* Print out the zone ranges */ - printk("Zone PFN ranges:\n"); + printk("Zone ranges:\n"); for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; @@ -4824,22 +4824,25 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) arch_zone_highest_possible_pfn[i]) printk(KERN_CONT "empty\n"); else - printk(KERN_CONT "%0#10lx -> %0#10lx\n", - arch_zone_lowest_possible_pfn[i], - arch_zone_highest_possible_pfn[i]); + printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", + arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, + (arch_zone_highest_possible_pfn[i] + << PAGE_SHIFT) - 1); } /* Print out the PFNs ZONE_MOVABLE begins at in each node */ - printk("Movable zone start PFN for each node\n"); + printk("Movable zone start for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i]) - printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); + printk(" Node %d: %#010lx\n", i, + zone_movable_pfn[i] << PAGE_SHIFT); } /* Print out the early_node_map[] */ - printk("Early memory PFN ranges\n"); + printk("Early memory node ranges\n"); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); + printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, + start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); /* Initialise every node */ mminit_verify_pageflags_layout(); -- cgit v1.2.3 From 955c1cd7401565671b064e499115344ec8067dfd Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 29 May 2012 15:06:31 -0700 Subject: mm/page_alloc.c: remove pageblock_default_order() This has always been broken: one version takes an unsigned int and the other version takes no arguments. This bug was hidden because one version of set_pageblock_order() was a macro which doesn't evaluate its argument. Simplify it all and remove pageblock_default_order() altogether. Reported-by: rajman mekaco Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Tejun Heo Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d71ad8ffa48..84f2c599d5d4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4300,25 +4300,24 @@ static inline void setup_usemap(struct pglist_data *pgdat, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE -/* Return a sensible default order for the pageblock size. */ -static inline int pageblock_default_order(void) -{ - if (HPAGE_SHIFT > PAGE_SHIFT) - return HUGETLB_PAGE_ORDER; - - return MAX_ORDER-1; -} - /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -static inline void __init set_pageblock_order(unsigned int order) +static inline void __init set_pageblock_order(void) { + unsigned int order; + /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) return; + if (HPAGE_SHIFT > PAGE_SHIFT) + order = HUGETLB_PAGE_ORDER; + else + order = MAX_ORDER - 1; + /* * Assume the largest contiguous order of interest is a huge page. - * This value may be variable depending on boot parameters on IA64 + * This value may be variable depending on boot parameters on IA64 and + * powerpc. */ pageblock_order = order; } @@ -4326,15 +4325,13 @@ static inline void __init set_pageblock_order(unsigned int order) /* * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() - * and pageblock_default_order() are unused as pageblock_order is set - * at compile-time. See include/linux/pageblock-flags.h for the values of - * pageblock_order based on the kernel config + * is unused as pageblock_order is set at compile-time. See + * include/linux/pageblock-flags.h for the values of pageblock_order based on + * the kernel config */ -static inline int pageblock_default_order(unsigned int order) +static inline void set_pageblock_order(void) { - return MAX_ORDER-1; } -#define set_pageblock_order(x) do {} while (0) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ @@ -4422,7 +4419,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, if (!size) continue; - set_pageblock_order(pageblock_default_order()); + set_pageblock_order(); setup_usemap(pgdat, zone, size); ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); -- cgit v1.2.3 From 5ceb9ce6fe9462a298bb2cd5c9f1ca6cb80a0199 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 29 May 2012 15:06:37 -0700 Subject: mm: compaction: handle incorrect MIGRATE_UNMOVABLE type pageblocks When MIGRATE_UNMOVABLE pages are freed from MIGRATE_UNMOVABLE type pageblock (and some MIGRATE_MOVABLE pages are left in it) waiting until an allocation takes ownership of the block may take too long. The type of the pageblock remains unchanged so the pageblock cannot be used as a migration target during compaction. Fix it by: * Adding enum compact_mode (COMPACT_ASYNC_[MOVABLE,UNMOVABLE], and COMPACT_SYNC) and then converting sync field in struct compact_control to use it. * Adding nr_pageblocks_skipped field to struct compact_control and tracking how many destination pageblocks were of MIGRATE_UNMOVABLE type. If COMPACT_ASYNC_MOVABLE mode compaction ran fully in try_to_compact_pages() (COMPACT_COMPLETE) it implies that there is not a suitable page for allocation. In this case then check how if there were enough MIGRATE_UNMOVABLE pageblocks to try a second pass in COMPACT_ASYNC_UNMOVABLE mode. * Scanning the MIGRATE_UNMOVABLE pageblocks (during COMPACT_SYNC and COMPACT_ASYNC_UNMOVABLE compaction modes) and building a count based on finding PageBuddy pages, page_count(page) == 0 or PageLRU pages. If all pages within the MIGRATE_UNMOVABLE pageblock are in one of those three sets change the whole pageblock type to MIGRATE_MOVABLE. My particular test case (on a ARM EXYNOS4 device with 512 MiB, which means 131072 standard 4KiB pages in 'Normal' zone) is to: - allocate 120000 pages for kernel's usage - free every second page (60000 pages) of memory just allocated - allocate and use 60000 pages from user space - free remaining 60000 pages of kernel memory (now we have fragmented memory occupied mostly by user space pages) - try to allocate 100 order-9 (2048 KiB) pages for kernel's usage The results: - with compaction disabled I get 11 successful allocations - with compaction enabled - 14 successful allocations - with this patch I'm able to get all 100 successful allocations NOTE: If we can make kswapd aware of order-0 request during compaction, we can enhance kswapd with changing mode to COMPACT_ASYNC_FULL (COMPACT_ASYNC_MOVABLE + COMPACT_ASYNC_UNMOVABLE). Please see the following thread: http://marc.info/?l=linux-mm&m=133552069417068&w=2 [minchan@kernel.org: minor cleanups] Cc: Mel Gorman Cc: Minchan Kim Cc: Rik van Riel Cc: Marek Szyprowski Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 84f2c599d5d4..457b4de122f4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -219,7 +219,7 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -static void set_pageblock_migratetype(struct page *page, int migratetype) +void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled)) @@ -954,8 +954,8 @@ static int move_freepages(struct zone *zone, return pages_moved; } -static int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) +int move_freepages_block(struct zone *zone, struct page *page, + int migratetype) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -5657,7 +5657,7 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .sync = true, + .mode = COMPACT_SYNC, }; INIT_LIST_HEAD(&cc.migratepages); -- cgit v1.2.3 From be9cd873e2a706a688e37224d48e135efd8c0d26 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 29 May 2012 15:06:44 -0700 Subject: mm/buddy: dump PG_compound_lock page flag The array pageflag_names[] does conversion from page flags into their corresponding names so that a meaningful representation of the corresponding page flag can be printed. This mechanism is used while dumping page frames. However, the array missed PG_compound_lock. So the PG_compound_lock page flag would be printed as a digital number instead of a meaningful string. The patch fixes that and prints "compound_lock" for the PG_compound_lock page flag. Signed-off-by: Gavin Shan Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 457b4de122f4..72869ea5c513 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5972,6 +5972,9 @@ static struct trace_print_flags pageflag_names[] = { #endif #ifdef CONFIG_MEMORY_FAILURE {1UL << PG_hwpoison, "hwpoison" }, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + {1UL << PG_compound_lock, "compound_lock" }, #endif {-1UL, NULL }, }; -- cgit v1.2.3 From acc50c110cf6f1a8bd1af410b2c7bc29ca624678 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 29 May 2012 15:06:44 -0700 Subject: mm: page_alloc: catch out-of-date list of page flag names String tables with names of enum items are always prone to go out of sync with the enums themselves. Ensure during compile time that the name table of page flags has the same size as the page flags enum. Signed-off-by: Johannes Weiner Cc: Gavin Shan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 72869ea5c513..5712898c951b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5985,6 +5985,8 @@ static void dump_page_flags(unsigned long flags) unsigned long mask; int i; + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) - 1 != __NR_PAGEFLAGS); + printk(KERN_ALERT "page flags: %#lx(", flags); /* remove zone id */ -- cgit v1.2.3 From 51300cef41c9355a7d428fe0714888d3c72a6f2f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 29 May 2012 15:06:44 -0700 Subject: mm/page_alloc.c: cleanups - make pageflag_names[] const - remove null termination of pageflag_names[] Cc: Johannes Weiner Cc: Gavin Shan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5712898c951b..4fc462b5fcf1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5938,7 +5938,7 @@ bool is_free_buddy_page(struct page *page) } #endif -static struct trace_print_flags pageflag_names[] = { +static const struct trace_print_flags pageflag_names[] = { {1UL << PG_locked, "locked" }, {1UL << PG_error, "error" }, {1UL << PG_referenced, "referenced" }, @@ -5976,7 +5976,6 @@ static struct trace_print_flags pageflag_names[] = { #ifdef CONFIG_TRANSPARENT_HUGEPAGE {1UL << PG_compound_lock, "compound_lock" }, #endif - {-1UL, NULL }, }; static void dump_page_flags(unsigned long flags) @@ -5985,14 +5984,14 @@ static void dump_page_flags(unsigned long flags) unsigned long mask; int i; - BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) - 1 != __NR_PAGEFLAGS); + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); printk(KERN_ALERT "page flags: %#lx(", flags); /* remove zone id */ flags &= (1UL << NR_PAGEFLAGS) - 1; - for (i = 0; pageflag_names[i].name && flags; i++) { + for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { mask = pageflag_names[i].mask; if ((flags & mask) != mask) -- cgit v1.2.3 From 89abfab133ef1f5902abafb744df72793213ac19 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:53 -0700 Subject: mm/memcg: move reclaim_stat into lruvec With mem_cgroup_disabled() now explicit, it becomes clear that the zone_reclaim_stat structure actually belongs in lruvec, per-zone when memcg is disabled but per-memcg per-zone when it's enabled. We can delete mem_cgroup_get_reclaim_stat(), and change update_page_reclaim_stat() to update just the one set of stats, the one which get_scan_count() will actually use. Signed-off-by: Hugh Dickins Signed-off-by: Konstantin Khlebnikov Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Reviewed-by: Minchan Kim Reviewed-by: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4fc462b5fcf1..8cbfc38e68ac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4410,10 +4410,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(lru) INIT_LIST_HEAD(&zone->lruvec.lists[lru]); - zone->reclaim_stat.recent_rotated[0] = 0; - zone->reclaim_stat.recent_rotated[1] = 0; - zone->reclaim_stat.recent_scanned[0] = 0; - zone->reclaim_stat.recent_scanned[1] = 0; + zone->lruvec.reclaim_stat.recent_rotated[0] = 0; + zone->lruvec.reclaim_stat.recent_rotated[1] = 0; + zone->lruvec.reclaim_stat.recent_scanned[0] = 0; + zone->lruvec.reclaim_stat.recent_scanned[1] = 0; zap_zone_vm_stats(zone); zone->flags = 0; if (!size) -- cgit v1.2.3 From 7f5e86c2ccc1480946d2c869d7f7d5278e828092 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 29 May 2012 15:06:58 -0700 Subject: mm: add link from struct lruvec to struct zone This is the first stage of struct mem_cgroup_zone removal. Further patches replace struct mem_cgroup_zone with a pointer to struct lruvec. If CONFIG_CGROUP_MEM_RES_CTLR=n lruvec_zone() is just container_of(). Signed-off-by: Konstantin Khlebnikov Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8cbfc38e68ac..6092f331b32e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4358,7 +4358,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; - enum lru_list lru; size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, @@ -4408,12 +4407,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - for_each_lru(lru) - INIT_LIST_HEAD(&zone->lruvec.lists[lru]); - zone->lruvec.reclaim_stat.recent_rotated[0] = 0; - zone->lruvec.reclaim_stat.recent_rotated[1] = 0; - zone->lruvec.reclaim_stat.recent_scanned[0] = 0; - zone->lruvec.reclaim_stat.recent_scanned[1] = 0; + lruvec_init(&zone->lruvec, zone); zap_zone_vm_stats(zone); zone->flags = 0; if (!size) -- cgit v1.2.3