From 47f3a867f6310d6abfa185ab12baaba7ed1d69af Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Fri, 6 Jan 2006 00:10:32 -0800 Subject: [PATCH] mm: fix __alloc_pages cpuset ALLOC_* flags Two changes to the setting of the ALLOC_CPUSET flag in mm/page_alloc.c:__alloc_pages() - A bug fix - the "ignoring mins" case should not be honoring ALLOC_CPUSET. This case of all cases, since it is handling a request that will free up more memory than is asked for (exiting tasks, e.g.) should be allowed to escape cpuset constraints when memory is tight. - A logic change to make it simpler. Honor cpusets even on GFP_ATOMIC (!wait) requests. With this, cpuset confinement applies to all requests except ALLOC_NO_WATERMARKS, so that in a subsequent cleanup patch, I can remove the ALLOC_CPUSET flag entirely. Since I don't know any real reason this logic has to be either way, I am choosing the path of the simplest code. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe14a8c87fc2..1e49dc7cd619 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -903,8 +903,7 @@ restart: alloc_flags |= ALLOC_HARDER; if (gfp_mask & __GFP_HIGH) alloc_flags |= ALLOC_HIGH; - if (wait) - alloc_flags |= ALLOC_CPUSET; + alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations @@ -926,7 +925,7 @@ restart: nofail_alloc: /* go through the zonelist yet again, ignoring mins */ page = get_page_from_freelist(gfp_mask, order, - zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); + zonelist, ALLOC_NO_WATERMARKS); if (page) goto got_pg; if (gfp_mask & __GFP_NOFAIL) { -- cgit v1.2.3 From a94b3ab7eab4edcc9b2cb474b188f774c331adf7 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 6 Jan 2006 00:10:51 -0800 Subject: [PATCH] mm: remove arch independent NODES_SPAN_OTHER_NODES The NODES_SPAN_OTHER_NODES config option was created so that DISCONTIGMEM could handle pSeries numa layouts. However, support for DISCONTIGMEM has been replaced by SPARSEMEM on powerpc. As a result, this config option and supporting code is no longer needed. I have already sent a patch to Paul that removes the option from powerpc specific code. This removes the arch independent piece. Doesn't really matter which is applied first. Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1e49dc7cd619..07825c637a58 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1708,8 +1708,6 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { if (!early_pfn_valid(pfn)) continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); set_page_count(page, 1); -- cgit v1.2.3 From c54ad30c784b84d0275152d0ca80985b21471811 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:56 -0800 Subject: [PATCH] mm: pagealloc opt Slightly optimise some page allocation and freeing functions by taking advantage of knowing whether or not interrupts are disabled. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07825c637a58..680cbe5b6ba2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -375,11 +375,10 @@ static int free_pages_bulk(struct zone *zone, int count, struct list_head *list, unsigned int order) { - unsigned long flags; struct page *page = NULL; int ret = 0; - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; while (!list_empty(list) && count--) { @@ -389,12 +388,13 @@ free_pages_bulk(struct zone *zone, int count, __free_pages_bulk(page, zone, order); ret++; } - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); return ret; } void __free_pages_ok(struct page *page, unsigned int order) { + unsigned long flags; LIST_HEAD(list); int i; int reserved = 0; @@ -415,7 +415,9 @@ void __free_pages_ok(struct page *page, unsigned int order) list_add(&page->lru, &list); mod_page_state(pgfree, 1 << order); kernel_map_pages(page, 1<lock, flags); + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { page = __rmqueue(zone, order); if (page == NULL) @@ -552,7 +553,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, allocated++; list_add_tail(&page->lru, list); } - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); return allocated; } @@ -589,6 +590,7 @@ void drain_remote_pages(void) #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) { + unsigned long flags; struct zone *zone; int i; @@ -600,8 +602,10 @@ static void __drain_pages(unsigned int cpu) struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; + local_irq_save(flags); pcp->count -= free_pages_bulk(zone, pcp->count, &pcp->list, 0); + local_irq_restore(flags); } } } @@ -744,7 +748,7 @@ again: if (pcp->count <= pcp->low) pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); - if (pcp->count) { + if (likely(pcp->count)) { page = list_entry(pcp->list.next, struct page, lru); list_del(&page->lru); pcp->count--; -- cgit v1.2.3 From 77a8a78834561398fb4cb1480afa7b0e80b1dd53 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:57 -0800 Subject: [PATCH] mm: set_page_refs opt Inline set_page_refs. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 680cbe5b6ba2..6d513faf050b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -453,23 +453,6 @@ expand(struct zone *zone, struct page *page, return page; } -void set_page_refs(struct page *page, int order) -{ -#ifdef CONFIG_MMU - set_page_count(page, 1); -#else - int i; - - /* - * We need to reference all the pages for this order, otherwise if - * anyone accesses one of the pages with (get/put) it will be freed. - * - eg: access_process_vm() - */ - for (i = 0; i < (1 << order); i++) - set_page_count(page + i, 1); -#endif /* CONFIG_MMU */ -} - /* * This page is about to be returned from the page allocator */ -- cgit v1.2.3 From 92be2e33b155ee76399f51f41fb061f850d02f08 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:57 -0800 Subject: [PATCH] mm: microopt conditions Micro optimise some conditionals where we don't need lazy evaluation. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d513faf050b..b0647b515277 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -336,9 +336,9 @@ static inline void __free_pages_bulk (struct page *page, static inline int free_pages_check(const char *function, struct page *page) { - if ( page_mapcount(page) || - page->mapping != NULL || - page_count(page) != 0 || + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | 1 << PG_private | @@ -348,7 +348,7 @@ static inline int free_pages_check(const char *function, struct page *page) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved ))) + 1 << PG_reserved )))) bad_page(function, page); if (PageDirty(page)) __ClearPageDirty(page); @@ -458,9 +458,9 @@ expand(struct zone *zone, struct page *page, */ static int prep_new_page(struct page *page, int order) { - if ( page_mapcount(page) || - page->mapping != NULL || - page_count(page) != 0 || + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | 1 << PG_private | @@ -471,7 +471,7 @@ static int prep_new_page(struct page *page, int order) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved ))) + 1 << PG_reserved )))) bad_page(__FUNCTION__, page); /* -- cgit v1.2.3 From 13e7444b0ec59f96d81a4e8c379d5f38fc5f2cc1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:58 -0800 Subject: [PATCH] mm: remove bad_range bad_range is supposed to be a temporary check. It would be a pity to throw it out. Make it depend on CONFIG_DEBUG_VM instead. CONFIG_HOLES_IN_ZONE systems were relying on this to check pfn_valid in the page allocator. Add that to page_is_buddy instead. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b0647b515277..088712f2ac02 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -81,6 +81,7 @@ int min_free_kbytes = 1024; unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; +#ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { int ret = 0; @@ -122,6 +123,13 @@ static int bad_range(struct zone *zone, struct page *page) return 0; } +#else +static inline int bad_range(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + static void bad_page(const char *function, struct page *page) { printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", @@ -255,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order) /* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if - * (a) the buddy is free && - * (b) the buddy is on the buddy system && - * (c) a page and its buddy have the same order. + * (a) the buddy is not in a hole && + * (b) the buddy is free && + * (c) the buddy is on the buddy system && + * (d) a page and its buddy have the same order. * for recording page's order, we use page_private(page) and PG_private. * */ static inline int page_is_buddy(struct page *page, int order) { +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + if (PagePrivate(page) && (page_order(page) == order) && page_count(page) == 0) @@ -314,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page, struct free_area *area; struct page *buddy; - combined_idx = __find_combined_index(page_idx, order); buddy = __page_find_buddy(page, page_idx, order); - - if (bad_range(zone, buddy)) - break; if (!page_is_buddy(buddy, order)) break; /* Move the buddy up one level. */ + list_del(&buddy->lru); area = zone->free_area + order; area->nr_free--; rmv_page_order(buddy); + combined_idx = __find_combined_index(page_idx, order); page = page + (combined_idx - page_idx); page_idx = combined_idx; order++; -- cgit v1.2.3 From 2d92c5c9150a2a9ca3dc25da58d5042e17a96b6a Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:10:59 -0800 Subject: [PATCH] mm: remove pcp low struct per_cpu_pages.low is useless. Remove it. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 088712f2ac02..7cff958e7813 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -740,7 +740,7 @@ again: page = NULL; pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); - if (pcp->count <= pcp->low) + if (!pcp->count) pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); if (likely(pcp->count)) { @@ -1345,10 +1345,9 @@ void show_free_areas(void) pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", + printk("cpu %d %s: high %d, batch %d used:%d\n", cpu, temperature ? "cold" : "hot", - pageset->pcp[temperature].low, pageset->pcp[temperature].high, pageset->pcp[temperature].batch, pageset->pcp[temperature].count); @@ -1790,14 +1789,12 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp[0]; /* hot */ pcp->count = 0; - pcp->low = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); INIT_LIST_HEAD(&pcp->list); pcp = &p->pcp[1]; /* cold*/ pcp->count = 0; - pcp->low = 0; pcp->high = 2 * batch; pcp->batch = max(1UL, batch/2); INIT_LIST_HEAD(&pcp->list); @@ -2193,12 +2190,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) seq_printf(m, "\n cpu: %i pcp: %i" "\n count: %i" - "\n low: %i" "\n high: %i" "\n batch: %i", i, j, pageset->pcp[j].count, - pageset->pcp[j].low, pageset->pcp[j].high, pageset->pcp[j].batch); } -- cgit v1.2.3 From a86b1f53166a260ced8f3c8c526945bf496f2e78 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:00 -0800 Subject: [PATCH] mm: page_state fixes read_page_state and __get_page_state only traverse online CPUs, which will cause results to fluctuate when CPUs are plugged in or out. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7cff958e7813..379618747deb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1169,12 +1169,11 @@ EXPORT_SYMBOL(nr_pagecache); DEFINE_PER_CPU(long, nr_pagecache_local) = 0; #endif -void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) +static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) { int cpu = 0; memset(ret, 0, sizeof(*ret)); - cpus_and(*cpumask, *cpumask, cpu_online_map); cpu = first_cpu(*cpumask); while (cpu < NR_CPUS) { @@ -1227,7 +1226,7 @@ unsigned long __read_page_state(unsigned long offset) unsigned long ret = 0; int cpu; - for_each_online_cpu(cpu) { + for_each_cpu(cpu) { unsigned long in; in = (unsigned long)&per_cpu(page_states, cpu) + offset; -- cgit v1.2.3 From 085cc7d5de3cc662da7ea78296464a0d52f3f01f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:01 -0800 Subject: [PATCH] mm: page_alloc cleanups Small cleanups that does not change generated code with the gcc's I've tested with. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 379618747deb..925b0b985f79 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -447,8 +447,7 @@ void __free_pages_ok(struct page *page, unsigned int order) * * -- wli */ -static inline struct page * -expand(struct zone *zone, struct page *page, +static inline void expand(struct zone *zone, struct page *page, int low, int high, struct free_area *area) { unsigned long size = 1 << high; @@ -462,7 +461,6 @@ expand(struct zone *zone, struct page *page, area->nr_free++; set_page_order(&page[size], high); } - return page; } /* @@ -522,7 +520,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) rmv_page_order(page); area->nr_free--; zone->free_pages -= 1UL << order; - return expand(zone, page, order, current_order, area); + expand(zone, page, order, current_order, area); + return page; } return NULL; @@ -537,19 +536,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list) { int i; - int allocated = 0; - struct page *page; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) + struct page *page = __rmqueue(zone, order); + if (unlikely(page == NULL)) break; - allocated++; list_add_tail(&page->lru, list); } spin_unlock(&zone->lock); - return allocated; + return i; } #ifdef CONFIG_NUMA -- cgit v1.2.3 From a226f6c899799fe2c4919daa0767ac579c88f7bd Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 Jan 2006 00:11:08 -0800 Subject: [PATCH] FRV: Clean up bootmem allocator's page freeing algorithm The attached patch cleans up the way the bootmem allocator frees pages. A new function, __free_pages_bootmem(), is provided in mm/page_alloc.c that is called from mm/bootmem.c to turn pages over to the main allocator. All the bits of code to initialise pages (clearing PG_reserved and setting the page count) are moved to here. The checks on page validity are removed, on the assumption that the struct page arrays will have been prepared correctly. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 925b0b985f79..cdad3249cf7f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,6 +53,8 @@ unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; +static void fastcall free_hot_cold_page(struct page *page, int cold); + /* * results with 256, 32 in the lowmem_reserve sysctl: * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) @@ -432,6 +434,39 @@ void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } +/* + * permit the bootmem allocator to evade page validation on high-order frees + */ +void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) +{ + if (order == 0) { + __ClearPageReserved(page); + set_page_count(page, 0); + + free_hot_cold_page(page, 0); + } else { + LIST_HEAD(list); + int loop; + + for (loop = 0; loop < BITS_PER_LONG; loop++) { + struct page *p = &page[loop]; + + if (loop + 16 < BITS_PER_LONG) + prefetchw(p + 16); + __ClearPageReserved(p); + set_page_count(p, 0); + } + + arch_free_page(page, order); + + mod_page_state(pgfree, 1 << order); + + list_add(&page->lru, &list); + kernel_map_pages(page, 1 << order, 0); + free_pages_bulk(page_zone(page), 1, &list, order); + } +} + /* * The order of subdivision here is critical for the IO subsystem. @@ -671,7 +706,6 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) /* * Free a 0-order page */ -static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); static void fastcall free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); -- cgit v1.2.3 From 9328b8faae922e52073785ed6c1eaa8565648a0e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:10 -0800 Subject: [PATCH] mm: dma32 zone statistics Add dma32 to zone statistics. Also attempt to arrange struct page_state a bit better (visually). Signed-off-by: Nick Piggin Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cdad3249cf7f..e12154d9c4ed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2277,32 +2277,40 @@ static char *vmstat_text[] = { "pgpgout", "pswpin", "pswpout", - "pgalloc_high", + "pgalloc_high", "pgalloc_normal", + "pgalloc_dma32", "pgalloc_dma", + "pgfree", "pgactivate", "pgdeactivate", "pgfault", "pgmajfault", + "pgrefill_high", "pgrefill_normal", + "pgrefill_dma32", "pgrefill_dma", "pgsteal_high", "pgsteal_normal", + "pgsteal_dma32", "pgsteal_dma", + "pgscan_kswapd_high", "pgscan_kswapd_normal", - + "pgscan_kswapd_dma32", "pgscan_kswapd_dma", + "pgscan_direct_high", "pgscan_direct_normal", + "pgscan_direct_dma32", "pgscan_direct_dma", - "pginodesteal", + "pginodesteal", "slabs_scanned", "kswapd_steal", "kswapd_inodesteal", -- cgit v1.2.3 From 224abf92b2f439a9030f21d2926ec8047d1ffcdb Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:11 -0800 Subject: [PATCH] mm: bad_page optimisation Cut down size slightly by not passing bad_page the function name (it should be able to be determined by dump_stack()). And cut down the number of printks in bad_page. Also, cut down some branching in the destroy_compound_page path. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e12154d9c4ed..b9fd2c238f13 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -132,16 +132,16 @@ static inline int bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(const char *function, struct page *page) -{ - printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", - function, current->comm, page); - printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, - page->mapping, page_mapcount(page), page_count(page)); - printk(KERN_EMERG "Backtrace:\n"); +static void bad_page(struct page *page) +{ + printk(KERN_EMERG "Bad page state in process '%s'\n" + "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" + "Trying to fix it up, but a reboot is needed\n" + "Backtrace:\n", + current->comm, page, (int)(2*sizeof(unsigned long)), + (unsigned long)page->flags, page->mapping, + page_mapcount(page), page_count(page)); dump_stack(); - printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); page->flags &= ~(1 << PG_lru | 1 << PG_private | 1 << PG_locked | @@ -194,19 +194,15 @@ static void destroy_compound_page(struct page *page, unsigned long order) int i; int nr_pages = 1 << order; - if (!PageCompound(page)) - return; - - if (page[1].index != order) - bad_page(__FUNCTION__, page); + if (unlikely(page[1].index != order)) + bad_page(page); for (i = 0; i < nr_pages; i++) { struct page *p = page + i; - if (!PageCompound(p)) - bad_page(__FUNCTION__, page); - if (page_private(p) != (unsigned long)page) - bad_page(__FUNCTION__, page); + if (unlikely(!PageCompound(p) | + (page_private(p) != (unsigned long)page))) + bad_page(page); ClearPageCompound(p); } } @@ -316,7 +312,7 @@ static inline void __free_pages_bulk (struct page *page, unsigned long page_idx; int order_size = 1 << order; - if (unlikely(order)) + if (unlikely(PageCompound(page))) destroy_compound_page(page, order); page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); @@ -348,7 +344,7 @@ static inline void __free_pages_bulk (struct page *page, zone->free_area[order].nr_free++; } -static inline int free_pages_check(const char *function, struct page *page) +static inline int free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -363,7 +359,7 @@ static inline int free_pages_check(const char *function, struct page *page) 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved )))) - bad_page(function, page); + bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); /* @@ -422,7 +418,7 @@ void __free_pages_ok(struct page *page, unsigned int order) #endif for (i = 0 ; i < (1 << order) ; ++i) - reserved += free_pages_check(__FUNCTION__, page + i); + reserved += free_pages_check(page + i); if (reserved) return; @@ -517,7 +513,7 @@ static int prep_new_page(struct page *page, int order) 1 << PG_swapcache | 1 << PG_writeback | 1 << PG_reserved )))) - bad_page(__FUNCTION__, page); + bad_page(page); /* * For now, we report if PG_reserved was found set, but do not @@ -716,7 +712,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) if (PageAnon(page)) page->mapping = NULL; - if (free_pages_check(__FUNCTION__, page)) + if (free_pages_check(page)) return; inc_page_state(pgfree); -- cgit v1.2.3 From f3fe65122da05e1cd4c9140340d96ea2f95d0c49 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 6 Jan 2006 00:11:15 -0800 Subject: [PATCH] mm: add populated_zone() helper There are numerous places we check whether a zone is populated or not. Provide a helper function to check for populated zones and convert all checks for zone->present_pages. Signed-off-by: Con Kolivas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b9fd2c238f13..8f3de5af92dd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1358,7 +1358,7 @@ void show_free_areas(void) show_node(zone); printk("%s per-cpu:", zone->name); - if (!zone->present_pages) { + if (!populated_zone(zone)) { printk(" empty\n"); continue; } else @@ -1435,7 +1435,7 @@ void show_free_areas(void) show_node(zone); printk("%s: ", zone->name); - if (!zone->present_pages) { + if (!populated_zone(zone)) { printk("empty\n"); continue; } @@ -2134,7 +2134,7 @@ static int frag_show(struct seq_file *m, void *arg) int order; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!zone->present_pages) + if (!populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); @@ -2167,7 +2167,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { int i; - if (!zone->present_pages) + if (!populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); -- cgit v1.2.3 From 1a93205bdffd9d7278d4a66081cdb48452522a58 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:11:16 -0800 Subject: [PATCH] mm: simplify build_zonelists_node by removing the case statement. Simplify build_zonelists_node by removing the case statement. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8f3de5af92dd..7adc9526d329 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1455,35 +1455,23 @@ void show_free_areas(void) /* * Builds allocation fallback zone lists. + * + * Add all populated zones of a node to the zonelist. */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) -{ - switch (k) { - struct zone *zone; - default: - BUG(); - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { +static int __init build_zonelists_node(pg_data_t *pgdat, + struct zonelist *zonelist, int j, int k) +{ + struct zone *zone; + + BUG_ON(k > ZONE_HIGHMEM); + for (zone = pgdat->node_zones + k; zone >= pgdat->node_zones; zone--) { + if (populated_zone(zone)) { #ifndef CONFIG_HIGHMEM - BUG(); + BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL); #endif zonelist->zones[j++] = zone; } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA32: - zone = pgdat->node_zones + ZONE_DMA32; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->present_pages) - zonelist->zones[j++] = zone; } - return j; } -- cgit v1.2.3 From 4be38e351c5f455f6f490f5aff29053e33ab4f99 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:11:17 -0800 Subject: [PATCH] mm: move determination of policy_zone into page allocator Currently the function to build a zonelist for a BIND policy has the side effect to set the policy_zone. This seems to be a bit strange. policy zone seems to not be initialized elsewhere and therefore 0. Do we police ZONE_DMA if no bind policy has been used yet? This patch moves the determination of the zone to apply policies to into the page allocator. We determine the zone while building the zonelist for nodes. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7adc9526d329..512e3f4d4963 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1470,6 +1471,7 @@ static int __init build_zonelists_node(pg_data_t *pgdat, BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL); #endif zonelist->zones[j++] = zone; + check_highest_zone(k); } } return j; -- cgit v1.2.3 From 02a68a5ebc7dd823da7496116f42290103e1e4a9 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:11:18 -0800 Subject: [PATCH] Fix zone policy determination The use k in the inner loop means that the highest zone nr is always used if any zone of a node is populated. This means that the policy zone is not correctly determined on arches that do no use HIGHMEM like ia64. Change the loop to decrement k which also simplifies the BUG_ON. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 512e3f4d4963..ca978992c898 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1465,15 +1465,19 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zone *zone; BUG_ON(k > ZONE_HIGHMEM); - for (zone = pgdat->node_zones + k; zone >= pgdat->node_zones; zone--) { + + do { + zone = pgdat->node_zones + k; if (populated_zone(zone)) { #ifndef CONFIG_HIGHMEM - BUG_ON(zone - pgdat->node_zones > ZONE_NORMAL); + BUG_ON(k > ZONE_NORMAL); #endif zonelist->zones[j++] = zone; check_highest_zone(k); } - } + k--; + + } while (k >= 0); return j; } -- cgit v1.2.3 From 070f80326a215d8e6c4fd6f175e28eb446c492bc Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 6 Jan 2006 00:11:19 -0800 Subject: [PATCH] build_zonelists_node(): rename args Give j and r meaningful names. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ca978992c898..7f580779abdb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1460,25 +1460,25 @@ void show_free_areas(void) * Add all populated zones of a node to the zonelist. */ static int __init build_zonelists_node(pg_data_t *pgdat, - struct zonelist *zonelist, int j, int k) + struct zonelist *zonelist, int nr_zones, int zone_type) { struct zone *zone; - BUG_ON(k > ZONE_HIGHMEM); + BUG_ON(zone_type > ZONE_HIGHMEM); do { - zone = pgdat->node_zones + k; + zone = pgdat->node_zones + zone_type; if (populated_zone(zone)) { #ifndef CONFIG_HIGHMEM - BUG_ON(k > ZONE_NORMAL); + BUG_ON(zone_type > ZONE_NORMAL); #endif - zonelist->zones[j++] = zone; - check_highest_zone(k); + zonelist->zones[nr_zones++] = zone; + check_highest_zone(zone_type); } - k--; + zone_type--; - } while (k >= 0); - return j; + } while (zone_type >= 0); + return nr_zones; } static inline int highest_zone(int zone_bits) -- cgit v1.2.3 From a74609fafa2e5cc31d558012abaaa55ec9ad9da4 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 6 Jan 2006 00:11:20 -0800 Subject: [PATCH] mm: page_state opt Optimise page_state manipulations by introducing interrupt unsafe accessors to page_state fields. Callers must provide their own locking (either disable interrupts or not update from interrupt context). Switch over the hot callsites that can easily be moved under interrupts off sections. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 89 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 39 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7f580779abdb..fd47494cb989 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -424,9 +424,9 @@ void __free_pages_ok(struct page *page, unsigned int order) return; list_add(&page->lru, &list); - mod_page_state(pgfree, 1 << order); kernel_map_pages(page, 1<zone_pgdat; pg_data_t *orig = zonelist->zones[0]->zone_pgdat; struct per_cpu_pageset *p; - local_irq_save(flags); - cpu = smp_processor_id(); - p = zone_pcp(z,cpu); + p = zone_pcp(z, cpu); if (pg == orig) { p->numa_hit++; } else { @@ -696,7 +692,6 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) p->local_node++; else p->other_node++; - local_irq_restore(flags); #endif } @@ -716,11 +711,11 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) if (free_pages_check(page)) return; - inc_page_state(pgfree); kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); + __inc_page_state(pgfree); list_add(&page->lru, &pcp->list); pcp->count++; if (pcp->count >= pcp->high) @@ -753,49 +748,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -static struct page * -buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) +static struct page *buffered_rmqueue(struct zonelist *zonelist, + struct zone *zone, int order, gfp_t gfp_flags) { unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); + int cpu; again: + cpu = get_cpu(); if (order == 0) { struct per_cpu_pages *pcp; - page = NULL; - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; + pcp = &zone_pcp(zone, cpu)->pcp[cold]; local_irq_save(flags); - if (!pcp->count) + if (!pcp->count) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); - if (likely(pcp->count)) { - page = list_entry(pcp->list.next, struct page, lru); - list_del(&page->lru); - pcp->count--; + if (unlikely(!pcp->count)) + goto failed; } - local_irq_restore(flags); - put_cpu(); + page = list_entry(pcp->list.next, struct page, lru); + list_del(&page->lru); + pcp->count--; } else { spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); + if (!page) + goto failed; } - if (page != NULL) { - BUG_ON(bad_range(zone, page)); - mod_page_state_zone(zone, pgalloc, 1 << order); - if (prep_new_page(page, order)) - goto again; + __mod_page_state_zone(zone, pgalloc, 1 << order); + zone_statistics(zonelist, zone, cpu); + local_irq_restore(flags); + put_cpu(); - if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); + BUG_ON(bad_range(zone, page)); + if (prep_new_page(page, order)) + goto again; - if (order && (gfp_flags & __GFP_COMP)) - prep_compound_page(page, order); - } + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); return page; + +failed: + local_irq_restore(flags); + put_cpu(); + return NULL; } #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ @@ -871,9 +875,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, continue; } - page = buffered_rmqueue(*z, order, gfp_mask); + page = buffered_rmqueue(zonelist, *z, order, gfp_mask); if (page) { - zone_statistics(zonelist, *z); break; } } while (*(++z) != NULL); @@ -1248,7 +1251,7 @@ void get_full_page_state(struct page_state *ret) __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); } -unsigned long __read_page_state(unsigned long offset) +unsigned long read_page_state_offset(unsigned long offset) { unsigned long ret = 0; int cpu; @@ -1262,18 +1265,26 @@ unsigned long __read_page_state(unsigned long offset) return ret; } -void __mod_page_state(unsigned long offset, unsigned long delta) +void __mod_page_state_offset(unsigned long offset, unsigned long delta) +{ + void *ptr; + + ptr = &__get_cpu_var(page_states); + *(unsigned long *)(ptr + offset) += delta; +} +EXPORT_SYMBOL(__mod_page_state_offset); + +void mod_page_state_offset(unsigned long offset, unsigned long delta) { unsigned long flags; - void* ptr; + void *ptr; local_irq_save(flags); ptr = &__get_cpu_var(page_states); - *(unsigned long*)(ptr + offset) += delta; + *(unsigned long *)(ptr + offset) += delta; local_irq_restore(flags); } - -EXPORT_SYMBOL(__mod_page_state); +EXPORT_SYMBOL(mod_page_state_offset); void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat) -- cgit v1.2.3 From 84c2008af01132c4ca257ed9b595693c611df15d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 8 Jan 2006 01:00:28 -0800 Subject: [PATCH] revert "mm: page_state fixes" Hugh says: page_alloc_cpu_notify() specifically contains code to /* Add dead cpu's page_states to our own. */ which handles this more efficiently. Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd47494cb989..0b98f428b07b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1204,6 +1204,7 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) int cpu = 0; memset(ret, 0, sizeof(*ret)); + cpus_and(*cpumask, *cpumask, cpu_online_map); cpu = first_cpu(*cpumask); while (cpu < NR_CPUS) { @@ -1256,7 +1257,7 @@ unsigned long read_page_state_offset(unsigned long offset) unsigned long ret = 0; int cpu; - for_each_cpu(cpu) { + for_each_online_cpu(cpu) { unsigned long in; in = (unsigned long)&per_cpu(page_states, cpu) + offset; -- cgit v1.2.3 From bec6b0c89b234090681a4516e20ac5debe3e7c59 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 8 Jan 2006 01:00:38 -0800 Subject: [PATCH] slab: remove nested #ifdef CONFIG_NUMA For some reason there is an #ifdef CONFIG_NUMA within another #ifdef CONFIG_NUMA in the page allocator. Remove innermost #ifdef CONFIG_NUMA Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b98f428b07b..5eeeadd9f66a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1883,7 +1883,6 @@ bad: static inline void free_zone_pagesets(int cpu) { -#ifdef CONFIG_NUMA struct zone *zone; for_each_zone(zone) { @@ -1892,7 +1891,6 @@ static inline void free_zone_pagesets(int cpu) zone_pcp(zone, cpu) = NULL; kfree(pset); } -#endif } static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, -- cgit v1.2.3 From 8ad4b1fb8205340dba16b63467bb23efc27264d6 Mon Sep 17 00:00:00 2001 From: Rohit Seth Date: Sun, 8 Jan 2006 01:00:40 -0800 Subject: [PATCH] Make high and batch sizes of per_cpu_pagelists configurable As recently there has been lot of traffic on the right values for batch and high water marks for per_cpu_pagelists. This patch makes these two variables configurable through /proc interface. A new tunable /proc/sys/vm/percpu_pagelist_fraction is added. This entry controls the fraction of pages at most in each zone that are allocated for each per cpu page list. The min value for this is 8. It means that we don't allow more than 1/8th of pages in each zone to be allocated in any single per_cpu_pagelist. The batch value of each per cpu pagelist is also updated as a result. It is set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) Signed-off-by: Rohit Seth Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5eeeadd9f66a..2c46f697e8ff 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly; unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; +int percpu_pagelist_fraction; static void fastcall free_hot_cold_page(struct page *page, int cold); @@ -1831,6 +1832,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) INIT_LIST_HEAD(&pcp->list); } +/* + * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * to the value high for the pageset p. + */ + +static void setup_pagelist_highmark(struct per_cpu_pageset *p, + unsigned long high) +{ + struct per_cpu_pages *pcp; + + pcp = &p->pcp[0]; /* hot list */ + pcp->high = high; + pcp->batch = max(1UL, high/4); + if ((high/4) > (PAGE_SHIFT * 8)) + pcp->batch = PAGE_SHIFT * 8; +} + + #ifdef CONFIG_NUMA /* * Boot pageset table. One per cpu which is going to be used for all @@ -1868,6 +1887,10 @@ static int __devinit process_zones(int cpu) goto bad; setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); + + if (percpu_pagelist_fraction) + setup_pagelist_highmark(zone_pcp(zone, cpu), + (zone->present_pages / percpu_pagelist_fraction)); } return 0; @@ -2567,6 +2590,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, return 0; } +/* + * percpu_pagelist_fraction - changes the pcp->high for each zone on each + * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist + * can have before it gets flushed back to buddy allocator. + */ + +int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + unsigned int cpu; + int ret; + + ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (!write || (ret == -EINVAL)) + return ret; + for_each_zone(zone) { + for_each_online_cpu(cpu) { + unsigned long high; + high = zone->present_pages / percpu_pagelist_fraction; + setup_pagelist_highmark(zone_pcp(zone, cpu), high); + } + } + return 0; +} + __initdata int hashdist = HASHDIST_DEFAULT; #ifdef CONFIG_NUMA -- cgit v1.2.3 From 23316bc86fd31c5d644a71c398ec41d9fecacec4 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sun, 8 Jan 2006 01:00:41 -0800 Subject: [PATCH] mm: cleanup zone_pcp Use zone_pcp everywhere even though NUMA code "knows" the internal details of the zone. Stop other people trying to copy, and it looks nicer. Also, only print the pagesets of online cpus in zoneinfo. Signed-off-by: Nick Piggin Cc: "Seth, Rohit" Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2c46f697e8ff..6b92a945ae6b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -597,7 +597,7 @@ void drain_remote_pages(void) if (zone->zone_pgdat->node_id == numa_node_id()) continue; - pset = zone->pageset[smp_processor_id()]; + pset = zone_pcp(zone, smp_processor_id()); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; @@ -1881,12 +1881,12 @@ static int __devinit process_zones(int cpu) for_each_zone(zone) { - zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), + zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), GFP_KERNEL, cpu_to_node(cpu)); - if (!zone->pageset[cpu]) + if (!zone_pcp(zone, cpu)) goto bad; - setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); + setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); if (percpu_pagelist_fraction) setup_pagelist_highmark(zone_pcp(zone, cpu), @@ -1898,8 +1898,8 @@ bad: for_each_zone(dzone) { if (dzone == zone) break; - kfree(dzone->pageset[cpu]); - dzone->pageset[cpu] = NULL; + kfree(zone_pcp(dzone, cpu)); + zone_pcp(dzone, cpu) = NULL; } return -ENOMEM; } @@ -1984,7 +1984,7 @@ static __devinit void zone_pcp_init(struct zone *zone) for (cpu = 0; cpu < NR_CPUS; cpu++) { #ifdef CONFIG_NUMA /* Early boot. Slab allocator not functional yet */ - zone->pageset[cpu] = &boot_pageset[cpu]; + zone_pcp(zone, cpu) = &boot_pageset[cpu]; setup_pageset(&boot_pageset[cpu],0); #else setup_pageset(zone_pcp(zone,cpu), batch); @@ -2227,7 +2227,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) seq_printf(m, ")" "\n pagesets"); - for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { + for_each_online_cpu(i) { struct per_cpu_pageset *pageset; int j; -- cgit v1.2.3 From 48db57f8ff10eb09ab887ccb6150b0da0c7be24e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sun, 8 Jan 2006 01:00:42 -0800 Subject: [PATCH] mm: free_pages opt Try to streamline free_pages_bulk by ensuring callers don't pass in a 'count' that exceeds the list size. Some cleanups: Rename __free_pages_bulk to __free_one_page. Put the page list manipulation from __free_pages_ok into free_one_page. Make __free_pages_ok static. Signed-off-by: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 58 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 28 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6b92a945ae6b..ad3d0202cdef 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -308,7 +308,7 @@ static inline int page_is_buddy(struct page *page, int order) * -- wli */ -static inline void __free_pages_bulk (struct page *page, +static inline void __free_one_page(struct page *page, struct zone *zone, unsigned int order) { unsigned long page_idx; @@ -383,40 +383,42 @@ static inline int free_pages_check(struct page *page) * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static int -free_pages_bulk(struct zone *zone, int count, - struct list_head *list, unsigned int order) +static void free_pages_bulk(struct zone *zone, int count, + struct list_head *list, int order) { - struct page *page = NULL; - int ret = 0; - spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - while (!list_empty(list) && count--) { + while (count--) { + struct page *page; + + BUG_ON(list_empty(list)); page = list_entry(list->prev, struct page, lru); - /* have to delete it as __free_pages_bulk list manipulates */ + /* have to delete it as __free_one_page list manipulates */ list_del(&page->lru); - __free_pages_bulk(page, zone, order); - ret++; + __free_one_page(page, zone, order); } spin_unlock(&zone->lock); - return ret; } -void __free_pages_ok(struct page *page, unsigned int order) +static void free_one_page(struct zone *zone, struct page *page, int order) { - unsigned long flags; LIST_HEAD(list); + list_add(&page->lru, &list); + free_pages_bulk(zone, 1, &list, order); +} + +static void __free_pages_ok(struct page *page, unsigned int order) +{ + unsigned long flags; int i; int reserved = 0; arch_free_page(page, order); #ifndef CONFIG_MMU - if (order > 0) - for (i = 1 ; i < (1 << order) ; ++i) - __put_page(page + i); + for (i = 1 ; i < (1 << order) ; ++i) + __put_page(page + i); #endif for (i = 0 ; i < (1 << order) ; ++i) @@ -424,11 +426,10 @@ void __free_pages_ok(struct page *page, unsigned int order) if (reserved) return; - list_add(&page->lru, &list); - kernel_map_pages(page, 1<pcp[i]; - if (pcp->count) - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; } } local_irq_restore(flags); @@ -627,8 +627,8 @@ static void __drain_pages(unsigned int cpu) pcp = &pset->pcp[i]; local_irq_save(flags); - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; local_irq_restore(flags); } } @@ -719,8 +719,10 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) __inc_page_state(pgfree); list_add(&page->lru, &pcp->list); pcp->count++; - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + if (pcp->count >= pcp->high) { + free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + pcp->count -= pcp->batch; + } local_irq_restore(flags); put_cpu(); } @@ -759,7 +761,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist, again: cpu = get_cpu(); - if (order == 0) { + if (likely(order == 0)) { struct per_cpu_pages *pcp; pcp = &zone_pcp(zone, cpu)->pcp[cold]; -- cgit v1.2.3 From 3e0d98b9f1eb757fc98efc84e74e54a08308aa73 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 8 Jan 2006 01:01:49 -0800 Subject: [PATCH] cpuset: memory pressure meter Provide a simple per-cpuset metric of memory pressure, tracking the -rate- that the tasks in a cpuset call try_to_free_pages(), the synchronous (direct) memory reclaim code. This enables batch managers monitoring jobs running in dedicated cpusets to efficiently detect what level of memory pressure that job is causing. This is useful both on tightly managed systems running a wide mix of submitted jobs, which may choose to terminate or reprioritize jobs that are trying to use more memory than allowed on the nodes assigned them, and with tightly coupled, long running, massively parallel scientific computing jobs that will dramatically fail to meet required performance goals if they start to use more memory than allowed to them. This patch just provides a very economical way for the batch manager to monitor a cpuset for signs of memory pressure. It's up to the batch manager or other user code to decide what to do about it and take action. ==> Unless this feature is enabled by writing "1" to the special file /dev/cpuset/memory_pressure_enabled, the hook in the rebalance code of __alloc_pages() for this metric reduces to simply noticing that the cpuset_memory_pressure_enabled flag is zero. So only systems that enable this feature will compute the metric. Why a per-cpuset, running average: Because this meter is per-cpuset, rather than per-task or mm, the system load imposed by a batch scheduler monitoring this metric is sharply reduced on large systems, because a scan of the tasklist can be avoided on each set of queries. Because this meter is a running average, instead of an accumulating counter, a batch scheduler can detect memory pressure with a single read, instead of having to read and accumulate results for a period of time. Because this meter is per-cpuset rather than per-task or mm, the batch scheduler can obtain the key information, memory pressure in a cpuset, with a single read, rather than having to query and accumulate results over all the (dynamically changing) set of tasks in the cpuset. A per-cpuset simple digital filter (requires a spinlock and 3 words of data per-cpuset) is kept, and updated by any task attached to that cpuset, if it enters the synchronous (direct) page reclaim code. A per-cpuset file provides an integer number representing the recent (half-life of 10 seconds) rate of direct page reclaims caused by the tasks in the cpuset, in units of reclaims attempted per second, times 1000. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ad3d0202cdef..e0e84924171b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -976,6 +976,7 @@ rebalance: cond_resched(); /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); p->flags |= PF_MEMALLOC; reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; -- cgit v1.2.3 From de5097c2e73f826302cd8957c225b3725e0c7553 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Jan 2006 15:59:21 -0800 Subject: [PATCH] mutex subsystem, more debugging code more mutex debugging: check for held locks during memory freeing, task exit, enable sysrq printouts, etc. Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e0e84924171b..a5e6891f7bb6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -415,6 +415,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) int reserved = 0; arch_free_page(page, order); + if (!PageHighMem(page)) + mutex_debug_check_no_locks_freed(page_address(page), + page_address(page+(1< Date: Wed, 11 Jan 2006 14:41:26 +0000 Subject: [PATCH] fix/simplify mutex debugging code Let's switch mutex_debug_check_no_locks_freed() to take (addr, len) as arguments instead, since all its callers were just calculating the 'to' address for themselves anyway... (and sometimes doing so badly). Signed-off-by: David Woodhouse Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a5e6891f7bb6..8e363536e2da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -417,7 +417,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) arch_free_page(page, order); if (!PageHighMem(page)) mutex_debug_check_no_locks_freed(page_address(page), - page_address(page+(1< Date: Wed, 11 Jan 2006 12:17:18 -0800 Subject: [PATCH] Restore KERN_EMERG to each line printed by bad_page Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e363536e2da..ce991b173aa9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -137,9 +137,9 @@ static inline int bad_range(struct zone *zone, struct page *page) static void bad_page(struct page *page) { printk(KERN_EMERG "Bad page state in process '%s'\n" - "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" - "Trying to fix it up, but a reboot is needed\n" - "Backtrace:\n", + KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" + KERN_EMERG "Trying to fix it up, but a reboot is needed\n" + KERN_EMERG "Backtrace:\n", current->comm, page, (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, page->mapping, page_mapcount(page), page_count(page)); -- cgit v1.2.3 From 4eac915d02453e81a32595cd7423492c81337a26 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 11 Jan 2006 12:17:19 -0800 Subject: [PATCH] mm: gfp_atomic comments Clarify in comments that GFP_ATOMIC means both "don't sleep" and "use emergency pools", hence both ALLOC_HARDER and ALLOC_HIGH. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce991b173aa9..d41a0662d4da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -931,7 +931,8 @@ restart: * * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling - * policy. + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will + * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags = ALLOC_WMARK_MIN; if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) -- cgit v1.2.3 From cbe8dd4af2967ee1c2d54ec9d4db35cf3ecc98d3 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 12 Jan 2006 01:05:24 -0800 Subject: [PATCH] memmap_init_zone(): remove uneccesary page++ Remove unecessary page++ from memmap_init_zone loop. Signed-off-by: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d41a0662d4da..8c960b469593 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1742,7 +1742,7 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long end_pfn = start_pfn + size; unsigned long pfn; - for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { + for (pfn = start_pfn; pfn < end_pfn; pfn++) { if (!early_pfn_valid(pfn)) continue; page = pfn_to_page(pfn); -- cgit v1.2.3