From 66918dcdf91ad101194c749c18099e836ba3de2b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 30 Jun 2009 11:41:37 -0700 Subject: x86: only clear node_states for 64bit Nathan reported that | commit 73d60b7f747176dbdff826c4127d22e1fd3f9f74 | Author: Yinghai Lu | Date: Tue Jun 16 15:33:00 2009 -0700 | | page-allocator: clear N_HIGH_MEMORY map before we set it again | | SRAT tables may contains nodes of very small size. The arch code may | decide to not activate such a node. However, currently the early boot | code sets N_HIGH_MEMORY for such nodes. These nodes therefore seem to be | active although these nodes have no present pages. | | For 64bit N_HIGH_MEMORY == N_NORMAL_MEMORY, so that works for 64 bit too unintentionally and incorrectly clears the cpuset.mems cgroup attribute on an i386 kvm guest, meaning that cpuset.mems can not be used. Fix this by only clearing node_states[N_NORMAL_MEMORY] for 64bit only. and need to do save/restore for that in find_zone_movable_pfn Reported-by: Nathan Lynch Tested-by: Nathan Lynch Signed-off-by: Yinghai Lu Cc: Christoph Lameter Cc: Ingo Molnar , Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5d714f8fb303..e0f2cdf9d8b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4032,6 +4032,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) int i, nid; unsigned long usable_startpfn; unsigned long kernelcore_node, kernelcore_remaining; + /* save the state before borrow the nodemask */ + nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); @@ -4059,7 +4061,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) /* If kernelcore was not specified, there is no ZONE_MOVABLE */ if (!required_kernelcore) - return; + goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ find_usable_zone_for_movable(); @@ -4158,6 +4160,10 @@ restart: for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + +out: + /* restore the node_state */ + node_states[N_HIGH_MEMORY] = saved_node_state; } /* Any regular memory on that node ? */ @@ -4242,11 +4248,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) early_node_map[i].start_pfn, early_node_map[i].end_pfn); - /* - * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init - * that node_mask, clear it at first - */ - nodes_clear(node_states[N_HIGH_MEMORY]); /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); -- cgit v1.2.3 From 5bfd7560979062ad75c9805c1719cec990b5db29 Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Sun, 5 Jul 2009 12:08:19 -0700 Subject: Fix virt_to_phys() warnings These warnings were observed on MIPS32 using 2.6.31-rc1 and gcc-4.2.0: mm/page_alloc.c: In function 'alloc_pages_exact': mm/page_alloc.c:1986: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast drivers/usb/mon/mon_bin.c: In function 'mon_alloc_buff': drivers/usb/mon/mon_bin.c:1264: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast [akpm@linux-foundation.org: fix kernel/perf_counter.c too] Signed-off-by: Kevin Cernekee Cc: Andi Kleen Cc: Ralf Baechle Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e0f2cdf9d8b1..ad7cd1c56b07 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1983,7 +1983,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned long alloc_end = addr + (PAGE_SIZE << order); unsigned long used = addr + PAGE_ALIGN(size); - split_page(virt_to_page(addr), order); + split_page(virt_to_page((void *)addr), order); while (used < alloc_end) { free_page(used); used += PAGE_SIZE; -- cgit v1.2.3 From 264ef8a904943ed7d0b04fa958894d7a5c2b2c61 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 7 Jul 2009 10:33:01 +0100 Subject: kmemleak: Remove alloc_bootmem annotations introduced in the past kmemleak_alloc() calls were added in some places where alloc_bootmem was called. Since now kmemleak tracks bootmem allocations, these explicit calls should be run. Signed-off-by: Catalin Marinas Cc: Ingo Molnar Acked-by: Pekka Enberg --- mm/page_alloc.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ad7cd1c56b07..3ef628845f07 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4745,8 +4745,10 @@ void *__init alloc_large_system_hash(const char *tablename, * some pages at the end of hash table which * alloc_pages_exact() automatically does */ - if (get_order(size) < MAX_ORDER) + if (get_order(size) < MAX_ORDER) { table = alloc_pages_exact(size, GFP_ATOMIC); + kmemleak_alloc(table, size, 1, GFP_ATOMIC); + } } } while (!table && size > PAGE_SIZE && --log2qty); @@ -4764,16 +4766,6 @@ void *__init alloc_large_system_hash(const char *tablename, if (_hash_mask) *_hash_mask = (1 << log2qty) - 1; - /* - * If hashdist is set, the table allocation is done with __vmalloc() - * which invokes the kmemleak_alloc() callback. This function may also - * be called before the slab and kmemleak are initialised when - * kmemleak simply buffers the request to be executed later - * (GFP_ATOMIC flag ignored in this case). - */ - if (!hashdist) - kmemleak_alloc(table, size, 1, GFP_ATOMIC); - return table; } -- cgit v1.2.3 From 8aa7e847d834ed937a9ad37a0f2ad5b8584c1ab0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Jul 2009 14:52:32 +0200 Subject: Fix congestion_wait() sync/async vs read/write confusion Commit 1faa16d22877f4839bd433547d770c676d1d964c accidentally broke the bdi congestion wait queue logic, causing us to wait on congestion for WRITE (== 1) when we really wanted BLK_RW_ASYNC (== 0) instead. Signed-off-by: Jens Axboe --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ad7cd1c56b07..a35eeab2724c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1666,7 +1666,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -1831,7 +1831,7 @@ rebalance: pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto rebalance; } -- cgit v1.2.3 From e084b2d95e48b31aa45f9c49ffc6cdae8bdb21d4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 29 Jul 2009 15:02:04 -0700 Subject: page-allocator: preserve PFN ordering when __GFP_COLD is set Fix a post-2.6.24 performace regression caused by 3dfa5721f12c3d5a441448086bee156887daa961 ("page-allocator: preserve PFN ordering when __GFP_COLD is set"). Narayanan reports "The regression is around 15%. There is no disk controller as our setup is based on Samsung OneNAND used as a memory mapped device on a OMAP2430 based board." The page allocator tries to preserve contiguous PFN ordering when returning pages such that repeated callers to the allocator have a strong chance of getting physically contiguous pages, particularly when external fragmentation is low. However, of the bulk of the allocations have __GFP_COLD set as they are due to aio_read() for example, then the PFNs are in reverse PFN order. This can cause performance degration when used with IO controllers that could have merged the requests. This patch attempts to preserve the contiguous ordering of PFNs for users of __GFP_COLD. Signed-off-by: Mel Gorman Reported-by: Narayananu Gopalakrishnan Tested-by: Narayanan Gopalakrishnan Cc: KAMEZAWA Hiroyuki Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index caa92689aac9..ae28c22a7fdb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -882,7 +882,7 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype) + int migratetype, int cold) { int i; @@ -901,7 +901,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - list_add(&page->lru, list); + if (likely(cold == 0)) + list_add(&page->lru, list); + else + list_add_tail(&page->lru, list); set_page_private(page, migratetype); list = &page->lru; } @@ -1119,7 +1122,8 @@ again: local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); + pcp->batch, &pcp->list, + migratetype, cold); if (unlikely(!pcp->count)) goto failed; } @@ -1138,7 +1142,8 @@ again: /* Allocate more to the pcp list if necessary */ if (unlikely(&page->lru == &pcp->list)) { pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); + pcp->batch, &pcp->list, + migratetype, cold); page = list_entry(pcp->list.next, struct page, lru); } -- cgit v1.2.3 From 6583bb64fc370842b32a87c67750c26f6d559af0 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 29 Jul 2009 15:02:06 -0700 Subject: mm: avoid endless looping for oom killed tasks If a task is oom killed and still cannot find memory when trying with no watermarks, it's better to fail the allocation attempt than to loop endlessly. Direct reclaim has already failed and the oom killer will be a no-op since current has yet to die, so there is no other alternative for allocations that are not __GFP_NOFAIL. Acked-by: Mel Gorman Signed-off-by: David Rientjes Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ae28c22a7fdb..2dbb2fc68837 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1794,6 +1794,10 @@ rebalance: if (p->flags & PF_MEMALLOC) goto nopage; + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, -- cgit v1.2.3 From 1fc28b70fe2dbf87e061b6ce5091a1f8e4e5d4e7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 29 Jul 2009 15:04:08 -0700 Subject: page-allocator: allow too high-order warning messages to be suppressed with __GFP_NOWARN The page allocator warns once when an order >= MAX_ORDER is specified. This is to catch callers of the allocator that are always falling back to their worst-case when it was not expected. However, there are cases where the caller is behaving correctly but cannot suppress the warning. This patch allows the warning to be suppressed by the callers by specifying __GFP_NOWARN. Signed-off-by: Mel Gorman Acked-by: David Rientjes Cc: Arnaldo Carvalho de Melo Cc: "David S. Miller" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2dbb2fc68837..d052abbe3063 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1745,8 +1745,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * be using allocators in order of preference for an area that is * too large. */ - if (WARN_ON_ONCE(order >= MAX_ORDER)) + if (order >= MAX_ORDER) { + WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); return NULL; + } /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and -- cgit v1.2.3