From 6d6a4360876f1e758e215570ccb04518db7cec3a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:13 +0200 Subject: mm: use performance variant for_each_cpu_mask_nr Change references from for_each_cpu_mask to for_each_cpu_mask_nr where appropriate Reviewed-by: Paul Jackson Reviewed-by: Christoph Lameter Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- mm/allocpercpu.c | 4 ++-- mm/vmstat.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index f4026bae6eed..b5411c2c47d4 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -35,7 +35,7 @@ EXPORT_SYMBOL_GPL(percpu_depopulate); void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) { int cpu; - for_each_cpu_mask(cpu, *mask) + for_each_cpu_mask_nr(cpu, *mask) percpu_depopulate(__pdata, cpu); } EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); @@ -86,7 +86,7 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, int cpu; cpus_clear(populated); - for_each_cpu_mask(cpu, *mask) + for_each_cpu_mask_nr(cpu, *mask) if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { __percpu_depopulate_mask(__pdata, &populated); return -ENOMEM; diff --git a/mm/vmstat.c b/mm/vmstat.c index db9eabb2c5b3..c3d4a781802f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -26,7 +26,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); - for_each_cpu_mask(cpu, *cpumask) { + for_each_cpu_mask_nr(cpu, *cpumask) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); for (i = 0; i < NR_VM_EVENT_ITEMS; i++) -- cgit v1.2.3 From 3eefae994d9224fb7771a3ddb683868363c23510 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:04 +0200 Subject: ftrace: limit trace entries Currently there is no protection from the root user to use up all of memory for trace buffers. If the root user allocates too many entries, the OOM killer might start kill off all tasks. This patch adds an algorith to check the following condition: pages_requested > (freeable_memory + current_trace_buffer_pages) / 4 If the above is met then the allocation fails. The above prevents more than 1/4th of freeable memory from being used by trace buffers. To determine the freeable_memory, I made determine_dirtyable_memory in mm/page-writeback.c global. Special thanks goes to Peter Zijlstra for suggesting the above calculation. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- mm/page-writeback.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 789b6adbef37..b38f700825fc 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -126,8 +126,6 @@ static void background_writeout(unsigned long _min_pages); static struct prop_descriptor vm_completions; static struct prop_descriptor vm_dirties; -static unsigned long determine_dirtyable_memory(void); - /* * couple the period to the dirty_ratio: * @@ -347,7 +345,13 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) #endif } -static unsigned long determine_dirtyable_memory(void) +/** + * determine_dirtyable_memory - amount of memory that may be used + * + * Returns the numebr of pages that can currently be freed and used + * by the kernel for direct mappings. + */ +unsigned long determine_dirtyable_memory(void) { unsigned long x; -- cgit v1.2.3 From e8c27ac9191ab9e6506ae5cbe70d87ac50f8e960 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 1 Jun 2008 13:15:22 -0700 Subject: x86, numa, 32-bit: print out debug info on all kvas also fix the print out of node_remap_end_vaddr Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 63835579323a..502223c3c2c6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3486,6 +3486,11 @@ void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, calculate_node_totalpages(pgdat, zones_size, zholes_size); alloc_node_mem_map(pgdat); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", + nid, (unsigned long)pgdat, + (unsigned long)pgdat->node_mem_map); +#endif free_area_init_core(pgdat, zones_size, zholes_size); } -- cgit v1.2.3 From cc1a9d86ce989083703c4bdc11b75a87e1cc404a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 8 Jun 2008 19:39:16 -0700 Subject: mm, x86: shrink_active_range() should check all Now we are using register_e820_active_regions() instead of add_active_range() directly. So end_pfn could be different between the value in early_node_map to node_end_pfn. So we need to make shrink_active_range() smarter. shrink_active_range() is a generic MM function in mm/page_alloc.c but it is only used on 32-bit x86. Should we move it back to some file in arch/x86? Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 502223c3c2c6..215408684076 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3579,25 +3579,49 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, /** * shrink_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @old_end_pfn: The old end PFN of the range * @new_end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. - * The map is kept at the end physical page range that has already been - * registered with add_active_range(). This function allows an arch to shrink - * an existing registered range. + * The map is kept near the end physical page range that has already been + * registered. This function allows an arch to shrink an existing registered + * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, - unsigned long new_end_pfn) +void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) { - int i; + int i, j; + int removed = 0; /* Find the old active region end and shrink */ - for_each_active_range_index_in_nid(i, nid) - if (early_node_map[i].end_pfn == old_end_pfn) { + for_each_active_range_index_in_nid(i, nid) { + if (early_node_map[i].start_pfn >= new_end_pfn) { + /* clear it */ + early_node_map[i].end_pfn = 0; + removed = 1; + continue; + } + if (early_node_map[i].end_pfn > new_end_pfn) { early_node_map[i].end_pfn = new_end_pfn; - break; + continue; } + } + + if (!removed) + return; + + /* remove the blank ones */ + for (i = nr_nodemap_entries - 1; i > 0; i--) { + if (early_node_map[i].nid != nid) + continue; + if (early_node_map[i].end_pfn) + continue; + /* we found it, get rid of it */ + for (j = i; j < nr_nodemap_entries - 1; j++) + memcpy(&early_node_map[j], &early_node_map[j+1], + sizeof(early_node_map[j])); + j = nr_nodemap_entries - 1; + memset(&early_node_map[j], 0, sizeof(early_node_map[j])); + nr_nodemap_entries--; + } } /** -- cgit v1.2.3 From 1ea0704e0da65b2b46f9142ff1391163aac24060 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 04:30:00 -0700 Subject: mm: add a ptep_modify_prot transaction abstraction This patch adds an API for doing read-modify-write updates to a pte's protection bits which may race against hardware updates to the pte. After reading the pte, the hardware may asynchonously set the accessed or dirty bits on a pte, which would be lost when writing back the modified pte value. The existing technique to handle this race is to use ptep_get_and_clear() atomically fetch the old pte value and clear it in memory. This has the effect of marking the pte as non-present, which will prevent the hardware from updating its state. When the new value is written back, the pte will be present again, and the hardware can resume updating the access/dirty flags. When running in a virtualized environment, pagetable updates are relatively expensive, since they generally involve some trap into the hypervisor. To mitigate the cost of these updates, we tend to batch them. However, because of the atomic nature of ptep_get_and_clear(), it is inherently non-batchable. This new interface allows batching by giving the underlying implementation enough information to open a transaction between the read and write phases: ptep_modify_prot_start() returns the current pte value, and puts the pte entry into a state where either the hardware will not update the pte, or if it does, the updates will be preserved on commit. ptep_modify_prot_commit() writes back the updated pte, makes sure that any hardware updates made since ptep_modify_prot_start() are preserved. ptep_modify_prot_start() and _commit() must be exactly paired, and used while holding the appropriate pte lock. They do not protect against other software updates of the pte in any way. The current implementations of ptep_modify_prot_start and _commit are functionally unchanged from before: _start() uses ptep_get_and_clear() fetch the pte and zero the entry, preventing any hardware updates. _commit() simply writes the new pte value back knowing that the hardware has not updated the pte in the meantime. The only current user of this interface is mprotect Signed-off-by: Jeremy Fitzhardinge Acked-by: Linus Torvalds Acked-by: Hugh Dickins Signed-off-by: Ingo Molnar --- mm/mprotect.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index a5bf31c27375..acfe7c8d72fc 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -47,19 +47,17 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, if (pte_present(oldpte)) { pte_t ptent; - /* Avoid an SMP race with hardware updated dirty/clean - * bits by wiping the pte and then setting the new pte - * into place. - */ - ptent = ptep_get_and_clear(mm, addr, pte); + ptent = ptep_modify_prot_start(mm, addr, pte); ptent = pte_modify(ptent, newprot); + /* * Avoid taking write faults for pages we know to be * dirty. */ if (dirty_accountable && pte_dirty(ptent)) ptent = pte_mkwrite(ptent); - set_pte_at(mm, addr, pte, ptent); + + ptep_modify_prot_commit(mm, addr, pte, ptent); #ifdef CONFIG_MIGRATION } else if (!pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); -- cgit v1.2.3 From 15c8b6c1aaaf1c4edd67e2f02e4d8e1bd1a51c0d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 May 2008 09:39:44 +0200 Subject: on_each_cpu(): kill unused 'retry' parameter It's not even passed on to smp_call_function() anymore, since that was removed. So kill it. Acked-by: Jeremy Fitzhardinge Reviewed-by: Paul E. McKenney Signed-off-by: Jens Axboe --- mm/page_alloc.c | 2 +- mm/slab.c | 4 ++-- mm/slub.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2f552955a02f..53242344a774 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -918,7 +918,7 @@ void drain_local_pages(void *arg) */ void drain_all_pages(void) { - on_each_cpu(drain_local_pages, NULL, 0, 1); + on_each_cpu(drain_local_pages, NULL, 1); } #ifdef CONFIG_HIBERNATION diff --git a/mm/slab.c b/mm/slab.c index 046607f05f3e..0772abb412b9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2454,7 +2454,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep) struct kmem_list3 *l3; int node; - on_each_cpu(do_drain, cachep, 1, 1); + on_each_cpu(do_drain, cachep, 1); check_irq_on(); for_each_online_node(node) { l3 = cachep->nodelists[node]; @@ -3939,7 +3939,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, } new->cachep = cachep; - on_each_cpu(do_ccupdate_local, (void *)new, 1, 1); + on_each_cpu(do_ccupdate_local, (void *)new, 1); check_irq_on(); cachep->batchcount = batchcount; diff --git a/mm/slub.c b/mm/slub.c index 0987d1cd943c..44715eb70c06 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1497,7 +1497,7 @@ static void flush_cpu_slab(void *d) static void flush_all(struct kmem_cache *s) { #ifdef CONFIG_SMP - on_each_cpu(flush_cpu_slab, s, 1, 1); + on_each_cpu(flush_cpu_slab, s, 1); #else unsigned long flags; -- cgit v1.2.3 From 38510754a50192a072210e24fdc4ae65592182f0 Mon Sep 17 00:00:00 2001 From: Haavard Skinnemoen Date: Mon, 14 Jan 2008 23:35:32 +0100 Subject: avr32: Use a quicklist for PTE allocation as well Using a quicklist to allocate PTEs might be slightly faster than using the page allocator directly since we might avoid zeroing the page after each allocation. Signed-off-by: Haavard Skinnemoen --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 3aa819d628c1..60b261952783 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -199,7 +199,7 @@ config BOUNCE config NR_QUICK int depends on QUICKLIST - default "2" if SUPERH + default "2" if SUPERH || AVR32 default "1" config VIRT_TO_BUS -- cgit v1.2.3 From cc1050bafebfb1d7935331282e948b5016318192 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 13 Jun 2008 19:08:52 -0700 Subject: x86: replace shrink_active_range() with remove_active_range() in case we have kva before ramdisk on a node, we still need to use those ranges. v2: reserve_early kva ram area, in case there are holes in highmem, to avoid those area could be treat as free high pages. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee5ba7509c1..d80e1868e570 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3552,30 +3552,47 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, } /** - * shrink_active_range - Shrink an existing registered range of PFNs + * remove_active_range - Shrink an existing registered range of PFNs * @nid: The node id the range is on that should be shrunk - * @new_end_pfn: The new PFN of the range + * @start_pfn: The new PFN of the range + * @end_pfn: The new PFN of the range * * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. * The map is kept near the end physical page range that has already been * registered. This function allows an arch to shrink an existing registered * range. */ -void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn) +void __init remove_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) { int i, j; int removed = 0; + printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", + nid, start_pfn, end_pfn); + /* Find the old active region end and shrink */ for_each_active_range_index_in_nid(i, nid) { - if (early_node_map[i].start_pfn >= new_end_pfn) { + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn <= end_pfn) { /* clear it */ + early_node_map[i].start_pfn = 0; early_node_map[i].end_pfn = 0; removed = 1; continue; } - if (early_node_map[i].end_pfn > new_end_pfn) { - early_node_map[i].end_pfn = new_end_pfn; + if (early_node_map[i].start_pfn < start_pfn && + early_node_map[i].end_pfn > start_pfn) { + unsigned long temp_end_pfn = early_node_map[i].end_pfn; + early_node_map[i].end_pfn = start_pfn; + if (temp_end_pfn > end_pfn) + add_active_range(nid, end_pfn, temp_end_pfn); + continue; + } + if (early_node_map[i].start_pfn >= start_pfn && + early_node_map[i].end_pfn > end_pfn && + early_node_map[i].start_pfn < end_pfn) { + early_node_map[i].start_pfn = end_pfn; continue; } } -- cgit v1.2.3 From b5bc6c0e55000dab86b73f838f5ad02908b23755 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 14 Jun 2008 18:32:52 -0700 Subject: x86, mm: use add_highpages_with_active_regions() for high pages init v2 use early_node_map to init high pages, so we can remove page_is_ram() and page_is_reserved_early() in the big loop with add_one_highpage also remove page_is_reserved_early(), it is not needed anymore. v2: fix the build of other platforms Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d80e1868e570..41c6e3aa059f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2929,6 +2929,14 @@ void __init free_bootmem_with_active_regions(int nid, } } +void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) + work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, + data); +} /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. -- cgit v1.2.3 From d52d53b8a5b258bfaab9223a5e7284fcfdd48577 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 16 Jun 2008 20:10:55 -0700 Subject: RFC x86: try to remove arch_get_ram_range want to remove arch_get_ram_range, and use early_node_map instead. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 41c6e3aa059f..e25b6b24f844 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2932,10 +2932,14 @@ void __init free_bootmem_with_active_regions(int nid, void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; + int ret; - for_each_active_range_index_in_nid(i, nid) - work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn, - data); + for_each_active_range_index_in_nid(i, nid) { + ret = work_fn(early_node_map[i].start_pfn, + early_node_map[i].end_pfn, data); + if (ret) + break; + } } /** * sparse_memory_present_with_active_regions - Call memory_present for each active range -- cgit v1.2.3 From e2fc252e0ce695b4c4abe27bb073c35bd0d73252 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 22 Jun 2008 07:22:12 -0700 Subject: x86 boot: show pfn addresses in hex not decimal in some kernel info printks Page frame numbers (the portion of physical addresses above the low order page offsets) are displayed in several kernel debug and info prints in decimal, not hex. Decimal addresse are unreadable. Use hex. Signed-off-by: Paul Jackson Cc: "Yinghai Lu" Cc: "Jack Steiner" Cc: "Mike Travis" Cc: "Huang Cc: Ying" Cc: "Andi Kleen" Cc: "Andrew Morton" Cc: Paul Jackson Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e25b6b24f844..c09c2c8d2c6a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3520,7 +3520,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, { int i; - printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " + printk(KERN_DEBUG "Entering add_active_range(%d, 0x%lx, 0x%lx) " "%d entries of %d used\n", nid, start_pfn, end_pfn, nr_nodemap_entries, MAX_ACTIVE_REGIONS); @@ -3936,7 +3936,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s %8lu -> %8lu\n", + printk(" %-8s 0x%8lx -> 0x%8lx\n", zone_names[i], arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); @@ -3952,7 +3952,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); for (i = 0; i < nr_nodemap_entries; i++) - printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, + printk(" %3d: 0x%8lx -> 0x%8lx\n", early_node_map[i].nid, early_node_map[i].start_pfn, early_node_map[i].end_pfn); -- cgit v1.2.3 From 2bc0d2615a15a93d344abbe8cb1b9056122bce9d Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sun, 22 Jun 2008 07:22:17 -0700 Subject: x86 boot: more consistently use type int for node ids Everywhere I look, node id's are of type 'int', except in this one case, which has 'unsigned long'. Change this one to 'int' as well. There is nothing special about the way this variable 'nid' is used in this routine to justify using an unusual type here. Signed-off-by: Paul Jackson Cc: "Yinghai Lu" Cc: "Jack Steiner" Cc: "Mike Travis" Cc: "Huang Cc: Ying" Cc: "Andi Kleen" Cc: "Andrew Morton" Cc: Paul Jackson Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c09c2c8d2c6a..b604c64a0337 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3669,7 +3669,7 @@ static void __init sort_node_map(void) } /* Find the lowest pfn for a node */ -unsigned long __init find_min_pfn_for_node(unsigned long nid) +unsigned long __init find_min_pfn_for_node(int nid) { int i; unsigned long min_pfn = ULONG_MAX; @@ -3680,7 +3680,7 @@ unsigned long __init find_min_pfn_for_node(unsigned long nid) if (min_pfn == ULONG_MAX) { printk(KERN_WARNING - "Could not find start_pfn for node %lu\n", nid); + "Could not find start_pfn for node %d\n", nid); return 0; } -- cgit v1.2.3 From 5dab8ec139be215fbaba216fb4aea914d0f4dac5 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 25 Jun 2008 05:44:40 -0700 Subject: mm, generic, x86 boot: more tweaks to hex prints of some pfn addresses Fix some problems with (and applies on top of) a previous patch: x86 boot: show pfn addresses in hex not decimal in some kernel info printks Primarily change "0x%8lx" format, which displays with a right aligned space filled hex number (spaces between the "0x" prefix and the number), into "%0#10lx" format, which zero fills instead of space fills, and which uses the printf flag '#' to request the "0x" prefix instead of hard coding it. Also replace some other "0x%lx" formats with "%#lx", making use of the '#' printf flag again. Signed-off-by: Paul Jackson Cc: "Yinghai Lu" Cc: "Jack Steiner" Cc: "Mike Travis" Cc: "Huang Cc: Ying" Cc: "Andi Kleen" Cc: "Andrew Morton" Cc: Paul Jackson Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b604c64a0337..f024b9b3a2a6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3520,7 +3520,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, { int i; - printk(KERN_DEBUG "Entering add_active_range(%d, 0x%lx, 0x%lx) " + printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) " "%d entries of %d used\n", nid, start_pfn, end_pfn, nr_nodemap_entries, MAX_ACTIVE_REGIONS); @@ -3936,7 +3936,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s 0x%8lx -> 0x%8lx\n", + printk(" %-8s %0#10lx -> %0#10lx\n", zone_names[i], arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); @@ -3952,7 +3952,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); for (i = 0; i < nr_nodemap_entries; i++) - printk(" %3d: 0x%8lx -> 0x%8lx\n", early_node_map[i].nid, + printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, early_node_map[i].start_pfn, early_node_map[i].end_pfn); -- cgit v1.2.3 From b845f313d78e4e259ec449909e3bbadf77b53a6d Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Tue, 8 Jul 2008 00:28:51 +1000 Subject: mm: Allow architectures to define additional protection bits This patch allows architectures to define functions to deal with additional protections bits for mmap() and mprotect(). arch_calc_vm_prot_bits() maps additonal protection bits to vm_flags arch_vm_get_page_prot() maps additional vm_flags to the vma's vm_page_prot arch_validate_prot() checks for valid values of the protection bits Note: vm_get_page_prot() is now pretty ugly, but the generated code should be identical for architectures that don't define additional protection bits. Signed-off-by: Dave Kleikamp Acked-by: Andrew Morton Acked-by: Hugh Dickins Signed-off-by: Benjamin Herrenschmidt --- mm/mmap.c | 5 +++-- mm/mprotect.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 3354fdd83d4b..1d102b956fd8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -72,8 +72,9 @@ pgprot_t protection_map[16] = { pgprot_t vm_get_page_prot(unsigned long vm_flags) { - return protection_map[vm_flags & - (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; + return __pgprot(pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | + pgprot_val(arch_vm_get_page_prot(vm_flags))); } EXPORT_SYMBOL(vm_get_page_prot); diff --git a/mm/mprotect.c b/mm/mprotect.c index a5bf31c27375..ecfaa5844b5f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -239,7 +239,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot) end = start + len; if (end <= start) return -ENOMEM; - if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) + if (!arch_validate_prot(prot)) return -EINVAL; reqprot = prot; -- cgit v1.2.3 From f4c0a0fdfae708f7aa438c27a380ed4071294e11 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: vfs: export filemap_fdatawrite_range() Make filemap_fdatawrite_range() function public, so that it can later be used in ordered mode rewrite by JBD/JBD2. Signed-off-by: Jan Kara --- mm/filemap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 1e6a7d34874f..65d9d9e2b755 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawrite); -static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } +EXPORT_SYMBOL(filemap_fdatawrite_range); /** * filemap_flush - mostly a non-blocking flush -- cgit v1.2.3 From 06d6cf6959d22037fcec598f4f954db5db3d7356 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 11 Jul 2008 19:27:31 -0400 Subject: mm: Add range_cont mode for writeback Filesystems like ext4 needs to start a new transaction in the writepages for block allocation. This happens with delayed allocation and there is limit to how many credits we can request from the journal layer. So we call write_cache_pages multiple times with wbc->nr_to_write set to the maximum possible value limitted by the max journal credits available. Add a new mode to writeback that enables us to handle this behaviour. In the new mode we update the wbc->range_start to point to the new offset to be written. Next call to call to write_cache_pages will start writeout from specified range_start offset. In the new mode we also limit writing to the specified wbc->range_end. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Acked-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- mm/page-writeback.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 789b6adbef37..ded57d528060 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -956,6 +956,9 @@ retry: } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; + + if (wbc->range_cont) + wbc->range_start = index << PAGE_CACHE_SHIFT; return ret; } EXPORT_SYMBOL(write_cache_pages); -- cgit v1.2.3 From 421c175c4d609864350df495b34d3e99f9fb1bdd Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 14 Jul 2008 09:59:18 +0200 Subject: [S390] Add support for memory hot-add. Cc: Gerald Schaefer Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 3aa819d628c1..4242743b981b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -129,7 +129,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG - depends on (IA64 || X86 || PPC64 || SUPERH) + depends on (IA64 || X86 || PPC64 || SUPERH || S390) comment "Memory hotplug is currently incompatible with Software Suspend" depends on SPARSEMEM && HOTPLUG && HIBERNATION -- cgit v1.2.3 From 7daf705f362e349983e92037a198b8821db198af Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 14 Jul 2008 12:12:53 -0700 Subject: Start using the new '%pS' infrastructure to print symbols This simplifies the code significantly, and was the whole point of the exercise. Signed-off-by: Linus Torvalds --- mm/slub.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 315c392253c7..5f6e2c4a2ba7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -431,9 +431,8 @@ static void print_track(const char *s, struct track *t) if (!t->addr) return; - printk(KERN_ERR "INFO: %s in ", s); - __print_symbol("%s", (unsigned long)t->addr); - printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); + printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + s, t->addr, jiffies - t->when, t->cpu, t->pid); } static void print_tracking(struct kmem_cache *s, void *object) -- cgit v1.2.3 From 0937502af7c9b648ed4e884ccb7f504b01a005a1 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 28 May 2008 10:32:22 -0700 Subject: slub: Add check for kfree() of non slab objects. We can detect kfree()s on non slab objects by checking for PageCompound(). Works in the same way as for ksize. This helped me catch an invalid kfree(). Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5f6e2c4a2ba7..b3f2e713cdf1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2766,6 +2766,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { + BUG_ON(!PageCompound(page)); put_page(page); return; } -- cgit v1.2.3 From 88e4ccf294ca62c2da998012a83533ce150c8dce Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 23 Jun 2008 02:58:37 +0400 Subject: slub: current is always valid Acked-by: Christoph Lameter Signed-off-by: Alexey Dobriyan Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index b3f2e713cdf1..488400d10700 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -411,7 +411,7 @@ static void set_track(struct kmem_cache *s, void *object, if (addr) { p->addr = addr; p->cpu = smp_processor_id(); - p->pid = current ? current->pid : -1; + p->pid = current->pid; p->when = jiffies; } else memset(p, 0, sizeof(struct track)); -- cgit v1.2.3 From e79aec291da55aa322ddb5d8f3bb04cdf69470d5 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Fri, 4 Jul 2008 00:40:32 +0530 Subject: slab: rename slab_destroy_objs With the removal of destructors, slab_destroy_objs no longer actually destroys any objects, making the kernel doc incorrect and the function name misleading. In keeping with the other debug functions, rename it to slab_destroy_debugcheck and drop the kernel doc. Signed-off-by: Rabin Vincent Signed-off-by: Pekka Enberg --- mm/slab.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 046607f05f3e..b4aa4c88250e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1901,15 +1901,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) #endif #if DEBUG -/** - * slab_destroy_objs - destroy a slab and its objects - * @cachep: cache pointer being destroyed - * @slabp: slab pointer being destroyed - * - * Call the registered destructor for each object in a slab that is being - * destroyed. - */ -static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) { int i; for (i = 0; i < cachep->num; i++) { @@ -1938,7 +1930,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) } } #else -static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) { } #endif @@ -1956,7 +1948,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) { void *addr = slabp->s_mem - slabp->colouroff; - slab_destroy_objs(cachep, slabp); + slab_destroy_debugcheck(cachep, slabp); if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { struct slab_rcu *slab_rcu; -- cgit v1.2.3 From 41ab8592ca35a20580665cae18c172816236b21e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 16 Jul 2008 21:29:02 +0400 Subject: SLUB: simplify re on_each_cpu() on_each_cpu() expands to function call on UP, too. Acked-by: Christoph Lameter Signed-off-by: Alexey Dobriyan Signed-off-by: Pekka Enberg --- mm/slub.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 35ab38a94b46..6cd9fec18f92 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1495,15 +1495,7 @@ static void flush_cpu_slab(void *d) static void flush_all(struct kmem_cache *s) { -#ifdef CONFIG_SMP on_each_cpu(flush_cpu_slab, s, 1); -#else - unsigned long flags; - - local_irq_save(flags); - flush_cpu_slab(s); - local_irq_restore(flags); -#endif } /* -- cgit v1.2.3 From 0ebd652b35e988c0be3390e49b39cc064ba1cfce Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sat, 19 Jul 2008 14:17:22 +0300 Subject: slub: dump more data on slab corruption The limit of 128 bytes is too small when debugging slab corruption of the skb cache, for example. So increase the limit to PAGE_SIZE to make debugging corruptions easier. Acked-by: Ingo Molnar Acked-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 6cd9fec18f92..6d4a49c1ff2f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -492,7 +492,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (p > addr + 16) print_section("Bytes b4", p - 16, 16); - print_section("Object", p, min(s->objsize, 128)); + print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) print_section("Redzone", p + s->objsize, -- cgit v1.2.3 From db7a94d60f871ce6a52e97d82dea476cee0c4ea0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 19 Jul 2008 22:39:46 -0700 Subject: highmem: Export totalhigh_pages. Hash et al. sizing code in SCTP wants to make the calculation totalram_pages - totalhigh_pages, just like TCP. But this requires an export for the CONFIG_HIGHMEM case to work. Signed-off-by: David S. Miller --- mm/highmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index 7da4a7b6af11..e16e1523b688 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -40,6 +40,7 @@ #ifdef CONFIG_HIGHMEM unsigned long totalhigh_pages __read_mostly; +EXPORT_SYMBOL(totalhigh_pages); unsigned int nr_free_highpages (void) { -- cgit v1.2.3