From 076422d2af7e3d8e72c6e70843f6ea377714b082 Mon Sep 17 00:00:00 2001 From: Amul Shah Date: Tue, 13 Feb 2007 13:26:19 +0100 Subject: [PATCH] x86-64: Allocate the NUMA hash function nodemap dynamically Remove the statically allocated memory to NUMA node hash map in favor of a dynamically allocated memory to node hash map (it is cache aligned). This patch has the nice side effect in that it allows the hash map to grow for systems with large amounts of memory (256GB - 1TB), but suffer from having small PCI space tacked onto the boot node (which is somewhere between 192MB to 512MB on the ES7000). Signed-off-by: Amul Shah Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Rohit Seth Signed-off-by: Andrew Morton --- arch/x86_64/mm/numa.c | 74 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 65 insertions(+), 9 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 2ee2e003606c..7d9c428f4094 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; int numa_off __initdata; +unsigned long __initdata nodemap_addr; +unsigned long __initdata nodemap_size; /* @@ -52,34 +54,87 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) int res = -1; unsigned long addr, end; - if (shift >= 64) - return -1; - memset(memnodemap, 0xff, sizeof(memnodemap)); + memset(memnodemap, 0xff, memnodemapsize); for (i = 0; i < numnodes; i++) { addr = nodes[i].start; end = nodes[i].end; if (addr >= end) continue; - if ((end >> shift) >= NODEMAPSIZE) + if ((end >> shift) >= memnodemapsize) return 0; do { if (memnodemap[addr >> shift] != 0xff) return -1; memnodemap[addr >> shift] = i; - addr += (1UL << shift); + addr += (1UL << shift); } while (addr < end); res = 1; } return res; } -int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +static int __init allocate_cachealigned_memnodemap(void) +{ + unsigned long pad, pad_addr; + + memnodemap = memnode.embedded_map; + if (memnodemapsize <= 48) { + printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", + nodemap_addr, nodemap_addr + nodemap_size); + return 0; + } + + pad = L1_CACHE_BYTES - 1; + pad_addr = 0x8000; + nodemap_size = pad + memnodemapsize; + nodemap_addr = find_e820_area(pad_addr, end_pfn<= 0) - shift++; + for (i = 0; i < numnodes; i++) { + start = nodes[i].start; + end = nodes[i].end; + if (start >= end) + continue; + bitfield |= start | end; + if (end > memtop) + memtop = end; + } + i = find_first_bit(&bitfield, sizeof(unsigned long)*8); + memnodemapsize = (memtop >> i)+1; + return i; +} + +int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +{ + int shift; + shift = extract_lsb_from_nodes(nodes, numnodes); + if (allocate_cachealigned_memnodemap()) + return -1; printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); @@ -290,6 +345,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) end_pfn << PAGE_SHIFT); /* setup dummy node covering all memory */ memnode_shift = 63; + memnodemap = memnode.embedded_map; memnodemap[0] = 0; nodes_clear(node_online_map); node_set_online(0); -- cgit v1.2.3 From 54413927f022292aeccadd268fbf1c0b42129945 Mon Sep 17 00:00:00 2001 From: Amul Shah Date: Tue, 13 Feb 2007 13:26:20 +0100 Subject: [PATCH] x86-64: x86_64-make-the-numa-hash-function-nodemap-allocation fix fix - Removed an extraneous debug message from allocate_cachealigned_map - Changed extract_lsb_from_nodes to return 63 for the case where there was only one memory node. The prevents the creation of the dynamic hashmap. - Changed extract_lsb_from_nodes to use only the starting memory address of a node. On an ES7000, our nodes overlap the starting and ending address, meaning, that we see nodes like 00000 - 10000 10000 - 20000 But other systems have nodes whose start and end addresses do not overlap. For example: 00000 - 0FFFF 10000 - 1FFFF In this case, using the ending address will result in an LSB much lower than what is possible. In this case an LSB of 1 when in reality it should be 16. Cc: Andi Kleen Cc: Rohit Seth Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/mm/numa.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 7d9c428f4094..1ec16ea97519 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -78,11 +78,8 @@ static int __init allocate_cachealigned_memnodemap(void) unsigned long pad, pad_addr; memnodemap = memnode.embedded_map; - if (memnodemapsize <= 48) { - printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", - nodemap_addr, nodemap_addr + nodemap_size); + if (memnodemapsize <= 48) return 0; - } pad = L1_CACHE_BYTES - 1; pad_addr = 0x8000; @@ -110,7 +107,7 @@ static int __init allocate_cachealigned_memnodemap(void) static int __init extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) { - int i; + int i, nodes_used = 0; unsigned long start, end; unsigned long bitfield = 0, memtop = 0; @@ -119,11 +116,15 @@ extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) end = nodes[i].end; if (start >= end) continue; - bitfield |= start | end; + bitfield |= start; + nodes_used++; if (end > memtop) memtop = end; } - i = find_first_bit(&bitfield, sizeof(unsigned long)*8); + if (nodes_used <= 1) + i = 63; + else + i = find_first_bit(&bitfield, sizeof(unsigned long)*8); memnodemapsize = (memtop >> i)+1; return i; } -- cgit v1.2.3 From 53fee04f318222a3179ca5933d8bda82c1eef17a Mon Sep 17 00:00:00 2001 From: Rohit Seth Date: Tue, 13 Feb 2007 13:26:22 +0100 Subject: [PATCH] x86-64: Fix fake numa for x86_64 machines with big IO hole This patch resolves the issue of running with numa=fake=X on kernel command line on x86_64 machines that have big IO hole. While calculating the size of each node now we look at the total hole size in that range. Previously there were nodes that only had IO holes in them causing kernel boot problems. We now use the NODE_MIN_SIZE (64MB) as the minimum size of memory that any node must have. We reduce the number of allocated nodes if the number of nodes specified on kernel command line results in any node getting memory smaller than NODE_MIN_SIZE. This change allows the extra memory to be incremented in NODE_MIN_SIZE granule and uniformly distribute among as many nodes (called big nodes) as possible. [akpm@osdl.org: build fix] Signed-off-by: David Rientjes Signed-off-by: Paul Menage Signed-off-by: Rohit Seth Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/mm/numa.c | 110 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 96 insertions(+), 14 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 1ec16ea97519..d3f747dd61d3 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -272,31 +272,113 @@ void __init numa_init_array(void) } #ifdef CONFIG_NUMA_EMU +/* Numa emulation */ int numa_fake __initdata = 0; -/* Numa emulation */ +/* + * This function is used to find out if the start and end correspond to + * different zones. + */ +int zone_cross_over(unsigned long start, unsigned long end) +{ + if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && + (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) + return 1; + return 0; +} + static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { - int i; + int i, big; struct bootnode nodes[MAX_NUMNODES]; - unsigned long sz = ((end_pfn - start_pfn)< 1) { - unsigned long x = 1; - while ((x << 1) < sz) - x <<= 1; - if (x < sz/2) - printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); - sz = x; - } + old_sz = sz; + /* + * Round down to the nearest FAKE_NODE_MIN_SIZE. + */ + sz &= FAKE_NODE_MIN_HASH_MASK; + + /* + * We ensure that each node is at least 64MB big. Smaller than this + * size can cause VM hiccups. + */ + if (sz == 0) { + printk(KERN_INFO "Not enough memory for %d nodes. Reducing " + "the number of nodes\n", numa_fake); + numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; + printk(KERN_INFO "Number of fake nodes will be = %d\n", + numa_fake); + sz = FAKE_NODE_MIN_SIZE; + } + /* + * Find out how many nodes can get an extra NODE_MIN_SIZE granule. + * This logic ensures the extra memory gets distributed among as many + * nodes as possible (as compared to one single node getting all that + * extra memory. + */ + big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; + printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " + "%d\n", + (sz >> 20), (hole_size >> 20), big); memset(&nodes,0,sizeof(nodes)); + end = start; for (i = 0; i < numa_fake; i++) { - nodes[i].start = (start_pfn<= max_addr) { + numa_fake = i - 1; + break; + } + start = nodes[i].start = end; + /* + * Final node can have all the remaining memory. + */ if (i == numa_fake-1) - sz = (end_pfn<= max_addr) + break; + } + /* + * Look at the next node to make sure there is some real memory + * to map. Bad things happen when the only memory present + * in a zone on a fake node is IO hole. + */ + while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { + if (zone_cross_over(start, end + sz)) { + end = (MAX_DMA32_PFN << PAGE_SHIFT); + break; + } + if (end >= max_addr) + break; + end += FAKE_NODE_MIN_SIZE; + } + if (end > max_addr) + end = max_addr; + nodes[i].end = end; printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", i, nodes[i].start, nodes[i].end, -- cgit v1.2.3 From 9b355897562fe2291248a7aec8e479c2c98cf117 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 13 Feb 2007 13:26:23 +0100 Subject: [PATCH] x86: simplify notify_page_fault() Remove all parameters from this function that aren't really variable. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/mm/fault.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 49e8cf2e06f8..6ada7231f3ab 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -56,17 +56,17 @@ int unregister_page_fault_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); -static inline int notify_page_fault(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) +static inline int notify_page_fault(struct pt_regs *regs, long err) { struct die_args args = { .regs = regs, - .str = str, + .str = "page fault", .err = err, - .trapnr = trap, - .signr = sig + .trapnr = 14, + .signr = SIGSEGV }; - return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); + return atomic_notifier_call_chain(¬ify_page_fault_chain, + DIE_PAGE_FAULT, &args); } /* Sometimes the CPU reports invalid exceptions on prefetch. @@ -355,8 +355,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if (vmalloc_fault(address) >= 0) return; } - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) return; /* * Don't take the mm semaphore here. If we fixup a prefetch @@ -365,8 +364,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) return; if (likely(regs->eflags & X86_EFLAGS_IF)) -- cgit v1.2.3 From f0a5a58aa812b31fd9f197c4ba48245942364eae Mon Sep 17 00:00:00 2001 From: Bob Picco Date: Tue, 13 Feb 2007 13:26:25 +0100 Subject: [PATCH] x86-64: clean up sparsemem memory_present call Eliminate arch specific memory_present call x86_64 NUMA by utilizing sparse_memory_present_with_active_regions. Acked-by: Mel Gorman Signed-off-by: Bob Picco Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/mm/numa.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index d3f747dd61d3..41b8fb069924 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -460,20 +460,6 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } -#ifdef CONFIG_SPARSEMEM -static void __init arch_sparse_init(void) -{ - int i; - - for_each_online_node(i) - memory_present(i, node_start_pfn(i), node_end_pfn(i)); - - sparse_init(); -} -#else -#define arch_sparse_init() do {} while (0) -#endif - void __init paging_init(void) { int i; @@ -483,7 +469,8 @@ void __init paging_init(void) max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; max_zone_pfns[ZONE_NORMAL] = end_pfn; - arch_sparse_init(); + sparse_memory_present_with_active_regions(MAX_NUMNODES); + sparse_init(); for_each_online_node(i) { setup_node_zones(i); -- cgit v1.2.3 From 126b1922367fbe5513daa675a2abd13ed3917f4e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 13 Feb 2007 13:26:26 +0100 Subject: [PATCH] x86-64: Remove mk_pte_phys() - Convert last user to pfn_pte - Remove mk_pte_phys Suggested by Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/mm/pageattr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index ccb91dd996a9..65c5eaa59905 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -107,6 +107,7 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) pud_t *pud; pmd_t *pmd; pte_t large_pte; + unsigned long pfn; pgd = pgd_offset_k(address); BUG_ON(pgd_none(*pgd)); @@ -114,7 +115,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); + pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; + large_pte = pfn_pte(pfn, ref_prot); large_pte = pte_mkhuge(large_pte); set_pte((pte_t *)pmd, large_pte); } -- cgit v1.2.3 From f3854517f3b4cf372d3d5a2a71969c9b26090b7a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 14 Feb 2007 00:33:49 -0800 Subject: [PATCH] sysctl: x86_64: remove unnecessary use of insert_at_head The only sysctl x86_64 provides are not provided elsewhere, so insert_at_head is unnecessary. Signed-off-by: Eric W. Biederman Acked-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 2968b90ef8ad..65aa66c35295 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -724,7 +724,7 @@ static ctl_table debug_root_table2[] = { static __init int x8664_sysctl_init(void) { - register_sysctl_table(debug_root_table2, 1); + register_sysctl_table(debug_root_table2, 0); return 0; } __initcall(x8664_sysctl_init); -- cgit v1.2.3 From c37ce0324962010e768f2570e2603553263ff219 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 14 Feb 2007 00:33:51 -0800 Subject: [PATCH] sysctl: C99 convert ctl_tables in arch/x86_64/mm/init.c Signed-off-by: Eric W. Biederman Acked-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/mm/init.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 65aa66c35295..a5c12ea57683 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -711,15 +711,25 @@ int kern_addr_valid(unsigned long addr) extern int exception_trace, page_fault_trace; static ctl_table debug_table2[] = { - { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, - proc_dointvec }, - { 0, } + { + .ctl_name = 99, + .procname = "exception-trace", + .data = &exception_trace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + {} }; static ctl_table debug_root_table2[] = { - { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, - .child = debug_table2 }, - { 0 }, + { + .ctl_name = CTL_DEBUG, + .procname = "debug", + .mode = 0555, + .child = debug_table2 + }, + {} }; static __init int x8664_sysctl_init(void) -- cgit v1.2.3 From 0b4d414714f0d2f922d39424b0c5c82ad900a381 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 14 Feb 2007 00:34:09 -0800 Subject: [PATCH] sysctl: remove insert_at_head from register_sysctl The semantic effect of insert_at_head is that it would allow new registered sysctl entries to override existing sysctl entries of the same name. Which is pain for caching and the proc interface never implemented. I have done an audit and discovered that none of the current users of register_sysctl care as (excpet for directories) they do not register duplicate sysctl entries. So this patch simply removes the support for overriding existing entries in the sys_sysctl interface since no one uses it or cares and it makes future enhancments harder. Signed-off-by: Eric W. Biederman Acked-by: Ralf Baechle Acked-by: Martin Schwidefsky Cc: Russell King Cc: David Howells Cc: "Luck, Tony" Cc: Ralf Baechle Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Andi Kleen Cc: Jens Axboe Cc: Corey Minyard Cc: Neil Brown Cc: "John W. Linville" Cc: James Bottomley Cc: Jan Kara Cc: Trond Myklebust Cc: Mark Fasheh Cc: David Chinner Cc: "David S. Miller" Cc: Patrick McHardy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/mm') diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index a5c12ea57683..ec31534eb104 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -734,7 +734,7 @@ static ctl_table debug_root_table2[] = { static __init int x8664_sysctl_init(void) { - register_sysctl_table(debug_root_table2, 0); + register_sysctl_table(debug_root_table2); return 0; } __initcall(x8664_sysctl_init); -- cgit v1.2.3