From 068790334cececc3d2d945617ccc585477da2e38 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 10 Jan 2009 12:17:37 +0530 Subject: x86: smp.h move cpu_callin_mask and cpu_callin_map declartion to cpumask.h Impact: cleanup Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 55c46074eba0..bf63de72b643 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_X86_LOCAL_APIC unsigned int num_processors; -- cgit v1.2.3 From c90aa894f0240084f2c6e42e2333b211d6cfe2b2 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 13 Jan 2009 20:41:34 +0900 Subject: x86: cleanup early setup_percpu references [ Based on original patch from Christoph Lameter and Mike Travis. ] * Ruggedize some calls in setup_percpu.c to prevent mishaps in early calls, particularly for non-critical functions. * Cleanup DEBUG_PER_CPU_MAPS usages and some comments. Signed-off-by: Mike Travis Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 56 ++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index bf63de72b643..56c63ac62b10 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -15,6 +15,12 @@ #include #include +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +# define DBG(x...) printk(KERN_DEBUG x) +#else +# define DBG(x...) +#endif + #ifdef CONFIG_X86_LOCAL_APIC unsigned int num_processors; unsigned disabled_cpus __cpuinitdata; @@ -27,31 +33,39 @@ unsigned int max_physical_apicid; physid_mask_t phys_cpu_present_map; #endif -/* map cpu index to physical APIC ID */ +/* + * Map cpu index to physical APIC ID + */ DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) -#define X86_64_NUMA 1 +#define X86_64_NUMA 1 /* (used later) */ -/* map cpu index to node index */ +/* + * Map cpu index to node index + */ DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); -/* which logical CPUs are on which nodes */ +/* + * Which logical CPUs are on which nodes + */ cpumask_t *node_to_cpumask_map; EXPORT_SYMBOL(node_to_cpumask_map); -/* setup node_to_cpumask_map */ +/* + * Setup node_to_cpumask_map + */ static void __init setup_node_to_cpumask_map(void); #else static inline void setup_node_to_cpumask_map(void) { } #endif -#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) +#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA /* * Copy data used in early init routines from the initial arrays to the * per cpu data areas. These arrays then become expendable and the @@ -200,6 +214,8 @@ void __init setup_per_cpu_areas(void) #endif per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + + DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } /* Setup percpu data maps */ @@ -221,6 +237,7 @@ void __init setup_per_cpu_areas(void) * Requires node_possible_map to be valid. * * Note: node_to_cpumask() is not valid until after this is done. + * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) */ static void __init setup_node_to_cpumask_map(void) { @@ -236,6 +253,7 @@ static void __init setup_node_to_cpumask_map(void) /* allocate the map */ map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); + DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids); pr_debug("Node to cpumask map at %p for %d nodes\n", map, nr_node_ids); @@ -248,17 +266,23 @@ void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - if (cpu_pda(cpu) && node != NUMA_NO_NODE) - cpu_pda(cpu)->nodenumber = node; - - if (cpu_to_node_map) + /* early setting, no percpu area yet */ + if (cpu_to_node_map) { cpu_to_node_map[cpu] = node; + return; + } - else if (per_cpu_offset(cpu)) - per_cpu(x86_cpu_to_node_map, cpu) = node; +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) { + printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); + dump_stack(); + return; + } +#endif + per_cpu(x86_cpu_to_node_map, cpu) = node; - else - pr_debug("Setting node for non-present cpu %d\n", cpu); + if (node != NUMA_NO_NODE) + cpu_pda(cpu)->nodenumber = node; } void __cpuinit numa_clear_node(int cpu) @@ -275,7 +299,7 @@ void __cpuinit numa_add_cpu(int cpu) void __cpuinit numa_remove_cpu(int cpu) { - cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); + cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } #else /* CONFIG_DEBUG_PER_CPU_MAPS */ @@ -285,7 +309,7 @@ void __cpuinit numa_remove_cpu(int cpu) */ static void __cpuinit numa_set_cpumask(int cpu, int enable) { - int node = cpu_to_node(cpu); + int node = early_cpu_to_node(cpu); cpumask_t *mask; char buf[64]; -- cgit v1.2.3 From 3e5d8f978435bb9ba4dfe3f4514e65e7885db1a9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Jan 2009 20:41:35 +0900 Subject: x86: make percpu symbols zerobased on SMP [ Based on original patch from Christoph Lameter and Mike Travis. ] This patch makes percpu symbols zerobased on x86_64 SMP by adding PERCPU_VADDR() to vmlinux.lds.h which helps setting explicit vaddr on the percpu output section and using it in vmlinux_64.lds.S. A new PHDR is added as existing ones cannot contain sections near address zero. PERCPU_VADDR() also adds a new symbol __per_cpu_load which always points to the vaddr of the loaded percpu data.init region. The following adjustments have been made to accomodate the address change. * code to locate percpu gdt_page in head_64.S is updated to add the load address to the gdt_page offset. * __per_cpu_load is used in places where access to the init data area is necessary. * pda->data_offset is initialized soon after C code is entered as zero value doesn't work anymore. This patch is mostly taken from Mike Travis' "x86_64: Base percpu variables at zero" patch. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 56c63ac62b10..44845842e722 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -213,7 +213,7 @@ void __init setup_per_cpu_areas(void) } #endif per_cpu_offset(cpu) = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } -- cgit v1.2.3 From c8f3329a0ddd751241e96b4100df7eda14b2cbc6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Jan 2009 20:41:35 +0900 Subject: x86: use static _cpu_pda array _cpu_pda array first uses statically allocated storage in data.init and then switches to allocated bootmem to conserve space. However, after folding pda area into percpu area, _cpu_pda array will be removed completely. Drop the reallocation part to simplify the code for soon-to-follow changes. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 44845842e722..73ab01b297c5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -114,7 +114,6 @@ static inline void setup_cpu_pda_map(void) { } static void __init setup_cpu_pda_map(void) { char *pda; - struct x8664_pda **new_cpu_pda; unsigned long size; int cpu; @@ -122,28 +121,21 @@ static void __init setup_cpu_pda_map(void) /* allocate cpu_pda array and pointer table */ { - unsigned long tsize = nr_cpu_ids * sizeof(void *); unsigned long asize = size * (nr_cpu_ids - 1); - tsize = roundup(tsize, cache_line_size()); - new_cpu_pda = alloc_bootmem(tsize + asize); - pda = (char *)new_cpu_pda + tsize; + pda = alloc_bootmem(asize); } /* initialize pointer table to static pda's */ for_each_possible_cpu(cpu) { if (cpu == 0) { /* leave boot cpu pda in place */ - new_cpu_pda[0] = cpu_pda(0); continue; } - new_cpu_pda[cpu] = (struct x8664_pda *)pda; - new_cpu_pda[cpu]->in_bootmem = 1; + cpu_pda(cpu) = (struct x8664_pda *)pda; + cpu_pda(cpu)->in_bootmem = 1; pda += size; } - - /* point to new pointer table */ - _cpu_pda = new_cpu_pda; } #endif /* CONFIG_SMP && CONFIG_X86_64 */ -- cgit v1.2.3 From 1a51e3a0aed18767cf2762e95456ecfeb0bca5e6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Jan 2009 20:41:35 +0900 Subject: x86: fold pda into percpu area on SMP [ Based on original patch from Christoph Lameter and Mike Travis. ] Currently pdas and percpu areas are allocated separately. %gs points to local pda and percpu area can be reached using pda->data_offset. This patch folds pda into percpu area. Due to strange gcc requirement, pda needs to be at the beginning of the percpu area so that pda->stack_canary is at %gs:40. To achieve this, a new percpu output section macro - PERCPU_VADDR_PREALLOC() - is added and used to reserve pda sized chunk at the start of the percpu area. After this change, for boot cpu, %gs first points to pda in the data.init area and later during setup_per_cpu_areas() gets updated to point to the actual pda. This means that setup_per_cpu_areas() need to reload %gs for CPU0 while clearing pda area for other cpus as cpu0 already has modified it when control reaches setup_per_cpu_areas(). This patch also removes now unnecessary get_local_pda() and its call sites. A lot of this patch is taken from Mike Travis' "x86_64: Fold pda into per cpu area" patch. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 107 ++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 61 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 73ab01b297c5..63d462802272 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #ifdef CONFIG_DEBUG_PER_CPU_MAPS @@ -65,6 +66,36 @@ static void __init setup_node_to_cpumask_map(void); static inline void setup_node_to_cpumask_map(void) { } #endif +#ifdef CONFIG_X86_64 +void __cpuinit load_pda_offset(int cpu) +{ + /* Memory clobbers used to order pda/percpu accesses */ + mb(); + wrmsrl(MSR_GS_BASE, cpu_pda(cpu)); + mb(); +} + +#endif /* CONFIG_SMP && CONFIG_X86_64 */ + +#ifdef CONFIG_X86_64 + +/* correctly size the local cpu masks */ +static void setup_cpu_local_masks(void) +{ + alloc_bootmem_cpumask_var(&cpu_initialized_mask); + alloc_bootmem_cpumask_var(&cpu_callin_mask); + alloc_bootmem_cpumask_var(&cpu_callout_mask); + alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); +} + +#else /* CONFIG_X86_32 */ + +static inline void setup_cpu_local_masks(void) +{ +} + +#endif /* CONFIG_X86_32 */ + #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA /* * Copy data used in early init routines from the initial arrays to the @@ -101,63 +132,7 @@ static void __init setup_per_cpu_maps(void) */ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); -static inline void setup_cpu_pda_map(void) { } - -#elif !defined(CONFIG_SMP) -static inline void setup_cpu_pda_map(void) { } - -#else /* CONFIG_SMP && CONFIG_X86_64 */ - -/* - * Allocate cpu_pda pointer table and array via alloc_bootmem. - */ -static void __init setup_cpu_pda_map(void) -{ - char *pda; - unsigned long size; - int cpu; - - size = roundup(sizeof(struct x8664_pda), cache_line_size()); - - /* allocate cpu_pda array and pointer table */ - { - unsigned long asize = size * (nr_cpu_ids - 1); - - pda = alloc_bootmem(asize); - } - - /* initialize pointer table to static pda's */ - for_each_possible_cpu(cpu) { - if (cpu == 0) { - /* leave boot cpu pda in place */ - continue; - } - cpu_pda(cpu) = (struct x8664_pda *)pda; - cpu_pda(cpu)->in_bootmem = 1; - pda += size; - } -} - -#endif /* CONFIG_SMP && CONFIG_X86_64 */ - -#ifdef CONFIG_X86_64 - -/* correctly size the local cpu masks */ -static void setup_cpu_local_masks(void) -{ - alloc_bootmem_cpumask_var(&cpu_initialized_mask); - alloc_bootmem_cpumask_var(&cpu_callin_mask); - alloc_bootmem_cpumask_var(&cpu_callout_mask); - alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); -} - -#else /* CONFIG_X86_32 */ - -static inline void setup_cpu_local_masks(void) -{ -} - -#endif /* CONFIG_X86_32 */ +#endif /* * Great future plan: @@ -171,9 +146,6 @@ void __init setup_per_cpu_areas(void) int cpu; unsigned long align = 1; - /* Setup cpu_pda map */ - setup_cpu_pda_map(); - /* Copy section for each CPU (we discard the original) */ old_size = PERCPU_ENOUGH_ROOM; align = max_t(unsigned long, PAGE_SIZE, align); @@ -204,8 +176,21 @@ void __init setup_per_cpu_areas(void) cpu, node, __pa(ptr)); } #endif - per_cpu_offset(cpu) = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); +#ifdef CONFIG_X86_64 + cpu_pda(cpu) = (void *)ptr; + + /* + * CPU0 modified pda in the init data area, reload pda + * offset for CPU0 and clear the area for others. + */ + if (cpu == 0) + load_pda_offset(0); + else + memset(cpu_pda(cpu), 0, sizeof(*cpu_pda(cpu))); +#endif + per_cpu_offset(cpu) = ptr - __per_cpu_start; DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } -- cgit v1.2.3 From 9939ddaff52787b2a7c1adf1b2afc95421aa0884 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Jan 2009 20:41:35 +0900 Subject: x86: merge 64 and 32 SMP percpu handling Now that pda is allocated as part of percpu, percpu doesn't need to be accessed through pda. Unify x86_64 SMP percpu access with x86_32 SMP one. Other than the segment register, operand size and the base of percpu symbols, they behave identical now. This patch replaces now unnecessary pda->data_offset with a dummy field which is necessary to keep stack_canary at its place. This patch also moves per_cpu_offset initialization out of init_gdt() into setup_per_cpu_areas(). Note that this change also necessitates explicit per_cpu_offset initializations in voyager_smp.c. With this change, x86_OP_percpu()'s are as efficient on x86_64 as on x86_32 and also x86_64 can use assembly PER_CPU macros. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 63d462802272..be1ff34db112 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -125,14 +125,14 @@ static void __init setup_per_cpu_maps(void) #endif } -#ifdef CONFIG_X86_32 -/* - * Great future not-so-futuristic plan: make i386 and x86_64 do it - * the same way - */ +#ifdef CONFIG_X86_64 +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { + [0] = (unsigned long)__per_cpu_load, +}; +#else unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(__per_cpu_offset); #endif +EXPORT_SYMBOL(__per_cpu_offset); /* * Great future plan: @@ -178,6 +178,7 @@ void __init setup_per_cpu_areas(void) #endif memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); + per_cpu_offset(cpu) = ptr - __per_cpu_start; #ifdef CONFIG_X86_64 cpu_pda(cpu) = (void *)ptr; @@ -190,7 +191,7 @@ void __init setup_per_cpu_areas(void) else memset(cpu_pda(cpu), 0, sizeof(*cpu_pda(cpu))); #endif - per_cpu_offset(cpu) = ptr - __per_cpu_start; + per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } -- cgit v1.2.3 From b12d8db8fbfaed1e8222a15333a3645599636854 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Jan 2009 20:41:35 +0900 Subject: x86: make pda a percpu variable [ Based on original patch from Christoph Lameter and Mike Travis. ] As pda is now allocated in percpu area, it can easily be made a proper percpu variable. Make it so by defining per cpu symbol from linker script and declaring it in C code for SMP and simply defining it for UP. This change cleans up code and brings SMP and UP closer a bit. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index be1ff34db112..daeedf82c15f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -66,6 +66,16 @@ static void __init setup_node_to_cpumask_map(void); static inline void setup_node_to_cpumask_map(void) { } #endif +/* + * Define load_pda_offset() and per-cpu __pda for x86_64. + * load_pda_offset() is responsible for loading the offset of pda into + * %gs. + * + * On SMP, pda offset also duals as percpu base address and thus it + * should be at the start of per-cpu area. To achieve this, it's + * preallocated in vmlinux_64.lds.S directly instead of using + * DEFINE_PER_CPU(). + */ #ifdef CONFIG_X86_64 void __cpuinit load_pda_offset(int cpu) { @@ -74,6 +84,10 @@ void __cpuinit load_pda_offset(int cpu) wrmsrl(MSR_GS_BASE, cpu_pda(cpu)); mb(); } +#ifndef CONFIG_SMP +DEFINE_PER_CPU(struct x8664_pda, __pda); +EXPORT_PER_CPU_SYMBOL(__pda); +#endif #endif /* CONFIG_SMP && CONFIG_X86_64 */ @@ -180,8 +194,6 @@ void __init setup_per_cpu_areas(void) memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); per_cpu_offset(cpu) = ptr - __per_cpu_start; #ifdef CONFIG_X86_64 - cpu_pda(cpu) = (void *)ptr; - /* * CPU0 modified pda in the init data area, reload pda * offset for CPU0 and clear the area for others. -- cgit v1.2.3 From a338af2c648f5e07c582154745a6c60cd2d8bf12 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 Jan 2009 11:19:03 +0900 Subject: x86: fix build bug introduced during merge EXPORT_PER_CPU_SYMBOL() got misplaced during merge leading to build failure. Fix it. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index daeedf82c15f..b5c35af2011d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -86,9 +86,8 @@ void __cpuinit load_pda_offset(int cpu) } #ifndef CONFIG_SMP DEFINE_PER_CPU(struct x8664_pda, __pda); -EXPORT_PER_CPU_SYMBOL(__pda); #endif - +EXPORT_PER_CPU_SYMBOL(__pda); #endif /* CONFIG_SMP && CONFIG_X86_64 */ #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 26f80bd6a9ab17bc8a60b6092e7c0d05c5927ce5 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 19 Jan 2009 00:38:58 +0900 Subject: x86-64: Convert irqstacks to per-cpu Move the irqstackptr variable from the PDA to per-cpu. Make the stacks themselves per-cpu, removing some specific allocation code. Add a seperate flag (is_boot_cpu) to simplify the per-cpu boot adjustments. tj: * sprinkle some underbars around. * irq_stack_ptr is not used till traps_init(), no reason to initialize it early. On SMP, just leaving it NULL till proper initialization in setup_per_cpu_areas() works. Dropped is_boot_cpu and early irq_stack_ptr initialization. * do DECLARE/DEFINE_PER_CPU(char[IRQ_STACK_SIZE], irq_stack) instead of (char, irq_stack[IRQ_STACK_SIZE]). Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index b5c35af2011d..8b53ef83c611 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -192,7 +192,10 @@ void __init setup_per_cpu_areas(void) memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); per_cpu_offset(cpu) = ptr - __per_cpu_start; + per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); #ifdef CONFIG_X86_64 + per_cpu(irq_stack_ptr, cpu) = + (char *)per_cpu(irq_stack, cpu) + IRQ_STACK_SIZE - 64; /* * CPU0 modified pda in the init data area, reload pda * offset for CPU0 and clear the area for others. @@ -202,7 +205,6 @@ void __init setup_per_cpu_areas(void) else memset(cpu_pda(cpu), 0, sizeof(*cpu_pda(cpu))); #endif - per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } -- cgit v1.2.3 From ea9279066de44053d0c20ea855bc9f4706652d84 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 19 Jan 2009 00:38:58 +0900 Subject: x86-64: Move cpu number from PDA to per-cpu and consolidate with 32-bit. tj: moved cpu_number definition out of CONFIG_HAVE_SETUP_PER_CPU_AREA for voyager. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8b53ef83c611..258497f93f4d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -22,6 +22,15 @@ # define DBG(x...) #endif +/* + * Could be inside CONFIG_HAVE_SETUP_PER_CPU_AREA with other stuff but + * voyager wants cpu_number too. + */ +#ifdef CONFIG_SMP +DEFINE_PER_CPU(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); +#endif + #ifdef CONFIG_X86_LOCAL_APIC unsigned int num_processors; unsigned disabled_cpus __cpuinitdata; @@ -193,6 +202,7 @@ void __init setup_per_cpu_areas(void) memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); per_cpu_offset(cpu) = ptr - __per_cpu_start; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); + per_cpu(cpu_number, cpu) = cpu; #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = (char *)per_cpu(irq_stack, cpu) + IRQ_STACK_SIZE - 64; -- cgit v1.2.3 From e7a22c1ebcc1caa8178df1819d05128bb5b45ab9 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 19 Jan 2009 00:38:59 +0900 Subject: x86-64: Move nodenumber from PDA to per-cpu. tj: * s/nodenumber/node_number/ * removed now unused pda variable from pda_init() Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 258497f93f4d..efbafbbff584 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -53,6 +53,8 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) #define X86_64_NUMA 1 /* (used later) */ +DEFINE_PER_CPU(int, node_number) = 0; +EXPORT_PER_CPU_SYMBOL(node_number); /* * Map cpu index to node index @@ -283,7 +285,7 @@ void __cpuinit numa_set_node(int cpu, int node) per_cpu(x86_cpu_to_node_map, cpu) = node; if (node != NUMA_NO_NODE) - cpu_pda(cpu)->nodenumber = node; + per_cpu(node_number, cpu) = node; } void __cpuinit numa_clear_node(int cpu) -- cgit v1.2.3 From 947e76cdc34c782fc947313d4331380686eebbad Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 19 Jan 2009 12:21:28 +0900 Subject: x86: move stack_canary into irq_stack Impact: x86_64 percpu area layout change, irq_stack now at the beginning Now that the PDA is empty except for the stack canary, it can be removed. The irqstack is moved to the start of the per-cpu section. If the stack protector is enabled, the canary overlaps the bottom 48 bytes of the irqstack. tj: * updated subject * dropped asm relocation of irq_stack_ptr * updated comments a bit * rebased on top of stack canary changes Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index efbafbbff584..90b8e154bb53 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -77,30 +77,6 @@ static void __init setup_node_to_cpumask_map(void); static inline void setup_node_to_cpumask_map(void) { } #endif -/* - * Define load_pda_offset() and per-cpu __pda for x86_64. - * load_pda_offset() is responsible for loading the offset of pda into - * %gs. - * - * On SMP, pda offset also duals as percpu base address and thus it - * should be at the start of per-cpu area. To achieve this, it's - * preallocated in vmlinux_64.lds.S directly instead of using - * DEFINE_PER_CPU(). - */ -#ifdef CONFIG_X86_64 -void __cpuinit load_pda_offset(int cpu) -{ - /* Memory clobbers used to order pda/percpu accesses */ - mb(); - wrmsrl(MSR_GS_BASE, cpu_pda(cpu)); - mb(); -} -#ifndef CONFIG_SMP -DEFINE_PER_CPU(struct x8664_pda, __pda); -#endif -EXPORT_PER_CPU_SYMBOL(__pda); -#endif /* CONFIG_SMP && CONFIG_X86_64 */ - #ifdef CONFIG_X86_64 /* correctly size the local cpu masks */ @@ -207,15 +183,13 @@ void __init setup_per_cpu_areas(void) per_cpu(cpu_number, cpu) = cpu; #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = - (char *)per_cpu(irq_stack, cpu) + IRQ_STACK_SIZE - 64; + per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64; /* - * CPU0 modified pda in the init data area, reload pda - * offset for CPU0 and clear the area for others. + * Up to this point, CPU0 has been using .data.init + * area. Reload %gs offset for CPU0. */ if (cpu == 0) - load_pda_offset(0); - else - memset(cpu_pda(cpu), 0, sizeof(*cpu_pda(cpu))); + load_gs_base(cpu); #endif DBG("PERCPU: cpu %4d %p\n", cpu, ptr); -- cgit v1.2.3 From 0d77e7f04d5da160307f4f5c030a171e004f602b Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:47 +0900 Subject: x86: merge setup_per_cpu_maps() into setup_per_cpu_areas() Impact: minor optimization Eliminates the need for two loops over possible cpus. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 48 +++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 90b8e154bb53..d0b1476490a7 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -97,33 +97,6 @@ static inline void setup_cpu_local_masks(void) #endif /* CONFIG_X86_32 */ #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA -/* - * Copy data used in early init routines from the initial arrays to the - * per cpu data areas. These arrays then become expendable and the - * *_early_ptr's are zeroed indicating that the static arrays are gone. - */ -static void __init setup_per_cpu_maps(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - per_cpu(x86_cpu_to_apicid, cpu) = - early_per_cpu_map(x86_cpu_to_apicid, cpu); - per_cpu(x86_bios_cpu_apicid, cpu) = - early_per_cpu_map(x86_bios_cpu_apicid, cpu); -#ifdef X86_64_NUMA - per_cpu(x86_cpu_to_node_map, cpu) = - early_per_cpu_map(x86_cpu_to_node_map, cpu); -#endif - } - - /* indicate the early static arrays will soon be gone */ - early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; - early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; -#ifdef X86_64_NUMA - early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; -#endif -} #ifdef CONFIG_X86_64 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { @@ -181,6 +154,19 @@ void __init setup_per_cpu_areas(void) per_cpu_offset(cpu) = ptr - __per_cpu_start; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; + /* + * Copy data used in early init routines from the initial arrays to the + * per cpu data areas. These arrays then become expendable and the + * *_early_ptr's are zeroed indicating that the static arrays are gone. + */ + per_cpu(x86_cpu_to_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_apicid, cpu); + per_cpu(x86_bios_cpu_apicid, cpu) = + early_per_cpu_map(x86_bios_cpu_apicid, cpu); +#ifdef X86_64_NUMA + per_cpu(x86_cpu_to_node_map, cpu) = + early_per_cpu_map(x86_cpu_to_node_map, cpu); +#endif #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64; @@ -195,8 +181,12 @@ void __init setup_per_cpu_areas(void) DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } - /* Setup percpu data maps */ - setup_per_cpu_maps(); + /* indicate the early static arrays will soon be gone */ + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; +#ifdef X86_64_NUMA + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; +#endif /* Setup node to cpumask map */ setup_node_to_cpumask_map(); -- cgit v1.2.3 From 6470aff619fbb9dff8dfe8afa5033084cd55ca20 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:47 +0900 Subject: x86: move 64-bit NUMA code Impact: Code movement, no functional change. Move the 64-bit NUMA code from setup_percpu.c to numa_64.c Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 237 +---------------------------------------- 1 file changed, 5 insertions(+), 232 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d0b1476490a7..cb6d622520be 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -51,32 +51,6 @@ DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) -#define X86_64_NUMA 1 /* (used later) */ -DEFINE_PER_CPU(int, node_number) = 0; -EXPORT_PER_CPU_SYMBOL(node_number); - -/* - * Map cpu index to node index - */ -DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); - -/* - * Which logical CPUs are on which nodes - */ -cpumask_t *node_to_cpumask_map; -EXPORT_SYMBOL(node_to_cpumask_map); - -/* - * Setup node_to_cpumask_map - */ -static void __init setup_node_to_cpumask_map(void); - -#else -static inline void setup_node_to_cpumask_map(void) { } -#endif - #ifdef CONFIG_X86_64 /* correctly size the local cpu masks */ @@ -163,13 +137,13 @@ void __init setup_per_cpu_areas(void) early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = early_per_cpu_map(x86_bios_cpu_apicid, cpu); -#ifdef X86_64_NUMA - per_cpu(x86_cpu_to_node_map, cpu) = - early_per_cpu_map(x86_cpu_to_node_map, cpu); -#endif #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64; +#ifdef CONFIG_NUMA + per_cpu(x86_cpu_to_node_map, cpu) = + early_per_cpu_map(x86_cpu_to_node_map, cpu); +#endif /* * Up to this point, CPU0 has been using .data.init * area. Reload %gs offset for CPU0. @@ -184,7 +158,7 @@ void __init setup_per_cpu_areas(void) /* indicate the early static arrays will soon be gone */ early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; -#ifdef X86_64_NUMA +#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif @@ -197,204 +171,3 @@ void __init setup_per_cpu_areas(void) #endif -#ifdef X86_64_NUMA - -/* - * Allocate node_to_cpumask_map based on number of available nodes - * Requires node_possible_map to be valid. - * - * Note: node_to_cpumask() is not valid until after this is done. - * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) - */ -static void __init setup_node_to_cpumask_map(void) -{ - unsigned int node, num = 0; - cpumask_t *map; - - /* setup nr_node_ids if not done yet */ - if (nr_node_ids == MAX_NUMNODES) { - for_each_node_mask(node, node_possible_map) - num = node; - nr_node_ids = num + 1; - } - - /* allocate the map */ - map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); - DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids); - - pr_debug("Node to cpumask map at %p for %d nodes\n", - map, nr_node_ids); - - /* node_to_cpumask() will now work */ - node_to_cpumask_map = map; -} - -void __cpuinit numa_set_node(int cpu, int node) -{ - int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); - - /* early setting, no percpu area yet */ - if (cpu_to_node_map) { - cpu_to_node_map[cpu] = node; - return; - } - -#ifdef CONFIG_DEBUG_PER_CPU_MAPS - if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) { - printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); - dump_stack(); - return; - } -#endif - per_cpu(x86_cpu_to_node_map, cpu) = node; - - if (node != NUMA_NO_NODE) - per_cpu(node_number, cpu) = node; -} - -void __cpuinit numa_clear_node(int cpu) -{ - numa_set_node(cpu, NUMA_NO_NODE); -} - -#ifndef CONFIG_DEBUG_PER_CPU_MAPS - -void __cpuinit numa_add_cpu(int cpu) -{ - cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); -} - -#else /* CONFIG_DEBUG_PER_CPU_MAPS */ - -/* - * --------- debug versions of the numa functions --------- - */ -static void __cpuinit numa_set_cpumask(int cpu, int enable) -{ - int node = early_cpu_to_node(cpu); - cpumask_t *mask; - char buf[64]; - - if (node_to_cpumask_map == NULL) { - printk(KERN_ERR "node_to_cpumask_map NULL\n"); - dump_stack(); - return; - } - - mask = &node_to_cpumask_map[node]; - if (enable) - cpu_set(cpu, *mask); - else - cpu_clear(cpu, *mask); - - cpulist_scnprintf(buf, sizeof(buf), mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", - enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); -} - -void __cpuinit numa_add_cpu(int cpu) -{ - numa_set_cpumask(cpu, 1); -} - -void __cpuinit numa_remove_cpu(int cpu) -{ - numa_set_cpumask(cpu, 0); -} - -int cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) { - printk(KERN_WARNING - "cpu_to_node(%d): usage too early!\n", cpu); - dump_stack(); - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} -EXPORT_SYMBOL(cpu_to_node); - -/* - * Same function as cpu_to_node() but used if called before the - * per_cpu areas are setup. - */ -int early_cpu_to_node(int cpu) -{ - if (early_per_cpu_ptr(x86_cpu_to_node_map)) - return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - - if (!per_cpu_offset(cpu)) { - printk(KERN_WARNING - "early_cpu_to_node(%d): no per_cpu area!\n", cpu); - dump_stack(); - return NUMA_NO_NODE; - } - return per_cpu(x86_cpu_to_node_map, cpu); -} - - -/* empty cpumask */ -static const cpumask_t cpu_mask_none; - -/* - * Returns a pointer to the bitmask of CPUs on Node 'node'. - */ -const cpumask_t *cpumask_of_node(int node) -{ - if (node_to_cpumask_map == NULL) { - printk(KERN_WARNING - "cpumask_of_node(%d): no node_to_cpumask_map!\n", - node); - dump_stack(); - return (const cpumask_t *)&cpu_online_map; - } - if (node >= nr_node_ids) { - printk(KERN_WARNING - "cpumask_of_node(%d): node > nr_node_ids(%d)\n", - node, nr_node_ids); - dump_stack(); - return &cpu_mask_none; - } - return &node_to_cpumask_map[node]; -} -EXPORT_SYMBOL(cpumask_of_node); - -/* - * Returns a bitmask of CPUs on Node 'node'. - * - * Side note: this function creates the returned cpumask on the stack - * so with a high NR_CPUS count, excessive stack space is used. The - * node_to_cpumask_ptr function should be used whenever possible. - */ -cpumask_t node_to_cpumask(int node) -{ - if (node_to_cpumask_map == NULL) { - printk(KERN_WARNING - "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); - dump_stack(); - return cpu_online_map; - } - if (node >= nr_node_ids) { - printk(KERN_WARNING - "node_to_cpumask(%d): node > nr_node_ids(%d)\n", - node, nr_node_ids); - dump_stack(); - return cpu_mask_none; - } - return node_to_cpumask_map[node]; -} -EXPORT_SYMBOL(node_to_cpumask); - -/* - * --------- end of debug versions of the numa functions --------- - */ - -#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ - -#endif /* X86_64_NUMA */ - -- cgit v1.2.3 From 2f2f52bad72f5e1ca5d1b9ad00a7b57a8cbd9159 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:47 +0900 Subject: x86: move setup_cpu_local_masks() Impact: Code movement, no functional change. Move setup_cpu_local_masks() to kernel/cpu/common.c, where the masks are defined. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index cb6d622520be..7bebdba8eb89 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -51,25 +51,6 @@ DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); -#ifdef CONFIG_X86_64 - -/* correctly size the local cpu masks */ -static void setup_cpu_local_masks(void) -{ - alloc_bootmem_cpumask_var(&cpu_initialized_mask); - alloc_bootmem_cpumask_var(&cpu_callin_mask); - alloc_bootmem_cpumask_var(&cpu_callout_mask); - alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); -} - -#else /* CONFIG_X86_32 */ - -static inline void setup_cpu_local_masks(void) -{ -} - -#endif /* CONFIG_X86_32 */ - #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 74631a248dc2c2129a96f6b8b706ed54bb5c3d3c Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:47 +0900 Subject: x86: always page-align per-cpu area start and size Impact: cleanup The way the code is written, align is always PAGE_SIZE. Simplify the code by removing the align variable. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7bebdba8eb89..5d4a4964a8b3 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -69,15 +69,12 @@ EXPORT_SYMBOL(__per_cpu_offset); */ void __init setup_per_cpu_areas(void) { - ssize_t size, old_size; + ssize_t size; char *ptr; int cpu; - unsigned long align = 1; /* Copy section for each CPU (we discard the original) */ - old_size = PERCPU_ENOUGH_ROOM; - align = max_t(unsigned long, PAGE_SIZE, align); - size = roundup(old_size, align); + size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE); pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); @@ -86,20 +83,17 @@ void __init setup_per_cpu_areas(void) for_each_possible_cpu(cpu) { #ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = __alloc_bootmem(size, align, - __pa(MAX_DMA_ADDRESS)); + ptr = alloc_bootmem_pages(size); #else int node = early_cpu_to_node(cpu); if (!node_online(node) || !NODE_DATA(node)) { - ptr = __alloc_bootmem(size, align, - __pa(MAX_DMA_ADDRESS)); + ptr = alloc_bootmem_pages(size); pr_info("cpu %d has no node %d or node-local memory\n", cpu, node); pr_debug("per cpu data for cpu%d at %016lx\n", cpu, __pa(ptr)); } else { - ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, - __pa(MAX_DMA_ADDRESS)); + ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); pr_debug("per cpu data for cpu%d on node%d at %016lx\n", cpu, node, __pa(ptr)); } -- cgit v1.2.3 From ec70de8b04bf37213982a5e8f303bc38679f3f8e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:47 +0900 Subject: x86: move apic variables to apic.c Impact: Code movement Move the variable definitions to apic.c. Ifdef the copying of the two early per-cpu variables, since Voyager doesn't use them. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5d4a4964a8b3..d367996693e6 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -31,26 +31,6 @@ DEFINE_PER_CPU(int, cpu_number); EXPORT_PER_CPU_SYMBOL(cpu_number); #endif -#ifdef CONFIG_X86_LOCAL_APIC -unsigned int num_processors; -unsigned disabled_cpus __cpuinitdata; -/* Processor that is doing the boot up */ -unsigned int boot_cpu_physical_apicid = -1U; -EXPORT_SYMBOL(boot_cpu_physical_apicid); -unsigned int max_physical_apicid; - -/* Bitmask of physically existing CPUs */ -physid_mask_t phys_cpu_present_map; -#endif - -/* - * Map cpu index to physical APIC ID - */ -DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); -DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); - #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA #ifdef CONFIG_X86_64 @@ -108,10 +88,12 @@ void __init setup_per_cpu_areas(void) * per cpu data areas. These arrays then become expendable and the * *_early_ptr's are zeroed indicating that the static arrays are gone. */ +#ifdef CONFIG_X86_LOCAL_APIC per_cpu(x86_cpu_to_apicid, cpu) = early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = early_per_cpu_map(x86_bios_cpu_apicid, cpu); +#endif #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64; -- cgit v1.2.3 From 996db817e3d1529d711e55b938d72ae4060b39fd Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:47 +0900 Subject: x86: only compile setup_percpu.o on SMP Impact: Minor build optimization Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d367996693e6..f30ff691c34d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -22,14 +22,8 @@ # define DBG(x...) #endif -/* - * Could be inside CONFIG_HAVE_SETUP_PER_CPU_AREA with other stuff but - * voyager wants cpu_number too. - */ -#ifdef CONFIG_SMP DEFINE_PER_CPU(int, cpu_number); EXPORT_PER_CPU_SYMBOL(cpu_number); -#endif #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA -- cgit v1.2.3 From 1688401a0fddba8991aa5c0943b8ae9583998d60 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:48 +0900 Subject: x86: move this_cpu_offset Impact: Small cleanup Define BOOT_PERCPU_OFFSET and use it for this_cpu_offset and __per_cpu_offset initializers. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index f30ff691c34d..36c2e81dfc3c 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -25,15 +25,20 @@ DEFINE_PER_CPU(int, cpu_number); EXPORT_PER_CPU_SYMBOL(cpu_number); +#ifdef CONFIG_X86_64 +#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) +#else +#define BOOT_PERCPU_OFFSET 0 +#endif + +DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; +EXPORT_PER_CPU_SYMBOL(this_cpu_off); + #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA -#ifdef CONFIG_X86_64 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { - [0] = (unsigned long)__per_cpu_load, + [0] = BOOT_PERCPU_OFFSET, }; -#else -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; -#endif EXPORT_SYMBOL(__per_cpu_offset); /* -- cgit v1.2.3 From 34019be1cd2941128b5de6d7c0fbdb51f967d268 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:48 +0900 Subject: x86: don't assume boot cpu is #0 Impact: minor cleanup Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 36c2e81dfc3c..be77f1a1231d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef CONFIG_DEBUG_PER_CPU_MAPS # define DBG(x...) printk(KERN_DEBUG x) @@ -37,7 +38,7 @@ EXPORT_PER_CPU_SYMBOL(this_cpu_off); #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { - [0] = BOOT_PERCPU_OFFSET, + [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, }; EXPORT_SYMBOL(__per_cpu_offset); @@ -101,10 +102,10 @@ void __init setup_per_cpu_areas(void) early_per_cpu_map(x86_cpu_to_node_map, cpu); #endif /* - * Up to this point, CPU0 has been using .data.init - * area. Reload %gs offset for CPU0. + * Up to this point, the boot CPU has been using .data.init + * area. Reload %gs offset for the boot CPU. */ - if (cpu == 0) + if (cpu == boot_cpu_id) load_gs_base(cpu); #endif -- cgit v1.2.3 From 89c9c4c58ee86e6e8802597271f23679e0c46647 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:48 +0900 Subject: x86: make Voyager use x86 per-cpu setup. Impact: standardize all x86 platforms on same setup code With the preceding changes, Voyager can use the same per-cpu setup code as all the other x86 platforms. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index be77f1a1231d..599dc1cc1da8 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -35,8 +35,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; EXPORT_PER_CPU_SYMBOL(this_cpu_off); -#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA - unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, }; @@ -125,6 +123,3 @@ void __init setup_per_cpu_areas(void) /* Setup cpu initialized, callin, callout masks */ setup_cpu_local_masks(); } - -#endif - -- cgit v1.2.3 From b2d2f4312b117a6cc647c8521e2643a88771f757 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:48 +0900 Subject: x86: initialize per-cpu GDT segment in per-cpu setup Impact: cleanup Rename init_gdt() to setup_percpu_segment(), and move it to setup_percpu.c. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 599dc1cc1da8..bcca3a7b3748 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -40,6 +40,19 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +static inline void setup_percpu_segment(int cpu) +{ +#ifdef CONFIG_X86_32 + struct desc_struct gdt; + + pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, + 0x2 | DESCTYPE_S, 0x8); + gdt.s = 1; + write_gdt_entry(get_cpu_gdt_table(cpu), + GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); +#endif +} + /* * Great future plan: * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. @@ -81,6 +94,7 @@ void __init setup_per_cpu_areas(void) per_cpu_offset(cpu) = ptr - __per_cpu_start; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; + setup_percpu_segment(cpu); /* * Copy data used in early init routines from the initial arrays to the * per cpu data areas. These arrays then become expendable and the -- cgit v1.2.3 From 2697fbd5faf19c84c17441b1752bdcbdcfd1248c Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:48 +0900 Subject: x86: load new GDT after setting up boot cpu per-cpu area Impact: sync 32 and 64-bit code Merge load_gs_base() into switch_to_new_gdt(). Load the GDT and per-cpu state for the boot cpu when its new area is set up. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index bcca3a7b3748..4caa78d7cb15 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -112,14 +112,14 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); +#endif #endif /* * Up to this point, the boot CPU has been using .data.init - * area. Reload %gs offset for the boot CPU. + * area. Reload any changed state for the boot CPU. */ if (cpu == boot_cpu_id) - load_gs_base(cpu); -#endif + switch_to_new_gdt(); DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } -- cgit v1.2.3 From 22f25138c345ec46a13744c93c093ff822cd98d1 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Tue, 27 Jan 2009 14:21:37 +0900 Subject: x86: fix build breakage on voyage Impact: build fix x86_cpu_to_apicid and x86_bios_cpu_apicid aren't defined for voyage. Earlier patch forgot to conditionalize early percpu clearing. Fix it. Signed-off-by: James Bottomley Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4caa78d7cb15..c7458ead22de 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -125,8 +125,10 @@ void __init setup_per_cpu_areas(void) } /* indicate the early static arrays will soon be gone */ +#ifdef CONFIG_X86_LOCAL_APIC early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; +#endif #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif -- cgit v1.2.3 From cf3997f507624757f149fcc42b76fb03c151fb65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Jan 2009 14:25:05 +0900 Subject: x86: clean up indentation in setup_per_cpu_areas() Impact: cosmetic cleanup Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index c7458ead22de..0d1e7ac439f4 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -96,22 +96,25 @@ void __init setup_per_cpu_areas(void) per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); /* - * Copy data used in early init routines from the initial arrays to the - * per cpu data areas. These arrays then become expendable and the - * *_early_ptr's are zeroed indicating that the static arrays are gone. + * Copy data used in early init routines from the + * initial arrays to the per cpu data areas. These + * arrays then become expendable and the *_early_ptr's + * are zeroed indicating that the static arrays are + * gone. */ #ifdef CONFIG_X86_LOCAL_APIC per_cpu(x86_cpu_to_apicid, cpu) = - early_per_cpu_map(x86_cpu_to_apicid, cpu); + early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = - early_per_cpu_map(x86_bios_cpu_apicid, cpu); + early_per_cpu_map(x86_bios_cpu_apicid, cpu); #endif #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = - per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64; + per_cpu(irq_stack_union.irq_stack, cpu) + + IRQ_STACK_SIZE - 64; #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = - early_per_cpu_map(x86_cpu_to_node_map, cpu); + early_per_cpu_map(x86_cpu_to_node_map, cpu); #endif #endif /* -- cgit v1.2.3 From 552be871e67ff577ed36beb2f53d078b42304739 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 30 Jan 2009 17:47:53 +0900 Subject: x86: pass in cpu number to switch_to_new_gdt() Impact: cleanup, prepare for xen boot fix. Xen needs to call this function very early to setup the GDT and per-cpu segments. Remove the call to smp_processor_id() and just pass in the cpu number. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 0d1e7ac439f4..ef91747bbed5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -122,7 +122,7 @@ void __init setup_per_cpu_areas(void) * area. Reload any changed state for the boot CPU. */ if (cpu == boot_cpu_id) - switch_to_new_gdt(); + switch_to_new_gdt(cpu); DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } -- cgit v1.2.3 From 60a5317ff0f42dd313094b88f809f63041568b08 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Feb 2009 22:17:40 +0900 Subject: x86: implement x86_32 stack protector Impact: stack protector for x86_32 Implement stack protector for x86_32. GDT entry 28 is used for it. It's set to point to stack_canary-20 and have the length of 24 bytes. CONFIG_CC_STACKPROTECTOR turns off CONFIG_X86_32_LAZY_GS and sets %gs to the stack canary segment on entry. As %gs is otherwise unused by the kernel, the canary can be anywhere. It's defined as a percpu variable. x86_32 exception handlers take register frame on stack directly as struct pt_regs. With -fstack-protector turned on, gcc copies the whole structure after the stack canary and (of course) doesn't copy back on return thus losing all changed. For now, -fno-stack-protector is added to all files which contain those functions. We definitely need something better. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ef91747bbed5..d992e6cff730 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_DEBUG_PER_CPU_MAPS # define DBG(x...) printk(KERN_DEBUG x) @@ -95,6 +96,7 @@ void __init setup_per_cpu_areas(void) per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); + setup_stack_canary_segment(cpu); /* * Copy data used in early init routines from the * initial arrays to the per cpu data areas. These -- cgit v1.2.3 From 11124411aa95827404d6bfdfc14c908e1b54513c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:09 +0900 Subject: x86: convert to the new dynamic percpu allocator Impact: use new dynamic allocator, unified access to static/dynamic percpu memory Convert to the new dynamic percpu allocator. * implement populate_extra_pte() for both 32 and 64 * update setup_per_cpu_areas() to use pcpu_setup_static() * define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() * define config HAVE_DYNAMIC_PER_CPU_AREA Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 62 +++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d992e6cff730..2dce43558217 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -61,38 +61,56 @@ static inline void setup_percpu_segment(int cpu) */ void __init setup_per_cpu_areas(void) { - ssize_t size; - char *ptr; - int cpu; - - /* Copy section for each CPU (we discard the original) */ - size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ssize_t size = __per_cpu_end - __per_cpu_start; + unsigned int nr_cpu_pages = DIV_ROUND_UP(size, PAGE_SIZE); + static struct page **pages; + size_t pages_size; + unsigned int cpu, i, j; + unsigned long delta; + size_t pcpu_unit_size; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); + pr_info("PERCPU: Allocating %zd bytes for static per cpu data\n", size); - pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); + pages_size = nr_cpu_pages * num_possible_cpus() * sizeof(pages[0]); + pages = alloc_bootmem(pages_size); + j = 0; for_each_possible_cpu(cpu) { + void *ptr; + + for (i = 0; i < nr_cpu_pages; i++) { #ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); + ptr = alloc_bootmem_pages(PAGE_SIZE); #else - int node = early_cpu_to_node(cpu); - if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(size); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d at %016lx\n", - cpu, __pa(ptr)); - } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); - pr_debug("per cpu data for cpu%d on node%d at %016lx\n", - cpu, node, __pa(ptr)); - } + int node = early_cpu_to_node(cpu); + + if (!node_online(node) || !NODE_DATA(node)) { + ptr = alloc_bootmem_pages(PAGE_SIZE); + pr_info("cpu %d has no node %d or node-local " + "memory\n", cpu, node); + pr_debug("per cpu data for cpu%d at %016lx\n", + cpu, __pa(ptr)); + } else { + ptr = alloc_bootmem_pages_node(NODE_DATA(node), + PAGE_SIZE); + pr_debug("per cpu data for cpu%d on node%d " + "at %016lx\n", cpu, node, __pa(ptr)); + } #endif + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pages[j++] = virt_to_page(ptr); + } + } + + pcpu_unit_size = pcpu_setup_static(populate_extra_pte, pages, size); - memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); - per_cpu_offset(cpu) = ptr - __per_cpu_start; + free_bootmem(__pa(pages), pages_size); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) { + per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); -- cgit v1.2.3 From 458a3e644c3327be529393982e24277eda8f1ac7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: x86: update populate_extra_pte() and add populate_extra_pmd() Impact: minor change to populate_extra_pte() and addition of pmd flavor Update populate_extra_pte() to return pointer to the pte_t for the specified address and add populate_extra_pmd() which only populates till the pmd and returns pointer to the pmd entry for the address. For 64bit, pud/pmd/pte fill functions are separated out from set_pte_vaddr[_pud]() and used for set_pte_vaddr[_pud]() and populate_extra_{pte|pmd}(). Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2dce43558217..671e6528a82d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -41,6 +41,11 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +static void __init pcpu4k_populate_pte(unsigned long addr) +{ + populate_extra_pte(addr); +} + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -104,7 +109,7 @@ void __init setup_per_cpu_areas(void) } } - pcpu_unit_size = pcpu_setup_static(populate_extra_pte, pages, size); + pcpu_unit_size = pcpu_setup_static(pcpu4k_populate_pte, pages, size); free_bootmem(__pa(pages), pages_size); -- cgit v1.2.3 From 8d408b4be37bc49c9086531f2ebe411cf5731746 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: percpu: give more latitude to arch specific first chunk initialization Impact: more latitude for first percpu chunk allocation The first percpu chunk serves the kernel static percpu area and may or may not contain extra room for further dynamic allocation. Initialization of the first chunk needs to be done before normal memory allocation service is up, so it has its own init path - pcpu_setup_static(). It seems archs need more latitude while initializing the first chunk for example to take advantage of large page mapping. This patch makes the following changes to allow this. * Define PERCPU_DYNAMIC_RESERVE to give arch hint about how much space to reserve in the first chunk for further dynamic allocation. * Rename pcpu_setup_static() to pcpu_setup_first_chunk(). * Make pcpu_setup_first_chunk() much more flexible by fetching page pointer by callback and adding optional @unit_size, @free_size and @base_addr arguments which allow archs to selectively part of chunk initialization to their likings. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 671e6528a82d..d928e8887201 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -41,6 +41,16 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +static struct page **pcpu4k_pages __initdata; +static int pcpu4k_nr_static_pages __initdata; + +static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) +{ + if (pageno < pcpu4k_nr_static_pages) + return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; + return NULL; +} + static void __init pcpu4k_populate_pte(unsigned long addr) { populate_extra_pte(addr); @@ -109,7 +119,10 @@ void __init setup_per_cpu_areas(void) } } - pcpu_unit_size = pcpu_setup_static(pcpu4k_populate_pte, pages, size); + pcpu4k_pages = pages; + pcpu4k_nr_static_pages = nr_cpu_pages; + pcpu_unit_size = pcpu_setup_first_chunk(pcpu4k_get_page, size, 0, 0, + NULL, pcpu4k_populate_pte); free_bootmem(__pa(pages), pages_size); -- cgit v1.2.3 From 5f5d8405d1c50f5cf7e1dbfe9c9b44e2f015c8fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: x86: separate out setup_pcpu_4k() from setup_per_cpu_areas() Impact: modularize percpu first chunk allocation x86 is gonna have a few different strategies for the first chunk allocation. Modularize it by separating out the current allocation mechanism into pcpu_alloc_bootmem() and setup_pcpu_4k(). Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 144 +++++++++++++++++++++++++++++------------ 1 file changed, 102 insertions(+), 42 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d928e8887201..4a17c96f4f6c 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,52 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +/** + * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu + * @cpu: cpu to allocate for + * @size: size allocation in bytes + * @align: alignment + * + * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper + * does the right thing for NUMA regardless of the current + * configuration. + * + * RETURNS: + * Pointer to the allocated area on success, NULL on failure. + */ +static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, + unsigned long align) +{ + const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NEED_MULTIPLE_NODES + int node = early_cpu_to_node(cpu); + void *ptr; + + if (!node_online(node) || !NODE_DATA(node)) { + ptr = __alloc_bootmem_nopanic(size, align, goal); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", + cpu, size, __pa(ptr)); + } else { + ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), + size, align, goal); + pr_debug("per cpu data for cpu%d %lu bytes on node%d at " + "%016lx\n", cpu, size, node, __pa(ptr)); + } + return ptr; +#else + return __alloc_bootmem_nopanic(size, align, goal); +#endif +} + +/* + * 4k page allocator + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page and most of initialization is done by the generic + * setup function. + */ static struct page **pcpu4k_pages __initdata; static int pcpu4k_nr_static_pages __initdata; @@ -56,6 +103,51 @@ static void __init pcpu4k_populate_pte(unsigned long addr) populate_extra_pte(addr); } +static ssize_t __init setup_pcpu_4k(size_t static_size) +{ + size_t pages_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + pcpu4k_nr_static_pages = PFN_UP(static_size); + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() + * sizeof(pcpu4k_pages[0])); + pcpu4k_pages = alloc_bootmem(pages_size); + + /* allocate and copy */ + j = 0; + for_each_possible_cpu(cpu) + for (i = 0; i < pcpu4k_nr_static_pages; i++) { + void *ptr; + + ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); + if (!ptr) + goto enomem; + + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pcpu4k_pages[j++] = virt_to_page(ptr); + } + + /* we're ready, commit */ + pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", + pcpu4k_nr_static_pages, static_size); + + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL, + pcpu4k_populate_pte); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpu4k_pages), pages_size); + return ret; +} + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -76,56 +168,24 @@ static inline void setup_percpu_segment(int cpu) */ void __init setup_per_cpu_areas(void) { - ssize_t size = __per_cpu_end - __per_cpu_start; - unsigned int nr_cpu_pages = DIV_ROUND_UP(size, PAGE_SIZE); - static struct page **pages; - size_t pages_size; - unsigned int cpu, i, j; + size_t static_size = __per_cpu_end - __per_cpu_start; + unsigned int cpu; unsigned long delta; size_t pcpu_unit_size; + ssize_t ret; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); - pr_info("PERCPU: Allocating %zd bytes for static per cpu data\n", size); - - pages_size = nr_cpu_pages * num_possible_cpus() * sizeof(pages[0]); - pages = alloc_bootmem(pages_size); - - j = 0; - for_each_possible_cpu(cpu) { - void *ptr; - - for (i = 0; i < nr_cpu_pages; i++) { -#ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(PAGE_SIZE); -#else - int node = early_cpu_to_node(cpu); - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(PAGE_SIZE); - pr_info("cpu %d has no node %d or node-local " - "memory\n", cpu, node); - pr_debug("per cpu data for cpu%d at %016lx\n", - cpu, __pa(ptr)); - } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(node), - PAGE_SIZE); - pr_debug("per cpu data for cpu%d on node%d " - "at %016lx\n", cpu, node, __pa(ptr)); - } -#endif - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pages[j++] = virt_to_page(ptr); - } - } - pcpu4k_pages = pages; - pcpu4k_nr_static_pages = nr_cpu_pages; - pcpu_unit_size = pcpu_setup_first_chunk(pcpu4k_get_page, size, 0, 0, - NULL, pcpu4k_populate_pte); + /* allocate percpu area */ + ret = setup_pcpu_4k(static_size); + if (ret < 0) + panic("cannot allocate static percpu area (%zu bytes, err=%zd)", + static_size, ret); - free_bootmem(__pa(pages), pages_size); + pcpu_unit_size = ret; + /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; -- cgit v1.2.3 From 89c9215165ca609096e845926d9a18f1306176a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: x86: add embedding percpu first chunk allocator Impact: add better first percpu allocation for !NUMA On !NUMA, we can simply allocate contiguous memory and use it for the first chunk without mapping it into vmalloc area. As the memory area is covered by the large page physical memory mapping, it allows the dynamic perpcu allocator to not add any TLB overhead for the static percpu area and whatever falls into the first chunk and the implementation is very simple too. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 86 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4a17c96f4f6c..fd4c399675df 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -42,6 +42,35 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +/** + * pcpu_need_numa - determine percpu allocation needs to consider NUMA + * + * If NUMA is not configured or there is only one NUMA node available, + * there is no reason to consider NUMA. This function determines + * whether percpu allocation should consider NUMA or not. + * + * RETURNS: + * true if NUMA should be considered; otherwise, false. + */ +static bool __init pcpu_need_numa(void) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + pg_data_t *last = NULL; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + return true; + + last = NODE_DATA(node); + } +#endif + return false; +} + /** * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu * @cpu: cpu to allocate for @@ -81,6 +110,59 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } +/* + * Embedding allocator + * + * The first chunk is sized to just contain the static area plus + * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using + * bootmem allocator and used as-is without being mapped into vmalloc + * area. This enables the first chunk to piggy back on the linear + * physical PMD mapping and doesn't add any additional pressure to + * TLB. + */ +static void *pcpue_ptr __initdata; +static size_t pcpue_unit_size __initdata; + +static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) +{ + return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + + ((size_t)pageno << PAGE_SHIFT)); +} + +static ssize_t __init setup_pcpu_embed(size_t static_size) +{ + unsigned int cpu; + + /* + * If large page isn't supported, there's no benefit in doing + * this. Also, embedding allocation doesn't play well with + * NUMA. + */ + if (!cpu_has_pse || pcpu_need_numa()) + return -EINVAL; + + /* allocate and copy */ + pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE); + pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, + PAGE_SIZE); + if (!pcpue_ptr) + return -ENOMEM; + + for_each_possible_cpu(cpu) + memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load, + static_size); + + /* we're ready, commit */ + pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", + pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size); + + return pcpu_setup_first_chunk(pcpue_get_page, static_size, + pcpue_unit_size, + pcpue_unit_size - static_size, pcpue_ptr, + NULL); +} + /* * 4k page allocator * @@ -178,7 +260,9 @@ void __init setup_per_cpu_areas(void) NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* allocate percpu area */ - ret = setup_pcpu_4k(static_size); + ret = setup_pcpu_embed(static_size); + if (ret < 0) + ret = setup_pcpu_4k(static_size); if (ret < 0) panic("cannot allocate static percpu area (%zu bytes, err=%zd)", static_size, ret); -- cgit v1.2.3 From 8ac837571491e239e64bd87863c1679d8002e8a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:22 +0900 Subject: x86: add remapping percpu first chunk allocator Impact: add better first percpu allocation for NUMA On NUMA, embedding allocator can't be used as different units can't be made to fall in the correct NUMA nodes. To use large page mapping, each unit needs to be remapped. However, percpu areas are usually much smaller than large page size and unused space hurts a lot as the number of cpus grow. This allocator remaps large pages for each chunk but gives back unused part to the bootmem allocator making the large pages mapped twice. This adds slightly to the TLB pressure but is much better than using 4k mappings while still being NUMA-friendly. Ingo suggested that this would be the correct approach for NUMA. Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 137 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index fd4c399675df..2d946a8f78b9 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -110,6 +110,133 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } +/* + * Remap allocator + * + * This allocator uses PMD page as unit. A PMD page is allocated for + * each cpu and each is remapped into vmalloc area using PMD mapping. + * As PMD page is quite large, only part of it is used for the first + * chunk. Unused part is returned to the bootmem allocator. + * + * So, the PMD pages are mapped twice - once to the physical mapping + * and to the vmalloc area for the first percpu chunk. The double + * mapping does add one more PMD TLB entry pressure but still is much + * better than only using 4k mappings while still being NUMA friendly. + */ +#ifdef CONFIG_NEED_MULTIPLE_NODES +static size_t pcpur_size __initdata; +static void **pcpur_ptrs __initdata; + +static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpur_size) + return NULL; + + return virt_to_page(pcpur_ptrs[cpu] + off); +} + +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ + static struct vm_struct vm; + pg_data_t *last; + size_t ptrs_size; + unsigned int cpu; + ssize_t ret; + + /* + * If large page isn't supported, there's no benefit in doing + * this. Also, on non-NUMA, embedding is better. + */ + if (!cpu_has_pse || pcpu_need_numa()) + return -EINVAL; + + last = NULL; + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + goto proceed; + + last = NODE_DATA(node); + } + return -EINVAL; + +proceed: + /* + * Currently supports only single page. Supporting multiple + * pages won't be too difficult if it ever becomes necessary. + */ + pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + if (pcpur_size > PMD_SIZE) { + pr_warning("PERCPU: static data is larger than large page, " + "can't use large page\n"); + return -EINVAL; + } + + /* allocate pointer array and alloc large pages */ + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); + pcpur_ptrs = alloc_bootmem(ptrs_size); + + for_each_possible_cpu(cpu) { + pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); + if (!pcpur_ptrs[cpu]) + goto enomem; + + /* + * Only use pcpur_size bytes and give back the rest. + * + * Ingo: The 2MB up-rounding bootmem is needed to make + * sure the partial 2MB page is still fully RAM - it's + * not well-specified to have a PAT-incompatible area + * (unmapped RAM, device memory, etc.) in that hole. + */ + free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), + PMD_SIZE - pcpur_size); + + memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + } + + /* allocate address and map */ + vm.flags = VM_ALLOC; + vm.size = num_possible_cpus() * PMD_SIZE; + vm_area_register_early(&vm, PMD_SIZE); + + for_each_possible_cpu(cpu) { + pmd_t *pmd; + + pmd = populate_extra_pmd((unsigned long)vm.addr + + cpu * PMD_SIZE); + set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), + PAGE_KERNEL_LARGE)); + } + + /* we're ready, commit */ + pr_info("PERCPU: Remapped at %p with large pages, static data " + "%zu bytes\n", vm.addr, static_size); + + ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE, + pcpur_size - static_size, vm.addr, NULL); + goto out_free_ar; + +enomem: + for_each_possible_cpu(cpu) + if (pcpur_ptrs[cpu]) + free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpur_ptrs), ptrs_size); + return ret; +} +#else +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ + return -EINVAL; +} +#endif + /* * Embedding allocator * @@ -259,8 +386,14 @@ void __init setup_per_cpu_areas(void) pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); - /* allocate percpu area */ - ret = setup_pcpu_embed(static_size); + /* + * Allocate percpu area. If PSE is supported, try to make use + * of large page mappings. Please read comments on top of + * each allocator for details. + */ + ret = setup_pcpu_remap(static_size); + if (ret < 0) + ret = setup_pcpu_embed(static_size); if (ret < 0) ret = setup_pcpu_4k(static_size); if (ret < 0) -- cgit v1.2.3 From 24ff954233ecfd45801383f831626f88937ebe6f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Feb 2009 10:38:10 +0900 Subject: x86, percpu: fix minor bugs in setup_percpu.c Recent changes in setup_percpu.c made a now meaningless DBG() statement fail to compile and introduced a comparison-of-different-types warning. Fix them. Compile failure is reported by Ingo Molnar. Signed-off-by: Tejun Heo Reported-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2d946a8f78b9..c29f301d3885 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -270,7 +270,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) /* allocate and copy */ pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); - pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE); + pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE); pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, PAGE_SIZE); if (!pcpue_ptr) @@ -438,8 +438,6 @@ void __init setup_per_cpu_areas(void) */ if (cpu == boot_cpu_id) switch_to_new_gdt(cpu); - - DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } /* indicate the early static arrays will soon be gone */ -- cgit v1.2.3 From cafe8816b217b98dc3f268d3b77445da498beb4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: percpu: use negative for auto for pcpu_setup_first_chunk() arguments Impact: argument semantic cleanup In pcpu_setup_first_chunk(), zero @unit_size and @dyn_size meant auto-sizing. It's okay for @unit_size as 0 doesn't make sense but 0 dynamic reserve size is valid. Alos, if arch @dyn_size is calculated from other parameters, it might end up passing in 0 @dyn_size and malfunction when the size is automatically adjusted. This patch makes both @unit_size and @dyn_size ssize_t and use -1 for auto sizing. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index c29f301d3885..ef3a2cd3fe64 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -344,7 +344,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", pcpu4k_nr_static_pages, static_size); - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL, + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL, pcpu4k_populate_pte); goto out_free_ar; -- cgit v1.2.3 From 9a4f8a878b68d5a5d9ee60908a52cf6a55e1b823 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: x86: make embedding percpu allocator return excessive free space Impact: reduce unnecessary memory usage on certain configurations Embedding percpu allocator allocates unit_size * smp_num_possible_cpus() bytes consecutively and use it for the first chunk. However, if the static area is small, this can result in excessive prellocated free space in the first chunk due to PCPU_MIN_UNIT_SIZE restriction. This patch makes embedding percpu allocator preallocate only what's necessary as described by PERPCU_DYNAMIC_RESERVE and return the leftover to the bootmem allocator. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 44 +++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ef3a2cd3fe64..38e2b2a470a5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -241,24 +241,31 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) * Embedding allocator * * The first chunk is sized to just contain the static area plus - * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using - * bootmem allocator and used as-is without being mapped into vmalloc - * area. This enables the first chunk to piggy back on the linear - * physical PMD mapping and doesn't add any additional pressure to - * TLB. + * module and dynamic reserves, and allocated as a contiguous area + * using bootmem allocator and used as-is without being mapped into + * vmalloc area. This enables the first chunk to piggy back on the + * linear physical PMD mapping and doesn't add any additional pressure + * to TLB. Note that if the needed size is smaller than the minimum + * unit size, the leftover is returned to the bootmem allocator. */ static void *pcpue_ptr __initdata; +static size_t pcpue_size __initdata; static size_t pcpue_unit_size __initdata; static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) { - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size - + ((size_t)pageno << PAGE_SHIFT)); + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpue_size) + return NULL; + + return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); } static ssize_t __init setup_pcpu_embed(size_t static_size) { unsigned int cpu; + size_t dyn_size; /* * If large page isn't supported, there's no benefit in doing @@ -269,25 +276,30 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) return -EINVAL; /* allocate and copy */ - pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); - pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE); + pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); + dyn_size = pcpue_size - static_size; + pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, PAGE_SIZE); if (!pcpue_ptr) return -ENOMEM; - for_each_possible_cpu(cpu) - memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load, - static_size); + for_each_possible_cpu(cpu) { + void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + + free_bootmem(__pa(ptr + pcpue_size), + pcpue_unit_size - pcpue_size); + memcpy(ptr, __per_cpu_load, static_size); + } /* we're ready, commit */ pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size); + pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); return pcpu_setup_first_chunk(pcpue_get_page, static_size, - pcpue_unit_size, - pcpue_unit_size - static_size, pcpue_ptr, - NULL); + pcpue_unit_size, dyn_size, + pcpue_ptr, NULL); } /* -- cgit v1.2.3 From edcb463997ed7b2ffa3bac76e3e75957318f2e01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: percpu, module: implement reserved allocation and use it for module percpu variables Impact: add reserved allocation functionality and use it for module percpu variables This patch implements reserved allocation from the first chunk. When setting up the first chunk, arch can ask to set aside certain number of bytes right after the core static area which is available only through a separate reserved allocator. This will be used primarily for module static percpu variables on architectures with limited relocation range to ensure that the module perpcu symbols are inside the relocatable range. If reserved area is requested, the first chunk becomes reserved and isn't available for regular allocation. If the first chunk also includes piggy-back dynamic allocation area, a separate chunk mapping the same region is created to serve dynamic allocation. The first one is called static first chunk and the second dynamic first chunk. Although they share the page map, their different area map initializations guarantee they serve disjoint areas according to their purposes. If arch doesn't setup reserved area, reserved allocation is handled like any other allocation. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 38e2b2a470a5..dd4eabc747c8 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -217,7 +217,7 @@ proceed: pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE, + ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE, pcpur_size - static_size, vm.addr, NULL); goto out_free_ar; @@ -297,7 +297,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); - return pcpu_setup_first_chunk(pcpue_get_page, static_size, + return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0, pcpue_unit_size, dyn_size, pcpue_ptr, NULL); } @@ -356,8 +356,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", pcpu4k_nr_static_pages, static_size); - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL, - pcpu4k_populate_pte); + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1, + NULL, pcpu4k_populate_pte); goto out_free_ar; enomem: -- cgit v1.2.3 From 6b19b0c2400437a3c10059ede0e59b517092e1bd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: x86, percpu: setup reserved percpu area for x86_64 Impact: fix relocation overflow during module load x86_64 uses 32bit relocations for symbol access and static percpu symbols whether in core or modules must be inside 2GB of the percpu segement base which the dynamic percpu allocator doesn't guarantee. This patch makes x86_64 reserve PERCPU_MODULE_RESERVE bytes in the first chunk so that module percpu areas are always allocated from the first chunk which is always inside the relocatable range. This problem exists for any percpu allocator but is easily triggered when using the embedding allocator because the second chunk is located beyond 2GB on it. This patch also changes the meaning of PERCPU_DYNAMIC_RESERVE such that it only indicates the size of the area to reserve for dynamic allocation as static and dynamic areas can be separate. New PERCPU_DYNAMIC_RESERVED is increased by 4k for both 32 and 64bits as the reserved area separation eats away some allocatable space and having slightly more headroom (currently between 4 and 8k after minimal boot sans module area) makes sense for common case performance. x86_32 can address anywhere from anywhere and doesn't need reserving. Mike Galbraith first reported the problem first and bisected it to the embedding percpu allocator commit. Signed-off-by: Tejun Heo Reported-by: Mike Galbraith Reported-by: Jaswinder Singh Rajput --- arch/x86/kernel/setup_percpu.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel/setup_percpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index dd4eabc747c8..efa615f2bf43 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -42,6 +42,19 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +/* + * On x86_64 symbols referenced from code should be reachable using + * 32bit relocations. Reserve space for static percpu variables in + * modules so that they are always served from the first chunk which + * is located at the percpu segment base. On x86_32, anything can + * address anywhere. No need to reserve space in the first chunk. + */ +#ifdef CONFIG_X86_64 +#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE +#else +#define PERCPU_FIRST_CHUNK_RESERVE 0 +#endif + /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -141,7 +154,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) { static struct vm_struct vm; pg_data_t *last; - size_t ptrs_size; + size_t ptrs_size, dyn_size; unsigned int cpu; ssize_t ret; @@ -169,12 +182,14 @@ proceed: * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. */ - pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE); if (pcpur_size > PMD_SIZE) { pr_warning("PERCPU: static data is larger than large page, " "can't use large page\n"); return -EINVAL; } + dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); @@ -217,8 +232,9 @@ proceed: pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE, - pcpur_size - static_size, vm.addr, NULL); + ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, + PERCPU_FIRST_CHUNK_RESERVE, + PMD_SIZE, dyn_size, vm.addr, NULL); goto out_free_ar; enomem: @@ -276,9 +292,10 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) return -EINVAL; /* allocate and copy */ - pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE); pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - dyn_size = pcpue_size - static_size; + dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, PAGE_SIZE); @@ -297,7 +314,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); - return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0, + return pcpu_setup_first_chunk(pcpue_get_page, static_size, + PERCPU_FIRST_CHUNK_RESERVE, pcpue_unit_size, dyn_size, pcpue_ptr, NULL); } @@ -356,8 +374,9 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", pcpu4k_nr_static_pages, static_size); - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1, - NULL, pcpu4k_populate_pte); + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, + PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL, + pcpu4k_populate_pte); goto out_free_ar; enomem: -- cgit v1.2.3