From a1676072558854b95336c8f7db76b0504e909a0a Mon Sep 17 00:00:00 2001 From: Tony Camuso Date: Thu, 15 May 2008 14:40:14 -0400 Subject: PCI: Correct last two HP entries in the bfsort whitelist Greetings. There is a code flaw in the bfsort whitelist, where there are redundant entries for the same two HP systems, DL385 G2 and DL585 G2. This patch replaces those redundant entries with the correct ones. The correct entries are for large-volume systems, the DL360 and DL380. ----------------------------------------------------------------------- commit ec69f0374c3b0ad7ea991b0e9ac00377acfe5b1a Author: Tony Camuso Date: Wed May 14 07:09:28 2008 -0400 Replace Redundant Whitelist Entries with the Correct Ones The ProLiant DL585 G2 and the DL585 G2 are entered reundantly in the dmi_system_id table. What should have been there are the DL360 and DL380. This patch simply replaces the redundant entries with the correct entries. arch/x86/pci/common.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) Signed-off-by: Tony Camuso Signed-off-by: Pat Schoeller Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/pci/common.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 6e64aaf00d1d..940185ecaeda 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -328,18 +328,18 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = { #endif { .callback = set_bf_sort, - .ident = "HP ProLiant DL385 G2", + .ident = "HP ProLiant DL360", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "HP"), - DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL385 G2"), + DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL360"), }, }, { .callback = set_bf_sort, - .ident = "HP ProLiant DL585 G2", + .ident = "HP ProLiant DL380", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "HP"), - DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"), + DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL380"), }, }, {} -- cgit v1.2.3 From a16ffe93c46dfca211434d00453ebb695025978b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 30 May 2008 15:09:42 -0500 Subject: lguest: fix ugly in /proc/interrupts Before: root@ubuntu:~# cat /proc/interrupts CPU0 1: 1672 lguest- virtio0 2: 1 lguest- virtio1 ... After: root@ubuntu:~# cat /proc/interrupts CPU0 1: 2889 lguest-level virtio0 2: 9 lguest-level virtio1 Signed-off-by: Rusty Russell --- arch/x86/lguest/boot.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index af65b2da3ba0..5c7e2fd52075 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -582,8 +582,9 @@ static void __init lguest_init_IRQ(void) int vector = FIRST_EXTERNAL_VECTOR + i; if (vector != SYSCALL_VECTOR) { set_intr_gate(vector, interrupt[i]); - set_irq_chip_and_handler(i, &lguest_irq_controller, - handle_level_irq); + set_irq_chip_and_handler_name(i, &lguest_irq_controller, + handle_level_irq, + "level"); } } /* This call is required to set up for 4k stacks, where we have -- cgit v1.2.3 From 75b19b790bec3ebffbf513405b27500e22270cbc Mon Sep 17 00:00:00 2001 From: Bertram Felgenhauer Date: Fri, 30 May 2008 03:20:05 +0200 Subject: pci, x86: add workaround for bug in ASUS A7V600 BIOS (rev 1005) This BIOS claims the VIA 8237 south bridge to be compatible with VIA 586, which it is not. Without this patch, I get the following warning while booting, among others, | PCI: Using IRQ router VIA [1106/3227] at 0000:00:11.0 | ------------[ cut here ]------------ | WARNING: at arch/x86/pci/irq.c:265 pirq_via586_get+0x4a/0x60() | Modules linked in: | Pid: 1, comm: swapper Not tainted 2.6.26-rc4-00015-g1ec7d99 #1 | [] warn_on_slowpath+0x54/0x70 | [] ? vt_console_print+0x210/0x2b0 | [] ? vt_console_print+0x0/0x2b0 | [] ? __call_console_drivers+0x43/0x60 | [] ? _call_console_drivers+0x52/0x80 | [] ? release_console_sem+0x1c9/0x200 | [] ? raw_pci_read+0x41/0x70 | [] ? pci_read+0x2f/0x40 | [] pirq_via586_get+0x4a/0x60 | [] ? pirq_via586_get+0x0/0x60 | [] pcibios_lookup_irq+0x15d/0x430 | [] pcibios_irq_init+0x17a/0x3e0 | [] ? kernel_init+0x0/0x250 | [] kernel_init+0x73/0x250 | [] ? pcibios_irq_init+0x0/0x3e0 | [] ? schedule_tail+0x10/0x40 | [] ? ret_from_fork+0x6/0x1c | [] ? kernel_init+0x0/0x250 | [] ? kernel_init+0x0/0x250 | [] kernel_thread_helper+0x7/0x1c | ======================= | ---[ end trace 4eaa2a86a8e2da22 ]--- and IRQ trouble later, | irq 10: nobody cared (try booting with the "irqpoll" option) Now that's an VIA 8237 chip, so pirq_via586_get shouldn't be called at all; adding this workaround to via_router_probe() fixes the problem for me. Amazingly I have a 2.6.23.8 kernel that somehow works fine ... I'll never understand why. Signed-off-by: Bertram Felgenhauer Acked-by: Alan Cox Signed-off-by: Ingo Molnar --- arch/x86/pci/irq.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 0908fca901bf..ca8df9c260bc 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -621,6 +621,13 @@ static __init int via_router_probe(struct irq_router *r, */ device = PCI_DEVICE_ID_VIA_8235; break; + case PCI_DEVICE_ID_VIA_8237: + /** + * Asus a7v600 bios wrongly reports 8237 + * as 586-compatible + */ + device = PCI_DEVICE_ID_VIA_8237; + break; } } -- cgit v1.2.3 From db9f600b96c16bb3c7f094e294fbdd370226ad86 Mon Sep 17 00:00:00 2001 From: Miquel van Smoorenburg Date: Wed, 28 May 2008 10:31:25 +0200 Subject: x86: pci-dma.c: use __GFP_NO_OOM instead of __GFP_NORETRY On Wed, 2008-05-28 at 04:47 +0200, Andi Kleen wrote: > > So... why not just remove the setting of __GFP_NORETRY? Why is it > > wrong to oom-kill things in this case? > > When the 16MB zone overflows (which can be common in some workloads) > calling the OOM killer is pretty useless because it has barely any > real user data [only exception would be the "only 16MB" case Alan > mentioned]. Killing random processes in this case is bad. > > I think for 16MB __GFP_NORETRY is ok because there should be > nothing freeable in there so looping is useless. Only exception would be the > "only 16MB total" case again but I'm not sure 2.6 supports that at all > on x86. > > On the other hand d_a_c() does more allocations than just 16MB, especially > on 64bit and the other zones need different strategies. Okay, so how about this then ? Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index c5ef1af8e79d..069e843f0b93 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -397,9 +397,6 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, if (dev->dma_mask == NULL) return NULL; - /* Don't invoke OOM killer */ - gfp |= __GFP_NORETRY; - #ifdef CONFIG_X86_64 /* Why <=? Even when the mask is smaller than 4GB it is often larger than 16MB and in this case we have a chance of @@ -410,7 +407,9 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, #endif again: - page = dma_alloc_pages(dev, gfp, get_order(size)); + /* Don't invoke OOM killer or retry in lower 16MB DMA zone */ + page = dma_alloc_pages(dev, + (gfp & GFP_DMA) ? gfp | __GFP_NORETRY : gfp, get_order(size)); if (page == NULL) return NULL; -- cgit v1.2.3 From f529626a86d61897862aa1bbbb4537773209238e Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Thu, 29 May 2008 00:30:21 -0700 Subject: suspend-vs-iommu: prevent suspend if we could not resume iommu/gart support misses suspend/resume code, which can do bad stuff, including memory corruption on resume. Prevent system suspend in case we would be unable to resume. Signed-off-by: Pavel Machek Tested-by: Patrick Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index c07455d1695f..aa8ec928caa8 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -548,6 +549,28 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) return aper_base; } +static int gart_resume(struct sys_device *dev) +{ + return 0; +} + +static int gart_suspend(struct sys_device *dev, pm_message_t state) +{ + return -EINVAL; +} + +static struct sysdev_class gart_sysdev_class = { + .name = "gart", + .suspend = gart_suspend, + .resume = gart_resume, + +}; + +static struct sys_device device_gart = { + .id = 0, + .cls = &gart_sysdev_class, +}; + /* * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. @@ -558,7 +581,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) unsigned aper_base, new_aper_base; struct pci_dev *dev; void *gatt; - int i; + int i, error; printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); aper_size = aper_base = info->aper_size = 0; @@ -606,6 +629,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info) pci_write_config_dword(dev, 0x90, ctl); } + + error = sysdev_class_register(&gart_sysdev_class); + if (!error) + error = sysdev_register(&device_gart); + if (error) + panic("Could not register gart_sysdev -- would corrupt data on next suspend"); flush_gart(); printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", -- cgit v1.2.3 From fb3bbd6a663fe972611676381adc4c60ddfe61ac Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 22 May 2008 18:22:30 -0700 Subject: x86: fix APIC warning on 32bit v2 for http://bugzilla.kernel.org/show_bug.cgi?id=10613 BIOS bug, APIC version is 0 for CPU#0! fixing up to 0x10. (tell your hw vendor) v2: fix 64 bit compilation Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: "Rafael J. Wysocki" Cc: Gabriel C Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c49ebcc6c41e..33c5216fd3e1 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -242,12 +242,19 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) static void __cpuinit acpi_register_lapic(int id, u8 enabled) { + unsigned int ver = 0; + if (!enabled) { ++disabled_cpus; return; } - generic_processor_info(id, 0); +#ifdef CONFIG_X86_32 + if (boot_cpu_physical_apicid != -1U) + ver = apic_version[boot_cpu_physical_apicid]; +#endif + + generic_processor_info(id, ver); } static int __init @@ -767,8 +774,13 @@ static void __init acpi_register_lapic_address(unsigned long address) mp_lapic_addr = address; set_fixmap_nocache(FIX_APIC_BASE, address); - if (boot_cpu_physical_apicid == -1U) + if (boot_cpu_physical_apicid == -1U) { boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); +#ifdef CONFIG_X86_32 + apic_version[boot_cpu_physical_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); +#endif + } } static int __init early_acpi_parse_madt_lapic_addr_ovr(void) -- cgit v1.2.3 From deef325086c3897393b8f7d6bccd03405244fe18 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 15:44:38 +0200 Subject: x86: disable preemption in native_smp_prepare_cpus Priit Laes reported the following warning: Call Trace: [] warn_on_slowpath+0x51/0x63 [] sys_ioctl+0x2d/0x5d [] _spin_lock+0xe/0x24 [] task_rq_lock+0x3d/0x73 [] set_cpu_sibling_map+0x336/0x350 [] read_apic_id+0x30/0x62 [] verify_local_APIC+0x90/0x138 [] native_smp_prepare_cpus+0x1f9/0x305 [] kernel_init+0x59/0x2d9 [] _spin_unlock_irq+0x11/0x2b [] child_rip+0xa/0x12 [] kernel_init+0x0/0x2d9 [] child_rip+0x0/0x12 fix this by generally disabling preemption in native_smp_prepare_cpus(). Reported-and-bisected-by: Priit Laes Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/smpboot.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 38988491c622..56078d61c793 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1190,6 +1190,7 @@ static void __init smp_cpu_index_default(void) */ void __init native_smp_prepare_cpus(unsigned int max_cpus) { + preempt_disable(); nmi_watchdog_default(); smp_cpu_index_default(); current_cpu_data = boot_cpu_data; @@ -1206,7 +1207,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); disable_smp(); - return; + goto out; } preempt_disable(); @@ -1246,6 +1247,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) printk(KERN_INFO "CPU%d: ", 0); print_cpu_info(&cpu_data(0)); setup_boot_clock(); +out: + preempt_enable(); } /* * Early setup to make printk work. -- cgit v1.2.3 From 5c1ea08215f1f830dfaf4819a5f22efca41c3832 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 25 May 2008 11:13:32 -0400 Subject: x86: enable preemption in delay The RT team has been searching for a nasty latency. This latency shows up out of the blue and has been seen to be as big as 5ms! Using ftrace I found the cause of the latency. pcscd-2995 3dNh1 52360300us : irq_exit (smp_apic_timer_interrupt) pcscd-2995 3dN.2 52360301us : idle_cpu (irq_exit) pcscd-2995 3dN.2 52360301us : rcu_irq_exit (irq_exit) pcscd-2995 3dN.1 52360771us : smp_apic_timer_interrupt (apic_timer_interrupt ) pcscd-2995 3dN.1 52360771us : exit_idle (smp_apic_timer_interrupt) Here's an example of a 400 us latency. pcscd took a timer interrupt and returned with "need resched" enabled, but did not reschedule until after the next interrupt came in at 52360771us 400us later! At first I thought we somehow missed a preemption check in entry.S. But I also noticed that this always seemed to happen during a __delay call. pcscd-2995 3dN.2 52360836us : rcu_irq_exit (irq_exit) pcscd-2995 3.N.. 52361265us : preempt_schedule (__delay) Looking at the x86 delay, I found my problem. In git commit 35d5d08a085c56f153458c3f5d8ce24123617faf, Andrew Morton placed preempt_disable around the entire delay due to TSC's not working nicely on SMP. Unfortunately for those that care about latencies this is devastating! Especially when we have callers to mdelay(8). Here I enable preemption during the loop and account for anytime the task migrates to a new CPU. The delay asked for may be extended a bit by the migration, but delay only guarantees that it will delay for that minimum time. Delaying longer should not be an issue. [ Thanks to Thomas Gleixner for spotting that cpu wasn't updated, and to place the rep_nop between preempt_enabled/disable. ] Signed-off-by: Steven Rostedt Cc: akpm@osdl.org Cc: Clark Williams Cc: Peter Zijlstra Cc: "Luis Claudio R. Goncalves" Cc: Gregory Haskins Cc: Linus Torvalds Cc: Andi Kleen Signed-off-by: Thomas Gleixner --- arch/x86/lib/delay_32.c | 31 +++++++++++++++++++++++++++---- arch/x86/lib/delay_64.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 53 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c index 4535e6d147ad..d710f2d167bb 100644 --- a/arch/x86/lib/delay_32.c +++ b/arch/x86/lib/delay_32.c @@ -44,13 +44,36 @@ static void delay_loop(unsigned long loops) static void delay_tsc(unsigned long loops) { unsigned long bclock, now; + int cpu; - preempt_disable(); /* TSC's are per-cpu */ + preempt_disable(); + cpu = smp_processor_id(); rdtscl(bclock); - do { - rep_nop(); + for (;;) { rdtscl(now); - } while ((now-bclock) < loops); + if ((now - bclock) >= loops) + break; + + /* Allow RT tasks to run */ + preempt_enable(); + rep_nop(); + preempt_disable(); + + /* + * It is possible that we moved to another CPU, and + * since TSC's are per-cpu we need to calculate + * that. The delay must guarantee that we wait "at + * least" the amount of time. Being moved to another + * CPU could make the wait longer but we just need to + * make sure we waited long enough. Rebalance the + * counter for this CPU. + */ + if (unlikely(cpu != smp_processor_id())) { + loops -= (now - bclock); + cpu = smp_processor_id(); + rdtscl(bclock); + } + } preempt_enable(); } diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c index bbc610518516..4c441be92641 100644 --- a/arch/x86/lib/delay_64.c +++ b/arch/x86/lib/delay_64.c @@ -31,14 +31,36 @@ int __devinit read_current_timer(unsigned long *timer_value) void __delay(unsigned long loops) { unsigned bclock, now; + int cpu; - preempt_disable(); /* TSC's are pre-cpu */ + preempt_disable(); + cpu = smp_processor_id(); rdtscl(bclock); - do { - rep_nop(); + for (;;) { rdtscl(now); + if ((now - bclock) >= loops) + break; + + /* Allow RT tasks to run */ + preempt_enable(); + rep_nop(); + preempt_disable(); + + /* + * It is possible that we moved to another CPU, and + * since TSC's are per-cpu we need to calculate + * that. The delay must guarantee that we wait "at + * least" the amount of time. Being moved to another + * CPU could make the wait longer but we just need to + * make sure we waited long enough. Rebalance the + * counter for this CPU. + */ + if (unlikely(cpu != smp_processor_id())) { + loops -= (now - bclock); + cpu = smp_processor_id(); + rdtscl(bclock); + } } - while ((now-bclock) < loops); preempt_enable(); } EXPORT_SYMBOL(__delay); -- cgit v1.2.3 From e8a496ac8cd00cabbdaa373db4818a9ad19a1c5a Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 23 May 2008 16:26:37 -0700 Subject: x86: fix broken math-emu with lazy allocation of fpu area Fix the math emulation that got broken with the recent lazy allocation of FPU area. init_fpu() need to be added for the math-emulation path aswell for the FPU area allocation. math emulation enabled kernel booted fine with this, in the presence of "no387 nofxsr" boot param. Signed-off-by: Suresh Siddha Cc: hpa@zytor.com Cc: mingo@elte.hu Signed-off-by: Thomas Gleixner --- arch/x86/kernel/i387.c | 44 ++++++++++++++++++++++++++++--------------- arch/x86/math-emu/fpu_entry.c | 13 ++++++++----- 2 files changed, 37 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index e03cc952f233..eb9ddd8efb82 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -56,6 +56,11 @@ void __cpuinit mxcsr_feature_mask_init(void) void __init init_thread_xstate(void) { + if (!HAVE_HWFP) { + xstate_size = sizeof(struct i387_soft_struct); + return; + } + if (cpu_has_fxsr) xstate_size = sizeof(struct i387_fxsave_struct); #ifdef CONFIG_X86_32 @@ -94,7 +99,7 @@ void __cpuinit fpu_init(void) int init_fpu(struct task_struct *tsk) { if (tsk_used_math(tsk)) { - if (tsk == current) + if (HAVE_HWFP && tsk == current) unlazy_fpu(tsk); return 0; } @@ -109,6 +114,15 @@ int init_fpu(struct task_struct *tsk) return -ENOMEM; } +#ifdef CONFIG_X86_32 + if (!HAVE_HWFP) { + memset(tsk->thread.xstate, 0, xstate_size); + finit(); + set_stopped_child_used_math(tsk); + return 0; + } +#endif + if (cpu_has_fxsr) { struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; @@ -330,13 +344,13 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - if (!HAVE_HWFP) - return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); - ret = init_fpu(target); if (ret) return ret; + if (!HAVE_HWFP) + return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); + if (!cpu_has_fxsr) { return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.xstate->fsave, 0, @@ -360,15 +374,15 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - if (!HAVE_HWFP) - return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); - ret = init_fpu(target); if (ret) return ret; set_stopped_child_used_math(target); + if (!HAVE_HWFP) + return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); + if (!cpu_has_fxsr) { return user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.xstate->fsave, 0, -1); @@ -474,18 +488,18 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) int restore_i387_ia32(struct _fpstate_ia32 __user *buf) { int err; + struct task_struct *tsk = current; - if (HAVE_HWFP) { - struct task_struct *tsk = current; - + if (HAVE_HWFP) clear_fpu(tsk); - if (!used_math()) { - err = init_fpu(tsk); - if (err) - return err; - } + if (!used_math()) { + err = init_fpu(tsk); + if (err) + return err; + } + if (HAVE_HWFP) { if (cpu_has_fxsr) err = restore_i387_fxsave(buf); else diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 6e38d877ea77..c7b06feb139b 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "fpu_system.h" #include "fpu_emu.h" @@ -146,6 +147,13 @@ asmlinkage void math_emulate(long arg) unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ struct desc_struct code_descriptor; + if (!used_math()) { + if (init_fpu(current)) { + do_group_exit(SIGKILL); + return; + } + } + #ifdef RE_ENTRANT_CHECKING if (emulating) { printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n"); @@ -153,11 +161,6 @@ asmlinkage void math_emulate(long arg) RE_ENTRANT_CHECK_ON; #endif /* RE_ENTRANT_CHECKING */ - if (!used_math()) { - finit(); - set_used_math(); - } - SETUP_DATA_AREA(arg); FPU_ORIG_EIP = FPU_EIP; -- cgit v1.2.3 From 226e9a93a253b7d8811b5ed9ac671c6c5a728022 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 May 2008 09:56:49 +0200 Subject: x86: ioremap fix failing nesting check Mika Kukkonen noticed that the nesting check in early_iounmap() is not actually done. Reported-by: Mika Kukkonen Signed-off-by: Ingo Molnar Cc: torvalds@linux-foundation.org Cc: arjan@linux.intel.com Cc: mikukkon@iki.fi Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 71bb3159031a..2b2bb3f9b683 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -593,10 +593,11 @@ void __init early_iounmap(void *addr, unsigned long size) unsigned long offset; unsigned int nrpages; enum fixed_addresses idx; - unsigned int nesting; + int nesting; nesting = --early_ioremap_nested; - WARN_ON(nesting < 0); + if (WARN_ON(nesting < 0)) + return; if (early_ioremap_debug) { printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, -- cgit v1.2.3 From 2884f110d5409714f3a04eeb6d2ecd77da66b242 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 28 May 2008 19:36:07 +0100 Subject: x86: fix bad pmd ffff810000207xxx(9090909090909090) OGAWA Hirofumi and Fede have reported rare pmd_ERROR messages: mm/memory.c:127: bad pmd ffff810000207xxx(9090909090909090). Initialization's cleanup_highmap was leaving alignment filler behind in the pmd for MODULES_VADDR: when vmalloc's guard page would occupy a new page table, it's not allocated, and then module unload's vfree hits the bad 9090 pmd entry left over. Signed-off-by: Hugh Dickins Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 32ba13b0f818..998a06ea5f7d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -206,7 +206,7 @@ void __init cleanup_highmap(void) pmd_t *last_pmd = pmd + PTRS_PER_PMD; for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { - if (!pmd_present(*pmd)) + if (pmd_none(*pmd)) continue; if (vaddr < (unsigned long) _text || vaddr > end) set_pmd(pmd, __pmd(0)); -- cgit v1.2.3 From 511631011d39706ac81ee5e4c9084d61e5b4fd34 Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Thu, 29 May 2008 21:14:35 -0300 Subject: x86: fix pointer type warning in arch/x86/mm/init_64.c:early_memtest Changed the call to find_e820_area_size to pass u64 instead of unsigned long. Signed-off-by: Kevin Winchester Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 998a06ea5f7d..156e6d7b0e32 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -506,7 +506,7 @@ early_param("memtest", parse_memtest); static void __init early_memtest(unsigned long start, unsigned long end) { - unsigned long t_start, t_size; + u64 t_start, t_size; unsigned pattern; if (!memtest_pattern) @@ -525,7 +525,7 @@ static void __init early_memtest(unsigned long start, unsigned long end) if (t_start + t_size > end) t_size = end - t_start; - printk(KERN_CONT "\n %016lx - %016lx pattern %d", + printk(KERN_CONT "\n %016llx - %016llx pattern %d", t_start, t_start + t_size, pattern); memtest(t_start, t_size, pattern); -- cgit v1.2.3 From 282c454cd3a7041f59a37112bb2f82263bc38f6c Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Thu, 29 May 2008 12:01:44 -0700 Subject: x86: fix Xorg crash with xf86MapVidMem error Clarify the usage of mtrr_lookup() in PAT code, and to make PAT code resilient to mtrr lookup problems. Specifically, pat_x_mtrr_type() is restructured to highlight, under what conditions we look for mtrr hint. pat_x_mtrr_type() uses a default type when there are any errors in mtrr lookup (still maintaining the pat consistency). And, reserve_memtype() highlights its usage ot mtrr_lookup for request type of '-1' and also defaults in a sane way on any mtrr lookup failure. pat.c looks at mtrr type of a range to get a hint on what mapping type to request when user/API: (1) hasn't specified any type (/dev/mem mapping) and we do not want to take performance hit by always mapping UC_MINUS. This will be the case for /dev/mem mappings used to map BIOS area or ACPI region which are WB'able. In this case, as long as MTRR is not WB, PAT will request UC_MINUS for such mappings. (2) user/API requests WB mapping while in reality MTRR may have UC or WC. In this case, PAT can map as WB (without checking MTRR) and still effective type will be UC or WC. But, a subsequent request to map same region as UC or WC may fail, as the region will get trackked as WB in PAT list. Looking at MTRR hint helps us to track based on effective type rather than what user requested. Again, here mtrr_lookup is only used as hint and we fallback to WB mapping (as requested by user) as default. In both cases, after using the mtrr hint, we still go through the memtype list to make sure there are no inconsistencies among multiple users. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Tested-by: Rufus & Azrael Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index de3a99812450..183fdd36d3bd 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -151,32 +151,33 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, unsigned long pat_type; u8 mtrr_type; - mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == 0xFF) { /* MTRR not enabled */ - *ret_prot = prot; - return 0; - } - if (mtrr_type == 0xFE) { /* MTRR match error */ - *ret_prot = _PAGE_CACHE_UC; - return -1; - } - if (mtrr_type != MTRR_TYPE_UNCACHABLE && - mtrr_type != MTRR_TYPE_WRBACK && - mtrr_type != MTRR_TYPE_WRCOMB) { /* MTRR type unhandled */ - *ret_prot = _PAGE_CACHE_UC; - return -1; - } - pat_type = prot & _PAGE_CACHE_MASK; prot &= (~_PAGE_CACHE_MASK); - /* Currently doing intersection by hand. Optimize it later. */ + /* + * We return the PAT request directly for types where PAT takes + * precedence with respect to MTRR and for UC_MINUS. + * Consistency checks with other PAT requests is done later + * while going through memtype list. + */ if (pat_type == _PAGE_CACHE_WC) { *ret_prot = prot | _PAGE_CACHE_WC; + return 0; } else if (pat_type == _PAGE_CACHE_UC_MINUS) { *ret_prot = prot | _PAGE_CACHE_UC_MINUS; - } else if (pat_type == _PAGE_CACHE_UC || - mtrr_type == MTRR_TYPE_UNCACHABLE) { + return 0; + } else if (pat_type == _PAGE_CACHE_UC) { + *ret_prot = prot | _PAGE_CACHE_UC; + return 0; + } + + /* + * Look for MTRR hint to get the effective type in case where PAT + * request is for WB. + */ + mtrr_type = mtrr_type_lookup(start, end); + + if (mtrr_type == MTRR_TYPE_UNCACHABLE) { *ret_prot = prot | _PAGE_CACHE_UC; } else if (mtrr_type == MTRR_TYPE_WRCOMB) { *ret_prot = prot | _PAGE_CACHE_WC; @@ -233,14 +234,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (req_type == -1) { /* - * Special case where caller wants to inherit from mtrr or - * existing pat mapping, defaulting to UC_MINUS in case of - * no match. + * Call mtrr_lookup to get the type hint. This is an + * optimization for /dev/mem mmap'ers into WB memory (BIOS + * tools and ACPI tools). Use WB request for WB memory and use + * UC_MINUS otherwise. */ u8 mtrr_type = mtrr_type_lookup(start, end); - if (mtrr_type == 0xFE) { /* MTRR match error */ - err = -1; - } if (mtrr_type == MTRR_TYPE_WRBACK) { req_type = _PAGE_CACHE_WB; -- cgit v1.2.3 From be524fb96081e9e511d993ebf39b05a32b19476e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 29 May 2008 00:01:28 -0700 Subject: x86: section mismatch fix Fix this: WARNING: vmlinux.o(.text+0x114bb): Section mismatch in reference from the function nopat() to the function .cpuinit.text:pat_disable() The function nopat() references the function __cpuinit pat_disable(). This is often because nopat lacks a __cpuinit annotation or the annotation of pat_disable is wrong. Reported-by: "Fabio Comolli" Cc: Sam Ravnborg Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 183fdd36d3bd..06b7a1c90fb8 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -34,7 +34,7 @@ void __cpuinit pat_disable(char *reason) printk(KERN_INFO "%s\n", reason); } -static int nopat(char *str) +static int __init nopat(char *str) { pat_disable("PAT support disabled."); return 0; -- cgit v1.2.3 From cd76374e9de4501acc74f833dc6cb5e7a5dca115 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Thu, 29 May 2008 00:30:21 -0700 Subject: suspend-vs-iommu: prevent suspend if we could not resume iommu/gart support misses suspend/resume code, which can do bad stuff, including memory corruption on resume. Prevent system suspend in case we would be unable to resume. Signed-off-by: Pavel Machek Tested-by: Patrick Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index c07455d1695f..aa8ec928caa8 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -548,6 +549,28 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) return aper_base; } +static int gart_resume(struct sys_device *dev) +{ + return 0; +} + +static int gart_suspend(struct sys_device *dev, pm_message_t state) +{ + return -EINVAL; +} + +static struct sysdev_class gart_sysdev_class = { + .name = "gart", + .suspend = gart_suspend, + .resume = gart_resume, + +}; + +static struct sys_device device_gart = { + .id = 0, + .cls = &gart_sysdev_class, +}; + /* * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. @@ -558,7 +581,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) unsigned aper_base, new_aper_base; struct pci_dev *dev; void *gatt; - int i; + int i, error; printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); aper_size = aper_base = info->aper_size = 0; @@ -606,6 +629,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info) pci_write_config_dword(dev, 0x90, ctl); } + + error = sysdev_class_register(&gart_sysdev_class); + if (!error) + error = sysdev_register(&device_gart); + if (error) + panic("Could not register gart_sysdev -- would corrupt data on next suspend"); flush_gart(); printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", -- cgit v1.2.3 From 870568b39064cab2dd971fe57969916036982862 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 2 Jun 2008 15:57:27 -0700 Subject: x86, fpu: fix CONFIG_PREEMPT=y corruption of application's FPU stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jürgen Mell reported an FPU state corruption bug under CONFIG_PREEMPT, and bisected it to commit v2.6.19-1363-gacc2076, "i386: add sleazy FPU optimization". Add tsk_used_math() checks to prevent calling math_state_restore() which can sleep in the case of !tsk_used_math(). This prevents making a blocking call in __switch_to(). Apparently "fpu_counter > 5" check is not enough, as in some signal handling and fork/exec scenarios, fpu_counter > 5 and !tsk_used_math() is possible. It's a side effect though. This is the failing scenario: process 'A' in save_i387_ia32() just after clear_used_math() Got an interrupt and pre-empted out. At the next context switch to process 'A' again, kernel tries to restore the math state proactively and sees a fpu_counter > 0 and !tsk_used_math() This results in init_fpu() during the __switch_to()'s math_state_restore() And resulting in fpu corruption which will be saved/restored (save_i387_fxsave and restore_i387_fxsave) during the remaining part of the signal handling after the context switch. Bisected-by: Jürgen Mell Signed-off-by: Suresh Siddha Tested-by: Jürgen Mell Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- arch/x86/kernel/process_32.c | 5 ++++- arch/x86/kernel/process_64.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f8476dfbb60d..6d5483356e74 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -649,8 +649,11 @@ struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now + * + * tsk_used_math() checks prevent calling math_state_restore(), + * which can sleep in the case of !tsk_used_math() */ - if (next_p->fpu_counter > 5) + if (tsk_used_math(next_p) && next_p->fpu_counter > 5) math_state_restore(); /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e2319f39988b..ac54ff56df80 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -658,8 +658,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now + * + * tsk_used_math() checks prevent calling math_state_restore(), + * which can sleep in the case of !tsk_used_math() */ - if (next_p->fpu_counter>5) + if (tsk_used_math(next_p) && next_p->fpu_counter > 5) math_state_restore(); return prev_p; } -- cgit v1.2.3 From 16104b5504fa8be130f7f127a5a1c7dd774efc44 Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Thu, 5 Jun 2008 22:47:13 +0200 Subject: x86: fix CONFIG_NONPROMISC_DEVMEM prompt and help text Here is an attempt to translate the prompt and help text into something which is legible and, as a bonus, correct. Signed-off-by: Stefan Richter Signed-off-by: Linus Torvalds --- arch/x86/Kconfig.debug | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index ac1e31ba4795..18363374d51a 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -6,15 +6,19 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" config NONPROMISC_DEVMEM - bool "Disable promiscuous /dev/mem" + bool "Filter access to /dev/mem" help - The /dev/mem file by default only allows userspace access to PCI - space and the BIOS code and data regions. This is sufficient for - dosemu and X and all common users of /dev/mem. With this config - option, you allow userspace access to all of memory, including - kernel and userspace memory. Accidental access to this is - obviously disasterous, but specific access can be used by people - debugging the kernel. + If this option is left off, you allow userspace access to all + of memory, including kernel and userspace memory. Accidental + access to this is obviously disastrous, but specific access can + be used by people debugging the kernel. + + If this option is switched on, the /dev/mem file only allows + userspace access to PCI space and the BIOS code and data regions. + This is sufficient for dosemu and X and all common users of + /dev/mem. + + If in doubt, say Y. config EARLY_PRINTK bool "Early printk" if EMBEDDED -- cgit v1.2.3 From 2bdd1b031b200d55c2512c8d7e0e9bdcf85d011f Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Thu, 5 Jun 2008 14:14:41 -0700 Subject: PCI/x86: fix up PCI stuff so that PCI_GOANY supports OLPC Previously, one would have to specifically choose CONFIG_OLPC and CONFIG_PCI_GOOLPC in order to enable PCI_OLPC. That doesn't really work for distro kernels, so this patch allows one to choose CONFIG_OLPC and CONFIG_PCI_GOANY in order to build in OLPC support in a generic kernel (as requested by Robert Millan). This also moves GOOLPC before GOANY in the menuconfig list. Finally, make pci_access_init return early if we detect OLPC hardware. There's no need to continue probing stuff, and pci_pcbios_init specifically trashes our settings (we didn't run into that before because PCI_GOANY wasn't supported). Signed-off-by: Andres Salomon Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Jesse Barnes --- arch/x86/Kconfig | 11 +++++------ arch/x86/pci/init.c | 3 ++- arch/x86/pci/olpc.c | 5 +++-- arch/x86/pci/pci.h | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dcbec34154cf..52e18e6d2ba0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1508,13 +1508,13 @@ config PCI_GOMMCONFIG config PCI_GODIRECT bool "Direct" -config PCI_GOANY - bool "Any" - config PCI_GOOLPC bool "OLPC" depends on OLPC +config PCI_GOANY + bool "Any" + endchoice config PCI_BIOS @@ -1531,9 +1531,8 @@ config PCI_MMCONFIG depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) config PCI_OLPC - bool - depends on PCI && PCI_GOOLPC - default y + def_bool y + depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY) config PCI_DOMAINS def_bool y diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index e70b9c57b88e..b821f4462d99 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -15,7 +15,8 @@ static __init int pci_access_init(void) pci_mmcfg_early_init(); #ifdef CONFIG_PCI_OLPC - pci_olpc_init(); + if (!pci_olpc_init()) + return 0; /* skip additional checks if it's an XO */ #endif #ifdef CONFIG_PCI_BIOS pci_pcbios_init(); diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index 5e7636558c02..e11e9e803d5f 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c @@ -302,12 +302,13 @@ static struct pci_raw_ops pci_olpc_conf = { .write = pci_olpc_write, }; -void __init pci_olpc_init(void) +int __init pci_olpc_init(void) { if (!machine_is_olpc() || olpc_has_vsa()) - return; + return -ENODEV; printk(KERN_INFO "PCI: Using configuration type OLPC\n"); raw_pci_ops = &pci_olpc_conf; is_lx = is_geode_lx(); + return 0; } diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index f3972b12c60a..720c4c554534 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -101,7 +101,7 @@ extern struct pci_raw_ops pci_direct_conf1; extern int pci_direct_probe(void); extern void pci_direct_init(int type); extern void pci_pcbios_init(void); -extern void pci_olpc_init(void); +extern int pci_olpc_init(void); /* pci-mmconfig.c */ -- cgit v1.2.3 From 9f67fd5db50566728996b0115a08c83d4f902cb3 Mon Sep 17 00:00:00 2001 From: Bertram Felgenhauer Date: Thu, 5 Jun 2008 15:31:22 -0700 Subject: x86/PCI: add workaround for bug in ASUS A7V600 BIOS (rev 1005) This BIOS claims the VIA 8237 south bridge to be compatible with VIA 586, which it is not. Without this patch, I get the following warning while booting, among others, | PCI: Using IRQ router VIA [1106/3227] at 0000:00:11.0 | ------------[ cut here ]------------ | WARNING: at arch/x86/pci/irq.c:265 pirq_via586_get+0x4a/0x60() | Modules linked in: | Pid: 1, comm: swapper Not tainted 2.6.26-rc4-00015-g1ec7d99 #1 | [] warn_on_slowpath+0x54/0x70 | [] ? vt_console_print+0x210/0x2b0 | [] ? vt_console_print+0x0/0x2b0 | [] ? __call_console_drivers+0x43/0x60 | [] ? _call_console_drivers+0x52/0x80 | [] ? release_console_sem+0x1c9/0x200 | [] ? raw_pci_read+0x41/0x70 | [] ? pci_read+0x2f/0x40 | [] pirq_via586_get+0x4a/0x60 | [] ? pirq_via586_get+0x0/0x60 | [] pcibios_lookup_irq+0x15d/0x430 | [] pcibios_irq_init+0x17a/0x3e0 | [] ? kernel_init+0x0/0x250 | [] kernel_init+0x73/0x250 | [] ? pcibios_irq_init+0x0/0x3e0 | [] ? schedule_tail+0x10/0x40 | [] ? ret_from_fork+0x6/0x1c | [] ? kernel_init+0x0/0x250 | [] ? kernel_init+0x0/0x250 | [] kernel_thread_helper+0x7/0x1c | ======================= | ---[ end trace 4eaa2a86a8e2da22 ]--- and IRQ trouble later, | irq 10: nobody cared (try booting with the "irqpoll" option) Now that's an VIA 8237 chip, so pirq_via586_get shouldn't be called at all; adding this workaround to via_router_probe() fixes the problem for me. Amazingly I have a 2.6.23.8 kernel that somehow works fine ... I'll never understand why. Signed-off-by: Bertram Felgenhauer Cc: Jesse Barnes Acked-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Jesse Barnes --- arch/x86/pci/irq.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 0908fca901bf..ca8df9c260bc 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -621,6 +621,13 @@ static __init int via_router_probe(struct irq_router *r, */ device = PCI_DEVICE_ID_VIA_8235; break; + case PCI_DEVICE_ID_VIA_8237: + /** + * Asus a7v600 bios wrongly reports 8237 + * as 586-compatible + */ + device = PCI_DEVICE_ID_VIA_8237; + break; } } -- cgit v1.2.3 From 33e3885de25148e00595c4dd808d6eb15db2edcf Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 21 May 2008 15:34:25 +0300 Subject: KVM: x86 emulator: fix hypercall return value on AMD The hypercall instructions on Intel and AMD are different. KVM allows the guest to choose one or the other (the default is Intel), and if the guest chooses incorrectly, KVM will patch it at runtime to select the correct instruction. This allows live migration between Intel and AMD machines. This patching occurs in the x86 emulator. The current code also executes the hypercall. Unfortunately, the tail end of the x86 emulator code also executes, overwriting the return value of the hypercall with the original contents of rax (which happens to be the hypercall number). Fix not by executing the hypercall in the emulator context; instead let the guest reissue the patched instruction and execute the hypercall via the normal path. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 8a96320ab071..932f216d890c 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -1727,7 +1727,8 @@ twobyte_insn: if (rc) goto done; - kvm_emulate_hypercall(ctxt->vcpu); + /* Let the processor re-execute the fixed hypercall */ + c->eip = ctxt->vcpu->arch.rip; /* Disable writeback. */ c->dst.type = OP_NONE; break; -- cgit v1.2.3 From 2f5997140f22f68f6390c49941150d3fa8a95cb7 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 May 2008 12:10:20 -0300 Subject: KVM: migrate PIT timer Migrate the PIT timer to the physical CPU which vcpu0 is scheduled on, similarly to what is done for the LAPIC timers, otherwise PIT interrupts will be delayed until an unrelated event causes an exit. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 14 +++++++++++++- arch/x86/kvm/irq.c | 6 ++++++ arch/x86/kvm/irq.h | 2 ++ arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 6 files changed, 24 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 7c077a9d9777..f2f5d260874e 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -200,7 +200,6 @@ int __pit_timer_fn(struct kvm_kpit_state *ps) atomic_inc(&pt->pending); smp_mb__after_atomic_inc(); - /* FIXME: handle case where the guest is in guest mode */ if (vcpu0 && waitqueue_active(&vcpu0->wq)) { vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; wake_up_interruptible(&vcpu0->wq); @@ -237,6 +236,19 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } +void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_pit *pit = vcpu->kvm->arch.vpit; + struct hrtimer *timer; + + if (vcpu->vcpu_id != 0 || !pit) + return; + + timer = &pit->pit_state.pit_timer.timer; + if (hrtimer_cancel(timer)) + hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); +} + static void destroy_pit_timer(struct kvm_kpit_timer *pt) { pr_debug("pit: execute del timer!\n"); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index ce1f583459b1..76d736b5f664 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -94,3 +94,9 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_timer_intr_post); + +void __kvm_migrate_timers(struct kvm_vcpu *vcpu) +{ + __kvm_migrate_apic_timer(vcpu); + __kvm_migrate_pit_timer(vcpu); +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 1802134b836f..2a15be2275c0 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -84,6 +84,8 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); +void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); +void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int pit_has_pending_timer(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ab22615eee89..6b0d5fa5bab3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -688,7 +688,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) delta = vcpu->arch.host_tsc - tsc_this; svm->vmcb->control.tsc_offset += delta; vcpu->cpu = cpu; - kvm_migrate_apic_timer(vcpu); + kvm_migrate_timers(vcpu); } for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bfe4db11989c..96445f341519 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -608,7 +608,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu->cpu != cpu) { vcpu_clear(vmx); - kvm_migrate_apic_timer(vcpu); + kvm_migrate_timers(vcpu); vpid_sync_vcpu_all(vmx); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 21338bdb28ff..00acf1301a15 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2758,7 +2758,7 @@ again: if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) - __kvm_migrate_apic_timer(vcpu); + __kvm_migrate_timers(vcpu); if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests)) { kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; -- cgit v1.2.3 From e693d71b46e64536581bf4884434fc1b8797e96f Mon Sep 17 00:00:00 2001 From: Eli Collins Date: Sun, 1 Jun 2008 20:24:40 -0700 Subject: KVM: VMX: Clear CR4.VMXE in hardware_disable Clear CR4.VMXE in hardware_disable. There's no reason to leave it set after doing a VMXOFF. VMware Workstation 6.5 checks CR4.VMXE as a proxy for whether the CPU is in VMX mode, so leaving VMXE set means we'll refuse to power on. With this change the user can power on after unloading the kvm-intel module. I tested on kvm-67 and kvm-69. Signed-off-by: Eli Collins Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 96445f341519..02efbe75f317 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1036,6 +1036,7 @@ static void hardware_enable(void *garbage) static void hardware_disable(void *garbage) { asm volatile (ASM_VMX_VMXOFF : : : "cc"); + write_cr4(read_cr4() & ~X86_CR4_VMXE); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, -- cgit v1.2.3 From 8d2d73b9a5c35f2c6abf427afba7888cfc4cc65d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 4 Jun 2008 18:42:24 +0300 Subject: KVM: MMU: reschedule during shadow teardown Shadows for large guests can take a long time to tear down, so reschedule occasionally to avoid softlockup warnings. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7246b60afb96..c2fd6a4a58c3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1858,6 +1858,7 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu) sp = container_of(vcpu->kvm->arch.active_mmu_pages.next, struct kvm_mmu_page, link); kvm_mmu_zap_page(vcpu->kvm, sp); + cond_resched(); } free_page((unsigned long)vcpu->arch.mmu.pae_root); } -- cgit v1.2.3 From ebb0e6264c7a65c51feb3575e9edb58eab0cf469 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 May 2008 16:21:58 +0300 Subject: KVM: MMU: Fix printk() format string Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 156fe10288ae..934c7b619396 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -418,7 +418,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* mmio */ if (is_error_pfn(pfn)) { - pgprintk("gfn %x is mmio\n", walker.gfn); + pgprintk("gfn %lx is mmio\n", walker.gfn); kvm_release_pfn_clean(pfn); return 1; } -- cgit v1.2.3 From 3c9155106d589584f67b026ec444e69c4a68d7dc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 May 2008 16:21:13 +0300 Subject: KVM: MMU: Fix is_empty_shadow_page() check The check is only looking at one of two possible empty ptes. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c2fd6a4a58c3..ee3f53098f0c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -658,7 +658,7 @@ static int is_empty_shadow_page(u64 *spt) u64 *end; for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) - if (*pos != shadow_trap_nonpresent_pte) { + if (is_shadow_present_pte(*pos)) { printk(KERN_ERR "%s: %p %llx\n", __func__, pos, *pos); return 0; -- cgit v1.2.3 From b7f09ae583c49d28b2796d2fa5893dcf822e3a10 Mon Sep 17 00:00:00 2001 From: Miquel van Smoorenburg Date: Thu, 5 Jun 2008 18:14:44 +0200 Subject: x86, pci-dma.c: don't always add __GFP_NORETRY to gfp Currently arch/x86/kernel/pci-dma.c always adds __GFP_NORETRY to the allocation flags, because it wants to be reasonably sure not to deadlock when calling alloc_pages(). But really that should only be done in two cases: - when allocating memory in the lower 16 MB DMA zone. If there's no free memory there, waiting or OOM killing is of no use - when optimistically trying an allocation in the DMA32 zone when dma_mask < DMA_32BIT_MASK hoping that the allocation happens to fall within the limits of the dma_mask Also blindly adding __GFP_NORETRY to the the gfp variable might not be a good idea since we then also use it when calling dma_ops->alloc_coherent(). Clearing it might also not be a good idea, dma_alloc_coherent()'s caller might have set it on purpose. The gfp variable should not be clobbered. [ mingo@elte.hu: converted to delta patch ontop of previous version. ] Signed-off-by: Miquel van Smoorenburg Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 069e843f0b93..dc00a1331ace 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -378,6 +378,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, struct page *page; unsigned long dma_mask = 0; dma_addr_t bus; + int noretry = 0; /* ignore region specifiers */ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); @@ -397,19 +398,25 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, if (dev->dma_mask == NULL) return NULL; + /* Don't invoke OOM killer or retry in lower 16MB DMA zone */ + if (gfp & __GFP_DMA) + noretry = 1; + #ifdef CONFIG_X86_64 /* Why <=? Even when the mask is smaller than 4GB it is often larger than 16MB and in this case we have a chance of finding fitting memory in the next higher zone first. If not retry with true GFP_DMA. -AK */ - if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) + if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) { gfp |= GFP_DMA32; + if (dma_mask < DMA_32BIT_MASK) + noretry = 1; + } #endif again: - /* Don't invoke OOM killer or retry in lower 16MB DMA zone */ page = dma_alloc_pages(dev, - (gfp & GFP_DMA) ? gfp | __GFP_NORETRY : gfp, get_order(size)); + noretry ? gfp | __GFP_NORETRY : gfp, get_order(size)); if (page == NULL) return NULL; -- cgit v1.2.3 From 39b8931b5cad9a7cbcd2394a40a088311e783a82 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 9 Jun 2008 16:48:18 -0700 Subject: ACPI: handle invalid ACPI SLIT table This is a SLIT sanity checking patch. It moves slit_valid() function to generic ACPI code and does sanity checking for both x86 and ia64. It sets up node_distance with LOCAL_DISTANCE and REMOTE_DISTANCE when hitting invalid SLIT table on ia64. It also cleans up unused variable localities in acpi_parse_slit() on x86. Signed-off-by: Fenghua Yu Signed-off-by: Andrew Morton Signed-off-by: Len Brown --- arch/x86/mm/srat_64.c | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 3890234e5b26..99649dccad28 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -97,36 +97,9 @@ static __init inline int srat_disabled(void) return numa_off || acpi_numa < 0; } -/* - * A lot of BIOS fill in 10 (= no distance) everywhere. This messes - * up the NUMA heuristics which wants the local node to have a smaller - * distance than the others. - * Do some quick checks here and only use the SLIT if it passes. - */ -static __init int slit_valid(struct acpi_table_slit *slit) -{ - int i, j; - int d = slit->locality_count; - for (i = 0; i < d; i++) { - for (j = 0; j < d; j++) { - u8 val = slit->entry[d*i + j]; - if (i == j) { - if (val != LOCAL_DISTANCE) - return 0; - } else if (val <= LOCAL_DISTANCE) - return 0; - } - } - return 1; -} - /* Callback for SLIT parsing */ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { - if (!slit_valid(slit)) { - printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n"); - return; - } acpi_slit = slit; } -- cgit v1.2.3 From f595ec964daf7f99668039d7303ddedd09a75142 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 12 Jun 2008 10:47:56 +0200 Subject: common implementation of iterative div/mod We have a few instances of the open-coded iterative div/mod loop, used when we don't expcet the dividend to be much bigger than the divisor. Unfortunately modern gcc's have the tendency to strength "reduce" this into a full mod operation, which isn't necessarily any faster, and even if it were, doesn't exist if gcc implements it in libgcc. The workaround is to put a dummy asm statement in the loop to prevent gcc from performing the transformation. This patch creates a single implementation of this loop, and uses it to replace the open-coded versions I know about. Signed-off-by: Jeremy Fitzhardinge Cc: Andrew Morton Cc: john stultz Cc: Segher Boessenkool Cc: Christian Kujau Cc: Robert Hancock Signed-off-by: Ingo Molnar --- arch/x86/xen/time.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c39e1a5aa241..52b2e3856980 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -150,11 +151,7 @@ static void do_stolen_accounting(void) if (stolen < 0) stolen = 0; - ticks = 0; - while (stolen >= NS_PER_TICK) { - ticks++; - stolen -= NS_PER_TICK; - } + ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); __get_cpu_var(residual_stolen) = stolen; account_steal_time(NULL, ticks); @@ -166,11 +163,7 @@ static void do_stolen_accounting(void) if (blocked < 0) blocked = 0; - ticks = 0; - while (blocked >= NS_PER_TICK) { - ticks++; - blocked -= NS_PER_TICK; - } + ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); __get_cpu_var(residual_blocked) = blocked; account_steal_time(idle_task(smp_processor_id()), ticks); } -- cgit v1.2.3 From 3703f39965a197ebd91743fc38d0f640606b8da3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 4 Jun 2008 18:13:37 +0200 Subject: geode: fix modular build -tip testing found this build bug: MODPOST 331 modules ERROR: "geode_mfgpt_toggle_event" [drivers/watchdog/geodewdt.ko] undefined! ERROR: "geode_mfgpt_alloc_timer" [drivers/watchdog/geodewdt.ko] undefined! make[1]: *** [__modpost] Error 1 make: *** [modules] Error 2 with this config: http://redhat.com/~mingo/misc/config-Wed_Jun__4_18_01_59_CEST_2008.bad export those symbols. Signed-off-by: Ingo Molnar --- arch/x86/kernel/mfgpt_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 3cad17fe026b..07c0f828f488 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -155,6 +155,7 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable) wrmsr(msr, value, dummy); return 0; } +EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) { @@ -222,6 +223,7 @@ int geode_mfgpt_alloc_timer(int timer, int domain) /* No timers available - too bad */ return -1; } +EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer); #ifdef CONFIG_GEODE_MFGPT_TIMER -- cgit v1.2.3 From b29c701deacd5d24453127c37ed77ef851c53b8b Mon Sep 17 00:00:00 2001 From: Henry Nestler Date: Mon, 12 May 2008 15:44:39 +0200 Subject: x86: fix endless page faults in mount_block_root for Linux 2.6 Page faults in kernel address space between PAGE_OFFSET up to VMALLOC_START should not try to map as vmalloc. Fix rarely endless page faults inside mount_block_root for root filesystem at boot time. All 32bit kernels up to 2.6.25 can fail into this hole. I can not present this under native linux kernel. I see, that the 64bit has fixed the problem. I copied the same lines into 32bit part. Recorded debugs are from coLinux kernel 2.6.22.18 (virtualisation): http://www.henrynestler.com/colinux/testing/pfn-check-0.7.3/20080410-antinx/bug16-recursive-page-fault-endless.txt The physicaly memory was trimmed down to 192MB to better catch the bug. More memory gets the bug more rarely. Details, how every x86 32bit system can fail: Start from "mount_block_root", http://lxr.linux.no/linux/init/do_mounts.c#L297 There the variable "fs_names" got one memory page with 4096 bytes. Variable "p" walks through the existing file system types. The first string is no problem. But, with the second loop in mount_block_root the offset of "p" is not at beginning of page, the offset is for example +9, if "reiserfs" is the first in list. Than calls do_mount_root, and lands in sys_mount. Remember: Variable "type_page" contains now "fs_type+9" and not contains a full page. The sys_mount copies 4096 bytes with function "exact_copy_from_user()": http://lxr.linux.no/linux/fs/namespace.c#L1540 Mostly exist pages after the buffer "fs_names+4096+9" and the page fault handler was not called. No problem. In the case, if the page after "fs_names+4096" is not mapped, the page fault handler was called from http://lxr.linux.no/linux/fs/namespace.c#L1320 The do_page_fault gots an address 0xc03b4000. It's kernel address, address >= TASK_SIZE, but not from vmalloc! It's from "__getname()" alias "kmem_cache_alloc". The "error_code" is 0. "vmalloc_fault" will be call: http://lxr.linux.no/linux/arch/i386/mm/fault.c#L332 "vmalloc_fault" tryed to find the physical page for a non existing virtual memory area. The macro "pte_present" in vmalloc_fault() got a next page fault for 0xc0000ed0 at: http://lxr.linux.no/linux/arch/i386/mm/fault.c#L282 No PTE exist for such virtual address. The page fault handler was trying to sync the physical page for the PTE lockup. This called vmalloc_fault() again for address 0xc000000, and that also was not existing. The endless began... In normal case the cpu would still loop with disabled interrrupts. Under coLinux this was catched by a stack overflow inside printk debugs. Signed-off-by: Henry Nestler Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/fault.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fd7e1798c75a..8bcb6f40ccb6 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -497,6 +497,11 @@ static int vmalloc_fault(unsigned long address) unsigned long pgd_paddr; pmd_t *pmd_k; pte_t *pte_k; + + /* Make sure we are in vmalloc area */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + /* * Synchronize this task's top level page-table * with the 'reference' page table. -- cgit v1.2.3 From 86b2b70e156203149c3861455feec54bc4906e6d Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Mon, 2 Jun 2008 17:21:06 -0400 Subject: x86: fix asm warning in head_32.S On Mon, May 19, 2008 at 04:10:02PM -0700, Linus Torvalds wrote: > It also causes these warnings on 32-bit PAE: > > AS arch/x86/kernel/head_32.o > arch/x86/kernel/head_32.S: Assembler messages: > arch/x86/kernel/head_32.S:225: Warning: left operand is a bignum; integer 0 assumed > arch/x86/kernel/head_32.S:609: Warning: left operand is a bignum; integer 0 assumed > > and I do not see why (the end result seems to be identical). Fix head_32.S gcc bignum warnings when CONFIG_PAE=y. arch/x86/kernel/head_32.S: Assembler messages: arch/x86/kernel/head_32.S:225: Warning: left operand is a bignum; integer 0 assumed arch/x86/kernel/head_32.S:609: Warning: left operand is a bignum; integer 0 assumed The assembler was stumbling over the 64-bit constant 0x100000000 in the KPMDS #define. Testing: a cmp(1) on head_32.o before and after shows the binary is unchanged. Signed-off-by: Joe Korty Cc: Theodore Tso Cc: Gabriel C Cc: Keith Packard Cc: "Pallipadi Venkatesh" Cc: Eric Anholt Cc: "Siddha Suresh B" Cc: bugme-daemon@bugzilla.kernel.org Cc: airlied@linux.ie Cc: "Barnes Jesse" Cc: Jeremy Fitzhardinge Cc: Andrew Morton Cc: "Rafael J. Wysocki" Cc: Linus Torvalds Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b2cc73768a9d..f7357cc0162c 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -189,7 +189,7 @@ default_entry: * this stage. */ -#define KPMDS ((0x100000000-__PAGE_OFFSET) >> 30) /* Number of kernel PMDs */ +#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ xorl %ebx,%ebx /* %ebx is kept at zero */ -- cgit v1.2.3 From 0b6a39f7ebcb1c82587ce35b401c513eed41ac5c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Jun 2008 13:29:43 +0200 Subject: Revert "x86: fix ioapic bug again" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 6e908947b4995bc0e551a8257c586d5c3e428201. Németh Márton reported: | there is a problem in 2.6.26-rc3 which was not there in case of | 2.6.25: the CPU wakes up ~90,000 times per sec instead of ~60 per sec. | | I also "git bisected" the problem, the result is: | | 6e908947b4995bc0e551a8257c586d5c3e428201 is first bad commit | commit 6e908947b4995bc0e551a8257c586d5c3e428201 | Author: Ingo Molnar | Date: Fri Mar 21 14:32:36 2008 +0100 | | x86: fix ioapic bug again the original problem is fixed by Maciej W. Rozycki in the tip/x86/apic branch (confirmed by Márton), but those changes are too intrusive for v2.6.26 so we'll go for the less intrusive (repeated) revert now. Reported-and-bisected-by: Németh Márton Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 12 ++---------- arch/x86/kernel/nmi_32.c | 9 ++------- 2 files changed, 4 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index a40d54fc1fdd..4dc8600d9d20 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -2130,14 +2130,10 @@ static inline void __init check_timer(void) { int apic1, pin1, apic2, pin2; int vector; - unsigned int ver; unsigned long flags; local_irq_save(flags); - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - /* * get/set the timer IRQ vector: */ @@ -2150,15 +2146,11 @@ static inline void __init check_timer(void) * mode for the 8259A whenever interrupts are routed * through I/O APICs. Also IRQ0 has to be enabled in * the 8259A which implies the virtual wire has to be - * disabled in the local APIC. Finally timer interrupts - * need to be acknowledged manually in the 8259A for - * timer_interrupt() and for the i82489DX when using - * the NMI watchdog. + * disabled in the local APIC. */ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); - timer_ack = !cpu_has_tsc; - timer_ack |= (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); + timer_ack = 1; if (timer_over_8254 > 0) enable_8259A_irq(0); diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 11b14bbaa61e..84160f74eeb0 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -26,7 +26,6 @@ #include #include -#include #include "mach_traps.h" @@ -82,7 +81,7 @@ int __init check_nmi_watchdog(void) prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); if (!prev_nmi_count) - goto error; + return -1; printk(KERN_INFO "Testing NMI watchdog ... "); @@ -119,7 +118,7 @@ int __init check_nmi_watchdog(void) if (!atomic_read(&nmi_active)) { kfree(prev_nmi_count); atomic_set(&nmi_active, -1); - goto error; + return -1; } printk("OK.\n"); @@ -130,10 +129,6 @@ int __init check_nmi_watchdog(void) kfree(prev_nmi_count); return 0; -error: - timer_ack = !cpu_has_tsc; - - return -1; } static int __init setup_nmi_watchdog(char *str) -- cgit v1.2.3 From 52aaa12fbe786c90396f1b11ec39c924ccdd8fd5 Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Thu, 5 Jun 2008 19:14:15 +0530 Subject: x86: fix unused variable 'loops' warning in arch/x86/boot/a20.c Following patch fixes the below warning message : arch/x86/boot/a20.c:118: warning: unused variable 'loops' Signed-off-by : Manish Katiyar Signed-off-by: Ingo Molnar --- arch/x86/boot/a20.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c index 90943f83e84d..e01aafd03bde 100644 --- a/arch/x86/boot/a20.c +++ b/arch/x86/boot/a20.c @@ -115,8 +115,6 @@ static void enable_a20_fast(void) int enable_a20(void) { - int loops = A20_ENABLE_LOOPS; - #if defined(CONFIG_X86_ELAN) /* Elan croaks if we try to touch the KBC */ enable_a20_fast(); @@ -128,6 +126,7 @@ int enable_a20(void) enable_a20_kbc(); return 0; #else + int loops = A20_ENABLE_LOOPS; while (loops--) { /* First, check to see if A20 is already enabled (legacy free, etc.) */ -- cgit v1.2.3 From e32e58a96de4ac35a03349db2ab69f263ded958f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 Jun 2008 10:14:08 +0200 Subject: x86: fix lockdep warning during suspend-to-ram Andrew Morton wrote: > I've been seeing the below for a long time during suspend-to-ram on the Vaio. > > > PM: Syncing filesystems ... done. > PM: Preparing system for mem sleep > Freezing user space processes ... <4>------------[ cut here ]------------ > WARNING: at kernel/lockdep.c:2658 check_flags+0x4c/0x127() > Modules linked in: i915 drm ipw2200 sonypi ipv6 autofs4 hidp l2cap bluetooth sunrpc nf_conntrack_netbios_ns ipt_REJECT nf_conntrack_ipv4 xt_state nf_conntrack xt_tcpudp iptable_filter ip_tables x_tables acpi_cpufreq nvram ohci1394 ieee1394 ehci_hcd uhci_hcd sg joydev snd_hda_intel snd_seq_dummy sr_mod snd_seq_oss cdrom snd_seq_midi_event snd_seq snd_seq_device snd_pcm_oss snd_mixer_oss ieee80211 pcspkr ieee80211_crypt snd_pcm i2c_i801 snd_timer i2c_core ide_pci_generic piix snd soundcore snd_page_alloc button ext3 jbd ide_disk ide_core [last unloaded: ipw2200] > Pid: 3250, comm: zsh Not tainted 2.6.26-rc5 #1 > [] warn_on_slowpath+0x41/0x6d > [] ? native_sched_clock+0x82/0x96 > [] ? mark_held_locks+0x41/0x5c > [] ? _spin_unlock_irqrestore+0x36/0x58 > [] ? trace_hardirqs_on+0xe6/0x10d > [] ? __lock_acquire+0xae3/0xb2b > [] ? schedule+0x39b/0x3b4 > [] check_flags+0x4c/0x127 > [] lock_acquire+0x3a/0x86 > [] _spin_lock+0x26/0x53 > [] ? refrigerator+0x13/0xc3 > [] refrigerator+0x13/0xc3 > [] get_signal_to_deliver+0x3c/0x31e > [] do_notify_resume+0x91/0x6ee > [] ? lock_release_holdtime+0x50/0x56 > [] ? _spin_unlock_irqrestore+0x36/0x58 > [] ? read_chan+0x0/0x58c > [] ? trace_hardirqs_on+0xe6/0x10d > [] ? _spin_unlock_irqrestore+0x42/0x58 > [] ? tty_ldisc_deref+0x5c/0x63 > [] ? tty_read+0x66/0x98 > [] ? audit_syscall_exit+0x2aa/0x2c5 > [] ? do_syscall_trace+0x6b/0x16f > [] work_notifysig+0x13/0x1b > ======================= > ---[ end trace 25b49fe59a25afa5 ]--- > possible reason: unannotated irqs-off. > irq event stamp: 58919 > hardirqs last enabled at (58919): [] syscall_exit_work+0x11/0x26 Joy - I so love entry.S Best I can make of it: syscall_exit_work resume_userspace DISABLE_INTERRUPTS (no TRACE_IRQS_OFF) work_pending work_notifysig do_notify_resume() do_signal() get_signal_to_deliver() try_to_freeze() refrigerator() task_lock() -> check_flags() -> BANG The normal path is: syscall_exit_work resume_userspace DISABLE_INTERRUPTS restore_all TRACE_IRQS_IRET iret No idea why that would not warn.. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 2a609dc3271c..c778e4fa55a2 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -248,6 +248,7 @@ ENTRY(resume_userspace) DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret + TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done on # int/exception return? -- cgit v1.2.3 From eb53e9f3ea859a6d59c37b500593b970aa8562e6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 7 Jun 2008 17:18:40 +0100 Subject: x86: fix an incompatible pointer type warning on 64-bit compilations Fix an incompatible pointer type warning on x86_64 compilations. early_memtest() is passing a u64* to find_e820_area_size() which is expecting an unsigned long. Change t_start and t_size to unsigned long as those are also 64-bit types on x88_64. Signed-off-by: David Howells Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 156e6d7b0e32..998a06ea5f7d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -506,7 +506,7 @@ early_param("memtest", parse_memtest); static void __init early_memtest(unsigned long start, unsigned long end) { - u64 t_start, t_size; + unsigned long t_start, t_size; unsigned pattern; if (!memtest_pattern) @@ -525,7 +525,7 @@ static void __init early_memtest(unsigned long start, unsigned long end) if (t_start + t_size > end) t_size = end - t_start; - printk(KERN_CONT "\n %016llx - %016llx pattern %d", + printk(KERN_CONT "\n %016lx - %016lx pattern %d", t_start, t_start + t_size, pattern); memtest(t_start, t_size, pattern); -- cgit v1.2.3 From 4461145ef1be92851c230f858f6b6f457c99670f Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 12 Jun 2008 08:55:59 +0200 Subject: x86, lockdep: fix "WARNING: at kernel/lockdep.c:2658 check_flags+0x4c/0x128()" Alessandro Suardi reported: > Recently upgraded my FC6 desktop to Fedora 9; with the > latest nautilus RPM updates my VNC session went nuts > with nautilus pegging the CPU for everything that breathed. > > I now reverted to an earlier nautilus package, but during > the peak CPU period my kernel spat this: > > [314185.623294] ------------[ cut here ]------------ > [314185.623414] WARNING: at kernel/lockdep.c:2658 check_flags+0x4c/0x128() > [314185.623514] Modules linked in: iptable_filter ip_tables x_tables > sunrpc ipv6 fuse snd_via82xx snd_ac97_codec ac97_bus snd_mpu401_uart > snd_rawmidi via686a hwmon parport_pc sg parport uhci_hcd ehci_hcd > [314185.623924] Pid: 12314, comm: nautilus Not tainted 2.6.26-rc5-git2 #4 > [314185.624021] [] warn_on_slowpath+0x41/0x7b > [314185.624021] [] ? do_page_fault+0x2c1/0x5fd > [314185.624021] [] ? up_read+0x16/0x28 > [314185.624021] [] ? do_page_fault+0x2c1/0x5fd > [314185.624021] [] ? __lock_acquire+0xbb4/0xbc3 > [314185.624021] [] check_flags+0x4c/0x128 > [314185.624021] [] lock_acquire+0x31/0x7d > [314185.624021] [] __atomic_notifier_call_chain+0x30/0x80 > [314185.624021] [] ? __atomic_notifier_call_chain+0x0/0x80 > [314185.624021] [] atomic_notifier_call_chain+0xc/0xe > [314185.624021] [] notify_die+0x2d/0x2f > [314185.624021] [] do_int3+0x1f/0x4d > [314185.624021] [] int3+0x27/0x2c > [314185.624021] ======================= > [314185.624021] ---[ end trace 1923f65a2d7bb246 ]--- > [314185.624021] possible reason: unannotated irqs-off. > [314185.624021] irq event stamp: 488879 > [314185.624021] hardirqs last enabled at (488879): [] > restore_nocheck+0x12/0x15 > [314185.624021] hardirqs last disabled at (488878): [] > work_resched+0x19/0x30 > [314185.624021] softirqs last enabled at (488876): [] > __do_softirq+0xa6/0xac > [314185.624021] softirqs last disabled at (488865): [] > do_softirq+0x57/0xa6 > > I didn't seem to find it with some googling, so here it is. > > I was incidentally ltracing that process to try and find out > what was gulping down that much CPU (sorry, no idea > whether ltrace and the WARNING happened at the same > time or which came first) and: Yeah, this is extremely likely to be the source of the warning. The warning should be harmless, however. > Box is my trusty noname K7-800, 512MB RAM; if there's > anything else useful I might be able to provide, just ask. It would be interesting to see where the int3 comes from. Too bad, lockdep doesn't provide the register dump. The stacktrace also doesn't go further than the int3(), I wonder if this int3 came from userspace? The ltrace readme says "software breakpoints, like gdb", so I guess this is the case. Yep, seems like it. This looks relevant: | commit fb1dac909d94ff807cd833d340c6827c3a957159 | Author: Peter Zijlstra | Date: Wed Jan 16 09:51:59 2008 +0100 | | lockdep: more hardirq annotations for notify_die() I'm attaching a similarly-looking patch for this case (DO_VM86_ERROR), though I suspect it might be missing for the other cases (DO_ERROR/DO_ERROR_INFO) as well. Reported-by: Alessandro Suardi Signed-off-by: Vegard Nossum Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index bde6f63e15d5..08d752de4eee 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -544,6 +544,7 @@ vm86_trap: #define DO_ERROR(trapnr, signr, str, name) \ void do_##name(struct pt_regs *regs, long error_code) \ { \ + trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ -- cgit v1.2.3 From f8a45704f5bd5f037c8e4a75172cab1476fc0447 Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Thu, 29 May 2008 21:14:35 -0300 Subject: x86: fix pointer type warning in arch/x86/mm/init_64.c:early_memtest Changed the call to find_e820_area_size to pass u64 instead of unsigned long. Signed-off-by: Kevin Winchester Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 998a06ea5f7d..156e6d7b0e32 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -506,7 +506,7 @@ early_param("memtest", parse_memtest); static void __init early_memtest(unsigned long start, unsigned long end) { - unsigned long t_start, t_size; + u64 t_start, t_size; unsigned pattern; if (!memtest_pattern) @@ -525,7 +525,7 @@ static void __init early_memtest(unsigned long start, unsigned long end) if (t_start + t_size > end) t_size = end - t_start; - printk(KERN_CONT "\n %016lx - %016lx pattern %d", + printk(KERN_CONT "\n %016llx - %016llx pattern %d", t_start, t_start + t_size, pattern); memtest(t_start, t_size, pattern); -- cgit v1.2.3 From 1da2e3d679a8ea2d9e82040359a706da0bd3bef6 Mon Sep 17 00:00:00 2001 From: Stas Sergeev Date: Thu, 12 Jun 2008 15:21:54 -0700 Subject: provide rtc_cmos platform device Recently (around 2.6.25) I've noticed that RTC no longer works for me. It turned out this is because I use pnpacpi=off kernel option to work around the parport_pc bugs. I always did so, but RTC used to work fine in the past, and now it have regressed. The patch fixes the problem by creating the platform device for the RTC when PNP is disabled. This may also help running the PNP-enabled kernel on an older PCs. Signed-off-by: Stas Sergeev Cc: David Brownell Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Bjorn Helgaas Cc: Adam Belay Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/rtc.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 9615eee9b775..05191bbc68b8 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include @@ -197,3 +199,35 @@ unsigned long long native_read_tsc(void) } EXPORT_SYMBOL(native_read_tsc); + +static struct resource rtc_resources[] = { + [0] = { + .start = RTC_PORT(0), + .end = RTC_PORT(1), + .flags = IORESOURCE_IO, + }, + [1] = { + .start = RTC_IRQ, + .end = RTC_IRQ, + .flags = IORESOURCE_IRQ, + } +}; + +static struct platform_device rtc_device = { + .name = "rtc_cmos", + .id = -1, + .resource = rtc_resources, + .num_resources = ARRAY_SIZE(rtc_resources), +}; + +static __init int add_rtc_cmos(void) +{ +#ifdef CONFIG_PNP + if (!pnp_platform_devices) + platform_device_register(&rtc_device); +#else + platform_device_register(&rtc_device); +#endif /* CONFIG_PNP */ + return 0; +} +device_initcall(add_rtc_cmos); -- cgit v1.2.3