From ee7e5516be4f2107535ad5a3d47d9c79f93661a2 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 29 Jun 2008 14:18:46 +0400 Subject: generic: per-device coherent dma allocator Currently x86_32, sh and cris-v32 provide per-device coherent dma memory allocator. However their implementation is nearly identical. Refactor out common code to be reused by them. Signed-off-by: Dmitry Baryshkov Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 + kernel/dma-coherent.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 kernel/dma-coherent.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9d..9e287d8ab766 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c new file mode 100644 index 000000000000..89a554cfd936 --- /dev/null +++ b/kernel/dma-coherent.c @@ -0,0 +1,127 @@ +/* + * Coherent per-device memory handling. + * Borrowed from i386 + */ +#include +#include + +struct dma_coherent_mem { + void *virt_base; + u32 device_base; + int size; + int flags; + unsigned long *bitmap; +}; + +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, + dma_addr_t device_addr, size_t size, int flags) +{ + void __iomem *mem_base = NULL; + int pages = size >> PAGE_SHIFT; + int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); + + if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) + goto out; + if (!size) + goto out; + if (dev->dma_mem) + goto out; + + /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ + + mem_base = ioremap(bus_addr, size); + if (!mem_base) + goto out; + + dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + if (!dev->dma_mem) + goto out; + dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!dev->dma_mem->bitmap) + goto free1_out; + + dev->dma_mem->virt_base = mem_base; + dev->dma_mem->device_base = device_addr; + dev->dma_mem->size = pages; + dev->dma_mem->flags = flags; + + if (flags & DMA_MEMORY_MAP) + return DMA_MEMORY_MAP; + + return DMA_MEMORY_IO; + + free1_out: + kfree(dev->dma_mem); + out: + if (mem_base) + iounmap(mem_base); + return 0; +} +EXPORT_SYMBOL(dma_declare_coherent_memory); + +void dma_release_declared_memory(struct device *dev) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + + if (!mem) + return; + dev->dma_mem = NULL; + iounmap(mem->virt_base); + kfree(mem->bitmap); + kfree(mem); +} +EXPORT_SYMBOL(dma_release_declared_memory); + +void *dma_mark_declared_memory_occupied(struct device *dev, + dma_addr_t device_addr, size_t size) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + int pos, err; + int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1); + + pages >>= PAGE_SHIFT; + + if (!mem) + return ERR_PTR(-EINVAL); + + pos = (device_addr - mem->device_base) >> PAGE_SHIFT; + err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); + if (err != 0) + return ERR_PTR(err); + return mem->virt_base + (pos << PAGE_SHIFT); +} +EXPORT_SYMBOL(dma_mark_declared_memory_occupied); + +int dma_alloc_from_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle, void **ret) +{ + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + int order = get_order(size); + + if (mem) { + int page = bitmap_find_free_region(mem->bitmap, mem->size, + order); + if (page >= 0) { + *dma_handle = mem->device_base + (page << PAGE_SHIFT); + *ret = mem->virt_base + (page << PAGE_SHIFT); + memset(*ret, 0, size); + } + if (mem->flags & DMA_MEMORY_EXCLUSIVE) + *ret = NULL; + } + return (mem != NULL); +} + +int dma_release_from_coherent(struct device *dev, int order, void *vaddr) +{ + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + + if (mem && vaddr >= mem->virt_base && vaddr < + (mem->virt_base + (mem->size << PAGE_SHIFT))) { + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; + + bitmap_release_region(mem->bitmap, page, order); + return 1; + } + return 0; +} -- cgit v1.2.3 From 538c29d43ebdac2edcef96ac07982d2296a63077 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 18 Jul 2008 13:29:57 +0400 Subject: Generic dma-coherent: fix DMA_MEMORY_EXCLUSIVE Don't rewrite successfull allocation return values in case the memory was marked with DMA_MEMORY_EXCLUSIVE. Signed-off-by: Dmitry Baryshkov Cc: Jesse Barnes Signed-off-by: Ingo Molnar --- kernel/dma-coherent.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 89a554cfd936..56dff5cb0f29 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -105,8 +105,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, *dma_handle = mem->device_base + (page << PAGE_SHIFT); *ret = mem->virt_base + (page << PAGE_SHIFT); memset(*ret, 0, size); - } - if (mem->flags & DMA_MEMORY_EXCLUSIVE) + } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) *ret = NULL; } return (mem != NULL); -- cgit v1.2.3 From b6d4f7e3ef25beb8c658c97867d98883e69dc544 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 20 Jul 2008 15:01:10 +0400 Subject: dma-coherent: add documentation to new interfaces Signed-off-by: Dmitry Baryshkov Cc: Jesse Barnes Signed-off-by: Ingo Molnar --- kernel/dma-coherent.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 56dff5cb0f29..7517115a8cce 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -92,6 +92,21 @@ void *dma_mark_declared_memory_occupied(struct device *dev, } EXPORT_SYMBOL(dma_mark_declared_memory_occupied); +/** + * Try to allocate memory from the per-device coherent area. + * + * @dev: device from which we allocate memory + * @size: size of requested memory area + * @dma_handle: This will be filled with the correct dma handle + * @ret: This pointer will be filled with the virtual address + * to allocated area. + * + * This function should be only called from per-arch %dma_alloc_coherent() + * to support allocation from per-device coherent memory pools. + * + * Returns 0 if dma_alloc_coherent should continue with allocating from + * generic memory areas, or !0 if dma_alloc_coherent should return %ret. + */ int dma_alloc_from_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret) { @@ -111,6 +126,19 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, return (mem != NULL); } +/** + * Try to free the memory allocated from per-device coherent memory pool. + * @dev: device from which the memory was allocated + * @order: the order of pages allocated + * @vaddr: virtual address of allocated pages + * + * This checks whether the memory was allocated from the per-device + * coherent memory pool and if so, releases that memory. + * + * Returns 1 if we correctly released the memory, or 0 if + * %dma_release_coherent() should proceed with releasing memory from + * generic pools. + */ int dma_release_from_coherent(struct device *dev, int order, void *vaddr) { struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; -- cgit v1.2.3 From 54da1174922cddd4be83d5a364b2e0fdd693f513 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Jul 2008 20:52:05 +0400 Subject: posix-timers: do_schedule_next_timer: fix the setting of ->si_overrun do_schedule_next_timer() sets info->si_overrun = timr->it_overrun_last, this discards the already accumulated overruns. Signed-off-by: Oleg Nesterov Cc: Mark McLoughlin Cc: Oliver Pinter Cc: Roland McGrath Cc: stable@kernel.org Cc: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index dbd8398ddb0b..814a2bb8d2ea 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -289,7 +289,7 @@ void do_schedule_next_timer(struct siginfo *info) else schedule_next_timer(timr); - info->si_overrun = timr->it_overrun_last; + info->si_overrun += timr->it_overrun_last; } if (timr) -- cgit v1.2.3 From ba661292a2bc6ddd305a212b0526e5dc22195fe7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Jul 2008 20:52:05 +0400 Subject: posix-timers: fix posix_timer_event() vs dequeue_signal() race The bug was reported and analysed by Mark McLoughlin , the patch is based on his and Roland's suggestions. posix_timer_event() always rewrites the pre-allocated siginfo before sending the signal. Most of the written info is the same all the time, but memset(0) is very wrong. If ->sigq is queued we can race with collect_signal() which can fail to find this siginfo looking at .si_signo, or copy_siginfo() can copy the wrong .si_code/si_tid/etc. In short, sys_timer_settime() can in fact stop the active timer, or the user can receive the siginfo with the wrong .si_xxx values. Move "memset(->info, 0)" from posix_timer_event() to alloc_posix_timer(), change send_sigqueue() to set .si_overrun = 0 when ->sigq is not queued. It would be nice to move the whole sigq->info initialization from send to create path, but this is not easy to do without uglifying timer_create() further. As Roland rightly pointed out, we need more cleanups/fixes here, see the "FIXME" comment in the patch. Hopefully this patch makes sense anyway, and it can mask the most bad implications. Reported-by: Mark McLoughlin Signed-off-by: Oleg Nesterov Cc: Mark McLoughlin Cc: Oliver Pinter Cc: Roland McGrath Cc: stable@kernel.org Cc: Andrew Morton Signed-off-by: Thomas Gleixner kernel/posix-timers.c | 17 +++++++++++++---- kernel/signal.c | 1 + 2 files changed, 14 insertions(+), 4 deletions(-) --- kernel/posix-timers.c | 17 +++++++++++++---- kernel/signal.c | 1 + 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 814a2bb8d2ea..0ffaeb075e35 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -296,14 +296,22 @@ void do_schedule_next_timer(struct siginfo *info) unlock_timer(timr, flags); } -int posix_timer_event(struct k_itimer *timr,int si_private) +int posix_timer_event(struct k_itimer *timr, int si_private) { - memset(&timr->sigq->info, 0, sizeof(siginfo_t)); + /* + * FIXME: if ->sigq is queued we can race with + * dequeue_signal()->do_schedule_next_timer(). + * + * If dequeue_signal() sees the "right" value of + * si_sys_private it calls do_schedule_next_timer(). + * We re-queue ->sigq and drop ->it_lock(). + * do_schedule_next_timer() locks the timer + * and re-schedules it while ->sigq is pending. + * Not really bad, but not that we want. + */ timr->sigq->info.si_sys_private = si_private; - /* Send signal to the process that owns this timer.*/ timr->sigq->info.si_signo = timr->it_sigev_signo; - timr->sigq->info.si_errno = 0; timr->sigq->info.si_code = SI_TIMER; timr->sigq->info.si_tid = timr->it_id; timr->sigq->info.si_value = timr->it_sigev_value; @@ -435,6 +443,7 @@ static struct k_itimer * alloc_posix_timer(void) kmem_cache_free(posix_timers_cache, tmr); tmr = NULL; } + memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); return tmr; } diff --git a/kernel/signal.c b/kernel/signal.c index 72bb4f51f963..13fab9838354 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1280,6 +1280,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) q->info.si_overrun++; goto out; } + q->info.si_overrun = 0; signalfd_notify(t, sig); pending = group ? &t->signal->shared_pending : &t->pending; -- cgit v1.2.3 From b8d317d10cca76cabe6b03ebfeb23cc99118b731 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:29 -0700 Subject: cpumask: make cpumask_of_cpu_map generic If an arch doesn't define cpumask_of_cpu_map, create a generic statically-initialized one for them. This allows removal of the buggy cpumask_of_cpu() macro (&cpumask_of_cpu() gives address of out-of-scope var). An arch with NR_CPUS of 4096 probably wants to allocate this itself based on the actual number of CPUs, since otherwise they're using 2MB of rodata (1024 cpus means 128k). That's what CONFIG_HAVE_CPUMASK_OF_CPU_MAP is for (only x86/64 does so at the moment). In future as we support more CPUs, we'll need to resort to a get_cpu_map()/put_cpu_map() allocation scheme. Signed-off-by: Mike Travis Signed-off-by: Rusty Russell Cc: Andrew Morton Cc: Jack Steiner Signed-off-by: Ingo Molnar --- kernel/cpu.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 10ba5f1004a5..fe31ff3d3809 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -461,3 +461,112 @@ out: #endif /* CONFIG_PM_SLEEP_SMP */ #endif /* CONFIG_SMP */ + +#ifndef CONFIG_HAVE_CPUMASK_OF_CPU_MAP +/* 64 bits of zeros, for initializers. */ +#if BITS_PER_LONG == 32 +#define Z64 0, 0 +#else +#define Z64 0 +#endif + +/* Initializer macros. */ +#define CMI0(n) { .bits = { 1UL << (n) } } +#define CMI(n, ...) { .bits = { __VA_ARGS__, 1UL << ((n) % BITS_PER_LONG) } } + +#define CMI8(n, ...) \ + CMI((n), __VA_ARGS__), CMI((n)+1, __VA_ARGS__), \ + CMI((n)+2, __VA_ARGS__), CMI((n)+3, __VA_ARGS__), \ + CMI((n)+4, __VA_ARGS__), CMI((n)+5, __VA_ARGS__), \ + CMI((n)+6, __VA_ARGS__), CMI((n)+7, __VA_ARGS__) + +#if BITS_PER_LONG == 32 +#define CMI64(n, ...) \ + CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__), \ + CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__), \ + CMI8((n)+32, 0, __VA_ARGS__), CMI8((n)+40, 0, __VA_ARGS__), \ + CMI8((n)+48, 0, __VA_ARGS__), CMI8((n)+56, 0, __VA_ARGS__) +#else +#define CMI64(n, ...) \ + CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__), \ + CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__), \ + CMI8((n)+32, __VA_ARGS__), CMI8((n)+40, __VA_ARGS__), \ + CMI8((n)+48, __VA_ARGS__), CMI8((n)+56, __VA_ARGS__) +#endif + +#define CMI256(n, ...) \ + CMI64((n), __VA_ARGS__), CMI64((n)+64, Z64, __VA_ARGS__), \ + CMI64((n)+128, Z64, Z64, __VA_ARGS__), \ + CMI64((n)+192, Z64, Z64, Z64, __VA_ARGS__) +#define Z256 Z64, Z64, Z64, Z64 + +#define CMI1024(n, ...) \ + CMI256((n), __VA_ARGS__), \ + CMI256((n)+256, Z256, __VA_ARGS__), \ + CMI256((n)+512, Z256, Z256, __VA_ARGS__), \ + CMI256((n)+768, Z256, Z256, Z256, __VA_ARGS__) +#define Z1024 Z256, Z256, Z256, Z256 + +/* We want this statically initialized, just to be safe. We try not + * to waste too much space, either. */ +static const cpumask_t cpumask_map[] = { + CMI0(0), CMI0(1), CMI0(2), CMI0(3), +#if NR_CPUS > 4 + CMI0(4), CMI0(5), CMI0(6), CMI0(7), +#endif +#if NR_CPUS > 8 + CMI0(8), CMI0(9), CMI0(10), CMI0(11), + CMI0(12), CMI0(13), CMI0(14), CMI0(15), +#endif +#if NR_CPUS > 16 + CMI0(16), CMI0(17), CMI0(18), CMI0(19), + CMI0(20), CMI0(21), CMI0(22), CMI0(23), + CMI0(24), CMI0(25), CMI0(26), CMI0(27), + CMI0(28), CMI0(29), CMI0(30), CMI0(31), +#endif +#if NR_CPUS > 32 +#if BITS_PER_LONG == 32 + CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0), + CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0), + CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0), + CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0), + CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0), + CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0), + CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0), + CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0), +#else + CMI0(32), CMI0(33), CMI0(34), CMI0(35), + CMI0(36), CMI0(37), CMI0(38), CMI0(39), + CMI0(40), CMI0(41), CMI0(42), CMI0(43), + CMI0(44), CMI0(45), CMI0(46), CMI0(47), + CMI0(48), CMI0(49), CMI0(50), CMI0(51), + CMI0(52), CMI0(53), CMI0(54), CMI0(55), + CMI0(56), CMI0(57), CMI0(58), CMI0(59), + CMI0(60), CMI0(61), CMI0(62), CMI0(63), +#endif /* BITS_PER_LONG == 64 */ +#endif +#if NR_CPUS > 64 + CMI64(64, Z64), +#endif +#if NR_CPUS > 128 + CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64), +#endif +#if NR_CPUS > 256 + CMI256(256, Z256), +#endif +#if NR_CPUS > 512 + CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256), +#endif +#if NR_CPUS > 1024 + CMI1024(1024, Z1024), +#endif +#if NR_CPUS > 2048 + CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024), +#endif +#if NR_CPUS > 4096 +#error NR_CPUS too big. Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP +#endif +}; + +const cpumask_t *cpumask_of_cpu_map = cpumask_map; +#endif /* !CONFIG_HAVE_CPUMASK_OF_CPU_MAP */ -- cgit v1.2.3 From 6524d938b3360504b43a1278b5a8403e85383d1a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:30 -0700 Subject: cpumask: put cpumask_of_cpu_map in the initdata section * Create the cpumask_of_cpu_map statically in the init data section using NR_CPUS but replace it during boot up with one sized by nr_cpu_ids (num possible cpus). Signed-off-by: Mike Travis Cc: Andrew Morton Cc: Jack Steiner Cc: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/cpu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index fe31ff3d3809..9d4e1c28c053 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -462,7 +462,6 @@ out: #endif /* CONFIG_SMP */ -#ifndef CONFIG_HAVE_CPUMASK_OF_CPU_MAP /* 64 bits of zeros, for initializers. */ #if BITS_PER_LONG == 32 #define Z64 0, 0 @@ -509,7 +508,11 @@ out: /* We want this statically initialized, just to be safe. We try not * to waste too much space, either. */ -static const cpumask_t cpumask_map[] = { +static const cpumask_t cpumask_map[] +#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP +__initdata +#endif += { CMI0(0), CMI0(1), CMI0(2), CMI0(3), #if NR_CPUS > 4 CMI0(4), CMI0(5), CMI0(6), CMI0(7), @@ -569,4 +572,3 @@ static const cpumask_t cpumask_map[] = { }; const cpumask_t *cpumask_of_cpu_map = cpumask_map; -#endif /* !CONFIG_HAVE_CPUMASK_OF_CPU_MAP */ -- cgit v1.2.3 From 0bc3cc03fa6e1c20aecb5a33356bcaae410640b9 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:31 -0700 Subject: cpumask: change cpumask_of_cpu_ptr to use new cpumask_of_cpu * Replace previous instances of the cpumask_of_cpu_ptr* macros with a the new (lvalue capable) generic cpumask_of_cpu(). Signed-off-by: Mike Travis Cc: Andrew Morton Cc: Jack Steiner Cc: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 3 +-- kernel/time/tick-common.c | 8 +++----- kernel/trace/trace_sysprof.c | 4 +--- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 738b411ff2d3..ba9b2054ecbd 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -33,9 +33,8 @@ static int stopmachine(void *cpu) { int irqs_disabled = 0; int prepared = 0; - cpumask_of_cpu_ptr(cpumask, (int)(long)cpu); - set_cpus_allowed_ptr(current, cpumask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu)); /* Ack: we are alive */ smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index bf43284d6855..80c4336f4188 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -196,12 +196,10 @@ static int tick_check_new_device(struct clock_event_device *newdev) struct tick_device *td; int cpu, ret = NOTIFY_OK; unsigned long flags; - cpumask_of_cpu_ptr_declare(cpumask); spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); - cpumask_of_cpu_ptr_next(cpumask, cpu); if (!cpu_isset(cpu, newdev->cpumask)) goto out_bc; @@ -209,7 +207,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) curdev = td->evtdev; /* cpu local device ? */ - if (!cpus_equal(newdev->cpumask, *cpumask)) { + if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) { /* * If the cpu affinity of the device interrupt can not @@ -222,7 +220,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * If we have a cpu local device already, do not replace it * by a non cpu local device */ - if (curdev && cpus_equal(curdev->cpumask, *cpumask)) + if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu))) goto out_bc; } @@ -254,7 +252,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) curdev = NULL; } clockevents_exchange_device(curdev, newdev); - tick_setup_device(td, newdev, cpu, cpumask); + tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index ce2d723c10e1..bb948e52ce20 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -213,9 +213,7 @@ static void start_stack_timers(void) int cpu; for_each_online_cpu(cpu) { - cpumask_of_cpu_ptr(new_mask, cpu); - - set_cpus_allowed_ptr(current, new_mask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); start_stack_timer(cpu); } set_cpus_allowed_ptr(current, &saved_mask); -- cgit v1.2.3 From 5a7a201c51c324876d00a54e7208af6af12d1ca4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 26 Jul 2008 16:50:47 +0200 Subject: cpumask: export cpumask_of_cpu_map fix: ERROR: "cpumask_of_cpu_map" [drivers/acpi/processor.ko] undefined! ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/microcode.ko] undefined! ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/cpu/cpufreq/speedstep-ich.ko] undefined! ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.ko] undefined! Signed-off-by: Ingo Molnar --- kernel/cpu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9d4e1c28c053..a35d8995dc8c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -572,3 +572,5 @@ __initdata }; const cpumask_t *cpumask_of_cpu_map = cpumask_map; + +EXPORT_SYMBOL_GPL(cpumask_of_cpu_map); -- cgit v1.2.3 From 15bba37d62351749c3915add81f673b256952ee1 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 24 Jul 2008 15:41:48 +0100 Subject: module: fix build warning with !CONFIG_KALLSYMS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixed the warning: CC kernel/module.o /home/wangcong/Projects/linux-2.6/kernel/module.c:332: warning: ‘lookup_symbol’ defined but not used Signed-off-by: WANG Cong Signed-off-by: Rusty Russell --- kernel/module.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d8b5605132a0..d861bd5b8c10 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -325,18 +325,6 @@ static unsigned long find_symbol(const char *name, return -ENOENT; } -/* lookup symbol in given range of kernel_symbols */ -static const struct kernel_symbol *lookup_symbol(const char *name, - const struct kernel_symbol *start, - const struct kernel_symbol *stop) -{ - const struct kernel_symbol *ks = start; - for (; ks < stop; ks++) - if (strcmp(ks->name, name) == 0) - return ks; - return NULL; -} - /* Search for module by name: must hold module_mutex. */ static struct module *find_module(const char *name) { @@ -1703,6 +1691,19 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, } #ifdef CONFIG_KALLSYMS + +/* lookup symbol in given range of kernel_symbols */ +static const struct kernel_symbol *lookup_symbol(const char *name, + const struct kernel_symbol *start, + const struct kernel_symbol *stop) +{ + const struct kernel_symbol *ks = start; + for (; ks < stop; ks++) + if (strcmp(ks->name, name) == 0) + return ks; + return NULL; +} + static int is_exported(const char *name, const struct module *mod) { if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) -- cgit v1.2.3 From 5c2aed622571ac7c3c6ec182d6d3c318e4b45c8b Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 28 Feb 2008 11:33:03 -0500 Subject: stop_machine: add ALL_CPUS option -allow stop_mahcine_run() to call a function on all cpus. Calling stop_machine_run() with a 'ALL_CPUS' invokes this new behavior. stop_machine_run() proceeds as normal until the calling cpu has invoked 'fn'. Then, we tell all the other cpus to call 'fn'. Signed-off-by: Jason Baron Signed-off-by: Mathieu Desnoyers Signed-off-by: Rusty Russell CC: Adrian Bunk CC: Andi Kleen CC: Alexey Dobriyan CC: Christoph Hellwig CC: mingo@elte.hu CC: akpm@osdl.org --- kernel/stop_machine.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 738b411ff2d3..a473bd0cb71b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -22,9 +22,17 @@ enum stopmachine_state { STOPMACHINE_WAIT, STOPMACHINE_PREPARE, STOPMACHINE_DISABLE_IRQ, + STOPMACHINE_RUN, STOPMACHINE_EXIT, }; +struct stop_machine_data { + int (*fn)(void *); + void *data; + struct completion done; + int run_all; +} smdata; + static enum stopmachine_state stopmachine_state; static unsigned int stopmachine_num_threads; static atomic_t stopmachine_thread_ack; @@ -33,6 +41,7 @@ static int stopmachine(void *cpu) { int irqs_disabled = 0; int prepared = 0; + int ran = 0; cpumask_of_cpu_ptr(cpumask, (int)(long)cpu); set_cpus_allowed_ptr(current, cpumask); @@ -58,6 +67,11 @@ static int stopmachine(void *cpu) prepared = 1; smp_mb(); /* Must read state first. */ atomic_inc(&stopmachine_thread_ack); + } else if (stopmachine_state == STOPMACHINE_RUN && !ran) { + smdata.fn(smdata.data); + ran = 1; + smp_mb(); /* Must read state first. */ + atomic_inc(&stopmachine_thread_ack); } /* Yield in first stage: migration threads need to * help our sisters onto their CPUs. */ @@ -136,11 +150,10 @@ static void restart_machine(void) preempt_enable_no_resched(); } -struct stop_machine_data { - int (*fn)(void *); - void *data; - struct completion done; -}; +static void run_other_cpus(void) +{ + stopmachine_set_state(STOPMACHINE_RUN); +} static int do_stop(void *_smdata) { @@ -150,6 +163,8 @@ static int do_stop(void *_smdata) ret = stop_machine(); if (ret == 0) { ret = smdata->fn(smdata->data); + if (smdata->run_all) + run_other_cpus(); restart_machine(); } @@ -173,14 +188,17 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, struct stop_machine_data smdata; struct task_struct *p; + mutex_lock(&stopmachine_mutex); + smdata.fn = fn; smdata.data = data; + smdata.run_all = (cpu == ALL_CPUS) ? 1 : 0; init_completion(&smdata.done); - mutex_lock(&stopmachine_mutex); + smp_wmb(); /* make sure other cpus see smdata updates */ /* If they don't care which CPU fn runs on, bind to any online one. */ - if (cpu == NR_CPUS) + if (cpu == NR_CPUS || cpu == ALL_CPUS) cpu = raw_smp_processor_id(); p = kthread_create(do_stop, &smdata, "kstopmachine"); -- cgit v1.2.3 From ffdb5976c47609c862917d4c186ecbb5706d2dda Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:28 -0500 Subject: Simplify stop_machine stop_machine creates a kthread which creates kernel threads. We can create those threads directly and simplify things a little. Some care must be taken with CPU hotunplug, which has special needs, but that code seems more robust than it was in the past. Signed-off-by: Rusty Russell Acked-by: Christian Borntraeger --- kernel/cpu.c | 13 +-- kernel/stop_machine.c | 293 +++++++++++++++++++++----------------------------- 2 files changed, 128 insertions(+), 178 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 10ba5f1004a5..cf79bb911371 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -216,7 +216,6 @@ static int __ref take_cpu_down(void *_param) static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - struct task_struct *p; cpumask_t old_allowed, tmp; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; @@ -250,19 +249,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpu_clear(cpu, tmp); set_cpus_allowed_ptr(current, &tmp); - p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); + err = __stop_machine_run(take_cpu_down, &tcd_param, cpu); - if (IS_ERR(p) || cpu_online(cpu)) { + if (err || cpu_online(cpu)) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) BUG(); - if (IS_ERR(p)) { - err = PTR_ERR(p); - goto out_allowed; - } - goto out_thread; + goto out_allowed; } /* Wait for it to sleep (leaving idle task). */ @@ -279,8 +274,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) check_for_tasks(cpu); -out_thread: - err = kthread_stop(p); out_allowed: set_cpus_allowed_ptr(current, &old_allowed); out_release: diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index a473bd0cb71b..35882dccc943 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,4 +1,4 @@ -/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. +/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. * GPL v2 and any later version. */ #include @@ -13,220 +13,177 @@ #include #include -/* Since we effect priority and affinity (both of which are visible - * to, and settable by outside processes) we do indirection via a - * kthread. */ - -/* Thread to stop each CPU in user context. */ +/* This controls the threads on each CPU. */ enum stopmachine_state { - STOPMACHINE_WAIT, + /* Dummy starting state for thread. */ + STOPMACHINE_NONE, + /* Awaiting everyone to be scheduled. */ STOPMACHINE_PREPARE, + /* Disable interrupts. */ STOPMACHINE_DISABLE_IRQ, + /* Run the function */ STOPMACHINE_RUN, + /* Exit */ STOPMACHINE_EXIT, }; +static enum stopmachine_state state; struct stop_machine_data { int (*fn)(void *); void *data; - struct completion done; - int run_all; -} smdata; + int fnret; +}; -static enum stopmachine_state stopmachine_state; -static unsigned int stopmachine_num_threads; -static atomic_t stopmachine_thread_ack; +/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ +static unsigned int num_threads; +static atomic_t thread_ack; +static struct completion finished; +static DEFINE_MUTEX(lock); -static int stopmachine(void *cpu) +static void set_state(enum stopmachine_state newstate) { - int irqs_disabled = 0; - int prepared = 0; - int ran = 0; - cpumask_of_cpu_ptr(cpumask, (int)(long)cpu); - - set_cpus_allowed_ptr(current, cpumask); - - /* Ack: we are alive */ - smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ - atomic_inc(&stopmachine_thread_ack); - - /* Simple state machine */ - while (stopmachine_state != STOPMACHINE_EXIT) { - if (stopmachine_state == STOPMACHINE_DISABLE_IRQ - && !irqs_disabled) { - local_irq_disable(); - hard_irq_disable(); - irqs_disabled = 1; - /* Ack: irqs disabled. */ - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } else if (stopmachine_state == STOPMACHINE_PREPARE - && !prepared) { - /* Everyone is in place, hold CPU. */ - preempt_disable(); - prepared = 1; - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } else if (stopmachine_state == STOPMACHINE_RUN && !ran) { - smdata.fn(smdata.data); - ran = 1; - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } - /* Yield in first stage: migration threads need to - * help our sisters onto their CPUs. */ - if (!prepared && !irqs_disabled) - yield(); - cpu_relax(); - } - - /* Ack: we are exiting. */ - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - - if (irqs_disabled) - local_irq_enable(); - if (prepared) - preempt_enable(); - - return 0; + /* Reset ack counter. */ + atomic_set(&thread_ack, num_threads); + smp_wmb(); + state = newstate; } -/* Change the thread state */ -static void stopmachine_set_state(enum stopmachine_state state) +/* Last one to ack a state moves to the next state. */ +static void ack_state(void) { - atomic_set(&stopmachine_thread_ack, 0); - smp_wmb(); - stopmachine_state = state; - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - cpu_relax(); + if (atomic_dec_and_test(&thread_ack)) { + /* If we're the last one to ack the EXIT, we're finished. */ + if (state == STOPMACHINE_EXIT) + complete(&finished); + else + set_state(state + 1); + } } -static int stop_machine(void) +/* This is the actual thread which stops the CPU. It exits by itself rather + * than waiting for kthread_stop(), because it's easier for hotplug CPU. */ +static int stop_cpu(struct stop_machine_data *smdata) { - int i, ret = 0; - - atomic_set(&stopmachine_thread_ack, 0); - stopmachine_num_threads = 0; - stopmachine_state = STOPMACHINE_WAIT; + enum stopmachine_state curstate = STOPMACHINE_NONE; + int uninitialized_var(ret); - for_each_online_cpu(i) { - if (i == raw_smp_processor_id()) - continue; - ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); - if (ret < 0) - break; - stopmachine_num_threads++; - } - - /* Wait for them all to come to life. */ - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) { - yield(); + /* Simple state machine */ + do { + /* Chill out and ensure we re-read stopmachine_state. */ cpu_relax(); - } - - /* If some failed, kill them all. */ - if (ret < 0) { - stopmachine_set_state(STOPMACHINE_EXIT); - return ret; - } - - /* Now they are all started, make them hold the CPUs, ready. */ - preempt_disable(); - stopmachine_set_state(STOPMACHINE_PREPARE); - - /* Make them disable irqs. */ - local_irq_disable(); - hard_irq_disable(); - stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); - - return 0; -} + if (state != curstate) { + curstate = state; + switch (curstate) { + case STOPMACHINE_DISABLE_IRQ: + local_irq_disable(); + hard_irq_disable(); + break; + case STOPMACHINE_RUN: + /* |= allows error detection if functions on + * multiple CPUs. */ + smdata->fnret |= smdata->fn(smdata->data); + break; + default: + break; + } + ack_state(); + } + } while (curstate != STOPMACHINE_EXIT); -static void restart_machine(void) -{ - stopmachine_set_state(STOPMACHINE_EXIT); local_irq_enable(); - preempt_enable_no_resched(); + do_exit(0); } -static void run_other_cpus(void) +/* Callback for CPUs which aren't supposed to do anything. */ +static int chill(void *unused) { - stopmachine_set_state(STOPMACHINE_RUN); + return 0; } -static int do_stop(void *_smdata) +int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) { - struct stop_machine_data *smdata = _smdata; - int ret; + int i, err; + struct stop_machine_data active, idle; + struct task_struct **threads; + + active.fn = fn; + active.data = data; + active.fnret = 0; + idle.fn = chill; + idle.data = NULL; + + /* If they don't care which cpu fn runs on, just pick one. */ + if (cpu == NR_CPUS) + cpu = any_online_cpu(cpu_online_map); + + /* This could be too big for stack on large machines. */ + threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL); + if (!threads) + return -ENOMEM; + + /* Set up initial state. */ + mutex_lock(&lock); + init_completion(&finished); + num_threads = num_online_cpus(); + set_state(STOPMACHINE_PREPARE); - ret = stop_machine(); - if (ret == 0) { - ret = smdata->fn(smdata->data); - if (smdata->run_all) - run_other_cpus(); - restart_machine(); - } + for_each_online_cpu(i) { + struct stop_machine_data *smdata; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - /* We're done: you can kthread_stop us now */ - complete(&smdata->done); + if (cpu == ALL_CPUS || i == cpu) + smdata = &active; + else + smdata = &idle; + + threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u", + i); + if (IS_ERR(threads[i])) { + err = PTR_ERR(threads[i]); + threads[i] = NULL; + goto kill_threads; + } - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return ret; -} + /* Place it onto correct cpu. */ + kthread_bind(threads[i], i); -struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, - unsigned int cpu) -{ - static DEFINE_MUTEX(stopmachine_mutex); - struct stop_machine_data smdata; - struct task_struct *p; + /* Make it highest prio. */ + if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, ¶m)) + BUG(); + } - mutex_lock(&stopmachine_mutex); + /* We've created all the threads. Wake them all: hold this CPU so one + * doesn't hit this CPU until we're ready. */ + cpu = get_cpu(); + for_each_online_cpu(i) + wake_up_process(threads[i]); - smdata.fn = fn; - smdata.data = data; - smdata.run_all = (cpu == ALL_CPUS) ? 1 : 0; - init_completion(&smdata.done); + /* This will release the thread on our CPU. */ + put_cpu(); + wait_for_completion(&finished); + mutex_unlock(&lock); - smp_wmb(); /* make sure other cpus see smdata updates */ + kfree(threads); - /* If they don't care which CPU fn runs on, bind to any online one. */ - if (cpu == NR_CPUS || cpu == ALL_CPUS) - cpu = raw_smp_processor_id(); + return active.fnret; - p = kthread_create(do_stop, &smdata, "kstopmachine"); - if (!IS_ERR(p)) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; +kill_threads: + for_each_online_cpu(i) + if (threads[i]) + kthread_stop(threads[i]); + mutex_unlock(&lock); - /* One high-prio thread per cpu. We'll do this one. */ - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_bind(p, cpu); - wake_up_process(p); - wait_for_completion(&smdata.done); - } - mutex_unlock(&stopmachine_mutex); - return p; + kfree(threads); + return err; } int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) { - struct task_struct *p; int ret; /* No CPUs can come up or down during this. */ get_online_cpus(); - p = __stop_machine_run(fn, data, cpu); - if (!IS_ERR(p)) - ret = kthread_stop(p); - else - ret = PTR_ERR(p); + ret = __stop_machine_run(fn, data, cpu); put_online_cpus(); return ret; -- cgit v1.2.3 From 04321587584272f4e8b9818f319f40caf8eeee13 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:29 -0500 Subject: Hotplug CPU: don't check cpu_online after take_cpu_down Akinobu points out that if take_cpu_down() succeeds, the cpu must be offline. Remove the cpu_online() check, and put a BUG_ON(). Quoting Akinobu Mita: Actually the cpu_online() check was necessary before appling this stop_machine: simplify patch. With old __stop_machine_run(), __stop_machine_run() could succeed (return !IS_ERR(p) value) even if take_cpu_down() returned non-zero value. The return value of take_cpu_down() was obtained through kthread_stop().. Signed-off-by: Rusty Russell Cc: "Akinobu Mita" --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index cf79bb911371..53cf508f975a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -250,8 +250,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) set_cpus_allowed_ptr(current, &tmp); err = __stop_machine_run(take_cpu_down, &tcd_param, cpu); - - if (err || cpu_online(cpu)) { + if (err) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) @@ -259,6 +258,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) goto out_allowed; } + BUG_ON(cpu_online(cpu)); /* Wait for it to sleep (leaving idle task). */ while (!idle_cpu(cpu)) -- cgit v1.2.3 From eeec4fad963490821348a331cca6102ae1c4a7a3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:30 -0500 Subject: stop_machine(): stop_machine_run() changed to use cpu mask Instead of a "cpu" arg with magic values NR_CPUS (any cpu) and ~0 (all cpus), pass a cpumask_t. Allow NULL for the common case (where we don't care which CPU the function is run on): temporary cpumask_t's are usually considered bad for stack space. This deprecates stop_machine_run, to be removed soon when all the callers are dead. Signed-off-by: Rusty Russell --- kernel/cpu.c | 3 ++- kernel/stop_machine.c | 27 +++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 53cf508f975a..29510d68338a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -248,8 +248,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpus_setall(tmp); cpu_clear(cpu, tmp); set_cpus_allowed_ptr(current, &tmp); + tmp = cpumask_of_cpu(cpu); - err = __stop_machine_run(take_cpu_down, &tcd_param, cpu); + err = __stop_machine(take_cpu_down, &tcd_param, &tmp); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 35882dccc943..e446c7c7d6a9 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -100,7 +100,7 @@ static int chill(void *unused) return 0; } -int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) +int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { int i, err; struct stop_machine_data active, idle; @@ -112,10 +112,6 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) idle.fn = chill; idle.data = NULL; - /* If they don't care which cpu fn runs on, just pick one. */ - if (cpu == NR_CPUS) - cpu = any_online_cpu(cpu_online_map); - /* This could be too big for stack on large machines. */ threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL); if (!threads) @@ -128,13 +124,16 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) set_state(STOPMACHINE_PREPARE); for_each_online_cpu(i) { - struct stop_machine_data *smdata; + struct stop_machine_data *smdata = &idle; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - if (cpu == ALL_CPUS || i == cpu) - smdata = &active; - else - smdata = &idle; + if (!cpus) { + if (i == first_cpu(cpu_online_map)) + smdata = &active; + } else { + if (cpu_isset(i, *cpus)) + smdata = &active; + } threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u", i); @@ -154,7 +153,7 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) /* We've created all the threads. Wake them all: hold this CPU so one * doesn't hit this CPU until we're ready. */ - cpu = get_cpu(); + get_cpu(); for_each_online_cpu(i) wake_up_process(threads[i]); @@ -177,15 +176,15 @@ kill_threads: return err; } -int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) +int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { int ret; /* No CPUs can come up or down during this. */ get_online_cpus(); - ret = __stop_machine_run(fn, data, cpu); + ret = __stop_machine(fn, data, cpus); put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(stop_machine_run); +EXPORT_SYMBOL_GPL(stop_machine); -- cgit v1.2.3 From 9b1a4d38373a5581a4e01032a3ccdd94cd93477b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:30 -0500 Subject: stop_machine: Wean existing callers off stop_machine_run() Signed-off-by: Rusty Russell --- kernel/module.c | 8 ++++---- kernel/rcuclassic.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d861bd5b8c10..61d212120df4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -678,7 +678,7 @@ static int try_stop_module(struct module *mod, int flags, int *forced) if (flags & O_NONBLOCK) { struct stopref sref = { mod, flags, forced }; - return stop_machine_run(__try_stop_module, &sref, NR_CPUS); + return stop_machine(__try_stop_module, &sref, NULL); } else { /* We don't need to stop the machine for this. */ mod->state = MODULE_STATE_GOING; @@ -1416,7 +1416,7 @@ static int __unlink_module(void *_mod) static void free_module(struct module *mod) { /* Delete from various lists */ - stop_machine_run(__unlink_module, mod, NR_CPUS); + stop_machine(__unlink_module, mod, NULL); remove_notes_attrs(mod); remove_sect_attrs(mod); mod_kobject_remove(mod); @@ -2197,7 +2197,7 @@ static struct module *load_module(void __user *umod, /* Now sew it into the lists so we can get lockdep and oops * info during argument parsing. Noone should access us, since * strong_try_module_get() will fail. */ - stop_machine_run(__link_module, mod, NR_CPUS); + stop_machine(__link_module, mod, NULL); /* Size of section 0 is 0, so this works well if no params */ err = parse_args(mod->name, mod->args, @@ -2231,7 +2231,7 @@ static struct module *load_module(void __user *umod, return mod; unlink: - stop_machine_run(__unlink_module, mod, NR_CPUS); + stop_machine(__unlink_module, mod, NULL); module_arch_cleanup(mod); cleanup: kobject_del(&mod->mkobj.kobj); diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 6f8696c502f4..aad93cdc9f68 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -91,8 +91,8 @@ static void force_quiescent_state(struct rcu_data *rdp, * rdp->cpu is the current cpu. * * cpu_online_map is updated by the _cpu_down() - * using stop_machine_run(). Since we're in irqs disabled - * section, stop_machine_run() is not exectuting, hence + * using __stop_machine(). Since we're in irqs disabled + * section, __stop_machine() is not exectuting, hence * the cpu_online_map is stable. * * However, a cpu might have been offlined _just_ before -- cgit v1.2.3 From 784e2d76007f90d69341b95967160c4fb7829299 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:31 -0500 Subject: stop_machine: fix up ftrace.c Simple conversion. Signed-off-by: Rusty Russell Cc: Abhishek Sagar Cc: Ingo Molnar Cc: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4231a3dc224a..f6e3af31b403 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -587,7 +587,7 @@ static int __ftrace_modify_code(void *data) static void ftrace_run_update_code(int command) { - stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); + stop_machine(__ftrace_modify_code, &command, NULL); } void ftrace_disable_daemon(void) @@ -787,7 +787,7 @@ static int ftrace_update_code(void) !ftrace_enabled || !ftraced_trigger) return 0; - stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); + stop_machine(__ftrace_update_code, NULL, NULL); return 1; } @@ -1564,7 +1564,7 @@ static int __init ftrace_dynamic_init(void) addr = (unsigned long)ftrace_record_ip; - stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); + stop_machine(ftrace_dyn_arch_init, &addr, NULL); /* ftrace_dyn_arch_init places the return code in addr */ if (addr) { -- cgit v1.2.3 From 157124c11f4217733691223ecf5ee47558ae9495 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Jul 2008 11:53:11 +0200 Subject: sched: fix warning in hrtick_start_fair() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Benjamin Herrenschmidt reported: > I get that on ppc64 ... > > In file included from kernel/sched.c:1595: > kernel/sched_fair.c: In function ‘hrtick_start_fair’: > kernel/sched_fair.c:902: warning: comparison of distinct pointer types lacks a cast > > Probably harmless but annoying. s64 delta = slice - ran; --> delta = max(10000LL, delta); Probably ppc64's s64 is long vs long long.. I think hpa was looking at sanitizing all these 64bit types across the architectures. Use max_t with an explicit type meanwhile. Reported-by: Benjamin Herrenschmid Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cf2cd6ce4cb2..0fe94ea43f32 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -899,7 +899,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) * doesn't make sense. Rely on vruntime for fairness. */ if (rq->curr != p) - delta = max(10000LL, delta); + delta = max_t(s64, 10000LL, delta); hrtick_start(rq, delta); } -- cgit v1.2.3 From 94f565598827e2015dce97f4c1ac4871ab84407b Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 27 Jul 2008 20:27:06 +0900 Subject: sched: fix SCHED_HRTICK dependency Currently, it seems SCHED_HRTICK allowed for !SMP. But, it seems to have no dependency of it. Fix it. Signed-off-by: OGAWA Hirofumi Signed-off-by: Ingo Molnar --- kernel/Kconfig.hz | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 382dd5a8b2d7..94fabd534b03 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -55,4 +55,4 @@ config HZ default 1000 if HZ_1000 config SCHED_HRTICK - def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS + def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) -- cgit v1.2.3 From e26873bb10f722f10b1af9de05119a3d7cbc07b2 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Tue, 22 Jul 2008 16:51:15 -0400 Subject: sched: test runtime rather than period in global_rt_runtime() Test runtime rather than period Signed-off-by: Roel Kluin Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0236958addcb..0d1717b00225 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -834,7 +834,7 @@ static inline u64 global_rt_period(void) static inline u64 global_rt_runtime(void) { - if (sysctl_sched_rt_period < 0) + if (sysctl_sched_rt_runtime < 0) return RUNTIME_INF; return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; -- cgit v1.2.3 From 2c3d103ba90827cfb478bf10464d9b5b9cea369c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 25 Jul 2008 19:45:00 +0100 Subject: sched: move sched_clock before first use Move sched_clock() up to stop warning: weak declaration of `sched_clock' after first use results in unspecified behavior (if -fno-unit-at-a-time). Signed-off-by: Hugh Dickins Cc: Mike Travis Cc: Ben Herrenschmidt Cc: Linuxppc-dev@ozlabs.org Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 22ed55d1167f..5a2dc7d8fd98 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -32,6 +32,15 @@ #include #include +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); +} #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK @@ -321,16 +330,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); #endif -/* - * Scheduler clock - returns current time in nanosec units. - * This is default implementation. - * Architectures and sub-architectures can override this. - */ -unsigned long long __attribute__((weak)) sched_clock(void) -{ - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); -} - unsigned long long cpu_clock(int cpu) { unsigned long long clock; -- cgit v1.2.3 From 0e241ffd306c0896bb9959be7faa4d4cfcb706d9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 24 Jul 2008 16:58:42 -0700 Subject: locking: fix mutex @key parameter kernel-doc notation Fix @key parameter to mutex_init() and one of its callers. Warning(linux-2.6.26-git11//drivers/base/class.c:210): No description found for parameter 'key' Signed-off-by: Randy Dunlap Acked-by: Greg Kroah-Hartman Signed-off-by: Ingo Molnar --- kernel/mutex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index bcdc9ac8ef60..12c779dc65d4 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -34,6 +34,7 @@ /*** * mutex_init - initialize the mutex * @lock: the mutex to be initialized + * @key: the lock_class_key for the class; used by mutex lock debugging * * Initialize the mutex to unlocked state. * -- cgit v1.2.3 From e56b3bc7942982ac2589c942fb345e38bc7a341a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 28 Jul 2008 11:32:33 -0700 Subject: cpu masks: optimize and clean up cpumask_of_cpu() Clean up and optimize cpumask_of_cpu(), by sharing all the zero words. Instead of stupidly generating all possible i=0...NR_CPUS 2^i patterns creating a huge array of constant bitmasks, realize that the zero words can be shared. In other words, on a 64-bit architecture, we only ever need 64 of these arrays - with a different bit set in one single world (with enough zero words around it so that we can create any bitmask by just offsetting in that big array). And then we just put enough zeroes around it that we can point every single cpumask to be one of those things. So when we have 4k CPU's, instead of having 4k arrays (of 4k bits each, with one bit set in each array - 2MB memory total), we have exactly 64 arrays instead, each 8k bits in size (64kB total). And then we just point cpumask(n) to the right position (which we can calculate dynamically). Once we have the right arrays, getting "cpumask(n)" ends up being: static inline const cpumask_t *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return (const cpumask_t *)p; } This brings other advantages and simplifications as well: - we are not wasting memory that is just filled with a single bit in various different places - we don't need all those games to re-create the arrays in some dense format, because they're already going to be dense enough. if we compile a kernel for up to 4k CPU's, "wasting" that 64kB of memory is a non-issue (especially since by doing this "overlapping" trick we probably get better cache behaviour anyway). [ mingo@elte.hu: Converted Linus's mails into a commit. See: http://lkml.org/lkml/2008/7/27/156 http://lkml.org/lkml/2008/7/28/320 Also applied a family filter - which also has the side-effect of leaving out the bits where Linus calls me an idio... Oh, never mind ;-) ] Signed-off-by: Ingo Molnar Cc: Rusty Russell Cc: Andrew Morton Cc: Al Viro Cc: Mike Travis Signed-off-by: Ingo Molnar --- kernel/cpu.c | 128 ++++++++++------------------------------------------------- 1 file changed, 20 insertions(+), 108 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a35d8995dc8c..06a8358bb418 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -462,115 +462,27 @@ out: #endif /* CONFIG_SMP */ -/* 64 bits of zeros, for initializers. */ -#if BITS_PER_LONG == 32 -#define Z64 0, 0 -#else -#define Z64 0 -#endif +/* + * cpu_bit_bitmap[] is a special, "compressed" data structure that + * represents all NR_CPUS bits binary values of 1< 4 - CMI0(4), CMI0(5), CMI0(6), CMI0(7), -#endif -#if NR_CPUS > 8 - CMI0(8), CMI0(9), CMI0(10), CMI0(11), - CMI0(12), CMI0(13), CMI0(14), CMI0(15), -#endif -#if NR_CPUS > 16 - CMI0(16), CMI0(17), CMI0(18), CMI0(19), - CMI0(20), CMI0(21), CMI0(22), CMI0(23), - CMI0(24), CMI0(25), CMI0(26), CMI0(27), - CMI0(28), CMI0(29), CMI0(30), CMI0(31), -#endif -#if NR_CPUS > 32 -#if BITS_PER_LONG == 32 - CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0), - CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0), - CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0), - CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0), - CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0), - CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0), - CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0), - CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0), -#else - CMI0(32), CMI0(33), CMI0(34), CMI0(35), - CMI0(36), CMI0(37), CMI0(38), CMI0(39), - CMI0(40), CMI0(41), CMI0(42), CMI0(43), - CMI0(44), CMI0(45), CMI0(46), CMI0(47), - CMI0(48), CMI0(49), CMI0(50), CMI0(51), - CMI0(52), CMI0(53), CMI0(54), CMI0(55), - CMI0(56), CMI0(57), CMI0(58), CMI0(59), - CMI0(60), CMI0(61), CMI0(62), CMI0(63), -#endif /* BITS_PER_LONG == 64 */ -#endif -#if NR_CPUS > 64 - CMI64(64, Z64), -#endif -#if NR_CPUS > 128 - CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64), -#endif -#if NR_CPUS > 256 - CMI256(256, Z256), -#endif -#if NR_CPUS > 512 - CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256), -#endif -#if NR_CPUS > 1024 - CMI1024(1024, Z1024), -#endif -#if NR_CPUS > 2048 - CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024), -#endif -#if NR_CPUS > 4096 -#error NR_CPUS too big. Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP +const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { + + MASK_DECLARE_8(0), MASK_DECLARE_8(8), + MASK_DECLARE_8(16), MASK_DECLARE_8(24), +#if BITS_PER_LONG > 32 + MASK_DECLARE_8(32), MASK_DECLARE_8(40), + MASK_DECLARE_8(48), MASK_DECLARE_8(56), #endif }; - -const cpumask_t *cpumask_of_cpu_map = cpumask_map; - -EXPORT_SYMBOL_GPL(cpumask_of_cpu_map); +EXPORT_SYMBOL_GPL(cpu_bit_bitmap); -- cgit v1.2.3 From cddb8a5c14aa89810b40495d94d3d2a0faee6619 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 28 Jul 2008 15:46:29 -0700 Subject: mmu-notifiers: core With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages. There are secondary MMUs (with secondary sptes and secondary tlbs) too. sptes in the kvm case are shadow pagetables, but when I say spte in mmu-notifier context, I mean "secondary pte". In GRU case there's no actual secondary pte and there's only a secondary tlb because the GRU secondary MMU has no knowledge about sptes and every secondary tlb miss event in the MMU always generates a page fault that has to be resolved by the CPU (this is not the case of KVM where the a secondary tlb miss will walk sptes in hardware and it will refill the secondary tlb transparently to software if the corresponding spte is present). The same way zap_page_range has to invalidate the pte before freeing the page, the spte (and secondary tlb) must also be invalidated before any page is freed and reused. Currently we take a page_count pin on every page mapped by sptes, but that means the pages can't be swapped whenever they're mapped by any spte because they're part of the guest working set. Furthermore a spte unmap event can immediately lead to a page to be freed when the pin is released (so requiring the same complex and relatively slow tlb_gather smp safe logic we have in zap_page_range and that can be avoided completely if the spte unmap event doesn't require an unpin of the page previously mapped in the secondary MMU). The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know when the VM is swapping or freeing or doing anything on the primary MMU so that the secondary MMU code can drop sptes before the pages are freed, avoiding all page pinning and allowing 100% reliable swapping of guest physical address space. Furthermore it avoids the code that teardown the mappings of the secondary MMU, to implement a logic like tlb_gather in zap_page_range that would require many IPI to flush other cpu tlbs, for each fixed number of spte unmapped. To make an example: if what happens on the primary MMU is a protection downgrade (from writeable to wrprotect) the secondary MMU mappings will be invalidated, and the next secondary-mmu-page-fault will call get_user_pages and trigger a do_wp_page through get_user_pages if it called get_user_pages with write=1, and it'll re-establishing an updated spte or secondary-tlb-mapping on the copied page. Or it will setup a readonly spte or readonly tlb mapping if it's a guest-read, if it calls get_user_pages with write=0. This is just an example. This allows to map any page pointed by any pte (and in turn visible in the primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an full MMU with both sptes and secondary-tlb like the shadow-pagetable layer with kvm), or a remote DMA in software like XPMEM (hence needing of schedule in XPMEM code to send the invalidate to the remote node, while no need to schedule in kvm/gru as it's an immediate event like invalidating primary-mmu pte). At least for KVM without this patch it's impossible to swap guests reliably. And having this feature and removing the page pin allows several other optimizations that simplify life considerably. Dependencies: 1) mm_take_all_locks() to register the mmu notifier when the whole VM isn't doing anything with "mm". This allows mmu notifier users to keep track if the VM is in the middle of the invalidate_range_begin/end critical section with an atomic counter incraese in range_begin and decreased in range_end. No secondary MMU page fault is allowed to map any spte or secondary tlb reference, while the VM is in the middle of range_begin/end as any page returned by get_user_pages in that critical section could later immediately be freed without any further ->invalidate_page notification (invalidate_range_begin/end works on ranges and ->invalidate_page isn't called immediately before freeing the page). To stop all page freeing and pagetable overwrites the mmap_sem must be taken in write mode and all other anon_vma/i_mmap locks must be taken too. 2) It'd be a waste to add branches in the VM if nobody could possibly run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of mmu notifiers, but this already allows to compile a KVM external module against a kernel with mmu notifiers enabled and from the next pull from kvm.git we'll start using them. And GRU/XPMEM will also be able to continue the development by enabling KVM=m in their config, until they submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n). This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM are all =n. The mmu_notifier_register call can fail because mm_take_all_locks may be interrupted by a signal and return -EINTR. Because mmu_notifier_reigster is used when a driver startup, a failure can be gracefully handled. Here an example of the change applied to kvm to register the mmu notifiers. Usually when a driver startups other allocations are required anyway and -ENOMEM failure paths exists already. struct kvm *kvm_arch_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + int err; if (!kvm) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops; + err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm); + if (err) { + kfree(kvm); + return ERR_PTR(err); + } + return kvm; } mmu_notifier_unregister returns void and it's reliable. The patch also adds a few needed but missing includes that would prevent kernel to compile after these changes on non-x86 archs (x86 didn't need them by luck). [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix mm/filemap_xip.c build] [akpm@linux-foundation.org: fix mm/mmu_notifier.c build] Signed-off-by: Andrea Arcangeli Signed-off-by: Nick Piggin Signed-off-by: Christoph Lameter Cc: Jack Steiner Cc: Robin Holt Cc: Nick Piggin Cc: Peter Zijlstra Cc: Kanoj Sarcar Cc: Roland Dreier Cc: Steve Wise Cc: Avi Kivity Cc: Hugh Dickins Cc: Rusty Russell Cc: Anthony Liguori Cc: Chris Wright Cc: Marcelo Tosatti Cc: Eric Dumazet Cc: "Paul E. McKenney" Cc: Izik Eidus Cc: Anthony Liguori Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8214ba7c8bb1..7ce2ebe84796 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -414,6 +415,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; + mmu_notifier_mm_init(mm); return mm; } @@ -446,6 +448,7 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); + mmu_notifier_mm_destroy(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); -- cgit v1.2.3 From 1a4e564b7db999fbe5d88318c96ac8747699d417 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 29 Jul 2008 22:32:57 -0700 Subject: resource: add resource_size() Avoid one-off errors by introducing a resource_size() function. Signed-off-by: Magnus Damm Cc: Ben Dooks Cc: Jean Delvare Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 74af2d7cb5a1..f5b518eabefe 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -490,7 +490,7 @@ resource_size_t resource_alignment(struct resource *res) { switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { case IORESOURCE_SIZEALIGN: - return res->end - res->start + 1; + return resource_size(res); case IORESOURCE_STARTALIGN: return res->start; default: -- cgit v1.2.3 From 5a3eb9f6b7c598529f832b8baa6458ab1cbab2c6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:18 -0700 Subject: cgroup: fix possible memory leak There's a leak if copy_from_user() returns failure. Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 657f8f8d93a5..28debe4e1488 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1424,14 +1424,17 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, if (buffer == NULL) return -ENOMEM; } - if (nbytes && copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; + if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { + retval = -EFAULT; + goto out; + } buffer[nbytes] = 0; /* nul-terminate */ strstrip(buffer); retval = cft->write_string(cgrp, cft, buffer); if (!retval) retval = nbytes; +out: if (buffer != local_buffer) kfree(buffer); return retval; -- cgit v1.2.3 From 36553434f475a84b653e25e74490ee8df43b86d5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:19 -0700 Subject: cgroup: remove duplicate code in allocate_cg_link() - just call free_cg_links() in allocate_cg_links() - the list will get initialized in allocate_cg_links(), so don't init it twice Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 28debe4e1488..249a517591b8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -355,6 +355,17 @@ static struct css_set *find_existing_css_set( return NULL; } +static void free_cg_links(struct list_head *tmp) +{ + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + + list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { + list_del(&link->cgrp_link_list); + kfree(link); + } +} + /* * allocate_cg_links() allocates "count" cg_cgroup_link structures * and chains them on tmp through their cgrp_link_list fields. Returns 0 on @@ -363,17 +374,12 @@ static struct css_set *find_existing_css_set( static int allocate_cg_links(int count, struct list_head *tmp) { struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; int i; INIT_LIST_HEAD(tmp); for (i = 0; i < count; i++) { link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { - list_for_each_entry_safe(link, saved_link, tmp, - cgrp_link_list) { - list_del(&link->cgrp_link_list); - kfree(link); - } + free_cg_links(tmp); return -ENOMEM; } list_add(&link->cgrp_link_list, tmp); @@ -381,17 +387,6 @@ static int allocate_cg_links(int count, struct list_head *tmp) return 0; } -static void free_cg_links(struct list_head *tmp) -{ - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; - - list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { - list_del(&link->cgrp_link_list); - kfree(link); - } -} - /* * find_css_set() takes an existing cgroup group and a * cgroup object, and returns a css_set object that's @@ -956,7 +951,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct super_block *sb; struct cgroupfs_root *root; struct list_head tmp_cg_links; - INIT_LIST_HEAD(&tmp_cg_links); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); -- cgit v1.2.3 From 55b6fd0162ace1e0f1b52c8c092565c115127ef6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:20 -0700 Subject: cgroup: uninline cgroup_has_css_refs() It's not small enough, and has 2 call sites. text data bss dec hex filename 12813 1676 4832 19321 4b79 cgroup.o.orig 12775 1676 4832 19283 4b53 cgroup.o Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 249a517591b8..13932abde159 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2368,7 +2368,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } -static inline int cgroup_has_css_refs(struct cgroup *cgrp) +static int cgroup_has_css_refs(struct cgroup *cgrp) { /* Check the reference count on each subsystem. Since we * already established that there are no tasks in the -- cgit v1.2.3 From 8d1e6266f512b3a94ef6d33528ff385f1aea0392 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:21 -0700 Subject: cpuset: a bit cleanup for scan_for_empty_cpusets() clean up hierarchy traversal code Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Paul Jackson Cc: Cliff Wickman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 91cf85b36dd5..3624dc0e95ed 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1833,24 +1833,21 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) */ static void scan_for_empty_cpusets(const struct cpuset *root) { + LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ - struct list_head queue; struct cgroup *cont; nodemask_t oldmems; - INIT_LIST_HEAD(&queue); - list_add_tail((struct list_head *)&root->stack_list, &queue); while (!list_empty(&queue)) { - cp = container_of(queue.next, struct cpuset, stack_list); + cp = list_first_entry(&queue, struct cpuset, stack_list); list_del(queue.next); list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); list_add_tail(&child->stack_list, &queue); } - cont = cp->css.cgroup; /* Continue past cpusets with all cpus, mems online */ if (cpus_subset(cp->cpus_allowed, cpu_online_map) && -- cgit v1.2.3 From f5393693e96393131a4a2e2743f883986d508503 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 29 Jul 2008 22:33:22 -0700 Subject: cpuset: speed up sched domain partition All child cpusets contain a subset of the parent's cpus, so we can skip them when partitioning sched domains. This decreases 'csa' greately for cpusets with multi-level hierarchy. Signed-off-by: Lai Jiangshan Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Reviewed-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3624dc0e95ed..c818c36b0c5f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -486,13 +486,38 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) static void update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) { - if (!dattr) - return; if (dattr->relax_domain_level < c->relax_domain_level) dattr->relax_domain_level = c->relax_domain_level; return; } +static void +update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) +{ + LIST_HEAD(q); + + list_add(&c->stack_list, &q); + while (!list_empty(&q)) { + struct cpuset *cp; + struct cgroup *cont; + struct cpuset *child; + + cp = list_first_entry(&q, struct cpuset, stack_list); + list_del(q.next); + + if (cpus_empty(cp->cpus_allowed)) + continue; + + if (is_sched_load_balance(cp)) + update_domain_attr(dattr, cp); + + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, &q); + } + } +} + /* * rebuild_sched_domains() * @@ -614,8 +639,16 @@ void rebuild_sched_domains(void) if (cpus_empty(cp->cpus_allowed)) continue; - if (is_sched_load_balance(cp)) + /* + * All child cpusets contain a subset of the parent's cpus, so + * just skip them, and then we call update_domain_attr_tree() + * to calc relax_domain_level of the corresponding sched + * domain. + */ + if (is_sched_load_balance(cp)) { csa[csn++] = cp; + continue; + } list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); @@ -686,7 +719,7 @@ restart: cpus_or(*dp, *dp, b->cpus_allowed); b->pn = -1; if (dattr) - update_domain_attr(dattr + update_domain_attr_tree(dattr + nslot, b); } } -- cgit v1.2.3 From 93a6557558a13f9ff35213efeca483f353c39dd3 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:23 -0700 Subject: cpuset: fix wrong calculation of relax domain level When multiple cpusets are overlapping in their 'cpus' and hence they form a single sched domain, the largest sched_relax_domain_level among those should be used. But when top_cpuset's sched_load_balance is set, its sched_relax_domain_level is used regardless other sub-cpusets'. This patch fixes it by walking the cpuset hierarchy to find the largest sched_relax_domain_level. Signed-off-by: Lai Jiangshan Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Reviewed-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c818c36b0c5f..7a82e9033a7f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -616,7 +616,7 @@ void rebuild_sched_domains(void) dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; - update_domain_attr(dattr, &top_cpuset); + update_domain_attr_tree(dattr, &top_cpuset); } *doms = top_cpuset.cpus_allowed; goto rebuild; -- cgit v1.2.3 From aeed682421a5ebfbf46940e30c3d1caf3bc64304 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:24 -0700 Subject: cpuset: clean up cpuset hierarchy traversal code Use cpuset.stack_list rather than kfifo, so we avoid memory allocation for kfifo. Signed-off-by: Li Zefan Signed-off-by: Lai Jiangshan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7a82e9033a7f..d5ab79cf516d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include @@ -557,7 +556,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * So the reverse nesting would risk an ABBA deadlock. * * The three key local variables below are: - * q - a kfifo queue of cpuset pointers, used to implement a + * q - a linked-list queue of cpuset pointers, used to implement a * top-down scan of all cpusets. This scan loads a pointer * to each cpuset marked is_sched_load_balance into the * array 'csa'. For our purposes, rebuilding the schedulers @@ -592,7 +591,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) void rebuild_sched_domains(void) { - struct kfifo *q; /* queue of cpusets to be scanned */ + LIST_HEAD(q); /* queue of cpusets to be scanned*/ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ @@ -602,7 +601,6 @@ void rebuild_sched_domains(void) int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ - q = NULL; csa = NULL; doms = NULL; dattr = NULL; @@ -622,20 +620,19 @@ void rebuild_sched_domains(void) goto rebuild; } - q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); - if (IS_ERR(q)) - goto done; csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; - cp = &top_cpuset; - __kfifo_put(q, (void *)&cp, sizeof(cp)); - while (__kfifo_get(q, (void *)&cp, sizeof(cp))) { + list_add(&top_cpuset.stack_list, &q); + while (!list_empty(&q)) { struct cgroup *cont; struct cpuset *child; /* scans child cpusets of cp */ + cp = list_first_entry(&q, struct cpuset, stack_list); + list_del(q.next); + if (cpus_empty(cp->cpus_allowed)) continue; @@ -652,7 +649,7 @@ void rebuild_sched_domains(void) list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); - __kfifo_put(q, (void *)&child, sizeof(cp)); + list_add_tail(&child->stack_list, &q); } } @@ -735,8 +732,6 @@ rebuild: put_online_cpus(); done: - if (q && !IS_ERR(q)) - kfifo_free(q); kfree(csa); /* Don't kfree(doms) -- partition_sched_domains() does that. */ /* Don't kfree(dattr) -- partition_sched_domains() does that. */ -- cgit v1.2.3 From 5def9a3a22e09c99717f41ab7f07ec9e1a1f3ec8 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 29 Jul 2008 22:33:31 -0700 Subject: markers: fix markers read barrier for multiple probes Paul pointed out two incorrect read barriers in the marker handler code in the path where multiple probes are connected. Those are ordering reads of "ptype" (single or multi probe marker), "multi" array pointer, and "multi" array data access. It should be ordered like this : read ptype smp_rmb() read multi array pointer smp_read_barrier_depends() access data referenced by multi array pointer The code with a single probe connected (optimized case, does not have to allocate an array) has correct memory ordering. It applies to kernel 2.6.26.x, 2.6.25.x and linux-next. Signed-off-by: Mathieu Desnoyers Cc: "Paul E. McKenney" Cc: [2.6.25.x, 2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/marker.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index 971da5317903..7d1faecd7a51 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -125,6 +125,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) } else { struct marker_probe_closure *multi; int i; + /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; /* * multi points to an array, therefore accessing the array * depends on reading multi. However, even in this case, @@ -133,7 +138,6 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = mdata->multi; for (i = 0; multi[i].func; i++) { va_start(args, call_private); multi[i].func(multi[i].probe_private, call_private, @@ -174,6 +178,11 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) } else { struct marker_probe_closure *multi; int i; + /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; /* * multi points to an array, therefore accessing the array * depends on reading multi. However, even in this case, @@ -182,7 +191,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = mdata->multi; for (i = 0; multi[i].func; i++) multi[i].func(multi[i].probe_private, call_private, mdata->format, &args); -- cgit v1.2.3 From 641de9d8f505db055d451b50e6e38117f84e79bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 29 Jul 2008 22:33:38 -0700 Subject: printk: fix comment for printk ratelimiting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment assumed the burst to be one and the ratelimit used to be named printk_ratelimit_jiffies. Signed-off-by: Uwe Kleine-König Cc: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index a7f7559c5f6c..b51b1567bb55 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1309,14 +1309,14 @@ void tty_write_message(struct tty_struct *tty, char *msg) #if defined CONFIG_PRINTK -DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); /* * printk rate limiting, lifted from the networking subsystem. * - * This enforces a rate limit: not more than one kernel message - * every printk_ratelimit_jiffies to make a denial-of-service - * attack impossible. + * This enforces a rate limit: not more than 10 kernel messages + * every 5s to make a denial-of-service attack impossible. */ +DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); + int printk_ratelimit(void) { return __ratelimit(&printk_ratelimit_state); -- cgit v1.2.3 From 6af8bf3d86d55c98af6e453cb920ddc30867e5c7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 Jul 2008 22:33:49 -0700 Subject: workqueues: add comments to __create_workqueue_key() Dmitry Adamushko pointed out that the error handling in __create_workqueue_key() is not clear, add the comment. Signed-off-by: Oleg Nesterov Cc: Dmitry Adamushko Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ec7e4f62aaff..4a26a1382df0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -830,10 +830,21 @@ struct workqueue_struct *__create_workqueue_key(const char *name, start_workqueue_thread(cwq, -1); } else { cpu_maps_update_begin(); + /* + * We must place this wq on list even if the code below fails. + * cpu_down(cpu) can remove cpu from cpu_populated_map before + * destroy_workqueue() takes the lock, in that case we leak + * cwq[cpu]->thread. + */ spin_lock(&workqueue_lock); list_add(&wq->list, &workqueues); spin_unlock(&workqueue_lock); - + /* + * We must initialize cwqs for each possible cpu even if we + * are going to call destroy_workqueue() finally. Otherwise + * cpu_up() can hit the uninitialized cwq once we drop the + * lock. + */ for_each_possible_cpu(cpu) { cwq = init_cpu_workqueue(wq, cpu); if (err || !cpu_online(cpu)) -- cgit v1.2.3 From f718cd4add5aea9d379faff92f162571e356cc5f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 29 Jul 2008 22:33:52 -0700 Subject: sched: make scheduler sysfs attributes sysdev class devices They are really class devices, but were incorrectly declared. This leads to crashes with the recent changes that makes non normal sysdevs use a different prototype. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andi Kleen Cc: Ingo Molnar Cc: Pierre Ossman Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0236958addcb..21f7da94662e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7671,34 +7671,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) } #ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *page) +static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, + char *page) { return sprintf(page, "%u\n", sched_mc_power_savings); } -static ssize_t sched_mc_power_savings_store(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); } -static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, - sched_mc_power_savings_store); +static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, + sched_mc_power_savings_show, + sched_mc_power_savings_store); #endif #ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *page) +static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, + char *page) { return sprintf(page, "%u\n", sched_smt_power_savings); } -static ssize_t sched_smt_power_savings_store(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); } -static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, +static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, + sched_smt_power_savings_show, sched_smt_power_savings_store); #endif -- cgit v1.2.3 From e4e4e534faa3c2be4e165ce414f44b76ada7208c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 14 Apr 2008 08:50:02 +0200 Subject: sched clock: revert various sched_clock() changes Found an interactivity problem on a quad core test-system - simple CPU loops would occasionally delay the system un an unacceptable way. After much debugging with Peter Zijlstra it turned out that the problem is caused by the string of sched_clock() changes - they caused the CPU clock to jump backwards a bit - which confuses the scheduler arithmetics. (which is unsigned for performance reasons) So revert: # c300ba2: sched_clock: and multiplier for TSC to gtod drift # c0c8773: sched_clock: only update deltas with local reads. # af52a90: sched_clock: stop maximum check on NO HZ # f7cce27: sched_clock: widen the max and min time This solves the interactivity problems. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mike Galbraith --- kernel/sched_clock.c | 109 ++++++----------------------------------------- kernel/time/tick-sched.c | 2 - 2 files changed, 13 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 5a2dc7d8fd98..9a7844158ae8 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -44,11 +44,6 @@ unsigned long long __attribute__((weak)) sched_clock(void) #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -#define MULTI_SHIFT 15 -/* Max is double, Min is 1/2 */ -#define MAX_MULTI (2LL << MULTI_SHIFT) -#define MIN_MULTI (1LL << (MULTI_SHIFT-1)) - struct sched_clock_data { /* * Raw spinlock - this is a special case: this might be called @@ -62,10 +57,6 @@ struct sched_clock_data { u64 tick_raw; u64 tick_gtod; u64 clock; - s64 multi; -#ifdef CONFIG_NO_HZ - int check_max; -#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); @@ -97,53 +88,18 @@ void sched_clock_init(void) scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; - scd->multi = 1 << MULTI_SHIFT; -#ifdef CONFIG_NO_HZ - scd->check_max = 1; -#endif } sched_clock_running = 1; } -#ifdef CONFIG_NO_HZ -/* - * The dynamic ticks makes the delta jiffies inaccurate. This - * prevents us from checking the maximum time update. - * Disable the maximum check during stopped ticks. - */ -void sched_clock_tick_stop(int cpu) -{ - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->check_max = 0; -} - -void sched_clock_tick_start(int cpu) -{ - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->check_max = 1; -} - -static int check_max(struct sched_clock_data *scd) -{ - return scd->check_max; -} -#else -static int check_max(struct sched_clock_data *scd) -{ - return 1; -} -#endif /* CONFIG_NO_HZ */ - /* * update the percpu scd from the raw @now value * * - filter out backward motion * - use jiffies to generate a min,max window to clip the raw values */ -static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time) +static void __update_sched_clock(struct sched_clock_data *scd, u64 now) { unsigned long now_jiffies = jiffies; long delta_jiffies = now_jiffies - scd->tick_jiffies; @@ -152,31 +108,16 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim s64 delta = now - scd->prev_raw; WARN_ON_ONCE(!irqs_disabled()); - - /* - * At schedule tick the clock can be just under the gtod. We don't - * want to push it too prematurely. - */ - min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC); - if (min_clock > TICK_NSEC) - min_clock -= TICK_NSEC / 2; + min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; if (unlikely(delta < 0)) { clock++; goto out; } - /* - * The clock must stay within a jiffie of the gtod. - * But since we may be at the start of a jiffy or the end of one - * we add another jiffy buffer. - */ - max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; - - delta *= scd->multi; - delta >>= MULTI_SHIFT; + max_clock = min_clock + TICK_NSEC; - if (unlikely(clock + delta > max_clock) && check_max(scd)) { + if (unlikely(clock + delta > max_clock)) { if (clock < max_clock) clock = max_clock; else @@ -189,12 +130,9 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim if (unlikely(clock < min_clock)) clock = min_clock; - if (time) - *time = clock; - else { - scd->prev_raw = now; - scd->clock = clock; - } + scd->prev_raw = now; + scd->tick_jiffies = now_jiffies; + scd->clock = clock; } static void lock_double_clock(struct sched_clock_data *data1, @@ -238,26 +176,21 @@ u64 sched_clock_cpu(int cpu) now -= scd->tick_gtod; __raw_spin_unlock(&my_scd->lock); - - __update_sched_clock(scd, now, &clock); - - __raw_spin_unlock(&scd->lock); - } else { __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now, NULL); - clock = scd->clock; - __raw_spin_unlock(&scd->lock); } + __update_sched_clock(scd, now); + clock = scd->clock; + + __raw_spin_unlock(&scd->lock); + return clock; } void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); - unsigned long now_jiffies = jiffies; - s64 mult, delta_gtod, delta_raw; u64 now, now_gtod; if (unlikely(!sched_clock_running)) @@ -269,29 +202,14 @@ void sched_clock_tick(void) now = sched_clock(); __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now, NULL); + __update_sched_clock(scd, now); /* * update tick_gtod after __update_sched_clock() because that will * already observe 1 new jiffy; adding a new tick_gtod to that would * increase the clock 2 jiffies. */ - delta_gtod = now_gtod - scd->tick_gtod; - delta_raw = now - scd->tick_raw; - - if ((long)delta_raw > 0) { - mult = delta_gtod << MULTI_SHIFT; - do_div(mult, delta_raw); - scd->multi = mult; - if (scd->multi > MAX_MULTI) - scd->multi = MAX_MULTI; - else if (scd->multi < MIN_MULTI) - scd->multi = MIN_MULTI; - } else - scd->multi = 1 << MULTI_SHIFT; - scd->tick_raw = now; scd->tick_gtod = now_gtod; - scd->tick_jiffies = now_jiffies; __raw_spin_unlock(&scd->lock); } @@ -321,7 +239,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) __raw_spin_lock(&scd->lock); scd->prev_raw = now; scd->clock += delta_ns; - scd->multi = 1 << MULTI_SHIFT; __raw_spin_unlock(&scd->lock); touch_softlockup_watchdog(); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 825b4c00fe44..f5da526424a9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -289,7 +289,6 @@ void tick_nohz_stop_sched_tick(int inidle) ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); - sched_clock_tick_stop(cpu); } /* @@ -392,7 +391,6 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); - sched_clock_tick_start(cpu); cpu_clear(cpu, nohz_cpu_mask); /* -- cgit v1.2.3 From 50526968e99afbca34924abcb04658b6dd5c5ea5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 30 Jul 2008 09:39:48 +0200 Subject: sched clock: clean up sched_clock_cpu() - simplify the remote clock rebasing Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mike Galbraith --- kernel/sched_clock.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 9a7844158ae8..b96559cb96a5 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -169,11 +169,8 @@ u64 sched_clock_cpu(int cpu) lock_double_clock(scd, my_scd); - now -= my_scd->tick_raw; - now += scd->tick_raw; - - now += my_scd->tick_gtod; - now -= scd->tick_gtod; + now += scd->tick_raw - my_scd->tick_raw; + now += my_scd->tick_gtod - scd->tick_gtod; __raw_spin_unlock(&my_scd->lock); } else { -- cgit v1.2.3 From 18e4e36c66d6edbdefc639692206cdf01e468713 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 30 Jul 2008 10:13:35 +0200 Subject: sched: eliminate scd->prev_raw eliminate prev_raw and use tick_raw instead. It's enough to base the current time on the scheduler tick timestamp alone - the monotonicity and maximum checks will prevent any damage. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mike Galbraith --- kernel/sched_clock.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index b96559cb96a5..4b8474c966dc 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -53,7 +53,6 @@ struct sched_clock_data { raw_spinlock_t lock; unsigned long tick_jiffies; - u64 prev_raw; u64 tick_raw; u64 tick_gtod; u64 clock; @@ -84,7 +83,6 @@ void sched_clock_init(void) scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; scd->tick_jiffies = now_jiffies; - scd->prev_raw = 0; scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; @@ -105,7 +103,7 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) long delta_jiffies = now_jiffies - scd->tick_jiffies; u64 clock = scd->clock; u64 min_clock, max_clock; - s64 delta = now - scd->prev_raw; + s64 delta = now - scd->tick_raw; WARN_ON_ONCE(!irqs_disabled()); min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; @@ -130,7 +128,6 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) if (unlikely(clock < min_clock)) clock = min_clock; - scd->prev_raw = now; scd->tick_jiffies = now_jiffies; scd->clock = clock; } @@ -234,7 +231,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) * rq clock: */ __raw_spin_lock(&scd->lock); - scd->prev_raw = now; scd->clock += delta_ns; __raw_spin_unlock(&scd->lock); -- cgit v1.2.3 From 56b906126d33904d4d67615d0d5b95dbdf1f27ca Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 30 Jul 2008 10:15:55 +0200 Subject: sched clock: simplify __update_sched_clock() - return the current clock instead of letting callers fetch it from scd->clock Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mike Galbraith --- kernel/sched_clock.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 4b8474c966dc..857a1291fd23 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -97,7 +97,7 @@ void sched_clock_init(void) * - filter out backward motion * - use jiffies to generate a min,max window to clip the raw values */ -static void __update_sched_clock(struct sched_clock_data *scd, u64 now) +static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) { unsigned long now_jiffies = jiffies; long delta_jiffies = now_jiffies - scd->tick_jiffies; @@ -130,6 +130,8 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) scd->tick_jiffies = now_jiffies; scd->clock = clock; + + return clock; } static void lock_double_clock(struct sched_clock_data *data1, @@ -174,8 +176,7 @@ u64 sched_clock_cpu(int cpu) __raw_spin_lock(&scd->lock); } - __update_sched_clock(scd, now); - clock = scd->clock; + clock = __update_sched_clock(scd, now); __raw_spin_unlock(&scd->lock); -- cgit v1.2.3 From 4a273f209cc95d148f79b4c96d3d03997b44ffda Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 30 Jul 2008 10:22:07 +0200 Subject: sched clock: couple local and remote clocks When taking the time of a remote CPU, use the opportunity to couple (sync) the clocks to each other. (in a monotonic way) Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Mike Galbraith --- kernel/sched_clock.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 857a1291fd23..074edc989379 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -149,7 +149,7 @@ static void lock_double_clock(struct sched_clock_data *data1, u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd = cpu_sdc(cpu); - u64 now, clock; + u64 now, clock, this_clock, remote_clock; if (unlikely(!sched_clock_running)) return 0ull; @@ -158,26 +158,36 @@ u64 sched_clock_cpu(int cpu) now = sched_clock(); if (cpu != raw_smp_processor_id()) { - /* - * in order to update a remote cpu's clock based on our - * unstable raw time rebase it against: - * tick_raw (offset between raw counters) - * tick_gotd (tick offset between cpus) - */ struct sched_clock_data *my_scd = this_scd(); lock_double_clock(scd, my_scd); - now += scd->tick_raw - my_scd->tick_raw; - now += my_scd->tick_gtod - scd->tick_gtod; + this_clock = __update_sched_clock(my_scd, now); + remote_clock = scd->clock; + + /* + * Use the opportunity that we have both locks + * taken to couple the two clocks: we take the + * larger time as the latest time for both + * runqueues. (this creates monotonic movement) + */ + if (likely(remote_clock < this_clock)) { + clock = this_clock; + scd->clock = clock; + } else { + /* + * Should be rare, but possible: + */ + clock = remote_clock; + my_scd->clock = remote_clock; + } __raw_spin_unlock(&my_scd->lock); } else { __raw_spin_lock(&scd->lock); + clock = __update_sched_clock(scd, now); } - clock = __update_sched_clock(scd, now); - __raw_spin_unlock(&scd->lock); return clock; @@ -223,7 +233,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); void sched_clock_idle_wakeup_event(u64 delta_ns) { struct sched_clock_data *scd = this_scd(); - u64 now = sched_clock(); /* * Override the previous timestamp and ignore all -- cgit v1.2.3 From 419ca3f13532793b81aff09f80c60af3eacbb43d Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 29 Jul 2008 21:45:03 -0700 Subject: lockdep: fix combinatorial explosion in lock subgraph traversal When we traverse the graph, either forwards or backwards, we are interested in whether a certain property exists somewhere in a node reachable in the graph. Therefore it is never necessary to traverse through a node more than once to get a correct answer to the given query. Take advantage of this property using a global ID counter so that we need not clear all the markers in all the lock_class entries before doing a traversal. A new ID is choosen when we start to traverse, and we continue through a lock_class only if it's ID hasn't been marked with the new value yet. This short-circuiting is essential especially for high CPU count systems. The scheduler has a runqueue per cpu, and needs to take two runqueue locks at a time, which leads to long chains of backwards and forwards subgraphs from these runqueue lock nodes. Without the short-circuit implemented here, a graph traversal on a runqueue lock can take up to (1 << (N - 1)) checks on a system with N cpus. For anything more than 16 cpus or so, lockdep will eventually bring the machine to a complete standstill. Signed-off-by: David S. Miller Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/lockdep_internals.h | 3 ++ kernel/lockdep_proc.c | 34 ++---------------- 3 files changed, 92 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index d38a64362973..6999e64fc248 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -372,6 +372,19 @@ unsigned int nr_process_chains; unsigned int max_lockdep_depth; unsigned int max_recursion_depth; +static unsigned int lockdep_dependency_gen_id; + +static bool lockdep_dependency_visit(struct lock_class *source, + unsigned int depth) +{ + if (!depth) + lockdep_dependency_gen_id++; + if (source->dep_gen_id == lockdep_dependency_gen_id) + return true; + source->dep_gen_id = lockdep_dependency_gen_id; + return false; +} + #ifdef CONFIG_DEBUG_LOCKDEP /* * We cannot printk in early bootup code. Not even early_printk() @@ -558,6 +571,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth) { struct lock_list *entry; + if (lockdep_dependency_visit(class, depth)) + return; + if (DEBUG_LOCKS_WARN_ON(depth >= 20)) return; @@ -959,6 +975,67 @@ static int noinline print_infinite_recursion_bug(void) return 0; } +unsigned long __lockdep_count_forward_deps(struct lock_class *class, + unsigned int depth) +{ + struct lock_list *entry; + unsigned long ret = 1; + + if (lockdep_dependency_visit(class, depth)) + return 0; + + /* + * Recurse this class's dependency list: + */ + list_for_each_entry(entry, &class->locks_after, entry) + ret += __lockdep_count_forward_deps(entry->class, depth + 1); + + return ret; +} + +unsigned long lockdep_count_forward_deps(struct lock_class *class) +{ + unsigned long ret, flags; + + local_irq_save(flags); + __raw_spin_lock(&lockdep_lock); + ret = __lockdep_count_forward_deps(class, 0); + __raw_spin_unlock(&lockdep_lock); + local_irq_restore(flags); + + return ret; +} + +unsigned long __lockdep_count_backward_deps(struct lock_class *class, + unsigned int depth) +{ + struct lock_list *entry; + unsigned long ret = 1; + + if (lockdep_dependency_visit(class, depth)) + return 0; + /* + * Recurse this class's dependency list: + */ + list_for_each_entry(entry, &class->locks_before, entry) + ret += __lockdep_count_backward_deps(entry->class, depth + 1); + + return ret; +} + +unsigned long lockdep_count_backward_deps(struct lock_class *class) +{ + unsigned long ret, flags; + + local_irq_save(flags); + __raw_spin_lock(&lockdep_lock); + ret = __lockdep_count_backward_deps(class, 0); + __raw_spin_unlock(&lockdep_lock); + local_irq_restore(flags); + + return ret; +} + /* * Prove that the dependency graph starting at can not * lead to . Print an error and return 0 if it does. @@ -968,6 +1045,9 @@ check_noncircular(struct lock_class *source, unsigned int depth) { struct lock_list *entry; + if (lockdep_dependency_visit(source, depth)) + return 1; + debug_atomic_inc(&nr_cyclic_check_recursions); if (depth > max_recursion_depth) max_recursion_depth = depth; @@ -1011,6 +1091,9 @@ find_usage_forwards(struct lock_class *source, unsigned int depth) struct lock_list *entry; int ret; + if (lockdep_dependency_visit(source, depth)) + return 1; + if (depth > max_recursion_depth) max_recursion_depth = depth; if (depth >= RECURSION_LIMIT) @@ -1050,6 +1133,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) struct lock_list *entry; int ret; + if (lockdep_dependency_visit(source, depth)) + return 1; + if (!__raw_spin_is_locked(&lockdep_lock)) return DEBUG_LOCKS_WARN_ON(1); diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index c3600a091a28..68d44ec77ab5 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -53,6 +53,9 @@ extern unsigned int nr_process_chains; extern unsigned int max_lockdep_depth; extern unsigned int max_recursion_depth; +extern unsigned long lockdep_count_forward_deps(struct lock_class *); +extern unsigned long lockdep_count_backward_deps(struct lock_class *); + #ifdef CONFIG_DEBUG_LOCKDEP /* * Various lockdep statistics: diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 9b0e940e2545..6252ff799d19 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -63,34 +63,6 @@ static void l_stop(struct seq_file *m, void *v) { } -static unsigned long count_forward_deps(struct lock_class *class) -{ - struct lock_list *entry; - unsigned long ret = 1; - - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_after, entry) - ret += count_forward_deps(entry->class); - - return ret; -} - -static unsigned long count_backward_deps(struct lock_class *class) -{ - struct lock_list *entry; - unsigned long ret = 1; - - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_before, entry) - ret += count_backward_deps(entry->class); - - return ret; -} - static void print_name(struct seq_file *m, struct lock_class *class) { char str[128]; @@ -124,10 +96,10 @@ static int l_show(struct seq_file *m, void *v) #ifdef CONFIG_DEBUG_LOCKDEP seq_printf(m, " OPS:%8ld", class->ops); #endif - nr_forward_deps = count_forward_deps(class); + nr_forward_deps = lockdep_count_forward_deps(class); seq_printf(m, " FD:%5ld", nr_forward_deps); - nr_backward_deps = count_backward_deps(class); + nr_backward_deps = lockdep_count_backward_deps(class); seq_printf(m, " BD:%5ld", nr_backward_deps); get_usage_chars(class, &c1, &c2, &c3, &c4); @@ -350,7 +322,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) nr_hardirq_read_unsafe++; - sum_forward_deps += count_forward_deps(class); + sum_forward_deps += lockdep_count_forward_deps(class); } #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); -- cgit v1.2.3 From 5e710e37bde120bb069f691bee68e69ef4393173 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Jul 2008 13:26:57 +0200 Subject: lockdep: change scheduler annotation While thinking about David's graph walk lockdep patch it _finally_ dawned on me that there is no reason we have a lock class per cpu ... Sorry for being dense :-/ The below changes the annotation from a lock class per cpu, to a single nested lock, as the scheduler never holds more that 2 rq locks at a time anyway. If there was code requiring holding all rq locks this would not work and the original annotation would be the only option, but that not being the case, this is a much lighter one. Compiles and boots on a 2-way x86_64. Signed-off-by: Peter Zijlstra Cc: David Miller Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0236958addcb..655f1db26b12 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -600,7 +600,6 @@ struct rq { /* BKL stats */ unsigned int bkl_count; #endif - struct lock_class_key rq_lock_key; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -2759,10 +2758,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) } else { if (rq1 < rq2) { spin_lock(&rq1->lock); - spin_lock(&rq2->lock); + spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); } else { spin_lock(&rq2->lock); - spin_lock(&rq1->lock); + spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); } } update_rq_clock(rq1); @@ -2805,10 +2804,10 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) if (busiest < this_rq) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); - spin_lock(&this_rq->lock); + spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); ret = 1; } else - spin_lock(&busiest->lock); + spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); } return ret; } @@ -7998,7 +7997,6 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); - lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); -- cgit v1.2.3 From a9b60bf4c29e07a5a2f26a6f74937972fee9b58b Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 1 Aug 2008 08:39:34 -0500 Subject: kgdb: fix kgdb_validate_break_address to perform a mem write A regression to the kgdb core was found in the case of using the CONFIG_DEBUG_RODATA kernel option. When this option is on, a breakpoint cannot be written into any readonly memory page. When an external debugger requests a breakpoint to get set, the kgdb_validate_break_address() was only checking to see if the address to place the breakpoint was readable and lacked a write check. This patch changes the validate routine to try reading (via the breakpoint set request) and also to try immediately writing the break point. If either fails, an error is correctly returned and the debugger behaves correctly. Then an end user can make the descision to use hardware breakpoints. Also update the documentation to reflect that using CONFIG_DEBUG_RODATA will inhibit the use of software breakpoints. Signed-off-by: Jason Wessel --- kernel/kgdb.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 3ec23c3ec97f..c0d45b2c4d79 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -166,13 +166,6 @@ early_param("nokgdbroundup", opt_nokgdbroundup); * Weak aliases for breakpoint management, * can be overriden by architectures when needed: */ -int __weak kgdb_validate_break_address(unsigned long addr) -{ - char tmp_variable[BREAK_INSTR_SIZE]; - - return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE); -} - int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) { int err; @@ -191,6 +184,25 @@ int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) (char *)bundle, BREAK_INSTR_SIZE); } +int __weak kgdb_validate_break_address(unsigned long addr) +{ + char tmp_variable[BREAK_INSTR_SIZE]; + int err; + /* Validate setting the breakpoint and then removing it. In the + * remove fails, the kernel needs to emit a bad message because we + * are deep trouble not being able to put things back the way we + * found them. + */ + err = kgdb_arch_set_breakpoint(addr, tmp_variable); + if (err) + return err; + err = kgdb_arch_remove_breakpoint(addr, tmp_variable); + if (err) + printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " + "memory destroyed at: %lx", addr); + return err; +} + unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) { return instruction_pointer(regs); -- cgit v1.2.3 From 25fc999913839a45cbb48ac7872e67f7521e7ed9 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 1 Aug 2008 08:39:35 -0500 Subject: kgdb: fix gdb serial thread queries The command "info threads" did not work correctly with kgdb. It would result in a silent kernel hang if used. This patach addresses several problems. - Fix use of deprecated NR_CPUS - Fix kgdb to not walk linearly through the pid space - Correctly implement shadow pids - Change the threads per query to a #define - Fix kgdb_hex2long to work with negated values The threads 0 and -1 are reserved to represent the current task. That means that CPU 0 will start with a shadow thread id of -2, and CPU 1 will have a shadow thread id of -3, etc... From the debugger you can switch to a shadow thread to see what one of the other cpus was doing, however it is not possible to execute run control operations on any other cpu execept the cpu executing the kgdb_handle_exception(). Signed-off-by: Jason Wessel --- kernel/kgdb.c | 68 +++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index c0d45b2c4d79..eaa21fc9ad1d 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -56,12 +56,14 @@ static int kgdb_break_asap; +#define KGDB_MAX_THREAD_QUERY 17 struct kgdb_state { int ex_vector; int signo; int err_code; int cpu; int pass_exception; + unsigned long thr_query; unsigned long threadid; long kgdb_usethreadid; struct pt_regs *linux_regs; @@ -445,9 +447,14 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) { int hex_val; int num = 0; + int negate = 0; *long_val = 0; + if (**ptr == '-') { + negate = 1; + (*ptr)++; + } while (**ptr) { hex_val = hex(**ptr); if (hex_val < 0) @@ -458,6 +465,9 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) (*ptr)++; } + if (negate) + *long_val = -*long_val; + return num; } @@ -527,10 +537,16 @@ static void int_to_threadref(unsigned char *id, int value) static struct task_struct *getthread(struct pt_regs *regs, int tid) { /* - * Non-positive TIDs are remapped idle tasks: + * Non-positive TIDs are remapped to the cpu shadow information */ - if (tid <= 0) - return idle_task(-tid); + if (tid == 0 || tid == -1) + tid = -atomic_read(&kgdb_active) - 2; + if (tid < 0) { + if (kgdb_info[-tid - 2].task) + return kgdb_info[-tid - 2].task; + else + return idle_task(-tid - 2); + } /* * find_task_by_pid_ns() does not take the tasklist lock anymore @@ -737,14 +753,15 @@ setundefined: } /* - * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs: + * Remap normal tasks to their real PID, + * CPU shadow threads are mapped to -CPU - 2 */ static inline int shadow_pid(int realpid) { if (realpid) return realpid; - return -1-raw_smp_processor_id(); + return -raw_smp_processor_id() - 2; } static char gdbmsgbuf[BUFMAX + 1]; @@ -838,7 +855,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; } else { local_debuggerinfo = NULL; - for (i = 0; i < NR_CPUS; i++) { + for_each_online_cpu(i) { /* * Try to find the task on some other * or possibly this node if we do not @@ -972,10 +989,13 @@ static int gdb_cmd_reboot(struct kgdb_state *ks) /* Handle the 'q' query packets */ static void gdb_cmd_query(struct kgdb_state *ks) { - struct task_struct *thread; + struct task_struct *g; + struct task_struct *p; unsigned char thref[8]; char *ptr; int i; + int cpu; + int finished = 0; switch (remcom_in_buffer[1]) { case 's': @@ -985,22 +1005,34 @@ static void gdb_cmd_query(struct kgdb_state *ks) break; } - if (remcom_in_buffer[1] == 'f') - ks->threadid = 1; - + i = 0; remcom_out_buffer[0] = 'm'; ptr = remcom_out_buffer + 1; - - for (i = 0; i < 17; ks->threadid++) { - thread = getthread(ks->linux_regs, ks->threadid); - if (thread) { - int_to_threadref(thref, ks->threadid); + if (remcom_in_buffer[1] == 'f') { + /* Each cpu is a shadow thread */ + for_each_online_cpu(cpu) { + ks->thr_query = 0; + int_to_threadref(thref, -cpu - 2); pack_threadid(ptr, thref); ptr += BUF_THREAD_ID_SIZE; *(ptr++) = ','; i++; } } + + do_each_thread(g, p) { + if (i >= ks->thr_query && !finished) { + int_to_threadref(thref, p->pid); + pack_threadid(ptr, thref); + ptr += BUF_THREAD_ID_SIZE; + *(ptr++) = ','; + ks->thr_query++; + if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) + finished = 1; + } + i++; + } while_each_thread(g, p); + *(--ptr) = '\0'; break; @@ -1023,15 +1055,15 @@ static void gdb_cmd_query(struct kgdb_state *ks) error_packet(remcom_out_buffer, -EINVAL); break; } - if (ks->threadid > 0) { + if ((int)ks->threadid > 0) { kgdb_mem2hex(getthread(ks->linux_regs, ks->threadid)->comm, remcom_out_buffer, 16); } else { static char tmpstr[23 + BUF_THREAD_ID_SIZE]; - sprintf(tmpstr, "Shadow task %d for pid 0", - (int)(-ks->threadid-1)); + sprintf(tmpstr, "shadowCPU%d", + (int)(-ks->threadid - 2)); kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); } break; -- cgit v1.2.3 From ee1d315663ee0b494898f813a266d6244b263b4f Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 7 Jul 2008 10:49:45 -0400 Subject: [PATCH] Audit: Collect signal info when SIGUSR2 is sent to auditd Makes the kernel audit subsystem collect information about the sending process when that process sends SIGUSR2 to the userspace audit daemon. SIGUSR2 is a new interesting signal to auditd telling auditd that it should try to start logging to disk again and the error condition which caused it to stop logging to disk (usually out of space) has been rectified. Signed-off-by: Eric Paris Signed-off-by: Al Viro --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4699950e65bd..580a5389fd99 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2375,7 +2375,7 @@ int __audit_signal_info(int sig, struct task_struct *t) struct audit_context *ctx = tsk->audit_context; if (audit_pid && t->tgid == audit_pid) { - if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { + if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { audit_sig_pid = tsk->pid; if (tsk->loginuid != -1) audit_sig_uid = tsk->loginuid; -- cgit v1.2.3 From 1d6c9649e236caa2e93e3647256216e57172b011 Mon Sep 17 00:00:00 2001 From: Vesa-Matti J Kari Date: Wed, 23 Jul 2008 00:06:13 +0300 Subject: kernel/audit.c control character detection is off-by-one Hello, According to my understanding there is an off-by-one bug in the function: audit_string_contains_control() in: kernel/audit.c Patch is included. I do not know from how many places the function is called from, but for example, SELinux Access Vector Cache tries to log untrusted filenames via call path: avc_audit() audit_log_untrustedstring() audit_log_n_untrustedstring() audit_string_contains_control() If audit_string_contains_control() detects control characters, then the string is hex-encoded. But the hex=0x7f dec=127, DEL-character, is not detected. I guess this could have at least some minor security implications, since a user can create a filename with 0x7f in it, causing logged filename to possibly look different when someone reads it on the terminal. Signed-off-by: Vesa-Matti Kari Signed-off-by: Al Viro --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e092f1c0ce30..6d903182c6b7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1366,7 +1366,7 @@ int audit_string_contains_control(const char *string, size_t len) { const unsigned char *p; for (p = string; p < (const unsigned char *)string + len && *p; p++) { - if (*p == '"' || *p < 0x21 || *p > 0x7f) + if (*p == '"' || *p < 0x21 || *p > 0x7e) return 1; } return 0; -- cgit v1.2.3 From 036bbf76ad9f83781590623111b80ba0b82930ac Mon Sep 17 00:00:00 2001 From: zhangxiliang Date: Fri, 1 Aug 2008 09:47:01 +0800 Subject: Re: [PATCH] the loginuid field should be output in all AUDIT_CONFIG_CHANGE audit messages > shouldn't these be using the "audit_get_loginuid(current)" and if we > are going to output loginuid we also should be outputting sessionid Thanks for your detailed explanation. I have made a new patch for outputing "loginuid" and "sessionid" by audit_get_loginuid(current) and audit_get_sessionid(current). If there are some deficiencies, please give me your indication. Signed-off-by: Zhang Xiliang Signed-off-by: Al Viro --- kernel/auditfilter.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 98c50cc671bb..b7d354e2b0ef 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1022,8 +1022,11 @@ static void audit_update_watch(struct audit_parent *parent, struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "auid=%u ses=%u", + audit_get_loginuid(current), + audit_get_sessionid(current)); audit_log_format(ab, - "op=updated rules specifying path="); + " op=updated rules specifying path="); audit_log_untrustedstring(ab, owatch->path); audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); @@ -1058,7 +1061,10 @@ static void audit_remove_parent_watches(struct audit_parent *parent) struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "op=remove rule path="); + audit_log_format(ab, "auid=%u ses=%u", + audit_get_loginuid(current), + audit_get_sessionid(current)); + audit_log_format(ab, " op=remove rule path="); audit_log_untrustedstring(ab, w->path); if (r->filterkey) { audit_log_format(ab, " key="); -- cgit v1.2.3 From 980dfb0db340b95094732d78b55311f2c539c1af Mon Sep 17 00:00:00 2001 From: zhangxiliang Date: Fri, 1 Aug 2008 19:15:47 +0800 Subject: [PATCH] Fix the kernel panic of audit_filter_task when key field is set When calling audit_filter_task(), it calls audit_filter_rules() with audit_context is NULL. If the key field is set, the result in audit_filter_rules() will be set to 1 and ctx->filterkey will be set to key. But the ctx is NULL in this condition, so kernel will panic. Signed-off-by: Zhang Xiliang Signed-off-by: Al Viro --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 580a5389fd99..496c3dd37276 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -610,7 +610,7 @@ static int audit_filter_rules(struct task_struct *tsk, if (!result) return 0; } - if (rule->filterkey) + if (rule->filterkey && ctx) ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; -- cgit v1.2.3 From 20c6aaa39ab735c7ed78e4e5a214d250efae0a6e Mon Sep 17 00:00:00 2001 From: zhangxiliang Date: Thu, 31 Jul 2008 10:11:19 +0800 Subject: [PATCH] Fix the bug of using AUDIT_STATUS_RATE_LIMIT when set fail, no error output. When the "status_get->mask" is "AUDIT_STATUS_RATE_LIMIT || AUDIT_STATUS_BACKLOG_LIMIT". If "audit_set_rate_limit" fails and "audit_set_backlog_limit" succeeds, the "err" value will be greater than or equal to 0. It will miss the failure of rate set. Signed-off-by: Zhang Xiliang Acked-by: Eric Paris Signed-off-by: Al Viro --- kernel/audit.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 6d903182c6b7..4414e93d8750 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -707,12 +707,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(status_get->enabled, loginuid, sessionid, sid); - if (err < 0) return err; + if (err < 0) + return err; } if (status_get->mask & AUDIT_STATUS_FAILURE) { err = audit_set_failure(status_get->failure, loginuid, sessionid, sid); - if (err < 0) return err; + if (err < 0) + return err; } if (status_get->mask & AUDIT_STATUS_PID) { int new_pid = status_get->pid; @@ -725,9 +727,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_pid = new_pid; audit_nlk_pid = NETLINK_CB(skb).pid; } - if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) + if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(status_get->rate_limit, loginuid, sessionid, sid); + if (err < 0) + return err; + } if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) err = audit_set_backlog_limit(status_get->backlog_limit, loginuid, sessionid, sid); -- cgit v1.2.3 From 5c7edcd7ee6b77b88252fe4096dce1a46a60c829 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 31 Jul 2008 02:04:09 -0700 Subject: tracehook: fix exit_signal=0 case My commit 2b2a1ff64afbadac842bbc58c5166962cf4f7664 introduced a regression (sorry about that) for the odd case of exit_signal=0 (e.g. clone_flags=0). This is not a normal use, but it's used by a case in the glibc test suite. Dying with exit_signal=0 sends no signal, but it's supposed to wake up a parent's blocked wait*() calls (unlike the delayed_group_leader case). This fixes tracehook_notify_death() and its caller to distinguish a "signal 0" wakeup from the delayed_group_leader case (with no wakeup). Signed-off-by: Roland McGrath Tested-by: Serge Hallyn Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index eb4d6470d1d0..38ec40630149 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -911,10 +911,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tsk->exit_signal = SIGCHLD; signal = tracehook_notify_death(tsk, &cookie, group_dead); - if (signal > 0) + if (signal >= 0) signal = do_notify_parent(tsk, signal); - tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE; + tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && @@ -927,7 +927,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tracehook_report_death(tsk, signal, cookie, group_dead); /* If the process is dead, release it - nobody will wait for it */ - if (signal < 0) + if (signal == DEATH_REAP) release_task(tsk); } -- cgit v1.2.3 From 1a61c88defcd611bd148d6c960b498e1b8bbbe00 Mon Sep 17 00:00:00 2001 From: zhangxiliang Date: Sat, 2 Aug 2008 10:56:37 +0800 Subject: Re: [PATCH] Fix the kernel panic of audit_filter_task when key field is set Sorry, I miss a blank between if and "(". And I add "unlikely" to check "ctx" in audit_match_perm() and audit_match_filetype(). This is a new patch for it. Signed-off-by: Zhang Xiliang Signed-off-by: Al Viro --- kernel/auditsc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 496c3dd37276..972f8e61d36a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -243,6 +243,9 @@ static inline int open_arg(int flags, int mask) static int audit_match_perm(struct audit_context *ctx, int mask) { + if (unlikely(!ctx)) + return 0; + unsigned n = ctx->major; switch (audit_classify_syscall(ctx->arch, n)) { case 0: /* native */ @@ -284,6 +287,10 @@ static int audit_match_filetype(struct audit_context *ctx, int which) { unsigned index = which & ~S_IFMT; mode_t mode = which & S_IFMT; + + if (unlikely(!ctx)) + return 0; + if (index >= ctx->name_count) return 0; if (ctx->names[index].ino == -1) -- cgit v1.2.3 From 725aad24c3ba96a7c06448c14c265a466cdbd663 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 3 Aug 2008 09:33:03 -0700 Subject: __sched_setscheduler: don't do any policy checks when not "user" The "user" parameter to __sched_setscheduler indicates whether the change is being done on behalf of a user process or not. If not, we shouldn't apply any permissions checks, so don't call security_task_setscheduler(). Signed-off-by: Jeremy Fitzhardinge Tested-by: Steve Wise Cc: Rusty Russell Cc: "Rafael J. Wysocki" Signed-off-by: Linus Torvalds --- kernel/sched.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 21f7da94662e..04160d277e7a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5004,19 +5004,21 @@ recheck: return -EPERM; } + if (user) { #ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (user - && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) - return -EPERM; + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + return -EPERM; #endif - retval = security_task_setscheduler(p, policy, param); - if (retval) - return retval; + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; + } + /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: -- cgit v1.2.3 From 32194450330be327f3b25bf6b66298bd122599e9 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 5 Aug 2008 13:01:10 -0700 Subject: relay: fix "full buffer with exactly full last subbuffer" accounting problem In relay's current read implementation, if the buffer is completely full but hasn't triggered the buffer-full condition (i.e. the last write didn't cross the subbuffer boundary) and the last subbuffer is exactly full, the subbuffer accounting code erroneously finds nothing available. This patch fixes the problem. Signed-off-by: Tom Zanussi Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Cc: Jens Axboe Cc: Mathieu Desnoyers Cc: Andrea Righi Cc: [2.6.25.x, 2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 04006ef970b8..8d13a7855c08 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -944,6 +944,10 @@ static void relay_file_read_consume(struct rchan_buf *buf, size_t n_subbufs = buf->chan->n_subbufs; size_t read_subbuf; + if (buf->subbufs_produced == buf->subbufs_consumed && + buf->offset == buf->bytes_consumed) + return; + if (buf->bytes_consumed + bytes_consumed > subbuf_size) { relay_subbufs_consumed(buf->chan, buf->cpu, 1); buf->bytes_consumed = 0; @@ -975,6 +979,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) relay_file_read_consume(buf, read_pos, 0); + consumed = buf->subbufs_consumed; + if (unlikely(buf->offset > subbuf_size)) { if (produced == consumed) return 0; @@ -993,8 +999,12 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) if (consumed > produced) produced += n_subbufs * subbuf_size; - if (consumed == produced) + if (consumed == produced) { + if (buf->offset == subbuf_size && + buf->subbufs_produced > buf->subbufs_consumed) + return 1; return 0; + } return 1; } -- cgit v1.2.3 From 5b2becc8cffdccdd60c63099f592ddd35aa6c34f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 5 Aug 2008 13:01:13 -0700 Subject: semaphore: __down_common: use signal_pending_state() Change __down_common() to use signal_pending_state() instead of open coding. The changes in kernel/semaphore.o are just artifacts, the state checks are optimized away. Signed-off-by: Oleg Nesterov Cc: Ingo Molnar Cc: Matthew Wilcox Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/semaphore.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/semaphore.c b/kernel/semaphore.c index aaaeae8244e7..94a62c0d4ade 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -212,9 +212,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state, waiter.up = 0; for (;;) { - if (state == TASK_INTERRUPTIBLE && signal_pending(task)) - goto interrupted; - if (state == TASK_KILLABLE && fatal_signal_pending(task)) + if (signal_pending_state(state, task)) goto interrupted; if (timeout <= 0) goto timed_out; -- cgit v1.2.3 From c69ad71bcdecbaab82cfacb1dc967bd7fd967a3b Mon Sep 17 00:00:00 2001 From: David Brownell Date: Tue, 5 Aug 2008 13:01:14 -0700 Subject: genirq: better warning on irqchip->set_type() failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While I'm glad to finally see the hole fixed whereby passing an invalid IRQ trigger type to request_irq() would be ignored, the current diagnostic isn't quite useful. Fixed by also listing the trigger type which was rejected. Signed-off-by: David Brownell Acked-by: Uwe Kleine-König Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 152abfd3589f..0314074fa232 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -323,7 +323,8 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); if (ret) - pr_err("setting flow type for irq %u failed (%pF)\n", + pr_err("setting trigger mode %d for irq %u failed (%pF)\n", + (int)(flags & IRQF_TRIGGER_MASK), irq, chip->set_type); return ret; -- cgit v1.2.3 From d2dc1f4adb4b5b02d87e49e115e5107f4da790c0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 5 Aug 2008 13:01:31 -0700 Subject: dma: fix order calculation in dma_mark_declared_memory_occupied() get_order() takes byte-sized input, not a page-granular one. Irrespective of this fix I'm inclined to believe that this doesn't work right anyway - bitmap_allocate_region() has an implicit assumption of 'pos' being suitable for 'order', which this function doesn't seem to enforce (and since it's being called with a byte-granular value there's no reason to believe that the callers would make sure device_addr is passed accordingly - it's also not documented that way). Signed-off-by: Jan Beulich Cc: James E.J. Bottomley Cc: Ingo Molnar Cc: Dmitry Baryshkov Cc: Jesse Barnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma-coherent.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 7517115a8cce..91e96950cd52 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -77,15 +77,14 @@ void *dma_mark_declared_memory_occupied(struct device *dev, { struct dma_coherent_mem *mem = dev->dma_mem; int pos, err; - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1); - pages >>= PAGE_SHIFT; + size += device_addr & ~PAGE_MASK; if (!mem) return ERR_PTR(-EINVAL); pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); + err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); if (err != 0) return ERR_PTR(err); return mem->virt_base + (pos << PAGE_SHIFT); -- cgit v1.2.3 From bf1db69fbf4ff511e88736ce2e6318846f34492b Mon Sep 17 00:00:00 2001 From: Richard Hughes Date: Tue, 5 Aug 2008 13:01:35 -0700 Subject: pm_qos: spelling fixes A documentation cleanup patch. With a minor tweak to clarify units for kbs. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: mark gross Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pm_qos_params.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 8cb757026386..da9c2dda6a4e 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -24,7 +24,7 @@ * requirement that the application has is cleaned up when closes the file * pointer or exits the pm_qos_object will get an opportunity to clean up. * - * mark gross mgross@linux.intel.com + * Mark Gross */ #include @@ -211,8 +211,8 @@ EXPORT_SYMBOL_GPL(pm_qos_requirement); * @value: defines the qos request * * This function inserts a new entry in the pm_qos_class list of requested qos - * performance charactoistics. It recomputes the agregate QoS expectations for - * the pm_qos_class of parrameters. + * performance characteristics. It recomputes the aggregate QoS expectations + * for the pm_qos_class of parameters. */ int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) { @@ -250,10 +250,10 @@ EXPORT_SYMBOL_GPL(pm_qos_add_requirement); * @name: identifies the request * @value: defines the qos request * - * Updates an existing qos requierement for the pm_qos_class of parameters along + * Updates an existing qos requirement for the pm_qos_class of parameters along * with updating the target pm_qos_class value. * - * If the named request isn't in the lest then no change is made. + * If the named request isn't in the list then no change is made. */ int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) { @@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(pm_qos_update_requirement); * @pm_qos_class: identifies which list of qos request to us * @name: identifies the request * - * Will remove named qos request from pm_qos_class list of parrameters and + * Will remove named qos request from pm_qos_class list of parameters and * recompute the current target value for the pm_qos_class. */ void pm_qos_remove_requirement(int pm_qos_class, char *name) @@ -319,7 +319,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); * @notifier: notifier block managed by caller. * * will register the notifier into a notification chain that gets called - * uppon changes to the pm_qos_class target value. + * upon changes to the pm_qos_class target value. */ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) { @@ -338,7 +338,7 @@ EXPORT_SYMBOL_GPL(pm_qos_add_notifier); * @notifier: notifier block to be removed. * * will remove the notifier from the notification chain that gets called - * uppon changes to the pm_qos_class target value. + * upon changes to the pm_qos_class target value. */ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) { -- cgit v1.2.3 From cb3952bf7853667a1cb3515e67f27e67f0fce9e8 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Wed, 30 Jul 2008 14:46:50 +0400 Subject: DMA: make dma-coherent.c documentation kdoc-friendly Spotted by Randy. Acked-by: Randy Dunlap Signed-off-by: Dmitry Baryshkov Signed-off-by: Jesse Barnes --- kernel/dma-coherent.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 91e96950cd52..c1d4d5b4c61c 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -92,7 +92,7 @@ void *dma_mark_declared_memory_occupied(struct device *dev, EXPORT_SYMBOL(dma_mark_declared_memory_occupied); /** - * Try to allocate memory from the per-device coherent area. + * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area * * @dev: device from which we allocate memory * @size: size of requested memory area @@ -100,11 +100,11 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied); * @ret: This pointer will be filled with the virtual address * to allocated area. * - * This function should be only called from per-arch %dma_alloc_coherent() + * This function should be only called from per-arch dma_alloc_coherent() * to support allocation from per-device coherent memory pools. * * Returns 0 if dma_alloc_coherent should continue with allocating from - * generic memory areas, or !0 if dma_alloc_coherent should return %ret. + * generic memory areas, or !0 if dma_alloc_coherent should return @ret. */ int dma_alloc_from_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret) @@ -126,7 +126,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, } /** - * Try to free the memory allocated from per-device coherent memory pool. + * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool * @dev: device from which the memory was allocated * @order: the order of pages allocated * @vaddr: virtual address of allocated pages @@ -135,7 +135,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, * coherent memory pool and if so, releases that memory. * * Returns 1 if we correctly released the memory, or 0 if - * %dma_release_coherent() should proceed with releasing memory from + * dma_release_coherent() should proceed with releasing memory from * generic pools. */ int dma_release_from_coherent(struct device *dev, int order, void *vaddr) -- cgit v1.2.3 From c1955a3d4762e7a9bf84035eb3c4886a900f0d15 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 08:59:03 +0200 Subject: sched_clock: delay using sched_clock() Some arch's can't handle sched_clock() being called too early - delay this until sched_clock_init() has been called. Reported-by: Bill Gatliff Signed-off-by: Peter Zijlstra Tested-by: Nishanth Aravamudan CC: Russell King - ARM Linux Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 074edc989379..204991a0bfa7 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -42,6 +42,8 @@ unsigned long long __attribute__((weak)) sched_clock(void) return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); } +static __read_mostly int sched_clock_running; + #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK struct sched_clock_data { @@ -70,8 +72,6 @@ static inline struct sched_clock_data *cpu_sdc(int cpu) return &per_cpu(sched_clock_data, cpu); } -static __read_mostly int sched_clock_running; - void sched_clock_init(void) { u64 ktime_now = ktime_to_ns(ktime_get()); @@ -248,6 +248,21 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ + +void sched_clock_init(void) +{ + sched_clock_running = 1; +} + +u64 sched_clock_cpu(int cpu) +{ + if (unlikely(!sched_clock_running)) + return 0; + + return sched_clock(); +} + #endif unsigned long long cpu_clock(int cpu) -- cgit v1.2.3 From 64aa348edc617dea17bbd01ddee4e47886d5ec8c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 09:30:21 +0200 Subject: lockdep: lock_set_subclass - reset a held lock's subclass this can be used to reset a held lock's subclass, for arbitrary-depth iterated data structures such as trees or lists which have per-node locks. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 6999e64fc248..e14d383dcb0b 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2660,6 +2660,55 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, return 1; } +static int +__lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class *class; + unsigned int depth; + int i; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + return print_unlock_inbalance_bug(curr, lock, ip); + +found_it: + class = register_lock_class(lock, subclass, 0); + hlock->class = class; + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + for (; i < depth; i++) { + hlock = curr->held_locks + i; + if (!__lock_acquire(hlock->instance, + hlock->class->subclass, hlock->trylock, + hlock->read, hlock->check, hlock->hardirqs_off, + hlock->acquire_ip)) + return 0; + } + + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + return 0; + return 1; +} + /* * Remove the lock to the list of currently held locks in a * potentially non-nested (out of order) manner. This is a @@ -2824,6 +2873,26 @@ static void check_flags(unsigned long flags) #endif } +void +lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; + check_flags(flags); + if (__lock_set_subclass(lock, subclass, ip)) + check_chain_key(current); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} + +EXPORT_SYMBOL_GPL(lock_set_subclass); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: -- cgit v1.2.3 From 1b12bbc747560ea68bcc132c3d05699e52271da0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 09:30:22 +0200 Subject: lockdep: re-annotate scheduler runqueues Instead of using a per-rq lock class, use the regular nesting operations. However, take extra care with double_lock_balance() as it can release the already held rq->lock (and therefore change its nesting class). So what can happen is: spin_lock(rq->lock); // this rq subclass 0 double_lock_balance(rq, other_rq); // release rq // acquire other_rq->lock subclass 0 // acquire rq->lock subclass 1 spin_unlock(other_rq->lock); leaving you with rq->lock in subclass 1 So a subsequent double_lock_balance() call can try to nest a subclass 1 lock while already holding a subclass 1 lock. Fix this by introducing double_unlock_balance() which releases the other rq's lock, but also re-sets the subclass for this rq's lock to 0. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++++++-- kernel/sched_rt.c | 8 +++++--- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 655f1db26b12..9b2b6a85577d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2812,6 +2812,13 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) return ret; } +static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -3636,7 +3643,7 @@ redo: ld_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, CPU_NEWLY_IDLE, &all_pinned); - spin_unlock(&busiest->lock); + double_unlock_balance(this_rq, busiest); if (unlikely(all_pinned)) { cpu_clear(cpu_of(busiest), *cpus); @@ -3751,7 +3758,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) else schedstat_inc(sd, alb_failed); } - spin_unlock(&target_rq->lock); + double_unlock_balance(busiest_rq, target_rq); } #ifdef CONFIG_NO_HZ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 908c04f9dad0..6163e4cf885b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -861,6 +861,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) #define RT_MAX_TRIES 3 static int double_lock_balance(struct rq *this_rq, struct rq *busiest); +static void double_unlock_balance(struct rq *this_rq, struct rq *busiest); + static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) @@ -1022,7 +1024,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) break; /* try again */ - spin_unlock(&lowest_rq->lock); + double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; } @@ -1091,7 +1093,7 @@ static int push_rt_task(struct rq *rq) resched_task(lowest_rq->curr); - spin_unlock(&lowest_rq->lock); + double_unlock_balance(rq, lowest_rq); ret = 1; out: @@ -1197,7 +1199,7 @@ static int pull_rt_task(struct rq *this_rq) } skip: - spin_unlock(&src_rq->lock); + double_unlock_balance(this_rq, src_rq); } return ret; -- cgit v1.2.3 From f82b217e3513fe3af342c0f3ee1494e86250c21c Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 11 Aug 2008 09:30:23 +0200 Subject: lockdep: shrink held_lock structure struct held_lock { u64 prev_chain_key; /* 0 8 */ struct lock_class * class; /* 8 8 */ long unsigned int acquire_ip; /* 16 8 */ struct lockdep_map * instance; /* 24 8 */ int irq_context; /* 32 4 */ int trylock; /* 36 4 */ int read; /* 40 4 */ int check; /* 44 4 */ int hardirqs_off; /* 48 4 */ /* size: 56, cachelines: 1 */ /* padding: 4 */ /* last cacheline: 56 bytes */ }; struct held_lock { u64 prev_chain_key; /* 0 8 */ long unsigned int acquire_ip; /* 8 8 */ struct lockdep_map * instance; /* 16 8 */ unsigned int class_idx:11; /* 24:21 4 */ unsigned int irq_context:2; /* 24:19 4 */ unsigned int trylock:1; /* 24:18 4 */ unsigned int read:2; /* 24:16 4 */ unsigned int check:2; /* 24:14 4 */ unsigned int hardirqs_off:1; /* 24:13 4 */ /* size: 32, cachelines: 1 */ /* padding: 4 */ /* bit_padding: 13 bits */ /* last cacheline: 32 bytes */ }; [mingo@elte.hu: shrunk hlock->class too] [peterz@infradead.org: fixup bit sizes] Signed-off-by: Dave Jones Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/lockdep.c | 113 ++++++++++++++++++++++++++------------------- kernel/lockdep_internals.h | 3 -- 2 files changed, 65 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e14d383dcb0b..d3c72ad8d09e 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -124,6 +124,15 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; unsigned long nr_lock_classes; static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +static inline struct lock_class *hlock_class(struct held_lock *hlock) +{ + if (!hlock->class_idx) { + DEBUG_LOCKS_WARN_ON(1); + return NULL; + } + return lock_classes + hlock->class_idx - 1; +} + #ifdef CONFIG_LOCK_STAT static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); @@ -222,7 +231,7 @@ static void lock_release_holdtime(struct held_lock *hlock) holdtime = sched_clock() - hlock->holdtime_stamp; - stats = get_lock_stats(hlock->class); + stats = get_lock_stats(hlock_class(hlock)); if (hlock->read) lock_time_inc(&stats->read_holdtime, holdtime); else @@ -518,7 +527,7 @@ static void print_lockdep_cache(struct lockdep_map *lock) static void print_lock(struct held_lock *hlock) { - print_lock_name(hlock->class); + print_lock_name(hlock_class(hlock)); printk(", at: "); print_ip_sym(hlock->acquire_ip); } @@ -948,7 +957,7 @@ static noinline int print_circular_bug_tail(void) if (debug_locks_silent) return 0; - this.class = check_source->class; + this.class = hlock_class(check_source); if (!save_trace(&this.trace)) return 0; @@ -1057,7 +1066,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) * Check this lock's dependency list: */ list_for_each_entry(entry, &source->locks_after, entry) { - if (entry->class == check_target->class) + if (entry->class == hlock_class(check_target)) return print_circular_bug_header(entry, depth+1); debug_atomic_inc(&nr_cyclic_checks); if (!check_noncircular(entry->class, depth+1)) @@ -1150,6 +1159,11 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) return 2; } + if (!source && debug_locks_off_graph_unlock()) { + WARN_ON(1); + return 0; + } + /* * Check this lock's dependency list: */ @@ -1189,9 +1203,9 @@ print_bad_irq_dependency(struct task_struct *curr, printk("\nand this task is already holding:\n"); print_lock(prev); printk("which would create a new lock dependency:\n"); - print_lock_name(prev->class); + print_lock_name(hlock_class(prev)); printk(" ->"); - print_lock_name(next->class); + print_lock_name(hlock_class(next)); printk("\n"); printk("\nbut this new dependency connects a %s-irq-safe lock:\n", @@ -1232,12 +1246,12 @@ check_usage(struct task_struct *curr, struct held_lock *prev, find_usage_bit = bit_backwards; /* fills in */ - ret = find_usage_backwards(prev->class, 0); + ret = find_usage_backwards(hlock_class(prev), 0); if (!ret || ret == 1) return ret; find_usage_bit = bit_forwards; - ret = find_usage_forwards(next->class, 0); + ret = find_usage_forwards(hlock_class(next), 0); if (!ret || ret == 1) return ret; /* ret == 2 */ @@ -1362,7 +1376,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, for (i = 0; i < curr->lockdep_depth; i++) { prev = curr->held_locks + i; - if (prev->class != next->class) + if (hlock_class(prev) != hlock_class(next)) continue; /* * Allow read-after-read recursion of the same @@ -1415,7 +1429,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ check_source = next; check_target = prev; - if (!(check_noncircular(next->class, 0))) + if (!(check_noncircular(hlock_class(next), 0))) return print_circular_bug_tail(); if (!check_prev_add_irq(curr, prev, next)) @@ -1439,8 +1453,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * chains - the second one will be new, but L1 already has * L2 added to its dependency list, due to the first chain.) */ - list_for_each_entry(entry, &prev->class->locks_after, entry) { - if (entry->class == next->class) { + list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) { + if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; return 2; @@ -1451,26 +1465,28 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ - ret = add_lock_to_list(prev->class, next->class, - &prev->class->locks_after, next->acquire_ip, distance); + ret = add_lock_to_list(hlock_class(prev), hlock_class(next), + &hlock_class(prev)->locks_after, + next->acquire_ip, distance); if (!ret) return 0; - ret = add_lock_to_list(next->class, prev->class, - &next->class->locks_before, next->acquire_ip, distance); + ret = add_lock_to_list(hlock_class(next), hlock_class(prev), + &hlock_class(next)->locks_before, + next->acquire_ip, distance); if (!ret) return 0; /* * Debugging printouts: */ - if (verbose(prev->class) || verbose(next->class)) { + if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { graph_unlock(); printk("\n new dependency: "); - print_lock_name(prev->class); + print_lock_name(hlock_class(prev)); printk(" => "); - print_lock_name(next->class); + print_lock_name(hlock_class(next)); printk("\n"); dump_stack(); return graph_lock(); @@ -1567,7 +1583,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct held_lock *hlock, u64 chain_key) { - struct lock_class *class = hlock->class; + struct lock_class *class = hlock_class(hlock); struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; struct held_lock *hlock_curr, *hlock_next; @@ -1640,7 +1656,7 @@ cache_hit: if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = cn; for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class - lock_classes; + int lock_id = curr->held_locks[i].class_idx - 1; chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; @@ -1736,7 +1752,7 @@ static void check_chain_key(struct task_struct *curr) WARN_ON(1); return; } - id = hlock->class - lock_classes; + id = hlock->class_idx - 1; if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) return; @@ -1781,7 +1797,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_lock(this); printk("{%s} state was registered at:\n", usage_str[prev_bit]); - print_stack_trace(this->class->usage_traces + prev_bit, 1); + print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); print_irqtrace_events(curr); printk("\nother info that might help us debug this:\n"); @@ -1800,7 +1816,7 @@ static inline int valid_state(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) { - if (unlikely(this->class->usage_mask & (1 << bad_bit))) + if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) return print_usage_bug(curr, this, bad_bit, new_bit); return 1; } @@ -1839,7 +1855,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, lockdep_print_held_locks(curr); printk("\nthe first lock's dependencies:\n"); - print_lock_dependencies(this->class, 0); + print_lock_dependencies(hlock_class(this), 0); printk("\nthe second lock's dependencies:\n"); print_lock_dependencies(other, 0); @@ -1862,7 +1878,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, find_usage_bit = bit; /* fills in */ - ret = find_usage_forwards(this->class, 0); + ret = find_usage_forwards(hlock_class(this), 0); if (!ret || ret == 1) return ret; @@ -1881,7 +1897,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, find_usage_bit = bit; /* fills in */ - ret = find_usage_backwards(this->class, 0); + ret = find_usage_backwards(hlock_class(this), 0); if (!ret || ret == 1) return ret; @@ -1947,7 +1963,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_ENABLED_HARDIRQS_READ, "hard-read")) return 0; #endif - if (hardirq_verbose(this->class)) + if (hardirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_USED_IN_SOFTIRQ: @@ -1972,7 +1988,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_ENABLED_SOFTIRQS_READ, "soft-read")) return 0; #endif - if (softirq_verbose(this->class)) + if (softirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_USED_IN_HARDIRQ_READ: @@ -1985,7 +2001,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, if (!check_usage_forwards(curr, this, LOCK_ENABLED_HARDIRQS, "hard")) return 0; - if (hardirq_verbose(this->class)) + if (hardirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_USED_IN_SOFTIRQ_READ: @@ -1998,7 +2014,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, if (!check_usage_forwards(curr, this, LOCK_ENABLED_SOFTIRQS, "soft")) return 0; - if (softirq_verbose(this->class)) + if (softirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_ENABLED_HARDIRQS: @@ -2024,7 +2040,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_USED_IN_HARDIRQ_READ, "hard-read")) return 0; #endif - if (hardirq_verbose(this->class)) + if (hardirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_ENABLED_SOFTIRQS: @@ -2050,7 +2066,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_USED_IN_SOFTIRQ_READ, "soft-read")) return 0; #endif - if (softirq_verbose(this->class)) + if (softirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_ENABLED_HARDIRQS_READ: @@ -2065,7 +2081,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_USED_IN_HARDIRQ, "hard")) return 0; #endif - if (hardirq_verbose(this->class)) + if (hardirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_ENABLED_SOFTIRQS_READ: @@ -2080,7 +2096,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_USED_IN_SOFTIRQ, "soft")) return 0; #endif - if (softirq_verbose(this->class)) + if (softirq_verbose(hlock_class(this))) ret = 2; break; default: @@ -2396,7 +2412,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, * If already set then do not dirty the cacheline, * nor do any checks: */ - if (likely(this->class->usage_mask & new_mask)) + if (likely(hlock_class(this)->usage_mask & new_mask)) return 1; if (!graph_lock()) @@ -2404,14 +2420,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Make sure we didnt race: */ - if (unlikely(this->class->usage_mask & new_mask)) { + if (unlikely(hlock_class(this)->usage_mask & new_mask)) { graph_unlock(); return 1; } - this->class->usage_mask |= new_mask; + hlock_class(this)->usage_mask |= new_mask; - if (!save_trace(this->class->usage_traces + new_bit)) + if (!save_trace(hlock_class(this)->usage_traces + new_bit)) return 0; switch (new_bit) { @@ -2545,8 +2561,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, return 0; hlock = curr->held_locks + depth; - - hlock->class = class; + if (DEBUG_LOCKS_WARN_ON(!class)) + return 0; + hlock->class_idx = class - lock_classes + 1; hlock->acquire_ip = ip; hlock->instance = lock; hlock->trylock = trylock; @@ -2690,7 +2707,7 @@ __lock_set_subclass(struct lockdep_map *lock, found_it: class = register_lock_class(lock, subclass, 0); - hlock->class = class; + hlock->class_idx = class - lock_classes + 1; curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; @@ -2698,7 +2715,7 @@ found_it: for (; i < depth; i++) { hlock = curr->held_locks + i; if (!__lock_acquire(hlock->instance, - hlock->class->subclass, hlock->trylock, + hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->acquire_ip)) return 0; @@ -2759,7 +2776,7 @@ found_it: for (i++; i < depth; i++) { hlock = curr->held_locks + i; if (!__lock_acquire(hlock->instance, - hlock->class->subclass, hlock->trylock, + hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->acquire_ip)) return 0; @@ -2804,7 +2821,7 @@ static int lock_release_nested(struct task_struct *curr, #ifdef CONFIG_DEBUG_LOCKDEP hlock->prev_chain_key = 0; - hlock->class = NULL; + hlock->class_idx = 0; hlock->acquire_ip = 0; hlock->irq_context = 0; #endif @@ -3000,9 +3017,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) found_it: hlock->waittime_stamp = sched_clock(); - point = lock_contention_point(hlock->class, ip); + point = lock_contention_point(hlock_class(hlock), ip); - stats = get_lock_stats(hlock->class); + stats = get_lock_stats(hlock_class(hlock)); if (point < ARRAY_SIZE(stats->contention_point)) stats->contention_point[i]++; if (lock->cpu != smp_processor_id()) @@ -3048,7 +3065,7 @@ found_it: hlock->holdtime_stamp = now; } - stats = get_lock_stats(hlock->class); + stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) lock_time_inc(&stats->read_waittime, waittime); diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index 68d44ec77ab5..55db193d366d 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -17,9 +17,6 @@ */ #define MAX_LOCKDEP_ENTRIES 8192UL -#define MAX_LOCKDEP_KEYS_BITS 11 -#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) - #define MAX_LOCKDEP_CHAINS_BITS 14 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) -- cgit v1.2.3 From 4f3e7524b2e703d9f8b02ac338153a53dd7ede66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 09:30:23 +0200 Subject: lockdep: map_acquire Most the free-standing lock_acquire() usages look remarkably similar, sweep them into a new helper. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ec7e4f62aaff..53564ae894a6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -290,11 +290,11 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) BUG_ON(get_wq_data(work) != cwq); work_clear_pending(work); - lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_); + map_acquire(&cwq->wq->lockdep_map); + map_acquire(&lockdep_map); f(work); - lock_release(&lockdep_map, 1, _THIS_IP_); - lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); + map_release(&lockdep_map); + map_release(&cwq->wq->lockdep_map); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " @@ -413,8 +413,8 @@ void flush_workqueue(struct workqueue_struct *wq) int cpu; might_sleep(); - lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_release(&wq->lockdep_map, 1, _THIS_IP_); + map_acquire(&wq->lockdep_map); + map_release(&wq->lockdep_map); for_each_cpu_mask_nr(cpu, *cpu_map) flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); } @@ -441,8 +441,8 @@ int flush_work(struct work_struct *work) if (!cwq) return 0; - lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); + map_acquire(&cwq->wq->lockdep_map); + map_release(&cwq->wq->lockdep_map); prev = NULL; spin_lock_irq(&cwq->lock); @@ -536,8 +536,8 @@ static void wait_on_work(struct work_struct *work) might_sleep(); - lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_release(&work->lockdep_map, 1, _THIS_IP_); + map_acquire(&work->lockdep_map); + map_release(&work->lockdep_map); cwq = get_wq_data(work); if (!cwq) @@ -861,8 +861,8 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) if (cwq->thread == NULL) return; - lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); + map_acquire(&cwq->wq->lockdep_map); + map_release(&cwq->wq->lockdep_map); flush_cpu_workqueue(cwq); /* -- cgit v1.2.3 From 7531e2f34d1d551b096143f19111139f0dd84c8b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 09:30:24 +0200 Subject: lockdep: lock protection locks On Fri, 2008-08-01 at 16:26 -0700, Linus Torvalds wrote: > On Fri, 1 Aug 2008, David Miller wrote: > > > > Taking more than a few locks of the same class at once is bad > > news and it's better to find an alternative method. > > It's not always wrong. > > If you can guarantee that anybody that takes more than one lock of a > particular class will always take a single top-level lock _first_, then > that's all good. You can obviously screw up and take the same lock _twice_ > (which will deadlock), but at least you cannot get into ABBA situations. > > So maybe the right thing to do is to just teach lockdep about "lock > protection locks". That would have solved the multi-queue issues for > networking too - all the actual network drivers would still have taken > just their single queue lock, but the one case that needs to take all of > them would have taken a separate top-level lock first. > > Never mind that the multi-queue locks were always taken in the same order: > it's never wrong to just have some top-level serialization, and anybody > who needs to take locks might as well do , because they sure as > hell aren't going to be on _any_ fastpaths. > > So the simplest solution really sounds like just teaching lockdep about > that one special case. It's not "nesting" exactly, although it's obviously > related to it. Do as Linus suggested. The lock protection lock is called nest_lock. Note that we still have the MAX_LOCK_DEPTH (48) limit to consider, so anything that spills that it still up shit creek. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index d3c72ad8d09e..410c3365ad8f 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1372,18 +1372,32 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, struct lockdep_map *next_instance, int read) { struct held_lock *prev; + struct held_lock *nest = NULL; int i; for (i = 0; i < curr->lockdep_depth; i++) { prev = curr->held_locks + i; + + if (prev->instance == next->nest_lock) + nest = prev; + if (hlock_class(prev) != hlock_class(next)) continue; + /* * Allow read-after-read recursion of the same * lock class (i.e. read_lock(lock)+read_lock(lock)): */ if ((read == 2) && prev->read) return 2; + + /* + * We're holding the nest_lock, which serializes this lock's + * nesting behaviour. + */ + if (nest) + return 2; + return print_deadlock_bug(curr, prev, next); } return 1; @@ -2507,7 +2521,7 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); */ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, - unsigned long ip) + struct lockdep_map *nest_lock, unsigned long ip) { struct task_struct *curr = current; struct lock_class *class = NULL; @@ -2566,6 +2580,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->class_idx = class - lock_classes + 1; hlock->acquire_ip = ip; hlock->instance = lock; + hlock->nest_lock = nest_lock; hlock->trylock = trylock; hlock->read = read; hlock->check = check; @@ -2717,7 +2732,7 @@ found_it: if (!__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, - hlock->acquire_ip)) + hlock->nest_lock, hlock->acquire_ip)) return 0; } @@ -2778,7 +2793,7 @@ found_it: if (!__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, - hlock->acquire_ip)) + hlock->nest_lock, hlock->acquire_ip)) return 0; } @@ -2915,7 +2930,8 @@ EXPORT_SYMBOL_GPL(lock_set_subclass); * and also avoid lockdep recursion: */ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, unsigned long ip) + int trylock, int read, int check, + struct lockdep_map *nest_lock, unsigned long ip) { unsigned long flags; @@ -2930,7 +2946,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, current->lockdep_recursion = 1; __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), ip); + irqs_disabled_flags(flags), nest_lock, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } -- cgit v1.2.3 From b7d39aff91454f2534db2275f55908656ec0470c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 09:30:24 +0200 Subject: lockdep: spin_lock_nest_lock() Expose the new lock protection lock. This can be used to annotate places where we take multiple locks of the same class and avoid deadlocks by always taking another (top-level) lock first. NOTE: we're still bound to the MAX_LOCK_DEPTH (48) limit. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/spinlock.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/spinlock.c b/kernel/spinlock.c index a1fb54c93cdd..44baeea94ab9 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -292,6 +292,7 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_spin_lock_nested); + unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) { unsigned long flags; @@ -314,6 +315,16 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas EXPORT_SYMBOL(_spin_lock_irqsave_nested); +void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + preempt_disable(); + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); +} + +EXPORT_SYMBOL(_spin_lock_nest_lock); + #endif void __lockfunc _spin_unlock(spinlock_t *lock) -- cgit v1.2.3 From 8bfe0298f7a04952d19f4a2cf510d7a6311eeed0 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Mon, 11 Aug 2008 09:30:26 +0200 Subject: lockdep: handle chains involving classes defined in modules Solve this by marking the classes as unused and not printing information about the unused classes. Reported-by: Eric Sesterhenn Signed-off-by: Rabin Vincent Acked-by: Huang Ying Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 1 + kernel/lockdep_proc.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 410c3365ad8f..ab933fecd2a1 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3176,6 +3176,7 @@ static void zap_class(struct lock_class *class) list_del_rcu(&class->hash_entry); list_del_rcu(&class->lock_entry); + class->key = NULL; } static inline int within(const void *addr, void *start, unsigned long size) diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 6252ff799d19..fa19aee604c2 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -201,6 +201,9 @@ static int lc_show(struct seq_file *m, void *v) for (i = 0; i < chain->depth; i++) { class = lock_chain_get_class(chain, i); + if (!class->key) + continue; + seq_printf(m, "[%p] ", class->key); print_name(m, class); seq_puts(m, "\n"); -- cgit v1.2.3 From 3295f0ef9ff048a4619ede597ad9ec9cab725654 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 11 Aug 2008 10:30:30 +0200 Subject: lockdep: rename map_[acquire|release]() => lock_map_[acquire|release]() the names were too generic: drivers/uio/uio.c:87: error: expected identifier or '(' before 'do' drivers/uio/uio.c:87: error: expected identifier or '(' before 'while' drivers/uio/uio.c:113: error: 'map_release' undeclared here (not in a function) Signed-off-by: Ingo Molnar --- kernel/workqueue.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 53564ae894a6..8bb5b68fb3a9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -290,11 +290,11 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) BUG_ON(get_wq_data(work) != cwq); work_clear_pending(work); - map_acquire(&cwq->wq->lockdep_map); - map_acquire(&lockdep_map); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire(&lockdep_map); f(work); - map_release(&lockdep_map); - map_release(&cwq->wq->lockdep_map); + lock_map_release(&lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " @@ -413,8 +413,8 @@ void flush_workqueue(struct workqueue_struct *wq) int cpu; might_sleep(); - map_acquire(&wq->lockdep_map); - map_release(&wq->lockdep_map); + lock_map_acquire(&wq->lockdep_map); + lock_map_release(&wq->lockdep_map); for_each_cpu_mask_nr(cpu, *cpu_map) flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); } @@ -441,8 +441,8 @@ int flush_work(struct work_struct *work) if (!cwq) return 0; - map_acquire(&cwq->wq->lockdep_map); - map_release(&cwq->wq->lockdep_map); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); prev = NULL; spin_lock_irq(&cwq->lock); @@ -536,8 +536,8 @@ static void wait_on_work(struct work_struct *work) might_sleep(); - map_acquire(&work->lockdep_map); - map_release(&work->lockdep_map); + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); cwq = get_wq_data(work); if (!cwq) @@ -861,8 +861,8 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) if (cwq->thread == NULL) return; - map_acquire(&cwq->wq->lockdep_map); - map_release(&cwq->wq->lockdep_map); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); flush_cpu_workqueue(cwq); /* -- cgit v1.2.3 From 77ae651347bdd46830da8b28b1efc5e4a9d7cbd0 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 11 Aug 2008 13:32:02 +0200 Subject: sched: fix mysql+oltp regression Defer commit 6d299f1b53b84e2665f402d9bcc494800aba6386 to the next release. Testing of the tip/sched/clock tree revealed a mysql+oltp regression which bisection eventually traced back to this commit in mainline. Pertinent test results: Three run sysbench averages, throughput units in read/write requests/sec. clients 1 2 4 8 16 32 64 6e0534f 9646 17876 34774 33868 32230 30767 29441 2.6.26.1 9112 17936 34652 33383 31929 30665 29232 6d299f1 9112 14637 28370 33339 32038 30762 29204 Note: subsequent commits hide the majority of this regression until you apply the clock fixes, at which time it reemerges at full magnitude. We cannot see anything bad about the change itself so we defer it to the next release until this problem is fully analysed. Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra Cc: Gregory Haskins Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0fe94ea43f32..fb8994c6d4bb 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1442,18 +1442,23 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) struct task_struct *p = NULL; struct sched_entity *se; - while (next != &cfs_rq->tasks) { + if (next == &cfs_rq->tasks) + return NULL; + + /* Skip over entities that are not tasks */ + do { se = list_entry(next, struct sched_entity, group_node); next = next->next; + } while (next != &cfs_rq->tasks && !entity_is_task(se)); - /* Skip over entities that are not tasks */ - if (entity_is_task(se)) { - p = task_of(se); - break; - } - } + if (next == &cfs_rq->tasks) + return NULL; cfs_rq->balance_iterator = next; + + if (entity_is_task(se)) + p = task_of(se); + return p; } -- cgit v1.2.3 From cc7a486cac78f6fc1a24e8cd63036bae8d2ab431 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 11 Aug 2008 13:49:30 +1000 Subject: generic-ipi: fix stack and rcu interaction bug in smp_call_function_mask() * Venki Pallipadi wrote: > Found a OOPS on a big SMP box during an overnight reboot test with > upstream git. > > Suresh and I looked at the oops and looks like the root cause is in > generic_smp_call_function_interrupt() and smp_call_function_mask() with > wait parameter. > > The actual oops looked like > > [ 11.277260] BUG: unable to handle kernel paging request at ffff8802ffffffff > [ 11.277815] IP: [] 0xffff8802ffffffff > [ 11.278155] PGD 202063 PUD 0 > [ 11.278576] Oops: 0010 [1] SMP > [ 11.279006] CPU 5 > [ 11.279336] Modules linked in: > [ 11.279752] Pid: 0, comm: swapper Not tainted 2.6.27-rc2-00020-g685d87f #290 > [ 11.280039] RIP: 0010:[] [] 0xffff8802ffffffff > [ 11.280692] RSP: 0018:ffff88027f1f7f70 EFLAGS: 00010086 > [ 11.280976] RAX: 00000000ffffffff RBX: 0000000000000000 RCX: 0000000000000000 > [ 11.281264] RDX: 0000000000004f4e RSI: 0000000000000001 RDI: 0000000000000000 > [ 11.281624] RBP: ffff88027f1f7f98 R08: 0000000000000001 R09: ffffffff802509af > [ 11.281925] R10: ffff8800280c2780 R11: 0000000000000000 R12: ffff88027f097d48 > [ 11.282214] R13: ffff88027f097d70 R14: 0000000000000005 R15: ffff88027e571000 > [ 11.282502] FS: 0000000000000000(0000) GS:ffff88027f1c3340(0000) knlGS:0000000000000000 > [ 11.283096] CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b > [ 11.283382] CR2: ffff8802ffffffff CR3: 0000000000201000 CR4: 00000000000006e0 > [ 11.283760] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [ 11.284048] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > [ 11.284337] Process swapper (pid: 0, threadinfo ffff88027f1f2000, task ffff88027f1f0640) > [ 11.284936] Stack: ffffffff80250963 0000000000000212 0000000000ee8c78 0000000000ee8a66 > [ 11.285802] ffff88027e571550 ffff88027f1f7fa8 ffffffff8021adb5 ffff88027f1f3e40 > [ 11.286599] ffffffff8020bdd6 ffff88027f1f3e40 ffff88027f1f3ef8 0000000000000000 > [ 11.287120] Call Trace: > [ 11.287768] [] ? generic_smp_call_function_interrupt+0x61/0x12c > [ 11.288354] [] smp_call_function_interrupt+0x17/0x27 > [ 11.288744] [] call_function_interrupt+0x66/0x70 > [ 11.289030] [] ? clockevents_notify+0x19/0x73 > [ 11.289380] [] ? acpi_idle_enter_simple+0x18b/0x1fa > [ 11.289760] [] ? acpi_idle_enter_simple+0x181/0x1fa > [ 11.290051] [] ? cpuidle_idle_call+0x70/0xa2 > [ 11.290338] [] ? cpu_idle+0x5f/0x7d > [ 11.290723] [] ? start_secondary+0x14d/0x152 > [ 11.291010] > [ 11.291287] > [ 11.291654] Code: Bad RIP value. > [ 11.292041] RIP [] 0xffff8802ffffffff > [ 11.292380] RSP > [ 11.292741] CR2: ffff8802ffffffff > [ 11.310951] ---[ end trace 137c54d525305f1c ]--- > > The problem is with the following sequence of events: > > - CPU A calls smp_call_function_mask() for CPU B with wait parameter > - CPU A sets up the call_function_data on the stack and does an rcu add to > call_function_queue > - CPU A waits until the WAIT flag is cleared > - CPU B gets the call function interrupt and starts going through the > call_function_queue > - CPU C also gets some other call function interrupt and starts going through > the call_function_queue > - CPU C, which is also going through the call_function_queue, starts referencing > CPU A's stack, as that element is still in call_function_queue > - CPU B finishes the function call that CPU A set up and as there are no other > references to it, rcu deletes the call_function_data (which was from CPU A > stack) > - CPU B sees the wait flag and just clears the flag (no call_rcu to free) > - CPU A which was waiting on the flag continues executing and the stack > contents change > > - CPU C is still in rcu_read section accessing the CPU A's stack sees > inconsistent call_funation_data and can try to execute > function with some random pointer, causing stack corruption for A > (by clearing the bits in mask field) and oops. Nice debugging work. I'd suggest something like the attached (boot tested) patch as the simple fix for now. I expect the benefits from the less synchronized, multiple-in-flight-data global queue will still outweigh the costs of dynamic allocations. But if worst comes to worst then we just go back to a globally synchronous one-at-a-time implementation, but that would be pretty sad! Signed-off-by: Ingo Molnar --- kernel/smp.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 96fc7c0edc59..e6084f6efb4d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -260,6 +260,41 @@ void __smp_call_function_single(int cpu, struct call_single_data *data) generic_exec_single(cpu, data); } +/* Dummy function */ +static void quiesce_dummy(void *unused) +{ +} + +/* + * Ensure stack based data used in call function mask is safe to free. + * + * This is needed by smp_call_function_mask when using on-stack data, because + * a single call function queue is shared by all CPUs, and any CPU may pick up + * the data item on the queue at any time before it is deleted. So we need to + * ensure that all CPUs have transitioned through a quiescent state after + * this call. + * + * This is a very slow function, implemented by sending synchronous IPIs to + * all possible CPUs. For this reason, we have to alloc data rather than use + * stack based data even in the case of synchronous calls. The stack based + * data is then just used for deadlock/oom fallback which will be very rare. + * + * If a faster scheme can be made, we could go back to preferring stack based + * data -- the data allocation/free is non-zero cost. + */ +static void smp_call_function_mask_quiesce_stack(cpumask_t mask) +{ + struct call_single_data data; + int cpu; + + data.func = quiesce_dummy; + data.info = NULL; + data.flags = CSD_FLAG_WAIT; + + for_each_cpu_mask(cpu, mask) + generic_exec_single(cpu, &data); +} + /** * smp_call_function_mask(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on. @@ -285,6 +320,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, cpumask_t allbutself; unsigned long flags; int cpu, num_cpus; + int slowpath = 0; /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -306,15 +342,16 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, return smp_call_function_single(cpu, func, info, wait); } - if (!wait) { - data = kmalloc(sizeof(*data), GFP_ATOMIC); - if (data) - data->csd.flags = CSD_FLAG_ALLOC; - } - if (!data) { + data = kmalloc(sizeof(*data), GFP_ATOMIC); + if (data) { + data->csd.flags = CSD_FLAG_ALLOC; + if (wait) + data->csd.flags |= CSD_FLAG_WAIT; + } else { data = &d; data->csd.flags = CSD_FLAG_WAIT; wait = 1; + slowpath = 1; } spin_lock_init(&data->lock); @@ -331,8 +368,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, arch_send_call_function_ipi(mask); /* optionally wait for the CPUs to complete */ - if (wait) + if (wait) { csd_flag_wait(&data->csd); + if (unlikely(slowpath)) + smp_call_function_mask_quiesce_stack(allbutself); + } return 0; } -- cgit v1.2.3 From 279ef6bbb8308488398c8f33b04c760148428378 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Wed, 30 Jul 2008 12:34:04 +0200 Subject: sched, cpu hotplug: fix set_cpus_allowed() use in hotplug callbacks Mark Langsdorf reported: > One of my co-workers noticed that the powernow-k8 > driver no longer restarts when a CPU core is > hot-disabled and then hot-enabled on AMD quad-core > systems. > > The following comands work fine on 2.6.26 and fail > on 2.6.27-rc1: > > echo 0 > /sys/devices/system/cpu/cpu3/online > echo 1 > /sys/devices/system/cpu/cpu3/online > find /sys -name cpufreq > > For 2.6.26, the find will return a cpufreq > directory for each processor. In 2.6.27-rc1, > the cpu3 directory is missing. > > After digging through the code, the following > logic is failing when the core is hot-enabled > at runtime. The code works during the boot > sequence. > > cpumask_t = current->cpus_allowed; > set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); > if (smp_processor_id() != cpu) > return -ENODEV; So set the CPU active before calling the CPU_ONLINE notifier chain, there are a handful of notifiers that use set_cpus_allowed(). This fix also solves the problem with x86-microcode. I've sent alternative patches for microcode, but as this "rely on set_cpus_allowed_ptr() being workable in cpu-hotplug(CPU_ONLINE, ...)" assumption seems to be more broad than what we thought, perhaps this fix should be applied. With this patch we define that by the moment CPU_ONLINE is being sent, a 'cpu' is online and ready for tasks to be migrated onto it. Signed-off-by: Dmitry Adamushko Reported-by: Mark Langsdorf Tested-by: Mark Langsdorf Signed-off-by: Ingo Molnar --- kernel/cpu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index e202a68d1cc1..c977c339f559 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -349,6 +349,8 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); + cpu_set(cpu, cpu_active_map); + /* Now call notifier in preparation. */ raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); @@ -383,9 +385,6 @@ int __cpuinit cpu_up(unsigned int cpu) err = _cpu_up(cpu, 0); - if (cpu_online(cpu)) - cpu_set(cpu, cpu_active_map); - out: cpu_maps_update_done(); return err; -- cgit v1.2.3 From 0f2bc27be27ca1dcc66b96131e44bf7648b959c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 22:45:51 +0200 Subject: lockdep: fix debug_lock_alloc When we enable DEBUG_LOCK_ALLOC but do not enable PROVE_LOCKING and or LOCK_STAT, lock_alloc() and lock_release() turn into nops, even though we should be doing hlock checking (check=1). This causes a false warning and a lockdep self-disable. Rectify this. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index ab933fecd2a1..1aa91fd6b06e 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2935,9 +2935,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; - if (unlikely(!lock_stat && !prove_locking)) - return; - if (unlikely(current->lockdep_recursion)) return; @@ -2958,9 +2955,6 @@ void lock_release(struct lockdep_map *lock, int nested, { unsigned long flags; - if (unlikely(!lock_stat && !prove_locking)) - return; - if (unlikely(current->lockdep_recursion)) return; -- cgit v1.2.3 From 59f9415ffb9759e950d775f4c400f747b332cc02 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 30 Jul 2008 12:49:02 -0700 Subject: modules: extend initcall_debug functionality to the module loader The kernel has this really nice facility where if you put "initcall_debug" on the kernel commandline, it'll print which function it's going to execute just before calling an initcall, and then after the call completes it will 1) print if it had an error code 2) checks for a few simple bugs (like leaving irqs off) and 3) print how long the init call took in milliseconds. While trying to optimize the boot speed of my laptop, I have been loving number 3 to figure out what to optimize... ... and then I wished that the same thing was done for module loading. This patch makes the module loader use this exact same functionality; it's a logical extension in my view (since modules are just sort of late binding initcalls anyway) and so far I've found it quite useful in finding where things are too slow in my boot. Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Rusty Russell --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 61d212120df4..08864d257eb0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2288,7 +2288,7 @@ sys_init_module(void __user *umod, /* Start the module */ if (mod->init != NULL) - ret = mod->init(); + ret = do_one_initcall(mod->init); if (ret < 0) { /* Init routine failed: abort. Try to protect us from buggy refcounters. */ -- cgit v1.2.3 From ed6d68763b8b589c0ae9d231cbd72bd01f6685c5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 31 Jul 2008 10:31:02 +0800 Subject: stop_machine: remove unused variable Signed-off-by: Li Zefan Signed-off-by: Rusty Russell --- kernel/stop_machine.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e446c7c7d6a9..af3c7cea258b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -65,7 +65,6 @@ static void ack_state(void) static int stop_cpu(struct stop_machine_data *smdata) { enum stopmachine_state curstate = STOPMACHINE_NONE; - int uninitialized_var(ret); /* Simple state machine */ do { -- cgit v1.2.3 From c2fc11985db304572322f1dcdcb0f71337315006 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 12 Aug 2008 18:05:13 +1000 Subject: generic-ipi: fix stack and rcu interaction bug in smp_call_function_mask(), fix > > Nick Piggin (1): > > generic-ipi: fix stack and rcu interaction bug in > > smp_call_function_mask() > > I'm still not 100% sure that I have this patch right... I might have seen > a lockup trace implicating the smp call function path... which may have > been due to some other problem or a different bug in the new call function > code, but if some more people can take a look at it before merging? OK indeed it did have a couple of bugs. Firstly, I wasn't freeing the data properly in the alloc && wait case. Secondly, I wasn't resetting CSD_FLAG_WAIT in the for each cpu loop (so only the first CPU would wait). After those fixes, the patch boots and runs with the kmalloc commented out (so it always executes the slowpath). Signed-off-by: Ingo Molnar --- kernel/smp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index e6084f6efb4d..782e2b93e465 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -135,7 +135,8 @@ void generic_smp_call_function_interrupt(void) */ smp_wmb(); data->csd.flags &= ~CSD_FLAG_WAIT; - } else + } + if (data->csd.flags & CSD_FLAG_ALLOC) call_rcu(&data->rcu_head, rcu_free_call_data); } rcu_read_unlock(); @@ -289,10 +290,11 @@ static void smp_call_function_mask_quiesce_stack(cpumask_t mask) data.func = quiesce_dummy; data.info = NULL; - data.flags = CSD_FLAG_WAIT; - for_each_cpu_mask(cpu, mask) + for_each_cpu_mask(cpu, mask) { + data.flags = CSD_FLAG_WAIT; generic_exec_single(cpu, &data); + } } /** @@ -371,7 +373,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, if (wait) { csd_flag_wait(&data->csd); if (unlikely(slowpath)) - smp_call_function_mask_quiesce_stack(allbutself); + smp_call_function_mask_quiesce_stack(mask); } return 0; -- cgit v1.2.3 From 3ee1062b4ee82a56294808a065b64f4bc6781a56 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 12 Aug 2008 15:08:40 -0700 Subject: cpu hotplug: s390 doesn't support additional_cpus anymore. s390 doesn't support the additional_cpus kernel parameter anymore since a long time. So we better update the code and documentation to reflect that. Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index c977c339f559..f17e9854c246 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -369,7 +369,7 @@ int __cpuinit cpu_up(unsigned int cpu) if (!cpu_isset(cpu, cpu_possible_map)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390) +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) printk(KERN_ERR "please check additional_cpus= boot " "parameter\n"); #endif -- cgit v1.2.3 From f18e439d1035d059534d261c414af33f89aee89a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 12 Aug 2008 15:09:03 -0700 Subject: genirq: switch /proc/irq/*/smp_affinity et al to seqfiles Switch /proc/irq/*/smp_affinity , /proc/irq/default_smp_affinity to seq_files. cat(1) reads with 1024 chunks by default, with high enough NR_CPUS, there will be -EINVAL. As side effect, there are now two less users of the ->read_proc interface. Signed-off-by: Alexey Dobriyan Cc: Paul Jackson Cc: Mike Travis Cc: Al Viro Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/proc.c | 96 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 51 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6c6d35d68ee9..a09dd29c2fd7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -8,6 +8,7 @@ #include #include +#include #include #include "internals.h" @@ -16,23 +17,18 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP -static int irq_affinity_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int irq_affinity_proc_show(struct seq_file *m, void *v) { - struct irq_desc *desc = irq_desc + (long)data; + struct irq_desc *desc = irq_desc + (long)m->private; cpumask_t *mask = &desc->affinity; - int len; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) mask = &desc->pending_mask; #endif - len = cpumask_scnprintf(page, count, *mask); - - if (count - len < 2) - return -EINVAL; - len += sprintf(page + len, "\n"); - return len; + seq_cpumask(m, mask); + seq_putc(m, '\n'); + return 0; } #ifndef is_affinity_mask_valid @@ -40,11 +36,12 @@ static int irq_affinity_read_proc(char *page, char **start, off_t off, #endif int no_irq_affinity; -static int irq_affinity_write_proc(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static ssize_t irq_affinity_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)data, full_count = count, err; + unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; cpumask_t new_value; + int err; if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) @@ -65,28 +62,38 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, if (!cpus_intersects(new_value, cpu_online_map)) /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - return irq_select_affinity(irq) ? -EINVAL : full_count; + return irq_select_affinity(irq) ? -EINVAL : count; irq_set_affinity(irq, new_value); - return full_count; + return count; } -static int default_affinity_read(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int irq_affinity_proc_open(struct inode *inode, struct file *file) { - int len = cpumask_scnprintf(page, count, irq_default_affinity); - if (count - len < 2) - return -EINVAL; - len += sprintf(page + len, "\n"); - return len; + return single_open(file, irq_affinity_proc_show, PDE(inode)->data); } -static int default_affinity_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static const struct file_operations irq_affinity_proc_fops = { + .open = irq_affinity_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = irq_affinity_proc_write, +}; + +static int default_affinity_show(struct seq_file *m, void *v) +{ + seq_cpumask(m, &irq_default_affinity); + seq_putc(m, '\n'); + return 0; +} + +static ssize_t default_affinity_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) { - unsigned int full_count = count, err; cpumask_t new_value; + int err; err = cpumask_parse_user(buffer, count, new_value); if (err) @@ -105,8 +112,21 @@ static int default_affinity_write(struct file *file, const char __user *buffer, irq_default_affinity = new_value; - return full_count; + return count; } + +static int default_affinity_open(struct inode *inode, struct file *file) +{ + return single_open(file, default_affinity_show, NULL); +} + +static const struct file_operations default_affinity_proc_fops = { + .open = default_affinity_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = default_affinity_write, +}; #endif static int irq_spurious_read(char *page, char **start, off_t off, @@ -178,16 +198,9 @@ void register_irq_proc(unsigned int irq) irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); #ifdef CONFIG_SMP - { - /* create /proc/irq//smp_affinity */ - entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); - - if (entry) { - entry->data = (void *)(long)irq; - entry->read_proc = irq_affinity_read_proc; - entry->write_proc = irq_affinity_write_proc; - } - } + /* create /proc/irq//smp_affinity */ + proc_create_data("smp_affinity", 0600, irq_desc[irq].dir, + &irq_affinity_proc_fops, (void *)(long)irq); #endif entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); @@ -208,15 +221,8 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) void register_default_affinity_proc(void) { #ifdef CONFIG_SMP - struct proc_dir_entry *entry; - - /* create /proc/irq/default_smp_affinity */ - entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir); - if (entry) { - entry->data = NULL; - entry->read_proc = default_affinity_read; - entry->write_proc = default_affinity_write; - } + proc_create("irq/default_smp_affinity", 0600, NULL, + &default_affinity_proc_fops); #endif } -- cgit v1.2.3