From f01114cb59d670e9b4f2c335930dd57db96e9360 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 May 2011 12:26:55 +0200 Subject: sched: Fix cross-cpu clock sync on remote wakeups Markus reported that commit 317f394160e ("sched: Move the second half of ttwu() to the remote cpu") caused some accounting funnies on his AMD Phenom II X4, such as weird 'top' results. It turns out that this is due to non-synced TSC and the queued remote wakeups stopped coupeling the two relevant cpu clocks, which leads to wakeups seeing time jumps, which in turn lead to skewed runtime stats. Add an explicit call to sched_clock_cpu() to couple the per-cpu clocks to restore the normal flow of time. Reported-and-tested-by: Markus Trippelsdorf Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1306835745.2353.3.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cbb3a0eee58e..49cc70b152cf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2600,6 +2600,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; } -- cgit v1.2.3 From f339b9dc1f03591761d5d930800db24bc0eda1e1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 May 2011 10:49:20 +0200 Subject: sched: Fix schedstat.nr_wakeups_migrate While looking over the code I found that with the ttwu rework the nr_wakeups_migrate test broke since we now switch cpus prior to calling ttwu_stat(), hence the test is always true. Cure this by passing the migration state in wake_flags. Also move the whole test under CONFIG_SMP, its hard to migrate tasks on UP :-) Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-pwwxl7gdqs5676f1d4cx6pj7@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 49cc70b152cf..2fe98ed474da 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2447,6 +2447,10 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } rcu_read_unlock(); } + + if (wake_flags & WF_MIGRATED) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); + #endif /* CONFIG_SMP */ schedstat_inc(rq, ttwu_count); @@ -2455,9 +2459,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_SYNC) schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (cpu != task_cpu(p)) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - #endif /* CONFIG_SCHEDSTATS */ } @@ -2675,8 +2676,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); - if (task_cpu(p) != cpu) + if (task_cpu(p) != cpu) { + wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); + } #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); -- cgit v1.2.3 From 74c355fbdfedd3820046dba4f537876cea54c207 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 May 2011 16:48:06 +0200 Subject: perf, cgroups: Fix up for new API Ben changed the cgroup API in commit f780bdb7c1c (cgroups: add per-thread subsystem callbacks) in an incompatible way, but forgot to convert the perf cgroup bits. Avoid compile warnings and runtime splats and convert perf too ;-) Acked-by: Ben Blum Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1306767651.1200.2990.camel@twins Signed-off-by: Ingo Molnar --- kernel/events/core.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c09767f7db3e..8a15944bf9d2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7394,26 +7394,12 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_move(struct task_struct *task) +static void +perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) { task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task, - bool threadgroup) -{ - perf_cgroup_move(task); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - perf_cgroup_move(c); - } - rcu_read_unlock(); - } -} - static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *task) { @@ -7425,7 +7411,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, if (!(task->flags & PF_EXITING)) return; - perf_cgroup_move(task); + perf_cgroup_attach_task(cgrp, task); } struct cgroup_subsys perf_subsys = { @@ -7434,6 +7420,6 @@ struct cgroup_subsys perf_subsys = { .create = perf_cgroup_create, .destroy = perf_cgroup_destroy, .exit = perf_cgroup_exit, - .attach = perf_cgroup_attach, + .attach_task = perf_cgroup_attach_task, }; #endif /* CONFIG_CGROUP_PERF */ -- cgit v1.2.3 From 1b054b67d3bfc6dca9f634c104780f3f24ff3eec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Jun 2011 11:13:33 +0200 Subject: clockevents: Handle empty cpumask gracefully For UP it's stupid to request an initialized cpumask for the clock event devices. Though we need the mask set even on UP to avoid a horrible ifdeffery especially in the broadcast code. For SMP we can at least try to survive with a warning and set the cpumask of the cpu we're running on. That gives a decent chance to bring the machine up and retrieve the debug info. Signed-off-by: Thomas Gleixner Cc: Linus Walleij Cc: Russell King - ARM Linux Cc: Stephen Boyd --- kernel/time/clockevents.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index c027d4f602f1..e4c699dfa4e8 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -182,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev) unsigned long flags; BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); - BUG_ON(!dev->cpumask); + if (!dev->cpumask) { + WARN_ON(num_possible_cpus() > 1); + dev->cpumask = cpumask_of(smp_processor_id()); + } raw_spin_lock_irqsave(&clockevents_lock, flags); -- cgit v1.2.3 From ef26f20cd117eb3c185038ed7cbf7b235575751d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 31 May 2011 08:56:10 +0200 Subject: genirq: Print threaded handler in spurious debug output In forced threaded mode (or with an explicit threaded handler) we only see the primary handler, but not the threaded handler. Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/1306824972-27067-1-git-send-email-sebastian@breakpoint.cc Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dfbd550401b2..c9a78ba30b6f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -201,10 +201,11 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; while (action) { - printk(KERN_ERR "[<%p>]", action->handler); - print_symbol(" (%s)", - (unsigned long)action->handler); - printk("\n"); + printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); + if (action->thread_fn) + printk(KERN_CONT " threaded [<%p>] %pf", + action->thread_fn, action->thread_fn); + printk(KERN_CONT "\n"); action = action->next; } raw_spin_unlock_irqrestore(&desc->lock, flags); -- cgit v1.2.3 From 3a43e05f4d0600e906fa09f4a65d749288c44592 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 31 May 2011 08:56:11 +0200 Subject: irq: Handle spurios irq detection for threaded irqs The detection of spurios interrupts is currently limited to first level handler. In force-threaded mode we never notice if the threaded irq does not feel responsible. This patch catches the return value of the threaded handler and forwards it to the spurious detector. If the primary handler returns only IRQ_WAKE_THREAD then the spourious detector ignores it because it gets called again from the threaded handler. [ tglx: Report the erroneous return value early and bail out ] Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/1306824972-27067-2-git-send-email-sebastian@breakpoint.cc Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 6 ------ kernel/irq/manage.c | 24 ++++++++++++++++++------ kernel/irq/spurious.c | 22 ++++++++++++++++++---- 3 files changed, 36 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 90cb55f6d7eb..470d08c82bbe 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -132,12 +132,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) switch (res) { case IRQ_WAKE_THREAD: - /* - * Set result to handled so the spurious check - * does not trigger. - */ - res = IRQ_HANDLED; - /* * Catch drivers which return WAKE_THREAD but * did not set up a thread function diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f7ce0021e1c4..d64bafb1afd0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -723,13 +723,16 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } * context. So we need to disable bh here to avoid deadlocks and other * side effects. */ -static void +static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) { + irqreturn_t ret; + local_bh_disable(); - action->thread_fn(action->irq, action->dev_id); + ret = action->thread_fn(action->irq, action->dev_id); irq_finalize_oneshot(desc, action, false); local_bh_enable(); + return ret; } /* @@ -737,10 +740,14 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) * preemtible - many of them need to sleep and wait for slow busses to * complete. */ -static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) +static irqreturn_t irq_thread_fn(struct irq_desc *desc, + struct irqaction *action) { - action->thread_fn(action->irq, action->dev_id); + irqreturn_t ret; + + ret = action->thread_fn(action->irq, action->dev_id); irq_finalize_oneshot(desc, action, false); + return ret; } /* @@ -753,7 +760,8 @@ static int irq_thread(void *data) }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); - void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); + irqreturn_t (*handler_fn)(struct irq_desc *desc, + struct irqaction *action); int wake; if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, @@ -783,8 +791,12 @@ static int irq_thread(void *data) desc->istate |= IRQS_PENDING; raw_spin_unlock_irq(&desc->lock); } else { + irqreturn_t action_ret; + raw_spin_unlock_irq(&desc->lock); - handler_fn(desc, action); + action_ret = handler_fn(desc, action); + if (!noirqdebug) + note_interrupt(action->irq, desc, action_ret); } wake = atomic_dec_and_test(&desc->threads_active); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index c9a78ba30b6f..aa57d5da18c1 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -167,6 +167,13 @@ out: jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } +static inline int bad_action_ret(irqreturn_t action_ret) +{ + if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD))) + return 0; + return 1; +} + /* * If 99,900 of the previous 100,000 interrupts have not been handled * then assume that the IRQ is stuck in some manner. Drop a diagnostic @@ -182,7 +189,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *action; unsigned long flags; - if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { + if (bad_action_ret(action_ret)) { printk(KERN_ERR "irq event %d: bogus return value %x\n", irq, action_ret); } else { @@ -263,7 +270,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, if (desc->istate & IRQS_POLL_INPROGRESS) return; - if (unlikely(action_ret != IRQ_HANDLED)) { + /* we get here again via the threaded handler */ + if (action_ret == IRQ_WAKE_THREAD) + return; + + if (bad_action_ret(action_ret)) { + report_bad_irq(irq, desc, action_ret); + return; + } + + if (unlikely(action_ret == IRQ_NONE)) { /* * If we are seeing only the odd spurious IRQ caused by * bus asynchronicity then don't eventually trigger an error, @@ -275,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, else desc->irqs_unhandled++; desc->last_unhandled = jiffies; - if (unlikely(action_ret != IRQ_NONE)) - report_bad_irq(irq, desc, action_ret); } if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { -- cgit v1.2.3 From e7fbad300a7a6432238f086e3c9a61538a905858 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 31 May 2011 18:14:39 +0200 Subject: genirq: Fix descriptor init on non-sparse IRQs The genirq changes are initializing descriptors for sparse IRQs quite differently from how non-sparse (stacked?) IRQs are initialized, with the effect that on my platform all IRQs are default-disabled on sparse IRQs and default-enabled if non-sparse IRQs are used, crashing some GPIO driver. Fix this by refactoring the non-sparse IRQs to use the same descriptor init function as the sparse IRQs. Signed-off: Linus Walleij Link: http://lkml.kernel.org/r/1306858479-16622-1-git-send-email-linus.walleij@stericsson.com Cc: stable@kernel.org # 2.6.39 Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 886e80347b32..c7ddc87d6c87 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -257,13 +257,11 @@ int __init early_irq_init(void) count = ARRAY_SIZE(irq_desc); for (i = 0; i < count; i++) { - desc[i].irq_data.irq = i; - desc[i].irq_data.chip = &no_irq_chip; desc[i].kstat_irqs = alloc_percpu(unsigned int); - irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); - alloc_masks(desc + i, GFP_KERNEL, node); - desc_smp_init(desc + i, node); + alloc_masks(&desc[i], GFP_KERNEL, node); + raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); + desc_set_defaults(i, &desc[i], node); } return arch_early_irq_init(); } -- cgit v1.2.3 From c5182b8867e189e14a8df5dbfcba1c73f286e061 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 2 Jun 2011 18:55:13 +0100 Subject: genirq: Ensure we locate the passed IRQ in irq_alloc_descs() When irq_alloc_descs() is called with no base IRQ specified then it will search for a range of IRQs starting from a specified base address. In the case where an IRQ is specified it still does this search in order to ensure that none of the requested range is already allocated and it still uses the from parameter to specify the base for the search. This means that in the case where a base is specified but from is zero (which is reasonable as any IRQ number is in the range specified by a zero from) the function will get confused and try to allocate the first suitably sized block of free IRQs it finds. Instead use a specified IRQ as the base address for the search, and insist that any from that is specified can support that IRQ. Signed-off-by: Mark Brown Link: http://lkml.kernel.org/r/1307037313-15733-1-git-send-email-broonie@opensource.wolfsonmicro.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index c7ddc87d6c87..4c60a50e66b2 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -344,6 +344,12 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) if (!cnt) return -EINVAL; + if (irq >= 0) { + if (from > irq) + return -EINVAL; + from = irq; + } + mutex_lock(&sparse_irq_lock); start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, -- cgit v1.2.3 From 1c3cc11602111d1318c2a5743bd2e88c82813927 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 21 May 2011 12:58:28 +0200 Subject: timers: Consider slack value in mod_timer() There is an optimization which does not update the timer if the timer was pending and the expiration time was unchanged. Since commit 3bbb9ec9 ("timers: Introduce the concept of timer slack for legacy timers") this optimization is no longer applied for timers where the expiration time got extended due to the slack value. So we need to check again after the expiration time might have been updated. [ tglx: Made it a single check by applying slack first and sorting out the slack = 0 value (all timeouts < 256 jiffies) early ] Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/20110521105828.GA29442@Chamillionaire.breakpoint.cc Signed-off-by: Thomas Gleixner --- kernel/timer.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index fd6198692b57..8cff36119e4d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -749,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) unsigned long expires_limit, mask; int bit; - expires_limit = expires; - if (timer->slack >= 0) { expires_limit = expires + timer->slack; } else { - unsigned long now = jiffies; + long delta = expires - jiffies; + + if (delta < 256) + return expires; - /* No slack, if already expired else auto slack 0.4% */ - if (time_after(expires, now)) - expires_limit = expires + (expires - now)/256; + expires_limit = expires + delta / 256; } mask = expires ^ expires_limit; if (mask == 0) @@ -795,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) */ int mod_timer(struct timer_list *timer, unsigned long expires) { + expires = apply_slack(timer, expires); + /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -803,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires) if (timer_pending(timer) && timer->expires == expires) return 1; - expires = apply_slack(timer, expires); - return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); } EXPORT_SYMBOL(mod_timer); -- cgit v1.2.3 From aa4a221875873d2a1f9656cb7fd7e545e952b4fa Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 3 Jun 2011 17:54:40 -0400 Subject: perf: Comment /proc/sys/kernel/perf_event_paranoid to be part of user ABI Turns out that distro packages use this file as an indicator of the perf event subsystem - this is easier to check for from scripts than the existence of the system call. This is easy enough to keep around for the kernel, so add a comment to make sure it stays so. Signed-off-by: Vince Weaver Cc: David Ahern Cc: Peter Zijlstra Cc: paulus@samba.org Cc: acme@redhat.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1106031751170.29381@cl320.eecs.utk.edu Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4fc92445a29c..f175d98bd355 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -938,6 +938,12 @@ static struct ctl_table kern_table[] = { }, #endif #ifdef CONFIG_PERF_EVENTS + /* + * User-space scripts rely on the existence of this file + * as a feature check for perf_events being enabled. + * + * So it's an ABI, do not remove! + */ { .procname = "perf_event_paranoid", .data = &sysctl_perf_event_paranoid, -- cgit v1.2.3 From 0aff1c0cef13b34c17e81a502336fad738151c37 Mon Sep 17 00:00:00 2001 From: GuoWen Li Date: Wed, 1 Jun 2011 19:18:47 +0800 Subject: ftrace: Fix possible undefined return code kernel/trace/ftrace.c: In function 'ftrace_regex_write.clone.15': kernel/trace/ftrace.c:2743:6: warning: 'ret' may be used uninitialized in this function Signed-off-by: GuoWen Li Link: http://lkml.kernel.org/r/201106011918.47939.guowen.li.linux@gmail.com Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1ee417fcbfa5..204b3eb4e690 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2740,7 +2740,7 @@ static int ftrace_process_regex(struct ftrace_hash *hash, { char *func, *command, *next = buff; struct ftrace_func_command *p; - int ret; + int ret = -EINVAL; func = strsep(&next, ":"); -- cgit v1.2.3 From f2513cde93f0957d5dc6c09bc24b0cccd27d8e1d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Jun 2011 12:32:43 +0200 Subject: lockdep: Fix lock_is_held() on recursion The main lock_is_held() user is lockdep_assert_held(), avoid false assertions in lockdep_off() sections by unconditionally reporting the lock is taken. [ the reason this is important is a lockdep_assert_held() in ttwu() which triggers a warning under lockdep_off() as in printk() which can trigger another wakeup and lock up due to spinlock recursion, as reported and heroically debugged by Arne Jansen ] Reported-and-tested-by: Arne Jansen Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Link: http://lkml.kernel.org/r/1307398759.2497.966.camel@laptop Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 63437d065ac8..298c9276dfdb 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3426,7 +3426,7 @@ int lock_is_held(struct lockdep_map *lock) int ret = 0; if (unlikely(current->lockdep_recursion)) - return ret; + return 1; /* avoid false negative lockdep_assert_held() */ raw_local_irq_save(flags); check_flags(flags); -- cgit v1.2.3 From 6c6c54e1807faf116724451ef2bd14993780470a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Jun 2011 17:37:07 +0200 Subject: sched: Fix/clarify set_task_cpu() locking rules Sergey reported a CONFIG_PROVE_RCU warning in push_rt_task where set_task_cpu() was called with both relevant rq->locks held, which should be sufficient for running tasks since holding its rq->lock will serialize against sched_move_task(). Update the comments and fix the task_group() lockdep test. Reported-and-tested-by: Sergey Senozhatsky Cc: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1307115427.2353.3456.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2fe98ed474da..3f2e502d609b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -605,10 +605,10 @@ static inline int cpu_of(struct rq *rq) /* * Return the group to which this tasks belongs. * - * We use task_subsys_state_check() and extend the RCU verification - * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() - * holds that lock for each task it moves into the cgroup. Therefore - * by holding that lock, we pin the task to the current cgroup. + * We use task_subsys_state_check() and extend the RCU verification with + * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each + * task it moves into the cgroup. Therefore by holding either of those locks, + * we pin the task to the current cgroup. */ static inline struct task_group *task_group(struct task_struct *p) { @@ -616,7 +616,8 @@ static inline struct task_group *task_group(struct task_struct *p) struct cgroup_subsys_state *css; css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock)); + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); return autogroup_task_group(p, tg); @@ -2200,6 +2201,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP + /* + * The caller should hold either p->pi_lock or rq->lock, when changing + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. + * + * sched_move_task() holds both and thus holding either pins the cgroup, + * see set_task_rq(). + * + * Furthermore, all task_rq users should acquire both locks, see + * task_rq_lock(). + */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || lockdep_is_held(&task_rq(p)->lock))); #endif -- cgit v1.2.3 From 265a5b7ee3eb21a4d0e53e17d59ba6eada91af39 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Jun 2011 22:35:13 -0400 Subject: kprobes/trace: Fix kprobe selftest for gcc 4.6 With gcc 4.6, the self test kprobe function: kprobe_trace_selftest_target() is optimized such that kallsyms does not list it. The kprobes test uses this function to insert a probe and test it. But it will fail the test if the function is not listed in kallsyms. Adding a __used annotation keeps the symbol in the kallsyms table. Suggested-by: David Daney Cc: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f925c45f0afa..27d13b36b8be 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1870,8 +1870,12 @@ fs_initcall(init_kprobe_trace); #ifdef CONFIG_FTRACE_STARTUP_TEST -static int kprobe_trace_selftest_target(int a1, int a2, int a3, - int a4, int a5, int a6) +/* + * The "__used" keeps gcc from removing the function symbol + * from the kallsyms table. + */ +static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, + int a4, int a5, int a6) { return a1 + a2 + a3 + a4 + a5 + a6; } -- cgit v1.2.3 From a4f18ed11a4ddf327dd91cd19e237278600ad327 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 7 Jun 2011 09:26:46 -0400 Subject: ftrace: Revert 8ab2b7efd ftrace: Remove unnecessary disabling of irqs Revert the commit that removed the disabling of interrupts around the initial modifying of mcount callers to nops, and update the comment. The original comment was outdated and stated that the interrupts were being disabled to prevent kstop machine, which was required with the old ftrace daemon, but was no longer the case. What the comment failed to mention was that interrupts needed to be disabled to keep interrupts from preempting the modifying of the code and then executing the code that was partially modified. Revert the commit and update the comment. Reported-by: Richard W.M. Jones Tested-by: Richard W.M. Jones Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 204b3eb4e690..908038f57440 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3330,6 +3330,7 @@ static int ftrace_process_locs(struct module *mod, { unsigned long *p; unsigned long addr; + unsigned long flags; mutex_lock(&ftrace_lock); p = start; @@ -3346,7 +3347,13 @@ static int ftrace_process_locs(struct module *mod, ftrace_record_ip(addr); } + /* + * Disable interrupts to prevent interrupts from executing + * code that is being modified. + */ + local_irq_save(flags); ftrace_update_code(mod); + local_irq_restore(flags); mutex_unlock(&ftrace_lock); return 0; -- cgit v1.2.3 From db5e7ecc4abc91b9f26f0c0d79ef88a51e987d90 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 9 Jun 2011 08:40:59 -0400 Subject: tracing: Fix regression in printk_formats file The fix to fix the printk_formats of modules broke the printk_formats of trace_printks in the kernel. The update of what to show via the seq_file was only updated if the passed in fmt was NULL, which happens only on the first iteration. The result was showing the first format every time instead of iterating through the available formats. Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index dff763b7baf1..1f06468a10d7 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -240,13 +240,10 @@ static const char **find_next(void *v, loff_t *pos) const char **fmt = v; int start_index; - if (!fmt) - fmt = __start___trace_bprintk_fmt + *pos; - start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; if (*pos < start_index) - return fmt; + return __start___trace_bprintk_fmt + *pos; return find_next_mod_format(start_index, v, fmt, pos); } -- cgit v1.2.3 From 13863a66c9c8a663665445cf05d68de96ff31830 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 9 Jun 2011 23:14:58 +0200 Subject: genirq: Prevent potential NULL dereference in irq_set_irq_wake() In kernel/irq/manage.c::irq_set_irq_wake() we call irq_get_desc_buslock() which may return NULL, but the code dereferences the result unconditionally. irq_set_irq_wake() has lots of callers - I checked a few and I couldn't find anything that guarantees that they won't call it with some input that will cause irq_get_desc_buslock() to return NULL, so I think it's a good thing to test and -EINVAL was the most sane error code in this situation that I could think of. Not all callers test the return value of irq_set_irq_wake(), but those that do take != 0 to mean error as far as I can see, so they should be fine. I guess those that don't test actually should, but that's a different issue. Signed-off-by: Jesper Juhl Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1106092300360.17868@swampdragon.chaosbits.net Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d64bafb1afd0..0a7840aeb0fb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -491,6 +491,9 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); int ret = 0; + if (!desc) + return -EINVAL; + /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ -- cgit v1.2.3 From 9a432736904d386cda28b987b38ba14dae960ecc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 May 2011 20:38:55 -0700 Subject: rcu: Simplify curing of load woes Make the functions creating the kthreads wake them up. Leverage the fact that the per-node and boost kthreads can run anywhere, thus dispensing with the need to wake them up once the incoming CPU has gone fully online. Signed-off-by: Paul E. McKenney Tested-by: Daniel J Blueman --- kernel/rcutree.c | 65 ++++++++++++++++--------------------------------- kernel/rcutree_plugin.h | 11 +-------- 2 files changed, 22 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 89419ff92e99..0a8ec5b2e208 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1635,6 +1635,20 @@ static int rcu_cpu_kthread(void *arg) * to manipulate rcu_cpu_kthread_task. There might be another CPU * attempting to access it during boot, but the locking in kthread_bind() * will enforce sufficient ordering. + * + * Please note that we cannot simply refuse to wake up the per-CPU + * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, + * which can result in softlockup complaints if the task ends up being + * idle for more than a couple of minutes. + * + * However, please note also that we cannot bind the per-CPU kthread to its + * CPU until that CPU is fully online. We also cannot wait until the + * CPU is fully online before we create its per-CPU kthread, as this would + * deadlock the system when CPU notifiers tried waiting for grace + * periods. So we bind the per-CPU kthread to its CPU only if the CPU + * is online. If its CPU is not yet fully online, then the code in + * rcu_cpu_kthread() will wait until it is fully online, and then do + * the binding. */ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) { @@ -1647,12 +1661,14 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); if (IS_ERR(t)) return PTR_ERR(t); - kthread_bind(t, cpu); + if (cpu_online(cpu)) + kthread_bind(t, cpu); per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); - per_cpu(rcu_cpu_kthread_task, cpu) = t; sp.sched_priority = RCU_KTHREAD_PRIO; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + per_cpu(rcu_cpu_kthread_task, cpu) = t; + wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ return 0; } @@ -1759,12 +1775,11 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = 99; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ } return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); } -static void rcu_wake_one_boost_kthread(struct rcu_node *rnp); - /* * Spawn all kthreads -- called as soon as the scheduler is running. */ @@ -1772,30 +1787,18 @@ static int __init rcu_spawn_kthreads(void) { int cpu; struct rcu_node *rnp; - struct task_struct *t; rcu_kthreads_spawnable = 1; for_each_possible_cpu(cpu) { per_cpu(rcu_cpu_has_work, cpu) = 0; - if (cpu_online(cpu)) { + if (cpu_online(cpu)) (void)rcu_spawn_one_cpu_kthread(cpu); - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t) - wake_up_process(t); - } } rnp = rcu_get_root(rcu_state); (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - if (rnp->node_kthread_task) - wake_up_process(rnp->node_kthread_task); if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state, rnp) { + rcu_for_each_leaf_node(rcu_state, rnp) (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - t = rnp->node_kthread_task; - if (t) - wake_up_process(t); - rcu_wake_one_boost_kthread(rnp); - } } return 0; } @@ -2220,31 +2223,6 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) } } -/* - * kthread_create() creates threads in TASK_UNINTERRUPTIBLE state, - * but the RCU threads are woken on demand, and if demand is low this - * could be a while triggering the hung task watchdog. - * - * In order to avoid this, poke all tasks once the CPU is fully - * up and running. - */ -static void __cpuinit rcu_online_kthreads(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); - struct rcu_node *rnp = rdp->mynode; - struct task_struct *t; - - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t) - wake_up_process(t); - - t = rnp->node_kthread_task; - if (t) - wake_up_process(t); - - rcu_wake_one_boost_kthread(rnp); -} - /* * Handle CPU online/offline notification events. */ @@ -2262,7 +2240,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_prepare_kthreads(cpu); break; case CPU_ONLINE: - rcu_online_kthreads(cpu); case CPU_DOWN_FAILED: rcu_node_kthread_setaffinity(rnp, -1); rcu_cpu_kthread_setrt(cpu, 1); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c8bff3099a89..ea2e2fb79e81 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1299,15 +1299,10 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = RCU_KTHREAD_PRIO; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ return 0; } -static void __cpuinit rcu_wake_one_boost_kthread(struct rcu_node *rnp) -{ - if (rnp->boost_kthread_task) - wake_up_process(rnp->boost_kthread_task); -} - #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) @@ -1331,10 +1326,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return 0; } -static void __cpuinit rcu_wake_one_boost_kthread(struct rcu_node *rnp) -{ -} - #endif /* #else #ifdef CONFIG_RCU_BOOST */ #ifndef CONFIG_SMP -- cgit v1.2.3 From 09223371deac67d08ca0b70bd18787920284c967 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 14 Jun 2011 13:26:25 +0800 Subject: rcu: Use softirq to address performance regression Commit a26ac2455ffcf3(rcu: move TREE_RCU from softirq to kthread) introduced performance regression. In an AIM7 test, this commit degraded performance by about 40%. The commit runs rcu callbacks in a kthread instead of softirq. We observed high rate of context switch which is caused by this. Out test system has 64 CPUs and HZ is 1000, so we saw more than 64k context switch per second which is caused by RCU's per-CPU kthread. A trace showed that most of the time the RCU per-CPU kthread doesn't actually handle any callbacks, but instead just does a very small amount of work handling grace periods. This means that RCU's per-CPU kthreads are making the scheduler do quite a bit of work in order to allow a very small amount of RCU-related processing to be done. Alex Shi's analysis determined that this slowdown is due to lock contention within the scheduler. Unfortunately, as Peter Zijlstra points out, the scheduler's real-time semantics require global action, which means that this contention is inherent in real-time scheduling. (Yes, perhaps someone will come up with a workaround -- otherwise, -rt is not going to do well on large SMP systems -- but this patch will work around this issue in the meantime. And "the meantime" might well be forever.) This patch therefore re-introduces softirq processing to RCU, but only for core RCU work. RCU callbacks are still executed in kthread context, so that only a small amount of RCU work runs in softirq context in the common case. This should minimize ksoftirqd execution, allowing us to skip boosting of ksoftirqd for CONFIG_RCU_BOOST=y kernels. Signed-off-by: Shaohua Li Tested-by: "Alex,Shi" Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 23 +++++++++++++++++++---- kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 9 +++++++++ kernel/softirq.c | 2 +- 4 files changed, 30 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0a8ec5b2e208..ae5c9ea68662 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -100,6 +100,7 @@ static char rcu_kthreads_spawnable; static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_cpu_kthread(void); +static void __invoke_rcu_cpu_kthread(void); #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ @@ -1442,13 +1443,21 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) } /* If there are callbacks ready, invoke them. */ - rcu_do_batch(rsp, rdp); + if (cpu_has_callbacks_ready_to_invoke(rdp)) + __invoke_rcu_cpu_kthread(); +} + +static void rcu_kthread_do_work(void) +{ + rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); + rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + rcu_preempt_do_callbacks(); } /* * Do softirq processing for the current CPU. */ -static void rcu_process_callbacks(void) +static void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); @@ -1465,7 +1474,7 @@ static void rcu_process_callbacks(void) * the current CPU with interrupts disabled, the rcu_cpu_kthread_task * cannot disappear out from under us. */ -static void invoke_rcu_cpu_kthread(void) +static void __invoke_rcu_cpu_kthread(void) { unsigned long flags; @@ -1479,6 +1488,11 @@ static void invoke_rcu_cpu_kthread(void) local_irq_restore(flags); } +static void invoke_rcu_cpu_kthread(void) +{ + raise_softirq(RCU_SOFTIRQ); +} + /* * Wake up the specified per-rcu_node-structure kthread. * Because the per-rcu_node kthreads are immortal, we don't need @@ -1613,7 +1627,7 @@ static int rcu_cpu_kthread(void *arg) *workp = 0; local_irq_restore(flags); if (work) - rcu_process_callbacks(); + rcu_kthread_do_work(); local_bh_enable(); if (*workp != 0) spincnt++; @@ -2387,6 +2401,7 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7b9a08b4aaea..0fed6b934d2a 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -439,6 +439,7 @@ static void rcu_preempt_offline_cpu(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_preempt_check_callbacks(int cpu); static void rcu_preempt_process_callbacks(void); +static void rcu_preempt_do_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index ea2e2fb79e81..38d09c5f2b41 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -602,6 +602,11 @@ static void rcu_preempt_process_callbacks(void) &__get_cpu_var(rcu_preempt_data)); } +static void rcu_preempt_do_callbacks(void) +{ + rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); +} + /* * Queue a preemptible-RCU callback for invocation after a grace period. */ @@ -997,6 +1002,10 @@ static void rcu_preempt_process_callbacks(void) { } +static void rcu_preempt_do_callbacks(void) +{ +} + /* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptible RCU does not exist, map to rcu-sched. diff --git a/kernel/softirq.c b/kernel/softirq.c index 13960170cad4..40cf63ddd4b3 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", - "TASKLET", "SCHED", "HRTIMER" + "TASKLET", "SCHED", "HRTIMER", "RCU" }; /* -- cgit v1.2.3 From ada9c93312f7ec49514c68c211595ce2601cebae Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 14 Jun 2011 15:50:11 -0700 Subject: signal.c: fix kernel-doc notation Fix kernel-doc warnings in signal.c: Warning(kernel/signal.c:2374): No description found for parameter 'nset' Warning(kernel/signal.c:2374): Excess function parameter 'set' description in 'sys_rt_sigprocmask' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 86c32b884f8e..ff7678603328 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2365,7 +2365,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) /** * sys_rt_sigprocmask - change the list of currently blocked signals * @how: whether to add, remove, or set signals - * @set: stores pending signals + * @nset: stores pending signals * @oset: previous value of signal mask if non-null * @sigsetsize: size of sigset_t type */ -- cgit v1.2.3 From 8dd0de8be31b4b966d17750a0b10df2f575c91ac Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 14 Jun 2011 18:36:24 -0400 Subject: sched: Fix need_resched() when checking peempt The RT preempt check tests the wrong task if NEED_RESCHED is set. It currently checks the local CPU task. It is supposed to check the task that is running on the runqueue we are about to wake another task on. Signed-off-by: Hillf Danton Reviewed-by: Yong Zhang Signed-off-by: Steven Rostedt Link: http://lkml.kernel.org/r/20110614223657.450239027@goodmis.org Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 88725c939e0b..9b8d5dce946e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1096,7 +1096,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag * to move current somewhere else, making room for our non-migratable * task. */ - if (p->prio == rq->curr->prio && !need_resched()) + if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); #endif } -- cgit v1.2.3 From 0da938c44921cfb690283d3b0c9c48a10375db2c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Jun 2011 18:36:25 -0400 Subject: sched: Check if lowest_mask is initialized in find_lowest_rq() On system boot up, the lowest_mask is initialized with an early_initcall(). But RT tasks may wake up on other early_initcall() callers before the lowest_mask is initialized, causing a system crash. Commit "d72bce0e67 rcu: Cure load woes" was the first commit to wake up RT tasks in early init. Before this commit this bug should not happen. Reported-by: Andrew Theurer Tested-by: Andrew Theurer Tested-by: Paul E. McKenney Signed-off-by: Steven Rostedt Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110614223657.824872966@goodmis.org Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 9b8d5dce946e..10d018212bab 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1239,6 +1239,10 @@ static int find_lowest_rq(struct task_struct *task) int this_cpu = smp_processor_id(); int cpu = task_cpu(task); + /* Make sure the mask is initialized first */ + if (unlikely(!lowest_mask)) + return -1; + if (task->rt.nr_cpus_allowed == 1) return -1; /* No other targets possible */ -- cgit v1.2.3 From 733eda7ac316cd4e550fa096e4ed42356dc546e7 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 15 Jun 2011 15:08:43 -0700 Subject: memcg: clear mm->owner when last possible owner leaves The following crash was reported: > Call Trace: > [] mem_cgroup_from_task+0x15/0x17 > [] __mem_cgroup_try_charge+0x148/0x4b4 > [] ? need_resched+0x23/0x2d > [] ? preempt_schedule+0x46/0x4f > [] mem_cgroup_charge_common+0x9a/0xce > [] mem_cgroup_newpage_charge+0x5d/0x5f > [] khugepaged+0x5da/0xfaf > [] ? __init_waitqueue_head+0x4b/0x4b > [] ? add_mm_counter.constprop.5+0x13/0x13 > [] kthread+0xa8/0xb0 > [] ? sub_preempt_count+0xa1/0xb4 > [] kernel_thread_helper+0x4/0x10 > [] ? retint_restore_args+0x13/0x13 > [] ? __init_kthread_worker+0x5a/0x5a What happens is that khugepaged tries to charge a huge page against an mm whose last possible owner has already exited, and the memory controller crashes when the stale mm->owner is used to look up the cgroup to charge. mm->owner has never been set to NULL with the last owner going away, but nobody cared until khugepaged came along. Even then it wasn't a problem because the final mmput() on an mm was forced to acquire and release mmap_sem in write-mode, preventing an exiting owner to go away while the mmap_sem was held, and until "692e0b3 mm: thp: optimize memcg charge in khugepaged", the memory cgroup charge was protected by mmap_sem in read-mode. Instead of going back to relying on the mmap_sem to enforce lifetime of a task, this patch ensures that mm->owner is properly set to NULL when the last possible owner is exiting, which the memory controller can handle just fine. [akpm@linux-foundation.org: tweak comments] Signed-off-by: Hugh Dickins Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Johannes Weiner Reported-by: Hugh Dickins Reported-by: Dave Jones Reviewed-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 20a406471525..f2b321bae440 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -561,29 +561,28 @@ void exit_files(struct task_struct *tsk) #ifdef CONFIG_MM_OWNER /* - * Task p is exiting and it owned mm, lets find a new owner for it + * A task is exiting. If it owned this mm, find a new owner for the mm. */ -static inline int -mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) -{ - /* - * If there are other users of the mm and the owner (us) is exiting - * we need to find a new owner to take on the responsibility. - */ - if (atomic_read(&mm->mm_users) <= 1) - return 0; - if (mm->owner != p) - return 0; - return 1; -} - void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *c, *g, *p = current; retry: - if (!mm_need_new_owner(mm, p)) + /* + * If the exiting or execing task is not the owner, it's + * someone else's problem. + */ + if (mm->owner != p) return; + /* + * The current owner is exiting/execing and there are no other + * candidates. Do not leave the mm pointing to a possibly + * freed task structure. + */ + if (atomic_read(&mm->mm_users) <= 1) { + mm->owner = NULL; + return; + } read_lock(&tasklist_lock); /* -- cgit v1.2.3 From d2c32258798f813dc2be6cbc32f78aa5ac5cb205 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 15 Jun 2011 15:08:47 -0700 Subject: gcov: disable CONFIG_CONSTRUCTORS when not needed by CONFIG_GCOV_KERNEL CONFIG_CONSTRUCTORS controls support for running constructor functions at kernel init time. According to commit b99b87f70c7785ab ("kernel: constructor support"), gcov (CONFIG_GCOV_KERNEL) needs this. However, CONFIG_CONSTRUCTORS currently defaults to y, with no option to disable it, and CONFIG_GCOV_KERNEL depends on it. Instead, default it to n and have CONFIG_GCOV_KERNEL select it, so that the normal case of CONFIG_GCOV_KERNEL=n will result in CONFIG_CONSTRUCTORS=n. Observed in the short list of =y values in a minimal kernel configuration. Signed-off-by: Josh Triplett Acked-by: WANG Cong Acked-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index b8cadf70b1fb..5bf924d80b5c 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -2,7 +2,8 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" - depends on DEBUG_FS && CONSTRUCTORS + depends on DEBUG_FS + select CONSTRUCTORS default n ---help--- This option enables gcov-based code profiling (e.g. for code coverage -- cgit v1.2.3 From a46e0899eec7a3069bcadd45dfba7bf67c6ed016 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 15 Jun 2011 15:47:09 -0700 Subject: rcu: use softirq instead of kthreads except when RCU_BOOST=y This patch #ifdefs RCU kthreads out of the kernel unless RCU_BOOST=y, thus eliminating context-switch overhead if RCU priority boosting has not been configured. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 59 ++++++++++++++++++++++++++++++++++++------------- kernel/rcutree.h | 8 +++++-- kernel/rcutree_plugin.h | 41 +++++++++++++++++++++------------- kernel/rcutree_trace.c | 32 +++++++++++++++++++-------- 4 files changed, 99 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ae5c9ea68662..429d4949f0eb 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -87,6 +87,8 @@ static struct rcu_state *rcu_state; int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); +#ifdef CONFIG_RCU_BOOST + /* * Control variables for per-CPU and per-rcu_node kthreads. These * handle all flavors of RCU. @@ -98,9 +100,11 @@ DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DEFINE_PER_CPU(char, rcu_cpu_has_work); static char rcu_kthreads_spawnable; +#endif /* #ifdef CONFIG_RCU_BOOST */ + static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); -static void invoke_rcu_cpu_kthread(void); -static void __invoke_rcu_cpu_kthread(void); +static void invoke_rcu_core(void); +static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ @@ -1089,6 +1093,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp; +#ifdef CONFIG_RCU_BOOST struct task_struct *t; /* Stop the CPU's kthread. */ @@ -1097,6 +1102,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) per_cpu(rcu_cpu_kthread_task, cpu) = NULL; kthread_stop(t); } +#endif /* #ifdef CONFIG_RCU_BOOST */ /* Exclude any attempts to start a new grace period. */ raw_spin_lock_irqsave(&rsp->onofflock, flags); @@ -1232,7 +1238,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Re-raise the RCU softirq if there are callbacks remaining. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) - invoke_rcu_cpu_kthread(); + invoke_rcu_core(); } /* @@ -1278,7 +1284,7 @@ void rcu_check_callbacks(int cpu, int user) } rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) - invoke_rcu_cpu_kthread(); + invoke_rcu_core(); } #ifdef CONFIG_SMP @@ -1444,9 +1450,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) /* If there are callbacks ready, invoke them. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) - __invoke_rcu_cpu_kthread(); + invoke_rcu_callbacks(rsp, rdp); } +#ifdef CONFIG_RCU_BOOST + static void rcu_kthread_do_work(void) { rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); @@ -1454,6 +1462,8 @@ static void rcu_kthread_do_work(void) rcu_preempt_do_callbacks(); } +#endif /* #ifdef CONFIG_RCU_BOOST */ + /* * Do softirq processing for the current CPU. */ @@ -1474,25 +1484,22 @@ static void rcu_process_callbacks(struct softirq_action *unused) * the current CPU with interrupts disabled, the rcu_cpu_kthread_task * cannot disappear out from under us. */ -static void __invoke_rcu_cpu_kthread(void) +static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) { - unsigned long flags; - - local_irq_save(flags); - __this_cpu_write(rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { - local_irq_restore(flags); + if (likely(!rsp->boost)) { + rcu_do_batch(rsp, rdp); return; } - wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); - local_irq_restore(flags); + invoke_rcu_callbacks_kthread(); } -static void invoke_rcu_cpu_kthread(void) +static void invoke_rcu_core(void) { raise_softirq(RCU_SOFTIRQ); } +#ifdef CONFIG_RCU_BOOST + /* * Wake up the specified per-rcu_node-structure kthread. * Because the per-rcu_node kthreads are immortal, we don't need @@ -1818,6 +1825,18 @@ static int __init rcu_spawn_kthreads(void) } early_initcall(rcu_spawn_kthreads); +#else /* #ifdef CONFIG_RCU_BOOST */ + +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +{ +} + +static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_state *rsp) @@ -2224,6 +2243,8 @@ static void __cpuinit rcu_prepare_cpu(int cpu) rcu_preempt_init_percpu_data(cpu); } +#ifdef CONFIG_RCU_BOOST + static void __cpuinit rcu_prepare_kthreads(int cpu) { struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); @@ -2237,6 +2258,14 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) } } +#else /* #ifdef CONFIG_RCU_BOOST */ + +static void __cpuinit rcu_prepare_kthreads(int cpu) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + /* * Handle CPU online/offline notification events. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 0fed6b934d2a..434288c7ad88 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -369,6 +369,7 @@ struct rcu_state { /* period because */ /* force_quiescent_state() */ /* was running. */ + u8 boost; /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ @@ -439,7 +440,6 @@ static void rcu_preempt_offline_cpu(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_preempt_check_callbacks(int cpu); static void rcu_preempt_process_callbacks(void); -static void rcu_preempt_do_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); @@ -451,11 +451,15 @@ static void rcu_preempt_send_cbs_to_online(void); static void __init __rcu_init_preempt(void); static void rcu_needs_cpu_flush(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); +static void invoke_rcu_callbacks_kthread(void); +#ifdef CONFIG_RCU_BOOST +static void rcu_preempt_do_callbacks(void); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, cpumask_var_t cm); -static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp, int rnp_index); +#endif /* #ifdef CONFIG_RCU_BOOST */ #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 38d09c5f2b41..2772386c0421 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -602,11 +602,15 @@ static void rcu_preempt_process_callbacks(void) &__get_cpu_var(rcu_preempt_data)); } +#ifdef CONFIG_RCU_BOOST + static void rcu_preempt_do_callbacks(void) { rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); } +#endif /* #ifdef CONFIG_RCU_BOOST */ + /* * Queue a preemptible-RCU callback for invocation after a grace period. */ @@ -1002,10 +1006,6 @@ static void rcu_preempt_process_callbacks(void) { } -static void rcu_preempt_do_callbacks(void) -{ -} - /* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptible RCU does not exist, map to rcu-sched. @@ -1257,6 +1257,23 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) } } +/* + * Wake up the per-CPU kthread to invoke RCU callbacks. + */ +static void invoke_rcu_callbacks_kthread(void) +{ + unsigned long flags; + + local_irq_save(flags); + __this_cpu_write(rcu_cpu_has_work, 1); + if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { + local_irq_restore(flags); + return; + } + wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); + local_irq_restore(flags); +} + /* * Set the affinity of the boost kthread. The CPU-hotplug locks are * held, so no one should be messing with the existence of the boost @@ -1297,6 +1314,7 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (&rcu_preempt_state != rsp) return 0; + rsp->boost = 1; if (rnp->boost_kthread_task != NULL) return 0; t = kthread_create(rcu_boost_kthread, (void *)rnp, @@ -1319,22 +1337,15 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore(&rnp->lock, flags); } -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, - cpumask_var_t cm) +static void invoke_rcu_callbacks_kthread(void) { + WARN_ON_ONCE(1); } static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } -static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp, - int rnp_index) -{ - return 0; -} - #endif /* #else #ifdef CONFIG_RCU_BOOST */ #ifndef CONFIG_SMP @@ -1509,7 +1520,7 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); * * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a - * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked + * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. */ int rcu_needs_cpu(int cpu) @@ -1560,7 +1571,7 @@ int rcu_needs_cpu(int cpu) /* If RCU callbacks are still pending, RCU still needs this CPU. */ if (c) - invoke_rcu_cpu_kthread(); + invoke_rcu_core(); return c; } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 9678cc3650f5..4e144876dc68 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,6 +46,8 @@ #define RCU_TREE_NONCORE #include "rcutree.h" +#ifdef CONFIG_RCU_BOOST + DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); @@ -58,6 +60,8 @@ static char convert_kthread_status(unsigned int kthread_status) return "SRWOY"[kthread_status]; } +#endif /* #ifdef CONFIG_RCU_BOOST */ + static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) @@ -76,7 +80,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld", + seq_printf(m, " ql=%ld qs=%c%c%c%c", rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], @@ -84,13 +88,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->nxttail[RCU_NEXT_READY_TAIL]], ".W"[rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], + ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); +#ifdef CONFIG_RCU_BOOST + seq_printf(m, " kt=%d/%c/%d ktl=%x", per_cpu(rcu_cpu_has_work, rdp->cpu), convert_kthread_status(per_cpu(rcu_cpu_kthread_status, rdp->cpu)), per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), - per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff, - rdp->blimit); + per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); +#endif /* #ifdef CONFIG_RCU_BOOST */ + seq_printf(m, " b=%ld", rdp->blimit); seq_printf(m, " ci=%lu co=%lu ca=%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } @@ -147,18 +154,21 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen, + seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], ".R"[rdp->nxttail[RCU_WAIT_TAIL] != rdp->nxttail[RCU_NEXT_READY_TAIL]], ".W"[rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], + ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); +#ifdef CONFIG_RCU_BOOST + seq_printf(m, ",%d,\"%c\"", per_cpu(rcu_cpu_has_work, rdp->cpu), convert_kthread_status(per_cpu(rcu_cpu_kthread_status, - rdp->cpu)), - rdp->blimit); + rdp->cpu))); +#endif /* #ifdef CONFIG_RCU_BOOST */ + seq_printf(m, ",%ld", rdp->blimit); seq_printf(m, ",%lu,%lu,%lu\n", rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } @@ -169,7 +179,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) #ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ - seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); + seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); +#ifdef CONFIG_RCU_BOOST + seq_puts(m, "\"kt\",\"ktl\""); +#endif /* #ifdef CONFIG_RCU_BOOST */ + seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); #ifdef CONFIG_TREE_PREEMPT_RCU seq_puts(m, "\"rcu_preempt:\"\n"); PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); -- cgit v1.2.3 From b5199515c25cca622495eb9c6a8a1d275e775088 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Jun 2011 16:22:08 +0200 Subject: clocksource: Make watchdog robust vs. interruption The clocksource watchdog code is interruptible and it has been observed that this can trigger false positives which disable the TSC. The reason is that an interrupt storm or a long running interrupt handler between the read of the watchdog source and the read of the TSC brings the two far enough apart that the delta is larger than the unstable treshold. Move both reads into a short interrupt disabled region to avoid that. Reported-and-tested-by: Vernon Mauery Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/time/clocksource.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1c95fd677328..e0980f0d9a0a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -185,7 +185,6 @@ static struct clocksource *watchdog; static struct timer_list watchdog_timer; static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); -static cycle_t watchdog_last; static int watchdog_running; static int clocksource_watchdog_kthread(void *data); @@ -254,11 +253,6 @@ static void clocksource_watchdog(unsigned long data) if (!watchdog_running) goto out; - wdnow = watchdog->read(watchdog); - wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask, - watchdog->mult, watchdog->shift); - watchdog_last = wdnow; - list_for_each_entry(cs, &watchdog_list, wd_list) { /* Clocksource already marked unstable? */ @@ -268,19 +262,28 @@ static void clocksource_watchdog(unsigned long data) continue; } + local_irq_disable(); csnow = cs->read(cs); + wdnow = watchdog->read(watchdog); + local_irq_enable(); /* Clocksource initialized ? */ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { cs->flags |= CLOCK_SOURCE_WATCHDOG; - cs->wd_last = csnow; + cs->wd_last = wdnow; + cs->cs_last = csnow; continue; } - /* Check the deviation from the watchdog clocksource. */ - cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & + wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, + watchdog->mult, watchdog->shift); + + cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & cs->mask, cs->mult, cs->shift); - cs->wd_last = csnow; + cs->cs_last = csnow; + cs->wd_last = wdnow; + + /* Check the deviation from the watchdog clocksource. */ if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { clocksource_unstable(cs, cs_nsec - wd_nsec); continue; @@ -318,7 +321,6 @@ static inline void clocksource_start_watchdog(void) return; init_timer(&watchdog_timer); watchdog_timer.function = clocksource_watchdog; - watchdog_last = watchdog->read(watchdog); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); watchdog_running = 1; -- cgit v1.2.3 From f8b7fc6b514f34a51875dd48dff70d4d17a54f38 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 16 Jun 2011 08:26:32 -0700 Subject: rcu: Move RCU_BOOST #ifdefs to header file The commit "use softirq instead of kthreads except when RCU_BOOST=y" just applied #ifdef in place. This commit is a cleanup that moves the newly #ifdef'ed code to the header file kernel/rcutree_plugin.h. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 383 +---------------------------------------------- kernel/rcutree.h | 5 + kernel/rcutree_plugin.h | 384 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 390 insertions(+), 382 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 429d4949f0eb..7e59ffb3d0ba 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1093,16 +1093,8 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp; -#ifdef CONFIG_RCU_BOOST - struct task_struct *t; - /* Stop the CPU's kthread. */ - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t != NULL) { - per_cpu(rcu_cpu_kthread_task, cpu) = NULL; - kthread_stop(t); - } -#endif /* #ifdef CONFIG_RCU_BOOST */ + rcu_stop_cpu_kthread(cpu); /* Exclude any attempts to start a new grace period. */ raw_spin_lock_irqsave(&rsp->onofflock, flags); @@ -1453,17 +1445,6 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) invoke_rcu_callbacks(rsp, rdp); } -#ifdef CONFIG_RCU_BOOST - -static void rcu_kthread_do_work(void) -{ - rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); - rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); - rcu_preempt_do_callbacks(); -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - /* * Do softirq processing for the current CPU. */ @@ -1498,345 +1479,6 @@ static void invoke_rcu_core(void) raise_softirq(RCU_SOFTIRQ); } -#ifdef CONFIG_RCU_BOOST - -/* - * Wake up the specified per-rcu_node-structure kthread. - * Because the per-rcu_node kthreads are immortal, we don't need - * to do anything to keep them alive. - */ -static void invoke_rcu_node_kthread(struct rcu_node *rnp) -{ - struct task_struct *t; - - t = rnp->node_kthread_task; - if (t != NULL) - wake_up_process(t); -} - -/* - * Set the specified CPU's kthread to run RT or not, as specified by - * the to_rt argument. The CPU-hotplug locks are held, so the task - * is not going away. - */ -static void rcu_cpu_kthread_setrt(int cpu, int to_rt) -{ - int policy; - struct sched_param sp; - struct task_struct *t; - - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (t == NULL) - return; - if (to_rt) { - policy = SCHED_FIFO; - sp.sched_priority = RCU_KTHREAD_PRIO; - } else { - policy = SCHED_NORMAL; - sp.sched_priority = 0; - } - sched_setscheduler_nocheck(t, policy, &sp); -} - -/* - * Timer handler to initiate the waking up of per-CPU kthreads that - * have yielded the CPU due to excess numbers of RCU callbacks. - * We wake up the per-rcu_node kthread, which in turn will wake up - * the booster kthread. - */ -static void rcu_cpu_kthread_timer(unsigned long arg) -{ - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); - struct rcu_node *rnp = rdp->mynode; - - atomic_or(rdp->grpmask, &rnp->wakemask); - invoke_rcu_node_kthread(rnp); -} - -/* - * Drop to non-real-time priority and yield, but only after posting a - * timer that will cause us to regain our real-time priority if we - * remain preempted. Either way, we restore our real-time priority - * before returning. - */ -static void rcu_yield(void (*f)(unsigned long), unsigned long arg) -{ - struct sched_param sp; - struct timer_list yield_timer; - - setup_timer_on_stack(&yield_timer, f, arg); - mod_timer(&yield_timer, jiffies + 2); - sp.sched_priority = 0; - sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); - set_user_nice(current, 19); - schedule(); - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); - del_timer(&yield_timer); -} - -/* - * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. - * This can happen while the corresponding CPU is either coming online - * or going offline. We cannot wait until the CPU is fully online - * before starting the kthread, because the various notifier functions - * can wait for RCU grace periods. So we park rcu_cpu_kthread() until - * the corresponding CPU is online. - * - * Return 1 if the kthread needs to stop, 0 otherwise. - * - * Caller must disable bh. This function can momentarily enable it. - */ -static int rcu_cpu_kthread_should_stop(int cpu) -{ - while (cpu_is_offline(cpu) || - !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || - smp_processor_id() != cpu) { - if (kthread_should_stop()) - return 1; - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; - per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); - local_bh_enable(); - schedule_timeout_uninterruptible(1); - if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - local_bh_disable(); - } - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - return 0; -} - -/* - * Per-CPU kernel thread that invokes RCU callbacks. This replaces the - * earlier RCU softirq. - */ -static int rcu_cpu_kthread(void *arg) -{ - int cpu = (int)(long)arg; - unsigned long flags; - int spincnt = 0; - unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); - char work; - char *workp = &per_cpu(rcu_cpu_has_work, cpu); - - for (;;) { - *statusp = RCU_KTHREAD_WAITING; - rcu_wait(*workp != 0 || kthread_should_stop()); - local_bh_disable(); - if (rcu_cpu_kthread_should_stop(cpu)) { - local_bh_enable(); - break; - } - *statusp = RCU_KTHREAD_RUNNING; - per_cpu(rcu_cpu_kthread_loops, cpu)++; - local_irq_save(flags); - work = *workp; - *workp = 0; - local_irq_restore(flags); - if (work) - rcu_kthread_do_work(); - local_bh_enable(); - if (*workp != 0) - spincnt++; - else - spincnt = 0; - if (spincnt > 10) { - *statusp = RCU_KTHREAD_YIELDING; - rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); - spincnt = 0; - } - } - *statusp = RCU_KTHREAD_STOPPED; - return 0; -} - -/* - * Spawn a per-CPU kthread, setting up affinity and priority. - * Because the CPU hotplug lock is held, no other CPU will be attempting - * to manipulate rcu_cpu_kthread_task. There might be another CPU - * attempting to access it during boot, but the locking in kthread_bind() - * will enforce sufficient ordering. - * - * Please note that we cannot simply refuse to wake up the per-CPU - * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, - * which can result in softlockup complaints if the task ends up being - * idle for more than a couple of minutes. - * - * However, please note also that we cannot bind the per-CPU kthread to its - * CPU until that CPU is fully online. We also cannot wait until the - * CPU is fully online before we create its per-CPU kthread, as this would - * deadlock the system when CPU notifiers tried waiting for grace - * periods. So we bind the per-CPU kthread to its CPU only if the CPU - * is online. If its CPU is not yet fully online, then the code in - * rcu_cpu_kthread() will wait until it is fully online, and then do - * the binding. - */ -static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) -{ - struct sched_param sp; - struct task_struct *t; - - if (!rcu_kthreads_spawnable || - per_cpu(rcu_cpu_kthread_task, cpu) != NULL) - return 0; - t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); - if (IS_ERR(t)) - return PTR_ERR(t); - if (cpu_online(cpu)) - kthread_bind(t, cpu); - per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; - WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - per_cpu(rcu_cpu_kthread_task, cpu) = t; - wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ - return 0; -} - -/* - * Per-rcu_node kthread, which is in charge of waking up the per-CPU - * kthreads when needed. We ignore requests to wake up kthreads - * for offline CPUs, which is OK because force_quiescent_state() - * takes care of this case. - */ -static int rcu_node_kthread(void *arg) -{ - int cpu; - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp = (struct rcu_node *)arg; - struct sched_param sp; - struct task_struct *t; - - for (;;) { - rnp->node_kthread_status = RCU_KTHREAD_WAITING; - rcu_wait(atomic_read(&rnp->wakemask) != 0); - rnp->node_kthread_status = RCU_KTHREAD_RUNNING; - raw_spin_lock_irqsave(&rnp->lock, flags); - mask = atomic_xchg(&rnp->wakemask, 0); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { - if ((mask & 0x1) == 0) - continue; - preempt_disable(); - t = per_cpu(rcu_cpu_kthread_task, cpu); - if (!cpu_online(cpu) || t == NULL) { - preempt_enable(); - continue; - } - per_cpu(rcu_cpu_has_work, cpu) = 1; - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - preempt_enable(); - } - } - /* NOTREACHED */ - rnp->node_kthread_status = RCU_KTHREAD_STOPPED; - return 0; -} - -/* - * Set the per-rcu_node kthread's affinity to cover all CPUs that are - * served by the rcu_node in question. The CPU hotplug lock is still - * held, so the value of rnp->qsmaskinit will be stable. - * - * We don't include outgoingcpu in the affinity set, use -1 if there is - * no outgoing CPU. If there are no CPUs left in the affinity set, - * this function allows the kthread to execute on any CPU. - */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) -{ - cpumask_var_t cm; - int cpu; - unsigned long mask = rnp->qsmaskinit; - - if (rnp->node_kthread_task == NULL) - return; - if (!alloc_cpumask_var(&cm, GFP_KERNEL)) - return; - cpumask_clear(cm); - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) - if ((mask & 0x1) && cpu != outgoingcpu) - cpumask_set_cpu(cpu, cm); - if (cpumask_weight(cm) == 0) { - cpumask_setall(cm); - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) - cpumask_clear_cpu(cpu, cm); - WARN_ON_ONCE(cpumask_weight(cm) == 0); - } - set_cpus_allowed_ptr(rnp->node_kthread_task, cm); - rcu_boost_kthread_setaffinity(rnp, cm); - free_cpumask_var(cm); -} - -/* - * Spawn a per-rcu_node kthread, setting priority and affinity. - * Called during boot before online/offline can happen, or, if - * during runtime, with the main CPU-hotplug locks held. So only - * one of these can be executing at a time. - */ -static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, - struct rcu_node *rnp) -{ - unsigned long flags; - int rnp_index = rnp - &rsp->node[0]; - struct sched_param sp; - struct task_struct *t; - - if (!rcu_kthreads_spawnable || - rnp->qsmaskinit == 0) - return 0; - if (rnp->node_kthread_task == NULL) { - t = kthread_create(rcu_node_kthread, (void *)rnp, - "rcun%d", rnp_index); - if (IS_ERR(t)) - return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->node_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = 99; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - } - return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); -} - -/* - * Spawn all kthreads -- called as soon as the scheduler is running. - */ -static int __init rcu_spawn_kthreads(void) -{ - int cpu; - struct rcu_node *rnp; - - rcu_kthreads_spawnable = 1; - for_each_possible_cpu(cpu) { - per_cpu(rcu_cpu_has_work, cpu) = 0; - if (cpu_online(cpu)) - (void)rcu_spawn_one_cpu_kthread(cpu); - } - rnp = rcu_get_root(rcu_state); - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state, rnp) - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - } - return 0; -} -early_initcall(rcu_spawn_kthreads); - -#else /* #ifdef CONFIG_RCU_BOOST */ - -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) -{ -} - -static void rcu_cpu_kthread_setrt(int cpu, int to_rt) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ - static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_state *rsp) @@ -2243,29 +1885,6 @@ static void __cpuinit rcu_prepare_cpu(int cpu) rcu_preempt_init_percpu_data(cpu); } -#ifdef CONFIG_RCU_BOOST - -static void __cpuinit rcu_prepare_kthreads(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); - struct rcu_node *rnp = rdp->mynode; - - /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ - if (rcu_kthreads_spawnable) { - (void)rcu_spawn_one_cpu_kthread(cpu); - if (rnp->node_kthread_task == NULL) - (void)rcu_spawn_one_node_kthread(rcu_state, rnp); - } -} - -#else /* #ifdef CONFIG_RCU_BOOST */ - -static void __cpuinit rcu_prepare_kthreads(int cpu) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ - /* * Handle CPU online/offline notification events. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 434288c7ad88..01b2ccda26fb 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -427,6 +427,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); +static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static void rcu_print_task_stall(struct rcu_node *rnp); @@ -460,6 +461,10 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp, int rnp_index); +static void invoke_rcu_node_kthread(struct rcu_node *rnp); +static void rcu_yield(void (*f)(unsigned long), unsigned long arg); #endif /* #ifdef CONFIG_RCU_BOOST */ +static void rcu_cpu_kthread_setrt(int cpu, int to_rt); +static void __cpuinit rcu_prepare_kthreads(int cpu); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 2772386c0421..14dc7dd00902 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1330,6 +1330,370 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return 0; } +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Stop the RCU's per-CPU kthread when its CPU goes offline,. + */ +static void rcu_stop_cpu_kthread(int cpu) +{ + struct task_struct *t; + + /* Stop the CPU's kthread. */ + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t != NULL) { + per_cpu(rcu_cpu_kthread_task, cpu) = NULL; + kthread_stop(t); + } +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +static void rcu_kthread_do_work(void) +{ + rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); + rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + rcu_preempt_do_callbacks(); +} + +/* + * Wake up the specified per-rcu_node-structure kthread. + * Because the per-rcu_node kthreads are immortal, we don't need + * to do anything to keep them alive. + */ +static void invoke_rcu_node_kthread(struct rcu_node *rnp) +{ + struct task_struct *t; + + t = rnp->node_kthread_task; + if (t != NULL) + wake_up_process(t); +} + +/* + * Set the specified CPU's kthread to run RT or not, as specified by + * the to_rt argument. The CPU-hotplug locks are held, so the task + * is not going away. + */ +static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +{ + int policy; + struct sched_param sp; + struct task_struct *t; + + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t == NULL) + return; + if (to_rt) { + policy = SCHED_FIFO; + sp.sched_priority = RCU_KTHREAD_PRIO; + } else { + policy = SCHED_NORMAL; + sp.sched_priority = 0; + } + sched_setscheduler_nocheck(t, policy, &sp); +} + +/* + * Timer handler to initiate the waking up of per-CPU kthreads that + * have yielded the CPU due to excess numbers of RCU callbacks. + * We wake up the per-rcu_node kthread, which in turn will wake up + * the booster kthread. + */ +static void rcu_cpu_kthread_timer(unsigned long arg) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); + struct rcu_node *rnp = rdp->mynode; + + atomic_or(rdp->grpmask, &rnp->wakemask); + invoke_rcu_node_kthread(rnp); +} + +/* + * Drop to non-real-time priority and yield, but only after posting a + * timer that will cause us to regain our real-time priority if we + * remain preempted. Either way, we restore our real-time priority + * before returning. + */ +static void rcu_yield(void (*f)(unsigned long), unsigned long arg) +{ + struct sched_param sp; + struct timer_list yield_timer; + + setup_timer_on_stack(&yield_timer, f, arg); + mod_timer(&yield_timer, jiffies + 2); + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); + set_user_nice(current, 19); + schedule(); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + del_timer(&yield_timer); +} + +/* + * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. + * This can happen while the corresponding CPU is either coming online + * or going offline. We cannot wait until the CPU is fully online + * before starting the kthread, because the various notifier functions + * can wait for RCU grace periods. So we park rcu_cpu_kthread() until + * the corresponding CPU is online. + * + * Return 1 if the kthread needs to stop, 0 otherwise. + * + * Caller must disable bh. This function can momentarily enable it. + */ +static int rcu_cpu_kthread_should_stop(int cpu) +{ + while (cpu_is_offline(cpu) || + !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || + smp_processor_id() != cpu) { + if (kthread_should_stop()) + return 1; + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; + per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); + local_bh_enable(); + schedule_timeout_uninterruptible(1); + if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + local_bh_disable(); + } + per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; + return 0; +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the + * earlier RCU softirq. + */ +static int rcu_cpu_kthread(void *arg) +{ + int cpu = (int)(long)arg; + unsigned long flags; + int spincnt = 0; + unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); + char work; + char *workp = &per_cpu(rcu_cpu_has_work, cpu); + + for (;;) { + *statusp = RCU_KTHREAD_WAITING; + rcu_wait(*workp != 0 || kthread_should_stop()); + local_bh_disable(); + if (rcu_cpu_kthread_should_stop(cpu)) { + local_bh_enable(); + break; + } + *statusp = RCU_KTHREAD_RUNNING; + per_cpu(rcu_cpu_kthread_loops, cpu)++; + local_irq_save(flags); + work = *workp; + *workp = 0; + local_irq_restore(flags); + if (work) + rcu_kthread_do_work(); + local_bh_enable(); + if (*workp != 0) + spincnt++; + else + spincnt = 0; + if (spincnt > 10) { + *statusp = RCU_KTHREAD_YIELDING; + rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); + spincnt = 0; + } + } + *statusp = RCU_KTHREAD_STOPPED; + return 0; +} + +/* + * Spawn a per-CPU kthread, setting up affinity and priority. + * Because the CPU hotplug lock is held, no other CPU will be attempting + * to manipulate rcu_cpu_kthread_task. There might be another CPU + * attempting to access it during boot, but the locking in kthread_bind() + * will enforce sufficient ordering. + * + * Please note that we cannot simply refuse to wake up the per-CPU + * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, + * which can result in softlockup complaints if the task ends up being + * idle for more than a couple of minutes. + * + * However, please note also that we cannot bind the per-CPU kthread to its + * CPU until that CPU is fully online. We also cannot wait until the + * CPU is fully online before we create its per-CPU kthread, as this would + * deadlock the system when CPU notifiers tried waiting for grace + * periods. So we bind the per-CPU kthread to its CPU only if the CPU + * is online. If its CPU is not yet fully online, then the code in + * rcu_cpu_kthread() will wait until it is fully online, and then do + * the binding. + */ +static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) +{ + struct sched_param sp; + struct task_struct *t; + + if (!rcu_kthreads_spawnable || + per_cpu(rcu_cpu_kthread_task, cpu) != NULL) + return 0; + t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); + if (IS_ERR(t)) + return PTR_ERR(t); + if (cpu_online(cpu)) + kthread_bind(t, cpu); + per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; + WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + per_cpu(rcu_cpu_kthread_task, cpu) = t; + wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ + return 0; +} + +/* + * Per-rcu_node kthread, which is in charge of waking up the per-CPU + * kthreads when needed. We ignore requests to wake up kthreads + * for offline CPUs, which is OK because force_quiescent_state() + * takes care of this case. + */ +static int rcu_node_kthread(void *arg) +{ + int cpu; + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp = (struct rcu_node *)arg; + struct sched_param sp; + struct task_struct *t; + + for (;;) { + rnp->node_kthread_status = RCU_KTHREAD_WAITING; + rcu_wait(atomic_read(&rnp->wakemask) != 0); + rnp->node_kthread_status = RCU_KTHREAD_RUNNING; + raw_spin_lock_irqsave(&rnp->lock, flags); + mask = atomic_xchg(&rnp->wakemask, 0); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { + if ((mask & 0x1) == 0) + continue; + preempt_disable(); + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (!cpu_online(cpu) || t == NULL) { + preempt_enable(); + continue; + } + per_cpu(rcu_cpu_has_work, cpu) = 1; + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + preempt_enable(); + } + } + /* NOTREACHED */ + rnp->node_kthread_status = RCU_KTHREAD_STOPPED; + return 0; +} + +/* + * Set the per-rcu_node kthread's affinity to cover all CPUs that are + * served by the rcu_node in question. The CPU hotplug lock is still + * held, so the value of rnp->qsmaskinit will be stable. + * + * We don't include outgoingcpu in the affinity set, use -1 if there is + * no outgoing CPU. If there are no CPUs left in the affinity set, + * this function allows the kthread to execute on any CPU. + */ +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +{ + cpumask_var_t cm; + int cpu; + unsigned long mask = rnp->qsmaskinit; + + if (rnp->node_kthread_task == NULL) + return; + if (!alloc_cpumask_var(&cm, GFP_KERNEL)) + return; + cpumask_clear(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) + if ((mask & 0x1) && cpu != outgoingcpu) + cpumask_set_cpu(cpu, cm); + if (cpumask_weight(cm) == 0) { + cpumask_setall(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) + cpumask_clear_cpu(cpu, cm); + WARN_ON_ONCE(cpumask_weight(cm) == 0); + } + set_cpus_allowed_ptr(rnp->node_kthread_task, cm); + rcu_boost_kthread_setaffinity(rnp, cm); + free_cpumask_var(cm); +} + +/* + * Spawn a per-rcu_node kthread, setting priority and affinity. + * Called during boot before online/offline can happen, or, if + * during runtime, with the main CPU-hotplug locks held. So only + * one of these can be executing at a time. + */ +static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + unsigned long flags; + int rnp_index = rnp - &rsp->node[0]; + struct sched_param sp; + struct task_struct *t; + + if (!rcu_kthreads_spawnable || + rnp->qsmaskinit == 0) + return 0; + if (rnp->node_kthread_task == NULL) { + t = kthread_create(rcu_node_kthread, (void *)rnp, + "rcun%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->node_kthread_task = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + sp.sched_priority = 99; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ + } + return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); +} + +/* + * Spawn all kthreads -- called as soon as the scheduler is running. + */ +static int __init rcu_spawn_kthreads(void) +{ + int cpu; + struct rcu_node *rnp; + + rcu_kthreads_spawnable = 1; + for_each_possible_cpu(cpu) { + per_cpu(rcu_cpu_has_work, cpu) = 0; + if (cpu_online(cpu)) + (void)rcu_spawn_one_cpu_kthread(cpu); + } + rnp = rcu_get_root(rcu_state); + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + if (NUM_RCU_NODES > 1) { + rcu_for_each_leaf_node(rcu_state, rnp) + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + } + return 0; +} +early_initcall(rcu_spawn_kthreads); + +static void __cpuinit rcu_prepare_kthreads(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_node *rnp = rdp->mynode; + + /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ + if (rcu_kthreads_spawnable) { + (void)rcu_spawn_one_cpu_kthread(cpu); + if (rnp->node_kthread_task == NULL) + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + } +} + #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) @@ -1346,6 +1710,26 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } +#ifdef CONFIG_HOTPLUG_CPU + +static void rcu_stop_cpu_kthread(int cpu) +{ +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +{ +} + +static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +{ +} + +static void __cpuinit rcu_prepare_kthreads(int cpu) +{ +} + #endif /* #else #ifdef CONFIG_RCU_BOOST */ #ifndef CONFIG_SMP -- cgit v1.2.3 From d8ad7d1123a960cc9f276bd499f9325c6f5e1bd1 Mon Sep 17 00:00:00 2001 From: Takao Indoh Date: Tue, 29 Mar 2011 12:35:04 -0400 Subject: generic-ipi: Fix kexec boot crash by initializing call_single_queue before enabling interrupts There is a problem that kdump(2nd kernel) sometimes hangs up due to a pending IPI from 1st kernel. Kernel panic occurs because IPI comes before call_single_queue is initialized. To fix the crash, rename init_call_single_data() to call_function_init() and call it in start_kernel() so that call_single_queue can be initialized before enabling interrupts. The details of the crash are: (1) 2nd kernel boots up (2) A pending IPI from 1st kernel comes when irqs are first enabled in start_kernel(). (3) Kernel tries to handle the interrupt, but call_single_queue is not initialized yet at this point. As a result, in the generic_smp_call_function_single_interrupt(), NULL pointer dereference occurs when list_replace_init() tries to access &q->list.next. Therefore this patch changes the name of init_call_single_data() to call_function_init() and calls it before local_irq_enable() in start_kernel(). Signed-off-by: Takao Indoh Reviewed-by: WANG Cong Acked-by: Neil Horman Acked-by: Vivek Goyal Acked-by: Peter Zijlstra Cc: Milton Miller Cc: Jens Axboe Cc: Paul E. McKenney Cc: kexec@lists.infradead.org Link: http://lkml.kernel.org/r/D6CBEE2F420741indou.takao@jp.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/smp.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 73a195193558..fb67dfa8394e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -74,7 +74,7 @@ static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { .notifier_call = hotplug_cfd, }; -static int __cpuinit init_call_single_data(void) +void __init call_function_init(void) { void *cpu = (void *)(long)smp_processor_id(); int i; @@ -88,10 +88,7 @@ static int __cpuinit init_call_single_data(void) hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); register_cpu_notifier(&hotplug_cfd_notifier); - - return 0; } -early_initcall(init_call_single_data); /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources -- cgit v1.2.3 From 879669961b11e7f40b518784863a259f735a72bf Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 17 Jun 2011 11:25:59 +0100 Subject: KEYS/DNS: Fix ____call_usermodehelper() to not lose the session keyring ____call_usermodehelper() now erases any credentials set by the subprocess_inf::init() function. The problem is that commit 17f60a7da150 ("capabilites: allow the application of capability limits to usermode helpers") creates and commits new credentials with prepare_kernel_cred() after the call to the init() function. This wipes all keyrings after umh_keys_init() is called. The best way to deal with this is to put the init() call just prior to the commit_creds() call, and pass the cred pointer to init(). That means that umh_keys_init() and suchlike can modify the credentials _before_ they are published and potentially in use by the rest of the system. This prevents request_key() from working as it is prevented from passing the session keyring it set up with the authorisation token to /sbin/request-key, and so the latter can't assume the authority to instantiate the key. This causes the in-kernel DNS resolver to fail with ENOKEY unconditionally. Signed-off-by: David Howells Acked-by: Eric Paris Tested-by: Jeff Layton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index ad6a81c58b44..47613dfb7b28 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -156,12 +156,6 @@ static int ____call_usermodehelper(void *data) */ set_user_nice(current, 0); - if (sub_info->init) { - retval = sub_info->init(sub_info); - if (retval) - goto fail; - } - retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) @@ -173,6 +167,14 @@ static int ____call_usermodehelper(void *data) new->cap_inheritable); spin_unlock(&umh_sysctl_lock); + if (sub_info->init) { + retval = sub_info->init(sub_info, new); + if (retval) { + abort_creds(new); + goto fail; + } + } + commit_creds(new); retval = kernel_execve(sub_info->path, @@ -388,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); * context in which call_usermodehelper_exec is called. */ void call_usermodehelper_setfns(struct subprocess_info *info, - int (*init)(struct subprocess_info *info), + int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), void *data) { -- cgit v1.2.3