From 78e4bc34e5d966cfd95f1238565afc399d56225c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Sep 2013 15:04:06 -0700 Subject: rcu: Fix and comment ordering around wait_event() It is all too easy to forget that wait_event() does not necessarily imply a full memory barrier. The case where it does not is where the condition transitions to true just as wait_event() starts execution. This is actually a feature: The standard use of wait_event() involves locking, in which case the locks provide the needed ordering (you hold a lock across the wake_up() and acquire that same lock after wait_event() returns). Given that I did forget that wait_event() does not necessarily imply a full memory barrier in one case, this commit fixes that case. This commit also adds comments calling out the placement of existing memory barriers relied on by wait_event() calls. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6abb03dff5c0..b023e5407111 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -779,8 +779,10 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (wake) + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ wake_up(&sync_rcu_preempt_exp_wq); + } break; } mask = rnp->grpmask; @@ -1852,6 +1854,7 @@ static int rcu_oom_notify(struct notifier_block *self, /* Wait for callbacks from earlier instance to complete. */ wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); + smp_mb(); /* Ensure callback reuse happens after callback invocation. */ /* * Prevent premature wakeup: ensure that all increments happen @@ -2250,6 +2253,7 @@ static int rcu_nocb_kthread(void *arg) trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("Sleep")); wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); + /* Memory barrier provide by xchg() below. */ } else if (firsttime) { firsttime = 0; trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, -- cgit v1.2.3 From 96d3fd0d315a949e30adc80f086031c5cdf070d1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 Oct 2013 14:33:34 -0700 Subject: rcu: Break call_rcu() deadlock involving scheduler and perf Dave Jones got the following lockdep splat: > ====================================================== > [ INFO: possible circular locking dependency detected ] > 3.12.0-rc3+ #92 Not tainted > ------------------------------------------------------- > trinity-child2/15191 is trying to acquire lock: > (&rdp->nocb_wq){......}, at: [] __wake_up+0x23/0x50 > > but task is already holding lock: > (&ctx->lock){-.-...}, at: [] perf_event_exit_task+0x109/0x230 > > which lock already depends on the new lock. > > > the existing dependency chain (in reverse order) is: > > -> #3 (&ctx->lock){-.-...}: > [] lock_acquire+0x93/0x200 > [] _raw_spin_lock+0x40/0x80 > [] __perf_event_task_sched_out+0x2df/0x5e0 > [] perf_event_task_sched_out+0x93/0xa0 > [] __schedule+0x1d2/0xa20 > [] preempt_schedule_irq+0x50/0xb0 > [] retint_kernel+0x26/0x30 > [] tty_flip_buffer_push+0x34/0x50 > [] pty_write+0x54/0x60 > [] n_tty_write+0x32d/0x4e0 > [] tty_write+0x158/0x2d0 > [] vfs_write+0xc0/0x1f0 > [] SyS_write+0x4c/0xa0 > [] tracesys+0xdd/0xe2 > > -> #2 (&rq->lock){-.-.-.}: > [] lock_acquire+0x93/0x200 > [] _raw_spin_lock+0x40/0x80 > [] wake_up_new_task+0xc2/0x2e0 > [] do_fork+0x126/0x460 > [] kernel_thread+0x26/0x30 > [] rest_init+0x23/0x140 > [] start_kernel+0x3f6/0x403 > [] x86_64_start_reservations+0x2a/0x2c > [] x86_64_start_kernel+0xf1/0xf4 > > -> #1 (&p->pi_lock){-.-.-.}: > [] lock_acquire+0x93/0x200 > [] _raw_spin_lock_irqsave+0x4b/0x90 > [] try_to_wake_up+0x31/0x350 > [] default_wake_function+0x12/0x20 > [] autoremove_wake_function+0x18/0x40 > [] __wake_up_common+0x58/0x90 > [] __wake_up+0x39/0x50 > [] __call_rcu_nocb_enqueue+0xa8/0xc0 > [] __call_rcu+0x140/0x820 > [] call_rcu+0x1d/0x20 > [] cpu_attach_domain+0x287/0x360 > [] build_sched_domains+0xe5e/0x10a0 > [] sched_init_smp+0x3b7/0x47a > [] kernel_init_freeable+0xf6/0x202 > [] kernel_init+0xe/0x190 > [] ret_from_fork+0x7c/0xb0 > > -> #0 (&rdp->nocb_wq){......}: > [] __lock_acquire+0x191a/0x1be0 > [] lock_acquire+0x93/0x200 > [] _raw_spin_lock_irqsave+0x4b/0x90 > [] __wake_up+0x23/0x50 > [] __call_rcu_nocb_enqueue+0xa8/0xc0 > [] __call_rcu+0x140/0x820 > [] kfree_call_rcu+0x20/0x30 > [] put_ctx+0x4f/0x70 > [] perf_event_exit_task+0x12e/0x230 > [] do_exit+0x30d/0xcc0 > [] do_group_exit+0x4c/0xc0 > [] SyS_exit_group+0x14/0x20 > [] tracesys+0xdd/0xe2 > > other info that might help us debug this: > > Chain exists of: > &rdp->nocb_wq --> &rq->lock --> &ctx->lock > > Possible unsafe locking scenario: > > CPU0 CPU1 > ---- ---- > lock(&ctx->lock); > lock(&rq->lock); > lock(&ctx->lock); > lock(&rdp->nocb_wq); > > *** DEADLOCK *** > > 1 lock held by trinity-child2/15191: > #0: (&ctx->lock){-.-...}, at: [] perf_event_exit_task+0x109/0x230 > > stack backtrace: > CPU: 2 PID: 15191 Comm: trinity-child2 Not tainted 3.12.0-rc3+ #92 > ffffffff82565b70 ffff880070c2dbf8 ffffffff8172a363 ffffffff824edf40 > ffff880070c2dc38 ffffffff81726741 ffff880070c2dc90 ffff88022383b1c0 > ffff88022383aac0 0000000000000000 ffff88022383b188 ffff88022383b1c0 > Call Trace: > [] dump_stack+0x4e/0x82 > [] print_circular_bug+0x200/0x20f > [] __lock_acquire+0x191a/0x1be0 > [] ? get_lock_stats+0x19/0x60 > [] ? native_sched_clock+0x24/0x80 > [] lock_acquire+0x93/0x200 > [] ? __wake_up+0x23/0x50 > [] _raw_spin_lock_irqsave+0x4b/0x90 > [] ? __wake_up+0x23/0x50 > [] __wake_up+0x23/0x50 > [] __call_rcu_nocb_enqueue+0xa8/0xc0 > [] __call_rcu+0x140/0x820 > [] ? local_clock+0x3f/0x50 > [] kfree_call_rcu+0x20/0x30 > [] put_ctx+0x4f/0x70 > [] perf_event_exit_task+0x12e/0x230 > [] do_exit+0x30d/0xcc0 > [] ? trace_hardirqs_on_caller+0x115/0x1e0 > [] ? trace_hardirqs_on+0xd/0x10 > [] do_group_exit+0x4c/0xc0 > [] SyS_exit_group+0x14/0x20 > [] tracesys+0xdd/0xe2 The underlying problem is that perf is invoking call_rcu() with the scheduler locks held, but in NOCB mode, call_rcu() will with high probability invoke the scheduler -- which just might want to use its locks. The reason that call_rcu() needs to invoke the scheduler is to wake up the corresponding rcuo callback-offload kthread, which does the job of starting up a grace period and invoking the callbacks afterwards. One solution (championed on a related problem by Lai Jiangshan) is to simply defer the wakeup to some point where scheduler locks are no longer held. Since we don't want to unnecessarily incur the cost of such deferral, the task before us is threefold: 1. Determine when it is likely that a relevant scheduler lock is held. 2. Defer the wakeup in such cases. 3. Ensure that all deferred wakeups eventually happen, preferably sooner rather than later. We use irqs_disabled_flags() as a proxy for relevant scheduler locks being held. This works because the relevant locks are always acquired with interrupts disabled. We may defer more often than needed, but that is at least safe. The wakeup deferral is tracked via a new field in the per-CPU and per-RCU-flavor rcu_data structure, namely ->nocb_defer_wakeup. This flag is checked by the RCU core processing. The __rcu_pending() function now checks this flag, which causes rcu_check_callbacks() to initiate RCU core processing at each scheduling-clock interrupt where this flag is set. Of course this is not sufficient because scheduling-clock interrupts are often turned off (the things we used to be able to count on!). So the flags are also checked on entry to any state that RCU considers to be idle, which includes both NO_HZ_IDLE idle state and NO_HZ_FULL user-mode-execution state. This approach should allow call_rcu() to be invoked regardless of what locks you might be holding, the key word being "should". Reported-by: Dave Jones Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra --- kernel/rcu/tree_plugin.h | 55 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 10 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b023e5407111..752ffaa0d681 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2104,7 +2104,8 @@ bool rcu_is_nocb_cpu(int cpu) static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, struct rcu_head *rhp, struct rcu_head **rhtp, - int rhcount, int rhcount_lazy) + int rhcount, int rhcount_lazy, + unsigned long flags) { int len; struct rcu_head **old_rhpp; @@ -2125,9 +2126,16 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, } len = atomic_long_read(&rdp->nocb_q_count); if (old_rhpp == &rdp->nocb_head) { - wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ + if (!irqs_disabled_flags(flags)) { + wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */ + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeEmpty")); + } else { + rdp->nocb_defer_wakeup = true; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeEmptyIsDeferred")); + } rdp->qlen_last_fqs_check = 0; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else if (len > rdp->qlen_last_fqs_check + qhimark) { wake_up_process(t); /* ... or if many callbacks queued. */ rdp->qlen_last_fqs_check = LONG_MAX / 2; @@ -2148,12 +2156,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, * "rcuo" kthread can find it. */ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy) + bool lazy, unsigned long flags) { if (!rcu_is_nocb_cpu(rdp->cpu)) return 0; - __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); + __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); if (__is_kfree_rcu_offset((unsigned long)rhp->func)) trace_rcu_kfree_callback(rdp->rsp->name, rhp, (unsigned long)rhp->func, @@ -2171,7 +2179,8 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, * not a no-CBs CPU. */ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp) + struct rcu_data *rdp, + unsigned long flags) { long ql = rsp->qlen; long qll = rsp->qlen_lazy; @@ -2185,14 +2194,14 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, /* First, enqueue the donelist, if any. This preserves CB ordering. */ if (rsp->orphan_donelist != NULL) { __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, - rsp->orphan_donetail, ql, qll); + rsp->orphan_donetail, ql, qll, flags); ql = qll = 0; rsp->orphan_donelist = NULL; rsp->orphan_donetail = &rsp->orphan_donelist; } if (rsp->orphan_nxtlist != NULL) { __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, - rsp->orphan_nxttail, ql, qll); + rsp->orphan_nxttail, ql, qll, flags); ql = qll = 0; rsp->orphan_nxtlist = NULL; rsp->orphan_nxttail = &rsp->orphan_nxtlist; @@ -2314,6 +2323,22 @@ static int rcu_nocb_kthread(void *arg) return 0; } +/* Is a deferred wakeup of rcu_nocb_kthread() required? */ +static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +{ + return ACCESS_ONCE(rdp->nocb_defer_wakeup); +} + +/* Do a deferred wakeup of rcu_nocb_kthread(). */ +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + if (!rcu_nocb_need_deferred_wakeup(rdp)) + return; + ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; + wake_up(&rdp->nocb_wq); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); +} + /* Initialize per-rcu_data variables for no-CBs CPUs. */ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { @@ -2369,13 +2394,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) } static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy) + bool lazy, unsigned long flags) { return 0; } static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp) + struct rcu_data *rdp, + unsigned long flags) { return 0; } @@ -2384,6 +2410,15 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { } +static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +{ + return false; +} + +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ +} + static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) { } -- cgit v1.2.3 From 79a62f957e0b37c59610a96d018cc341aebb48f4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 30 Oct 2013 04:13:22 -0700 Subject: rcu: Warn on allegedly impossible rcu_read_unlock_special() from irq After commit #10f39bb1b2c1 (rcu: protect __rcu_read_unlock() against scheduler-using irq handlers), it is no longer possible to enter the main body of rcu_read_lock_special() from an NMI, interrupt, or softirq handler. In theory, this implies that the check for "in_irq() || in_serving_softirq()" must always fail, so that in theory this check could be removed entirely. In practice, this commit wraps this condition with a WARN_ON_ONCE(). If this warning never triggers, then the condition will be removed entirely. [ paulmck: And one way of triggering the WARN_ON() is if a scheduling clock interrupt occurs in an RCU read-side critical section, setting RCU_READ_UNLOCK_NEED_QS, which is handled by rcu_read_unlock_special(). Updated this commit to return if only that bit was set. ] Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 752ffaa0d681..fa7a18b62253 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -361,10 +361,14 @@ void rcu_read_unlock_special(struct task_struct *t) special = t->rcu_read_unlock_special; if (special & RCU_READ_UNLOCK_NEED_QS) { rcu_preempt_qs(smp_processor_id()); + if (!t->rcu_read_unlock_special) { + local_irq_restore(flags); + return; + } } - /* Hardware IRQ handlers cannot block. */ - if (in_irq() || in_serving_softirq()) { + /* Hardware IRQ handlers cannot block, complain if they get here. */ + if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) { local_irq_restore(flags); return; } -- cgit v1.2.3 From a096932f0c9c9dca9cce72f1c0fb2395df8f2dff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 8 Nov 2013 09:03:10 -0800 Subject: rcu: Don't activate RCU core on NO_HZ_FULL CPUs Whenever a CPU receives a scheduling-clock interrupt, RCU checks to see if the RCU core needs anything from this CPU. If so, RCU raises RCU_SOFTIRQ to carry out any needed processing. This approach has worked well historically, but it is undesirable on NO_HZ_FULL CPUs. Such CPUs are expected to spend almost all of their time in userspace, so that scheduling-clock interrupts can be disabled while there is only one runnable task on the CPU in question. Unfortunately, raising any softirq has the potential to wake up ksoftirqd, which would provide the second runnable task on that CPU, preventing disabling of scheduling-clock interrupts. What is needed instead is for RCU to leave NO_HZ_FULL CPUs alone, relying on the grace-period kthreads' quiescent-state forcing to do any needed RCU work on behalf of those CPUs. This commit therefore refrains from raising RCU_SOFTIRQ on any NO_HZ_FULL CPUs during any grace periods that have been in effect for less than one second. The one-second limit handles the case where an inappropriate workload is running on a NO_HZ_FULL CPU that features lots of scheduling-clock interrupts, but no idle or userspace time. Reported-by: Mike Galbraith Signed-off-by: Paul E. McKenney Tested-by: Mike Galbraith Toasted-by: Frederic Weisbecker --- kernel/rcu/tree_plugin.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel/rcu/tree_plugin.h') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa7a18b62253..e0885cb6c599 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2872,3 +2872,23 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) } #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +/* + * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the + * grace-period kthread will do force_quiescent_state() processing? + * The idea is to avoid waking up RCU core processing on such a + * CPU unless the grace period has extended for too long. + * + * This code relies on the fact that all NO_HZ_FULL CPUs are also + * CONFIG_RCU_NOCB_CPUs. + */ +static bool rcu_nohz_full_cpu(struct rcu_state *rsp) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_cpu(smp_processor_id()) && + (!rcu_gp_in_progress(rsp) || + ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ))) + return 1; +#endif /* #ifdef CONFIG_NO_HZ_FULL */ + return 0; +} -- cgit v1.2.3