From fe15d706cfc1cb321dbe2329b04b5ca185edff60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Jan 2012 13:30:33 -0800 Subject: rcu: Add lockdep-RCU checks for simple self-deadlock It is illegal to have a grace period within a same-flavor RCU read-side critical section, so this commit adds lockdep-RCU checks to splat when such abuse is encountered. This commit does not detect more elaborate RCU deadlock situations. These situations might be a job for lockdep enhancements. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6c4a6722abfd..3cf713a724c1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1816,6 +1816,10 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); */ void synchronize_sched(void) { + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; wait_rcu_gp(call_rcu_sched); @@ -1833,6 +1837,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched); */ void synchronize_rcu_bh(void) { + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; wait_rcu_gp(call_rcu_bh); -- cgit v1.2.3 From 0bb7b59d6e2b8440cd7097097dd4bbfc4d76ed07 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Jan 2012 14:44:39 -0800 Subject: rcu: Add diagnostic for misaligned rcu_head structures The push for energy efficiency will require that RCU tag rcu_head structures to indicate whether or not their invocation is time critical. This tagging is best carried out in the bottom bits of the ->next pointers in the rcu_head structures. This tagging requires that the rcu_head structures be properly aligned, so this commit adds the required diagnostics. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 3cf713a724c1..570f7530f4b3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1707,6 +1707,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), unsigned long flags; struct rcu_data *rdp; + WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ debug_rcu_head_queue(head); head->func = func; head->next = NULL; -- cgit v1.2.3 From 486e259340fc4c60474f2c14703e3b3634bb58ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Jan 2012 14:11:30 -0800 Subject: rcu: Avoid waking up CPUs having only kfree_rcu() callbacks When CONFIG_RCU_FAST_NO_HZ is enabled, RCU will allow a given CPU to enter dyntick-idle mode even if it still has RCU callbacks queued. RCU avoids system hangs in this case by scheduling a timer for several jiffies in the future. However, if all of the callbacks on that CPU are from kfree_rcu(), there is no reason to wake the CPU up, as it is not a problem to defer freeing of memory. This commit therefore tracks the number of callbacks on a given CPU that are from kfree_rcu(), and avoids scheduling the timer if all of a given CPU's callbacks are from kfree_rcu(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 570f7530f4b3..acf2d67ad2f4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1261,6 +1261,7 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp) *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + receive_rdp->qlen_lazy += rdp->qlen_lazy; receive_rdp->qlen += rdp->qlen; receive_rdp->n_cbs_adopted += rdp->qlen; rdp->n_cbs_orphaned += rdp->qlen; @@ -1268,6 +1269,7 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp) rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen_lazy = 0; rdp->qlen = 0; } @@ -1368,11 +1370,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int bl, count; + int bl, count, count_lazy; /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { - trace_rcu_batch_start(rsp->name, 0, 0); + trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -1385,7 +1387,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) */ local_irq_save(flags); bl = rdp->blimit; - trace_rcu_batch_start(rsp->name, rdp->qlen, bl); + trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); list = rdp->nxtlist; rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; *rdp->nxttail[RCU_DONE_TAIL] = NULL; @@ -1396,12 +1398,13 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); /* Invoke callbacks. */ - count = 0; + count = count_lazy = 0; while (list) { next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - __rcu_reclaim(rsp->name, list); + if (__rcu_reclaim(rsp->name, list)) + count_lazy++; list = next; /* Stop only if limit reached and CPU has something to do. */ if (++count >= bl && @@ -1416,6 +1419,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rcu_is_callbacks_kthread()); /* Update count, and requeue any remaining callbacks. */ + rdp->qlen_lazy -= count_lazy; rdp->qlen -= count; rdp->n_cbs_invoked += count; if (list != NULL) { @@ -1702,7 +1706,7 @@ static void invoke_rcu_core(void) static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp) + struct rcu_state *rsp, bool lazy) { unsigned long flags; struct rcu_data *rdp; @@ -1727,12 +1731,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), *rdp->nxttail[RCU_NEXT_TAIL] = head; rdp->nxttail[RCU_NEXT_TAIL] = &head->next; rdp->qlen++; + if (lazy) + rdp->qlen_lazy++; if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, - rdp->qlen); + rdp->qlen_lazy, rdp->qlen); else - trace_rcu_callback(rsp->name, head, rdp->qlen); + trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); /* If interrupts were disabled, don't dive into RCU core. */ if (irqs_disabled_flags(flags)) { @@ -1779,16 +1785,16 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), */ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_sched_state); + __call_rcu(head, func, &rcu_sched_state, 0); } EXPORT_SYMBOL_GPL(call_rcu_sched); /* - * Queue an RCU for invocation after a quicker grace period. + * Queue an RCU callback for invocation after a quicker grace period. */ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_bh_state); + __call_rcu(head, func, &rcu_bh_state, 0); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -2036,6 +2042,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen_lazy = 0; rdp->qlen = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); -- cgit v1.2.3 From e5601400081651060a59bd1f45f2821bb8e97f95 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 7 Jan 2012 11:03:57 -0800 Subject: rcu: Simplify offline processing Move ->qsmaskinit and blkd_tasks[] manipulation to the CPU_DYING notifier. This simplifies the code by eliminating a potential deadlock and by reducing the responsibilities of force_quiescent_state(). Also rename functions to make their connection to the CPU-hotplug stages explicit. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 160 ++++++++++++++++++++++++++----------------------------- 1 file changed, 75 insertions(+), 85 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index acf2d67ad2f4..575f91d03f06 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -943,6 +943,10 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat * in preparation for detecting the next grace period. The caller must hold * the root node's ->lock, which is released before return. Hard irqs must * be disabled. + * + * Note that it is legal for a dying CPU (which is marked as offline) to + * invoke this function. This can happen when the dying CPU reports its + * quiescent state. */ static void rcu_start_gp(struct rcu_state *rsp, unsigned long flags) @@ -1245,118 +1249,101 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) /* * Move a dying CPU's RCU callbacks to online CPU's callback list. - * Synchronization is not required because this function executes - * in stop_machine() context. + * Also record a quiescent state for this CPU for the current grace period. + * Synchronization and interrupt disabling are not required because + * this function executes in stop_machine() context. Therefore, cleanup + * operations that might block must be done later from the CPU_DEAD + * notifier. + * + * Note that the outgoing CPU's bit has already been cleared in the + * cpu_online_mask. This allows us to randomly pick a callback + * destination from the bits set in that mask. */ -static void rcu_send_cbs_to_online(struct rcu_state *rsp) +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { + unsigned long flags; int i; - /* current DYING CPU is cleared in the cpu_online_mask */ + unsigned long mask; + int need_report; int receive_cpu = cpumask_any(cpu_online_mask); struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); + struct rcu_node *rnp = rdp->mynode; /* For dying CPU. */ + + /* Move callbacks to some other CPU. */ + if (rdp->nxtlist != NULL) { + *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; + receive_rdp->nxttail[RCU_NEXT_TAIL] = + rdp->nxttail[RCU_NEXT_TAIL]; + receive_rdp->qlen_lazy += rdp->qlen_lazy; + receive_rdp->qlen += rdp->qlen; + receive_rdp->n_cbs_adopted += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; + + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen_lazy = 0; + rdp->qlen = 0; + } - if (rdp->nxtlist == NULL) - return; /* irqs disabled, so comparison is stable. */ - - *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; - receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - receive_rdp->qlen_lazy += rdp->qlen_lazy; - receive_rdp->qlen += rdp->qlen; - receive_rdp->n_cbs_adopted += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; - - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp->qlen_lazy = 0; - rdp->qlen = 0; -} - -/* - * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy - * and move all callbacks from the outgoing CPU to the current one. - * There can only be one CPU hotplug operation at a time, so no other - * CPU can be attempting to update rcu_cpu_kthread_task. - */ -static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long mask; - int need_report = 0; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp; - - rcu_stop_cpu_kthread(cpu); - - /* Exclude any attempts to start a new grace period. */ - raw_spin_lock_irqsave(&rsp->onofflock, flags); - - /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ + /* Record a quiescent state for the dying CPU. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ + trace_rcu_grace_period(rsp->name, + rnp->gpnum + 1 - !!(rnp->qsmask & mask), + "cpuofl"); + rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); + /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ + + /* + * Remove the dying CPU from the bitmasks in the rcu_node + * hierarchy. Because we are in stop_machine() context, we + * automatically exclude ->onofflock critical sections. + */ do { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irqsave(&rnp->lock, flags); rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { - if (rnp != rdp->mynode) - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - else - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - - !!(rnp->qsmask & mask), - "cpuofl"); + raw_spin_unlock_irqrestore(&rnp->lock, flags); break; } if (rnp == rdp->mynode) { - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - - !!(rnp->qsmask & mask), - "cpuofl"); need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + if (need_report & RCU_OFL_TASKS_NORM_GP) + rcu_report_unblock_qs_rnp(rnp, flags); + else + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (need_report & RCU_OFL_TASKS_EXP_GP) + rcu_report_exp_rnp(rsp, rnp, true); } else - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); mask = rnp->grpmask; rnp = rnp->parent; } while (rnp != NULL); - - /* - * We still hold the leaf rcu_node structure lock here, and - * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->onofflock - * held leads to deadlock. - */ - raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ - rnp = rdp->mynode; - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); - rcu_node_kthread_setaffinity(rnp, -1); } /* - * Remove the specified CPU from the RCU hierarchy and move any pending - * callbacks that it might have to the current CPU. This code assumes - * that at least one CPU in the system will remain running at all times. - * Any attempt to offline -all- CPUs is likely to strand RCU callbacks. + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no other + * CPU can be attempting to update rcu_cpu_kthread_task. */ -static void rcu_offline_cpu(int cpu) +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { - __rcu_offline_cpu(cpu, &rcu_sched_state); - __rcu_offline_cpu(cpu, &rcu_bh_state); - rcu_preempt_offline_cpu(cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; + + rcu_stop_cpu_kthread(cpu); + rcu_node_kthread_setaffinity(rnp, -1); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_send_cbs_to_online(struct rcu_state *rsp) +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } -static void rcu_offline_cpu(int cpu) +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { } @@ -1725,6 +1712,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * a quiescent state betweentimes. */ local_irq_save(flags); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ @@ -2155,16 +2143,18 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, * touch any data without introducing corruption. We send the * dying CPU's callbacks to an arbitrarily chosen online CPU. */ - rcu_send_cbs_to_online(&rcu_bh_state); - rcu_send_cbs_to_online(&rcu_sched_state); - rcu_preempt_send_cbs_to_online(); + rcu_cleanup_dying_cpu(&rcu_bh_state); + rcu_cleanup_dying_cpu(&rcu_sched_state); + rcu_preempt_cleanup_dying_cpu(); rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - rcu_offline_cpu(cpu); + rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); + rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); + rcu_preempt_cleanup_dead_cpu(cpu); break; default: break; -- cgit v1.2.3 From 8146c4e2e2c1972216afece5c50e072e86120e42 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Jan 2012 14:23:29 -0800 Subject: rcu: Check for callback invocation from offline CPUs Because quiescent states are now reported from offline CPUs in CPU_DYING state, there is some possibility that such a CPU might note the end of a grace period and attempt to start invoking callbacks. This would be a very bad thing, and is supposed to be prevented by the fact that the CPU_DYING CPU gets rid of all its callbacks before reporting the quiescent state. However, there is other CPU-offline code in the kernel, and it is quite possible that someone will invoke RCU core processing from that code. Therefore, this commit adds a warning for this case. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 575f91d03f06..ac3a810d2db7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1373,6 +1373,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * races with call_rcu() from interrupt handlers. */ local_irq_save(flags); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); bl = rdp->blimit; trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); list = rdp->nxtlist; -- cgit v1.2.3 From a50c3af910e06f35bc0c68f89d8fef98c0fec0ea Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Jan 2012 17:52:31 -0800 Subject: rcu: Don't make callbacks go through second full grace period RCU's current CPU-offline code path dumps all of the outgoing CPU's callbacks onto the RCU_NEXT_TAIL portion of the surviving CPU's callback list. This means that all the ready-to-invoke callbacks from the outgoing CPU must wait for another full RCU grace period. This was just fine when CPU-hotplug events were rare, but there is increasing evidence that users are planning to make increasing use of CPU hotplug. Therefore, this commit changes the callback-dumping procedure so that callbacks that are ready to invoke are moved to the RCU_DONE_TAIL portion of the surviving CPU's callback list. This avoids running these callbacks through a second unnecessary grace period. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 6 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ac3a810d2db7..7789e666394d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1270,24 +1270,64 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); struct rcu_node *rnp = rdp->mynode; /* For dying CPU. */ - /* Move callbacks to some other CPU. */ + /* First, adjust the counts. */ + if (rdp->nxtlist != NULL) { + receive_rdp->qlen_lazy += rdp->qlen_lazy; + receive_rdp->qlen += rdp->qlen; + rdp->qlen_lazy = 0; + rdp->qlen = 0; + } + + /* + * Next, move ready-to-invoke callbacks to be invoked on some + * other CPU. These will not be required to pass through another + * grace period: They are done, regardless of CPU. + */ + if (rdp->nxtlist != NULL && + rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { + struct rcu_head *oldhead; + struct rcu_head **oldtail; + struct rcu_head **newtail; + + oldhead = rdp->nxtlist; + oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; + rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; + *rdp->nxttail[RCU_DONE_TAIL] = *oldtail; + *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; + newtail = rdp->nxttail[RCU_DONE_TAIL]; + for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { + if (receive_rdp->nxttail[i] == oldtail) + receive_rdp->nxttail[i] = newtail; + if (rdp->nxttail[i] == newtail) + rdp->nxttail[i] = &rdp->nxtlist; + } + } + + /* + * Finally, put the rest of the callbacks at the end of the list. + * The ones that made it partway through get to start over: We + * cannot assume that grace periods are synchronized across CPUs. + * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but + * this does not seem compelling. Not yet, anyway.) + */ if (rdp->nxtlist != NULL) { *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - receive_rdp->qlen_lazy += rdp->qlen_lazy; - receive_rdp->qlen += rdp->qlen; receive_rdp->n_cbs_adopted += rdp->qlen; rdp->n_cbs_orphaned += rdp->qlen; rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; - rdp->qlen_lazy = 0; - rdp->qlen = 0; } - /* Record a quiescent state for the dying CPU. */ + /* + * Record a quiescent state for the dying CPU. This is safe + * only because we have already cleared out the callbacks. + * (Otherwise, the RCU core might try to schedule the invocation + * of callbacks on this now-offline CPU, which would be bad.) + */ mask = rdp->grpmask; /* rnp->grplo is constant. */ trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), -- cgit v1.2.3 From f38bd1020f797694b6b5e06f5f06c87688fc84c0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jan 2012 11:34:50 -0800 Subject: rcu: Remove single-rcu_node optimization in rcu_start_gp() The grace-period initialization sequence in rcu_start_gp() has a special case for systems where the rcu_node tree is a single rcu_node structure. This made sense some years ago when systems were smaller and up to 64 CPUs could share a single rcu_node structure, but now that large systems are common and a given leaf rcu_node structure can support only 16 CPUs (due to lock contention on the rcu_node's ->lock field), this optimization is almost never taken. And even the small mobile platforms that might make use of it might rather have the kernel text reduction. Therefore, this commit removes the check for single-rcu_node trees. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7789e666394d..49bb363a0837 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -984,26 +984,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); - - /* Special-case the common single-level case. */ - if (NUM_RCU_NODES == 1) { - rcu_preempt_check_blocked_tasks(rnp); - rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; - rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ - rcu_start_gp_per_cpu(rsp, rnp, rdp); - rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, - rnp->level, rnp->grplo, - rnp->grphi, rnp->qsmask); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ - /* Exclude any concurrent CPU-hotplug operations. */ raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ -- cgit v1.2.3 From 30fbcc90b02187c55c57ff0ecf57cecbd487d694 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Jan 2012 11:01:14 -0800 Subject: rcu: Clean up straggling rcu_preempt_needs_cpu() name The recent updates to RCU_CPU_FAST_NO_HZ have an rcu_needs_cpu() that does more than just check for callbacks, so get the name for rcu_preempt_needs_cpu() consistent with that change, now calling it rcu_preempt_cpu_has_callbacks(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 49bb363a0837..0569ba11e35a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1958,7 +1958,7 @@ static int rcu_cpu_has_callbacks(int cpu) /* RCU callbacks either ready or pending? */ return per_cpu(rcu_sched_data, cpu).nxtlist || per_cpu(rcu_bh_data, cpu).nxtlist || - rcu_preempt_needs_cpu(cpu); + rcu_preempt_cpu_has_callbacks(cpu); } static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -- cgit v1.2.3 From c44e2cddacc2cf299186bad5697d738ea19668b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Jan 2012 13:08:18 -0800 Subject: rcu: Check for idle-loop entry while in RCU read-side critical section The inner idle loop is an extended quiescent state for all flavors of RCU, but there have been recent bug involving use of RCU read-side primitives from within the idle loop. Therefore, this commit enlists lockdep-RCU to detect attempts to enter the inner idle loop while in an RCU read-side critical section, emitting a lockdep-RCU splat if so. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0569ba11e35a..195bf1fb59ea 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -366,6 +366,17 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) atomic_inc(&rdtp->dynticks); smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + + /* + * The idle task is not permitted to enter the idle loop while + * in an RCU read-side critical section. + */ + rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), + "Illegal idle entry in RCU read-side critical section."); + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), + "Illegal idle entry in RCU-bh read-side critical section."); + rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), + "Illegal idle entry in RCU-sched read-side critical section."); } /** -- cgit v1.2.3 From 27565d64a4e564e72c22d8c91a3cfcb9442383e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Jan 2012 19:35:08 -0800 Subject: rcu: Remove #ifdef CONFIG_SMP from TREE_RCU Now that both TINY_RCU and TINY_PREEMPT_RCU have been in place for awhile, it is time to remove UP support from TREE_RCU, which is what this commit does. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 195bf1fb59ea..61adb351d2c7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -301,8 +301,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) return &rsp->node[0]; } -#ifdef CONFIG_SMP - /* * If the specified CPU is offline, tell the caller that it is in * a quiescent state. Otherwise, whack it with a reschedule IPI. @@ -339,8 +337,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) return 0; } -#endif /* #ifdef CONFIG_SMP */ - /* * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle * @@ -606,8 +602,6 @@ int rcu_is_cpu_rrupt_from_idle(void) return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; } -#ifdef CONFIG_SMP - /* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU @@ -651,8 +645,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return rcu_implicit_offline_qs(rdp); } -#endif /* #ifdef CONFIG_SMP */ - static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; @@ -1517,8 +1509,6 @@ void rcu_check_callbacks(int cpu, int user) trace_rcu_utilization("End scheduler-tick"); } -#ifdef CONFIG_SMP - /* * Scan the leaf rcu_node structures, processing dyntick state for any that * have not yet encountered a quiescent state, using the function specified. @@ -1641,15 +1631,6 @@ unlock_fqs_ret: trace_rcu_utilization("End fqs"); } -#else /* #ifdef CONFIG_SMP */ - -static void force_quiescent_state(struct rcu_state *rsp, int relaxed) -{ - set_need_resched(); -} - -#endif /* #else #ifdef CONFIG_SMP */ - /* * This does the RCU core processing work for the specified rcu_state * and rcu_data structures. This may be called only from the CPU to -- cgit v1.2.3 From 13cfcca0e4e2d4cee1d0183c049eb34e54ac976e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Jan 2012 15:32:18 -0800 Subject: rcu: Set RCU CPU stall times via sysfs The default CONFIG_RCU_CPU_STALL_TIMEOUT value of 60 seconds has served Linux users well for production use for quite some time. However, for debugging, there will be more than three minutes between subsequent stall-warning messages. This can be an annoyingly long wait if you are trying to work out where the offending infinite loop is hiding. Therefore, this commit provides a rcu_cpu_stall_timeout sysfs parameter that may be adjusted at boot time and at runtime to speed up debugging. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 61adb351d2c7..cfdab9898e33 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -208,8 +208,11 @@ module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); -int rcu_cpu_stall_suppress __read_mostly; +int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; + module_param(rcu_cpu_stall_suppress, int, 0644); +module_param(rcu_cpu_stall_timeout, int, 0644); static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -645,10 +648,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return rcu_implicit_offline_qs(rdp); } +static int jiffies_till_stall_check(void) +{ + int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); + + /* + * Limit check must be consistent with the Kconfig limits + * for CONFIG_RCU_CPU_STALL_TIMEOUT. + */ + if (till_stall_check < 3) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; + till_stall_check = 3; + } else if (till_stall_check > 300) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; + till_stall_check = 300; + } + return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} + static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; - rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; + rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); } static void print_other_cpu_stall(struct rcu_state *rsp) @@ -667,7 +688,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; /* * Now rat on any tasks that got kicked up to the root rcu_node @@ -726,8 +747,8 @@ static void print_cpu_stall(struct rcu_state *rsp) raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) - rsp->jiffies_stall = - jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + rsp->jiffies_stall = jiffies + + 3 * jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); set_need_resched(); /* kick ourselves to get things going. */ -- cgit v1.2.3 From a858af2875fb291d0f4b0a4419fefbf03c2379c0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Jan 2012 13:29:10 -0800 Subject: rcu: Print scheduling-clock information on RCU CPU stall-warning messages There have been situations where RCU CPU stall warnings were caused by issues in scheduling-clock timer initialization. To make it easier to track these down, this commit causes the RCU CPU stall-warning messages to print out the number of scheduling-clock interrupts taken in the current grace period for each stalled CPU. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index cfdab9898e33..dccd2f78db4e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -689,12 +689,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) return; } rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; - - /* - * Now rat on any tasks that got kicked up to the root rcu_node - * due to CPU offlining. - */ - ndetected = rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -702,8 +696,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", + printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", rsp->name); + print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); ndetected += rcu_print_task_stall(rnp); @@ -712,11 +707,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) continue; for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) if (rnp->qsmask & (1UL << cpu)) { - printk(" %d", rnp->grplo + cpu); + print_cpu_stall_info(rsp, rnp->grplo + cpu); ndetected++; } } - printk("} (detected by %d, t=%ld jiffies)\n", + + /* + * Now rat on any tasks that got kicked up to the root rcu_node + * due to CPU offlining. + */ + rnp = rcu_get_root(rsp); + raw_spin_lock_irqsave(&rnp->lock, flags); + ndetected = rcu_print_task_stall(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + print_cpu_stall_info_end(); + printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); if (ndetected == 0) printk(KERN_ERR "INFO: Stall ended before state dump start\n"); @@ -740,8 +746,11 @@ static void print_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", - rsp->name, smp_processor_id(), jiffies - rsp->gp_start); + printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); + print_cpu_stall_info_begin(); + print_cpu_stall_info(rsp, smp_processor_id()); + print_cpu_stall_info_end(); + printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); if (!trigger_all_cpu_backtrace()) dump_stack(); @@ -831,6 +840,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rdp->passed_quiesce = 0; } else rdp->qs_pending = 0; + zero_cpu_stall_ticks(rdp); } } @@ -1496,6 +1506,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) void rcu_check_callbacks(int cpu, int user) { trace_rcu_utilization("Start scheduler-tick"); + increment_cpu_stall_ticks(); if (user || rcu_is_cpu_rrupt_from_idle()) { /* -- cgit v1.2.3 From c0d6d01bffdce19fa19baad6cb8cc3eed7bfd6f5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jan 2012 12:41:26 -0800 Subject: rcu: Check for illegal use of RCU from offlined CPUs Although it is legal to use RCU during early boot, it is anything but legal to use RCU at runtime from an offlined CPU. After all, RCU explicitly ignores offlined CPUs. This commit therefore adds checks for runtime use of RCU from offlined CPUs. These checks are not perfect, in particular, they can be subverted through use of things like rcu_dereference_raw(). Note that it is not possible to put checks in rcu_read_lock() and friends due to the fact that these primitives are used in code that might be used under either RCU or lock-based protection, which means that checking rcu_read_lock() gets you fat piles of false positives. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dccd2f78db4e..bcf7db2f2fd2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -591,6 +591,35 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Is the current CPU online? Disable preemption to avoid false positives + * that could otherwise happen due to the current CPU number being sampled, + * this task being preempted, its old CPU being taken offline, resuming + * on some other CPU, then determining that its old CPU is now offline. + * It is OK to use RCU on an offline processor during initial boot, hence + * the check for rcu_scheduler_fully_active. + * + * Disable checking if in an NMI handler because we cannot safely report + * errors from NMI handlers anyway. + */ +bool rcu_lockdep_current_cpu_online(void) +{ + bool ret; + + if (in_nmi()) + return 1; + preempt_disable(); + ret = cpu_online(smp_processor_id()) || + !rcu_scheduler_fully_active; + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + #endif /* #ifdef CONFIG_PROVE_RCU */ /** -- cgit v1.2.3 From 3d3b7db0a22085cfc05c3318b9874f7fb8266d18 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jan 2012 17:05:46 -0800 Subject: rcu: Move synchronize_sched_expedited() to rcutree.c Now that TREE_RCU and TREE_PREEMPT_RCU no longer do anything different for the single-CPU case, there is no need for multiple definitions of synchronize_sched_expedited(). It is no longer in any sense a plug-in, so move it from kernel/rcutree_plugin.h to kernel/rcutree.c. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index bcf7db2f2fd2..05470d4caba3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -50,6 +50,8 @@ #include #include #include +#include +#include #include "rcutree.h" #include @@ -1918,6 +1920,121 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); +static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); +static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); + +static int synchronize_sched_expedited_cpu_stop(void *data) +{ + /* + * There must be a full memory barrier on each affected CPU + * between the time that try_stop_cpus() is called and the + * time that it returns. + * + * In the current initial implementation of cpu_stop, the + * above condition is already met when the control reaches + * this point and the following smp_mb() is not strictly + * necessary. Do smp_mb() anyway for documentation and + * robustness against future implementation changes. + */ + smp_mb(); /* See above comment block. */ + return 0; +} + +/* + * Wait for an rcu-sched grace period to elapse, but use "big hammer" + * approach to force grace period to end quickly. This consumes + * significant time on all CPUs, and is thus not recommended for + * any sort of common-case code. + * + * Note that it is illegal to call this function while holding any + * lock that is acquired by a CPU-hotplug notifier. Failing to + * observe this restriction will result in deadlock. + * + * This implementation can be thought of as an application of ticket + * locking to RCU, with sync_sched_expedited_started and + * sync_sched_expedited_done taking on the roles of the halves + * of the ticket-lock word. Each task atomically increments + * sync_sched_expedited_started upon entry, snapshotting the old value, + * then attempts to stop all the CPUs. If this succeeds, then each + * CPU will have executed a context switch, resulting in an RCU-sched + * grace period. We are then done, so we use atomic_cmpxchg() to + * update sync_sched_expedited_done to match our snapshot -- but + * only if someone else has not already advanced past our snapshot. + * + * On the other hand, if try_stop_cpus() fails, we check the value + * of sync_sched_expedited_done. If it has advanced past our + * initial snapshot, then someone else must have forced a grace period + * some time after we took our snapshot. In this case, our work is + * done for us, and we can simply return. Otherwise, we try again, + * but keep our initial snapshot for purposes of checking for someone + * doing our work for us. + * + * If we fail too many times in a row, we fall back to synchronize_sched(). + */ +void synchronize_sched_expedited(void) +{ + int firstsnap, s, snap, trycount = 0; + + /* Note that atomic_inc_return() implies full memory barrier. */ + firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); + get_online_cpus(); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); + + /* + * Each pass through the following loop attempts to force a + * context switch on each CPU. + */ + while (try_stop_cpus(cpu_online_mask, + synchronize_sched_expedited_cpu_stop, + NULL) == -EAGAIN) { + put_online_cpus(); + + /* No joy, try again later. Or just synchronize_sched(). */ + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + + /* Check to see if someone else did our work for us. */ + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + + /* + * Refetching sync_sched_expedited_started allows later + * callers to piggyback on our grace period. We subtract + * 1 to get the same token that the last incrementer got. + * We retry after they started, so our grace period works + * for them, and they started after our first try, so their + * grace period works for us. + */ + get_online_cpus(); + snap = atomic_read(&sync_sched_expedited_started); + smp_mb(); /* ensure read is before try_stop_cpus(). */ + } + + /* + * Everyone up to our most recent fetch is covered by our grace + * period. Update the counter, but only if our work is still + * relevant -- which it won't be if someone who started later + * than we did beat us to the punch. + */ + do { + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { + smp_mb(); /* ensure test happens before caller kfree */ + break; + } + } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. -- cgit v1.2.3 From 2036d94a7b61ca5032ce90f2bda06afec0fe713e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 Jan 2012 17:02:47 -0800 Subject: rcu: Rework detection of use of RCU by offline CPUs Because newly offlined CPUs continue executing after completing the CPU_DYING notifiers, they legitimately enter the scheduler and use RCU while appearing to be offline. This calls for a more sophisticated approach as follows: 1. RCU marks the CPU online during the CPU_UP_PREPARE phase. 2. RCU marks the CPU offline during the CPU_DEAD phase. 3. Diagnostics regarding use of read-side RCU by offline CPUs use RCU's accounting rather than the cpu_online_map. (Note that __call_rcu() still uses cpu_online_map to detect illegal invocations within CPU_DYING notifiers.) 4. Offline CPUs are prevented from hanging the system by force_quiescent_state(), which pays attention to cpu_online_map. Some additional work (in a later commit) will be needed to guarantee that force_quiescent_state() waits a full jiffy before assuming that a CPU is offline, for example, when called from idle entry. (This commit also makes the one-jiffy wait explicit, since the old-style implicit wait can now be defeated by RCU_FAST_NO_HZ and by rcutorture.) This approach avoids the false positives encountered when attempting to use more exact classification of CPU online/offline state. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 113 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 46 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 05470d4caba3..708469a06860 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -320,25 +320,18 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static int rcu_implicit_offline_qs(struct rcu_data *rdp) { /* - * If the CPU is offline, it is in a quiescent state. We can - * trust its state not to change because interrupts are disabled. + * If the CPU is offline for more than a jiffy, it is in a quiescent + * state. We can trust its state not to change because interrupts + * are disabled. The reason for the jiffy's worth of slack is to + * handle CPUs initializing on the way up and finding their way + * to the idle loop on the way down. */ - if (cpu_is_offline(rdp->cpu)) { + if (cpu_is_offline(rdp->cpu) && + ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); rdp->offline_fqs++; return 1; } - - /* - * The CPU is online, so send it a reschedule IPI. This forces - * it through the scheduler, and (inefficiently) also handles cases - * where idle loops fail to inform RCU about the CPU being idle. - */ - if (rdp->cpu != smp_processor_id()) - smp_send_reschedule(rdp->cpu); - else - set_need_resched(); - rdp->resched_ipi++; return 0; } @@ -601,19 +594,33 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); * this task being preempted, its old CPU being taken offline, resuming * on some other CPU, then determining that its old CPU is now offline. * It is OK to use RCU on an offline processor during initial boot, hence - * the check for rcu_scheduler_fully_active. + * the check for rcu_scheduler_fully_active. Note also that it is OK + * for a CPU coming online to use RCU for one jiffy prior to marking itself + * online in the cpu_online_mask. Similarly, it is OK for a CPU going + * offline to continue to use RCU for one jiffy after marking itself + * offline in the cpu_online_mask. This leniency is necessary given the + * non-atomic nature of the online and offline processing, for example, + * the fact that a CPU enters the scheduler after completing the CPU_DYING + * notifiers. + * + * This is also why RCU internally marks CPUs online during the + * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. * * Disable checking if in an NMI handler because we cannot safely report * errors from NMI handlers anyway. */ bool rcu_lockdep_current_cpu_online(void) { + struct rcu_data *rdp; + struct rcu_node *rnp; bool ret; if (in_nmi()) return 1; preempt_disable(); - ret = cpu_online(smp_processor_id()) || + rdp = &__get_cpu_var(rcu_sched_data); + rnp = rdp->mynode; + ret = (rdp->grpmask & rnp->qsmaskinit) || !rcu_scheduler_fully_active; preempt_enable(); return ret; @@ -1308,14 +1315,12 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { - unsigned long flags; int i; unsigned long mask; - int need_report; int receive_cpu = cpumask_any(cpu_online_mask); struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); - struct rcu_node *rnp = rdp->mynode; /* For dying CPU. */ + RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ /* First, adjust the counts. */ if (rdp->nxtlist != NULL) { @@ -1381,32 +1386,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) "cpuofl"); rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ - - /* - * Remove the dying CPU from the bitmasks in the rcu_node - * hierarchy. Because we are in stop_machine() context, we - * automatically exclude ->onofflock critical sections. - */ - do { - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->qsmaskinit &= ~mask; - if (rnp->qsmaskinit != 0) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - break; - } - if (rnp == rdp->mynode) { - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); - } else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - mask = rnp->grpmask; - rnp = rnp->parent; - } while (rnp != NULL); } /* @@ -1417,11 +1396,53 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) */ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { + unsigned long flags; + unsigned long mask; + int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ + /* Adjust any no-longer-needed kthreads. */ rcu_stop_cpu_kthread(cpu); rcu_node_kthread_setaffinity(rnp, -1); + + /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ + + /* Exclude any attempts to start a new grace period. */ + raw_spin_lock_irqsave(&rsp->onofflock, flags); + + /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ + mask = rdp->grpmask; /* rnp->grplo is constant. */ + do { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->qsmaskinit &= ~mask; + if (rnp->qsmaskinit != 0) { + if (rnp != rdp->mynode) + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + break; + } + if (rnp == rdp->mynode) + need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + else + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + mask = rnp->grpmask; + rnp = rnp->parent; + } while (rnp != NULL); + + /* + * We still hold the leaf rcu_node structure lock here, and + * irqs are still disabled. The reason for this subterfuge is + * because invoking rcu_report_unblock_qs_rnp() with ->onofflock + * held leads to deadlock. + */ + raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + rnp = rdp->mynode; + if (need_report & RCU_OFL_TASKS_NORM_GP) + rcu_report_unblock_qs_rnp(rnp, flags); + else + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (need_report & RCU_OFL_TASKS_EXP_GP) + rcu_report_exp_rnp(rsp, rnp, true); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From 236fefafe5d3d34b78ed2ccf5510909716112326 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Jan 2012 14:00:41 -0800 Subject: rcu: Call out dangers of expedited RCU primitives The expedited RCU primitives can be quite useful, but they have some high costs as well. This commit updates and creates docbook comments calling out the costs, and updates the RCU documentation as well. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 708469a06860..df0e3c1bb68e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1961,15 +1961,21 @@ static int synchronize_sched_expedited_cpu_stop(void *data) return 0; } -/* - * Wait for an rcu-sched grace period to elapse, but use "big hammer" - * approach to force grace period to end quickly. This consumes - * significant time on all CPUs, and is thus not recommended for - * any sort of common-case code. +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. * - * Note that it is illegal to call this function while holding any - * lock that is acquired by a CPU-hotplug notifier. Failing to - * observe this restriction will result in deadlock. + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. * * This implementation can be thought of as an application of ticket * locking to RCU, with sync_sched_expedited_started and -- cgit v1.2.3 From 29e37d814188ac8d60f2120583704d3ef6d634b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Nov 2011 16:55:56 -0800 Subject: rcu: Allow nesting of rcu_idle_enter() and rcu_idle_exit() Use of RCU in the idle loop is incorrect, quite a few instances of just that have made their way into mainline, primarily event tracing. The problem with RCU read-side critical sections on CPUs that RCU believes to be idle is that RCU is completely ignoring the CPU, along with any attempts and RCU read-side critical sections. The approaches of eliminating the offending uses and of pushing the definition of idle down beyond the offending uses have both proved impractical. The new approach is to encapsulate offending uses of RCU with rcu_idle_exit() and rcu_idle_enter(), but this requires nesting for code that is invoked both during idle and and during normal execution. Therefore, this commit modifies rcu_idle_enter() and rcu_idle_exit() to permit nesting. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Deepthi Dharwar --- kernel/rcutree.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index df0e3c1bb68e..92b47760edf3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -198,7 +198,7 @@ void rcu_note_context_switch(int cpu) EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = DYNTICK_TASK_NESTING, + .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), }; @@ -394,7 +394,11 @@ void rcu_idle_enter(void) local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting = 0; + WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); + if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + rdtp->dynticks_nesting = 0; + else + rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; rcu_idle_enter_common(rdtp, oldval); local_irq_restore(flags); } @@ -467,7 +471,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) * Exit idle mode, in other words, -enter- the mode in which RCU * read-side critical sections can occur. * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to + * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to * allow for the possibility of usermode upcalls messing up our count * of interrupt nesting level during the busy period that is just * now starting. @@ -481,8 +485,11 @@ void rcu_idle_exit(void) local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE(oldval != 0); - rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; + WARN_ON_ONCE(oldval < 0); + if (oldval & DYNTICK_TASK_NEST_MASK) + rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; + else + rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_idle_exit_common(rdtp, oldval); local_irq_restore(flags); } @@ -2253,7 +2260,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->qlen_lazy = 0; rdp->qlen = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; @@ -2281,7 +2288,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; + rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); rcu_prepare_for_idle_init(cpu); -- cgit v1.2.3 From 8a2ecf474d3ee8dd5d001490349e422cec52f39f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Feb 2012 15:42:04 -0800 Subject: rcu: Add RCU_NONIDLE() for idle-loop RCU read-side critical sections RCU, RCU-bh, and RCU-sched read-side critical sections are forbidden in the inner idle loop, that is, between the rcu_idle_enter() and the rcu_idle_exit() -- RCU will happily ignore any such read-side critical sections. However, things like powertop need tracepoints in the inner idle loop. This commit therefore provides an RCU_NONIDLE() macro that can be used to wrap code in the idle loop that requires RCU read-side critical sections. Suggested-by: Steven Rostedt Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Deepthi Dharwar --- kernel/rcutree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 92b47760edf3..eacc10b03531 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -402,6 +402,7 @@ void rcu_idle_enter(void) rcu_idle_enter_common(rdtp, oldval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_enter); /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -493,6 +494,7 @@ void rcu_idle_exit(void) rcu_idle_exit_common(rdtp, oldval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_exit); /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle -- cgit v1.2.3 From 1cc85961e214773cb7d7f2ccbe3bc644dd466df0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 17 Feb 2012 13:20:31 -0800 Subject: rcu: Stop spurious warnings from synchronize_sched_expedited synchronize_sched_expedited() is spamming CONFIG_DEBUG_PREEMPT=y users with an unintended warning from the cpu_is_offline() check: use raw_smp_processor_id() instead of smp_processor_id() there. Because the warning is under a get_online_cpus(), it is not possible for any CPUs to go offline, though it is quite possible that the task might migrate between the raw_smp_processor_id() and the check of cpu_is_offline(). This is not a problem because the task cannot migrate from an offline CPU to an online one or vice versa. The point of the check is to verify that synchronize_sched_expedited() is not called from an offline CPU, for example, from a CPU_DYING notifier, or, more important, from an outgoing CPU making its way from its CPU_DYING notifiers to the idle loop. Signed-off-by: Hugh Dickins Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index eacc10b03531..1050d6d3922c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2014,7 +2014,7 @@ void synchronize_sched_expedited(void) /* Note that atomic_inc_return() implies full memory barrier. */ firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); get_online_cpus(); - WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); + WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); /* * Each pass through the following loop attempts to force a -- cgit v1.2.3