summaryrefslogtreecommitdiff
path: root/kernel/rcu
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2015-08-04 08:40:58 -0700
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2015-08-04 08:40:58 -0700
commit8ff4fbfd69a6c7b9598f8c1f2df34f89bac02c1a (patch)
treea56504a1f6223a9a755e30ab4ce0fbeb79b94f8d /kernel/rcu
parent9a54f98e341d09793247a6e598012edefb5ae7cb (diff)
parentaf859beaaba4d57883b08f4acbcb3974bc1f975e (diff)
Merge branches 'fixes.2015.07.22a' and 'initexp.2015.08.04a' into HEAD
fixes.2015.07.22a: Miscellaneous fixes. initexp.2015.08.04a: Initialization and expedited updates. (Single branch due to conflicts.)
Diffstat (limited to 'kernel/rcu')
-rw-r--r--kernel/rcu/tree.c589
-rw-r--r--kernel/rcu/tree.h84
-rw-r--r--kernel/rcu/tree_plugin.h117
-rw-r--r--kernel/rcu/tree_trace.c19
4 files changed, 401 insertions, 408 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0a73d26357a2..9f75f25cc5d9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -70,6 +70,8 @@ MODULE_ALIAS("rcutree");
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];
/*
* In order to export the rcu_state name to the tracing tools, it
@@ -124,13 +126,8 @@ module_param(rcu_fanout_exact, bool, 0444);
static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
module_param(rcu_fanout_leaf, int, 0444);
int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
-static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
- NUM_RCU_LVL_0,
- NUM_RCU_LVL_1,
- NUM_RCU_LVL_2,
- NUM_RCU_LVL_3,
- NUM_RCU_LVL_4,
-};
+/* Number of rcu_nodes at specified level. */
+static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
/*
@@ -1178,9 +1175,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
j = jiffies;
gpa = READ_ONCE(rsp->gp_activity);
if (j - gpa > 2 * HZ)
- pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n",
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
rsp->name, j - gpa,
- rsp->gpnum, rsp->completed, rsp->gp_flags);
+ rsp->gpnum, rsp->completed,
+ rsp->gp_flags, rsp->gp_state,
+ rsp->gp_kthread ? rsp->gp_kthread->state : 0);
}
/*
@@ -1906,6 +1905,26 @@ static int rcu_gp_init(struct rcu_state *rsp)
}
/*
+ * Helper function for wait_event_interruptible_timeout() wakeup
+ * at force-quiescent-state time.
+ */
+static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ /* Someone like call_rcu() requested a force-quiescent-state scan. */
+ *gfp = READ_ONCE(rsp->gp_flags);
+ if (*gfp & RCU_GP_FLAG_FQS)
+ return true;
+
+ /* The current grace period has completed. */
+ if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
+ return true;
+
+ return false;
+}
+
+/*
* Do one round of quiescent-state forcing.
*/
static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
@@ -2041,6 +2060,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
wait_event_interruptible(rsp->gp_wq,
READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
+ rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
@@ -2068,11 +2088,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
ret = wait_event_interruptible_timeout(rsp->gp_wq,
- ((gf = READ_ONCE(rsp->gp_flags)) &
- RCU_GP_FLAG_FQS) ||
- (!READ_ONCE(rnp->qsmask) &&
- !rcu_preempt_blocked_readers_cgp(rnp)),
- j);
+ rcu_gp_fqs_check_wake(rsp, &gf), j);
+ rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
/* If grace period done, leave loop. */
if (!READ_ONCE(rnp->qsmask) &&
@@ -2110,7 +2127,9 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
/* Handle grace-period end. */
+ rsp->gp_state = RCU_GP_CLEANUP;
rcu_gp_cleanup(rsp);
+ rsp->gp_state = RCU_GP_CLEANED;
}
}
@@ -3305,23 +3324,195 @@ void cond_synchronize_sched(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(cond_synchronize_sched);
-static int synchronize_sched_expedited_cpu_stop(void *data)
+/* Adjust sequence number for start of update-side operation. */
+static void rcu_seq_start(unsigned long *sp)
+{
+ WRITE_ONCE(*sp, *sp + 1);
+ smp_mb(); /* Ensure update-side operation after counter increment. */
+ WARN_ON_ONCE(!(*sp & 0x1));
+}
+
+/* Adjust sequence number for end of update-side operation. */
+static void rcu_seq_end(unsigned long *sp)
+{
+ smp_mb(); /* Ensure update-side operation before counter increment. */
+ WRITE_ONCE(*sp, *sp + 1);
+ WARN_ON_ONCE(*sp & 0x1);
+}
+
+/* Take a snapshot of the update side's sequence number. */
+static unsigned long rcu_seq_snap(unsigned long *sp)
+{
+ unsigned long s;
+
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
+ s = (READ_ONCE(*sp) + 3) & ~0x1;
+ smp_mb(); /* Above access must not bleed into critical section. */
+ return s;
+}
+
+/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred.
+ */
+static bool rcu_seq_done(unsigned long *sp, unsigned long s)
+{
+ return ULONG_CMP_GE(READ_ONCE(*sp), s);
+}
+
+/* Wrapper functions for expedited grace periods. */
+static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
+{
+ rcu_seq_start(&rsp->expedited_sequence);
+}
+static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
+{
+ rcu_seq_end(&rsp->expedited_sequence);
+ smp_mb(); /* Ensure that consecutive grace periods serialize. */
+}
+static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
+{
+ return rcu_seq_snap(&rsp->expedited_sequence);
+}
+static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
+{
+ return rcu_seq_done(&rsp->expedited_sequence, s);
+}
+
+/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
+static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp,
+ atomic_long_t *stat, unsigned long s)
{
+ if (rcu_exp_gp_seq_done(rsp, s)) {
+ if (rnp)
+ mutex_unlock(&rnp->exp_funnel_mutex);
+ else if (rdp)
+ mutex_unlock(&rdp->exp_funnel_mutex);
+ /* Ensure test happens before caller kfree(). */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(stat);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Funnel-lock acquisition for expedited grace periods. Returns a
+ * pointer to the root rcu_node structure, or NULL if some other
+ * task did the expedited grace period for us.
+ */
+static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp0;
+ struct rcu_node *rnp1 = NULL;
+
/*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
+ * First try directly acquiring the root lock in order to reduce
+ * latency in the common case where expedited grace periods are
+ * rare. We check mutex_is_locked() to avoid pathological levels of
+ * memory contention on ->exp_funnel_mutex in the heavy-load case.
*/
- smp_mb(); /* See above comment block. */
+ rnp0 = rcu_get_root(rsp);
+ if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
+ if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
+ if (sync_exp_work_done(rsp, rnp0, NULL,
+ &rsp->expedited_workdone0, s))
+ return NULL;
+ return rnp0;
+ }
+ }
+
+ /*
+ * Each pass through the following loop works its way
+ * up the rcu_node tree, returning if others have done the
+ * work or otherwise falls through holding the root rnp's
+ * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
+ * can be inexact, as it is just promoting locality and is not
+ * strictly needed for correctness.
+ */
+ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+ if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
+ return NULL;
+ mutex_lock(&rdp->exp_funnel_mutex);
+ rnp0 = rdp->mynode;
+ for (; rnp0 != NULL; rnp0 = rnp0->parent) {
+ if (sync_exp_work_done(rsp, rnp1, rdp,
+ &rsp->expedited_workdone2, s))
+ return NULL;
+ mutex_lock(&rnp0->exp_funnel_mutex);
+ if (rnp1)
+ mutex_unlock(&rnp1->exp_funnel_mutex);
+ else
+ mutex_unlock(&rdp->exp_funnel_mutex);
+ rnp1 = rnp0;
+ }
+ if (sync_exp_work_done(rsp, rnp1, rdp,
+ &rsp->expedited_workdone3, s))
+ return NULL;
+ return rnp1;
+}
+
+/* Invoked on each online non-idle CPU for expedited quiescent state. */
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+ struct rcu_data *rdp = data;
+ struct rcu_state *rsp = rdp->rsp;
+
+ /* We are here: If we are last, do the wakeup. */
+ rdp->exp_done = true;
+ if (atomic_dec_and_test(&rsp->expedited_need_qs))
+ wake_up(&rsp->expedited_wq);
return 0;
}
+static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
+{
+ int cpu;
+ unsigned long jiffies_stall;
+ unsigned long jiffies_start;
+ struct rcu_data *rdp;
+ int ret;
+
+ jiffies_stall = rcu_jiffies_till_stall_check();
+ jiffies_start = jiffies;
+
+ for (;;) {
+ ret = wait_event_interruptible_timeout(
+ rsp->expedited_wq,
+ !atomic_read(&rsp->expedited_need_qs),
+ jiffies_stall);
+ if (ret > 0)
+ return;
+ if (ret < 0) {
+ /* Hit a signal, disable CPU stall warnings. */
+ wait_event(rsp->expedited_wq,
+ !atomic_read(&rsp->expedited_need_qs));
+ return;
+ }
+ pr_err("INFO: %s detected expedited stalls on CPUs: {",
+ rsp->name);
+ for_each_online_cpu(cpu) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+
+ if (rdp->exp_done)
+ continue;
+ pr_cont(" %d", cpu);
+ }
+ pr_cont(" } %lu jiffies s: %lu\n",
+ jiffies - jiffies_start, rsp->expedited_sequence);
+ for_each_online_cpu(cpu) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+
+ if (rdp->exp_done)
+ continue;
+ dump_cpu_task(cpu);
+ }
+ jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+ }
+}
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3333,58 +3524,21 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
* restructure your code to batch your updates, and then use a single
* synchronize_sched() instead.
*
- * This implementation can be thought of as an application of ticket
- * locking to RCU, with sync_sched_expedited_started and
- * sync_sched_expedited_done taking on the roles of the halves
- * of the ticket-lock word. Each task atomically increments
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs. If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period. We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done. If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot. In this case, our work is
- * done for us, and we can simply return. Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
+ * This implementation can be thought of as an application of sequence
+ * locking to expedited grace periods, but using the sequence counter to
+ * determine when someone else has already done the work instead of for
+ * retrying readers.
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
+ unsigned long s;
+ struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_sched_state;
- /*
- * If we are in danger of counter wrap, just do synchronize_sched().
- * By allowing sync_sched_expedited_started to advance no more than
- * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
- * that more than 3.5 billion CPUs would be required to force a
- * counter wrap on a 32-bit system. Quite a few more CPUs would of
- * course be required on a 64-bit system.
- */
- if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
- (ulong)atomic_long_read(&rsp->expedited_done) +
- ULONG_MAX / 8)) {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_wrap);
- return;
- }
+ /* Take a snapshot of the sequence number. */
+ s = rcu_exp_gp_seq_snap(rsp);
- /*
- * Take a ticket. Note that atomic_inc_return() implies a
- * full memory barrier.
- */
- snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3393,100 +3547,38 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
+ rnp = exp_funnel_lock(rsp, s);
+ if (rnp == NULL) {
+ put_online_cpus();
+ return; /* Someone else did our work for us. */
}
- /*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
- */
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
+ rcu_exp_gp_seq_start(rsp);
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ /* Stop each CPU that is online, non-idle, and not us. */
+ init_waitqueue_head(&rsp->expedited_wq);
+ atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
+ for_each_online_cpu(cpu) {
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ rdp->exp_done = false;
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* Skip our CPU and any idle CPUs. */
+ if (raw_smp_processor_id() == cpu ||
+ !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+ atomic_inc(&rsp->expedited_need_qs);
+ stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
+ rdp, &rdp->exp_stop_work);
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
-all_cpus_idle:
- free_cpumask_var(cm);
+ /* Remove extra count and, if necessary, wait for CPUs to stop. */
+ if (!atomic_dec_and_test(&rsp->expedited_need_qs))
+ synchronize_sched_expedited_wait(rsp);
- /*
- * Everyone up to our most recent fetch is covered by our grace
- * period. Update the counter, but only if our work is still
- * relevant -- which it won't be if someone who started later
- * than we did already did their update.
- */
- do {
- atomic_long_inc(&rsp->expedited_done_tries);
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_done_lost);
- break;
- }
- } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
- atomic_long_inc(&rsp->expedited_done_exit);
+ rcu_exp_gp_seq_end(rsp);
+ mutex_unlock(&rnp->exp_funnel_mutex);
put_online_cpus();
}
@@ -3623,10 +3715,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
struct rcu_state *rsp = rdp->rsp;
if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
- _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
complete(&rsp->barrier_completion);
} else {
- _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
}
}
@@ -3638,7 +3730,7 @@ static void rcu_barrier_func(void *type)
struct rcu_state *rsp = type;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
- _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
atomic_inc(&rsp->barrier_cpu_count);
rsp->call(&rdp->barrier_head, rcu_barrier_callback);
}
@@ -3651,55 +3743,24 @@ static void _rcu_barrier(struct rcu_state *rsp)
{
int cpu;
struct rcu_data *rdp;
- unsigned long snap = READ_ONCE(rsp->n_barrier_done);
- unsigned long snap_done;
+ unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Begin", -1, snap);
+ _rcu_barrier_trace(rsp, "Begin", -1, s);
/* Take mutex to serialize concurrent rcu_barrier() requests. */
mutex_lock(&rsp->barrier_mutex);
- /*
- * Ensure that all prior references, including to ->n_barrier_done,
- * are ordered before the _rcu_barrier() machinery.
- */
- smp_mb(); /* See above block comment. */
-
- /*
- * Recheck ->n_barrier_done to see if others did our work for us.
- * This means checking ->n_barrier_done for an even-to-odd-to-even
- * transition. The "if" expression below therefore rounds the old
- * value up to the next even number and adds two before comparing.
- */
- snap_done = rsp->n_barrier_done;
- _rcu_barrier_trace(rsp, "Check", -1, snap_done);
-
- /*
- * If the value in snap is odd, we needed to wait for the current
- * rcu_barrier() to complete, then wait for the next one, in other
- * words, we need the value of snap_done to be three larger than
- * the value of snap. On the other hand, if the value in snap is
- * even, we only had to wait for the next rcu_barrier() to complete,
- * in other words, we need the value of snap_done to be only two
- * greater than the value of snap. The "(snap + 3) & ~0x1" computes
- * this for us (thank you, Linus!).
- */
- if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
- _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
+ /* Did someone else do our work for us? */
+ if (rcu_seq_done(&rsp->barrier_sequence, s)) {
+ _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rsp->barrier_mutex);
return;
}
- /*
- * Increment ->n_barrier_done to avoid duplicate work. Use
- * WRITE_ONCE() to prevent the compiler from speculating
- * the increment to precede the early-exit check.
- */
- WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
- WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
- _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
- smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
+ /* Mark the start of the barrier operation. */
+ rcu_seq_start(&rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
/*
* Initialize the count to one rather than to zero in order to
@@ -3723,10 +3784,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
if (rcu_is_nocb_cpu(cpu)) {
if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
_rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
} else {
_rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
smp_mb__before_atomic();
atomic_inc(&rsp->barrier_cpu_count);
__call_rcu(&rdp->barrier_head,
@@ -3734,11 +3795,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
}
} else if (READ_ONCE(rdp->qlen)) {
_rcu_barrier_trace(rsp, "OnlineQ", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
} else {
_rcu_barrier_trace(rsp, "OnlineNQ", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
}
}
put_online_cpus();
@@ -3750,16 +3811,13 @@ static void _rcu_barrier(struct rcu_state *rsp)
if (atomic_dec_and_test(&rsp->barrier_cpu_count))
complete(&rsp->barrier_completion);
- /* Increment ->n_barrier_done to prevent duplicate work. */
- smp_mb(); /* Keep increment after above mechanism. */
- WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
- WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
- _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
- smp_mb(); /* Keep increment before caller's subsequent code. */
-
/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
wait_for_completion(&rsp->barrier_completion);
+ /* Mark the end of the barrier operation. */
+ _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
+ rcu_seq_end(&rsp->barrier_sequence);
+
/* Other rcu_barrier() invocations can now safely proceed. */
mutex_unlock(&rsp->barrier_mutex);
}
@@ -3822,6 +3880,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
rdp->cpu = cpu;
rdp->rsp = rsp;
+ mutex_init(&rdp->exp_funnel_mutex);
rcu_boot_init_nocb_percpu_data(rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -4013,22 +4072,22 @@ void rcu_scheduler_starting(void)
* Compute the per-level fanout, either using the exact fanout specified
* or balancing the tree, depending on the rcu_fanout_exact boot parameter.
*/
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
+static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
{
int i;
if (rcu_fanout_exact) {
- rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+ levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
for (i = rcu_num_lvls - 2; i >= 0; i--)
- rsp->levelspread[i] = RCU_FANOUT;
+ levelspread[i] = RCU_FANOUT;
} else {
int ccur;
int cprv;
cprv = nr_cpu_ids;
for (i = rcu_num_lvls - 1; i >= 0; i--) {
- ccur = rsp->levelcnt[i];
- rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+ ccur = levelcnt[i];
+ levelspread[i] = (cprv + ccur - 1) / ccur;
cprv = ccur;
}
}
@@ -4040,23 +4099,20 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
static void __init rcu_init_one(struct rcu_state *rsp,
struct rcu_data __percpu *rda)
{
- static const char * const buf[] = {
- "rcu_node_0",
- "rcu_node_1",
- "rcu_node_2",
- "rcu_node_3" }; /* Match MAX_RCU_LVLS */
- static const char * const fqs[] = {
- "rcu_node_fqs_0",
- "rcu_node_fqs_1",
- "rcu_node_fqs_2",
- "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const buf[] = RCU_NODE_NAME_INIT;
+ static const char * const fqs[] = RCU_FQS_NAME_INIT;
+ static const char * const exp[] = RCU_EXP_NAME_INIT;
+ static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT;
static u8 fl_mask = 0x1;
+
+ int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
+ int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
int cpustride = 1;
int i;
int j;
struct rcu_node *rnp;
- BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
+ BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
/* Silence gcc 4.8 false positive about array index out of range. */
if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
@@ -4065,19 +4121,19 @@ static void __init rcu_init_one(struct rcu_state *rsp,
/* Initialize the level-tracking arrays. */
for (i = 0; i < rcu_num_lvls; i++)
- rsp->levelcnt[i] = num_rcu_lvl[i];
+ levelcnt[i] = num_rcu_lvl[i];
for (i = 1; i < rcu_num_lvls; i++)
- rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
- rcu_init_levelspread(rsp);
+ rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1];
+ rcu_init_levelspread(levelspread, levelcnt);
rsp->flavor_mask = fl_mask;
fl_mask <<= 1;
/* Initialize the elements themselves, starting from the leaves. */
for (i = rcu_num_lvls - 1; i >= 0; i--) {
- cpustride *= rsp->levelspread[i];
+ cpustride *= levelspread[i];
rnp = rsp->level[i];
- for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+ for (j = 0; j < levelcnt[i]; j++, rnp++) {
raw_spin_lock_init(&rnp->lock);
lockdep_set_class_and_name(&rnp->lock,
&rcu_node_class[i], buf[i]);
@@ -4097,14 +4153,23 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rnp->grpmask = 0;
rnp->parent = NULL;
} else {
- rnp->grpnum = j % rsp->levelspread[i - 1];
+ rnp->grpnum = j % levelspread[i - 1];
rnp->grpmask = 1UL << rnp->grpnum;
rnp->parent = rsp->level[i - 1] +
- j / rsp->levelspread[i - 1];
+ j / levelspread[i - 1];
}
rnp->level = i;
INIT_LIST_HEAD(&rnp->blkd_tasks);
rcu_init_one_nocb(rnp);
+ mutex_init(&rnp->exp_funnel_mutex);
+ if (rsp == &rcu_sched_state)
+ lockdep_set_class_and_name(
+ &rnp->exp_funnel_mutex,
+ &rcu_exp_sched_class[i], exp_sched[i]);
+ else
+ lockdep_set_class_and_name(
+ &rnp->exp_funnel_mutex,
+ &rcu_exp_class[i], exp[i]);
}
}
@@ -4128,9 +4193,7 @@ static void __init rcu_init_geometry(void)
{
ulong d;
int i;
- int j;
- int n = nr_cpu_ids;
- int rcu_capacity[MAX_RCU_LVLS + 1];
+ int rcu_capacity[RCU_NUM_LVLS];
/*
* Initialize any unspecified boot parameters.
@@ -4153,47 +4216,49 @@ static void __init rcu_init_geometry(void)
rcu_fanout_leaf, nr_cpu_ids);
/*
- * Compute number of nodes that can be handled an rcu_node tree
- * with the given number of levels. Setting rcu_capacity[0] makes
- * some of the arithmetic easier.
- */
- rcu_capacity[0] = 1;
- rcu_capacity[1] = rcu_fanout_leaf;
- for (i = 2; i <= MAX_RCU_LVLS; i++)
- rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
-
- /*
* The boot-time rcu_fanout_leaf parameter is only permitted
* to increase the leaf-level fanout, not decrease it. Of course,
* the leaf-level fanout cannot exceed the number of bits in
- * the rcu_node masks. Finally, the tree must be able to accommodate
- * the configured number of CPUs. Complain and fall back to the
- * compile-time values if these limits are exceeded.
+ * the rcu_node masks. Complain and fall back to the compile-
+ * time values if these limits are exceeded.
*/
if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
- rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
- n > rcu_capacity[MAX_RCU_LVLS]) {
+ rcu_fanout_leaf > sizeof(unsigned long) * 8) {
+ rcu_fanout_leaf = RCU_FANOUT_LEAF;
WARN_ON(1);
return;
}
+ /*
+ * Compute number of nodes that can be handled an rcu_node tree
+ * with the given number of levels.
+ */
+ rcu_capacity[0] = rcu_fanout_leaf;
+ for (i = 1; i < RCU_NUM_LVLS; i++)
+ rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
+
+ /*
+ * The tree must be able to accommodate the configured number of CPUs.
+ * If this limit is exceeded than we have a serious problem elsewhere.
+ */
+ if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1])
+ panic("rcu_init_geometry: rcu_capacity[] is too small");
+
+ /* Calculate the number of levels in the tree. */
+ for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
+ }
+ rcu_num_lvls = i + 1;
+
/* Calculate the number of rcu_nodes at each level of the tree. */
- for (i = 1; i <= MAX_RCU_LVLS; i++)
- if (n <= rcu_capacity[i]) {
- for (j = 0; j <= i; j++)
- num_rcu_lvl[j] =
- DIV_ROUND_UP(n, rcu_capacity[i - j]);
- rcu_num_lvls = i;
- for (j = i + 1; j <= MAX_RCU_LVLS; j++)
- num_rcu_lvl[j] = 0;
- break;
- }
+ for (i = 0; i < rcu_num_lvls; i++) {
+ int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
+ num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
+ }
/* Calculate the total number of rcu_node structures. */
rcu_num_nodes = 0;
- for (i = 0; i <= MAX_RCU_LVLS; i++)
+ for (i = 0; i < rcu_num_lvls; i++)
rcu_num_nodes += num_rcu_lvl[i];
- rcu_num_nodes -= n;
}
/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4adb7ca0bf47..0412030ca882 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
+#include <linux/stop_machine.h>
/*
* Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -36,8 +37,6 @@
* Of course, your mileage may vary.
*/
-#define MAX_RCU_LVLS 4
-
#ifdef CONFIG_RCU_FANOUT
#define RCU_FANOUT CONFIG_RCU_FANOUT
#else /* #ifdef CONFIG_RCU_FANOUT */
@@ -66,38 +65,53 @@
#if NR_CPUS <= RCU_FANOUT_1
# define RCU_NUM_LVLS 1
# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 (NR_CPUS)
-# define NUM_RCU_LVL_2 0
-# define NUM_RCU_LVL_3 0
-# define NUM_RCU_LVL_4 0
+# define NUM_RCU_NODES NUM_RCU_LVL_0
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
+# define RCU_EXP_SCHED_NAME_INIT \
+ { "rcu_node_exp_sched_0" }
#elif NR_CPUS <= RCU_FANOUT_2
# define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_2 (NR_CPUS)
-# define NUM_RCU_LVL_3 0
-# define NUM_RCU_LVL_4 0
+# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
+# define RCU_EXP_SCHED_NAME_INIT \
+ { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" }
#elif NR_CPUS <= RCU_FANOUT_3
# define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_3 (NR_CPUS)
-# define NUM_RCU_LVL_4 0
+# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
+# define RCU_EXP_SCHED_NAME_INIT \
+ { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" }
#elif NR_CPUS <= RCU_FANOUT_4
# define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_4 (NR_CPUS)
+# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
+# define RCU_EXP_SCHED_NAME_INIT \
+ { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" }
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
-#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
-#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
-
extern int rcu_num_lvls;
extern int rcu_num_nodes;
@@ -236,6 +250,8 @@ struct rcu_node {
int need_future_gp[2];
/* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
+
+ struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
} ____cacheline_internodealigned_in_smp;
/*
@@ -287,12 +303,13 @@ struct rcu_data {
bool gpwrap; /* Possible gpnum/completed wrap. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
unsigned long ticks_this_gp; /* The number of scheduling-clock */
/* ticks this CPU has handled */
/* during and after the last grace */
/* period it is aware of. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+ struct cpu_stop_work exp_stop_work;
+ /* Expedited grace-period control */
+ /* for CPU stopping. */
/* 2) batch handling */
/*
@@ -355,11 +372,13 @@ struct rcu_data {
unsigned long n_rp_nocb_defer_wakeup;
unsigned long n_rp_need_nothing;
- /* 6) _rcu_barrier() and OOM callbacks. */
+ /* 6) _rcu_barrier(), OOM callbacks, and expediting. */
struct rcu_head barrier_head;
#ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+ struct mutex exp_funnel_mutex;
+ bool exp_done; /* Expedited QS for this CPU? */
/* 7) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
@@ -387,9 +406,7 @@ struct rcu_data {
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* 8) RCU CPU stall data. */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
unsigned int softirq_snap; /* Snapshot of softirq activity. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
int cpu;
struct rcu_state *rsp;
@@ -442,9 +459,9 @@ do { \
*/
struct rcu_state {
struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
- struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
- u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
- u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
+ struct rcu_node *level[RCU_NUM_LVLS + 1];
+ /* Hierarchy levels (+1 to */
+ /* shut bogus gcc warning) */
u8 flavor_mask; /* bit in flavor mask. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
@@ -479,21 +496,18 @@ struct rcu_state {
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
- unsigned long n_barrier_done; /* ++ at start and end of */
+ unsigned long barrier_sequence; /* ++ at start and end of */
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
- atomic_long_t expedited_start; /* Starting ticket. */
- atomic_long_t expedited_done; /* Done ticket. */
- atomic_long_t expedited_wrap; /* # near-wrap incidents. */
- atomic_long_t expedited_tryfail; /* # acquisition failures. */
+ unsigned long expedited_sequence; /* Take a ticket. */
+ atomic_long_t expedited_workdone0; /* # done by others #0. */
atomic_long_t expedited_workdone1; /* # done by others #1. */
atomic_long_t expedited_workdone2; /* # done by others #2. */
+ atomic_long_t expedited_workdone3; /* # done by others #3. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
- atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
- atomic_long_t expedited_done_tries; /* # tries to update _done. */
- atomic_long_t expedited_done_lost; /* # times beaten to _done. */
- atomic_long_t expedited_done_exit; /* # times exited _done loop. */
+ atomic_t expedited_need_qs; /* # CPUs left to check in. */
+ wait_queue_head_t expedited_wq; /* Wait for check-ins. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
@@ -527,7 +541,11 @@ struct rcu_state {
/* Values for rcu_state structure's gp_flags field. */
#define RCU_GP_WAIT_INIT 0 /* Initial state. */
#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
-#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */
+#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */
+#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */
+#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */
+#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */
+#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */
extern struct list_head rcu_struct_flavors;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 80a7c17907fe..b2bf3963a0ae 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -82,10 +82,8 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tRCU lockdep checking is enabled.\n");
if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
pr_info("\tRCU torture testing starts during boot.\n");
- if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
- pr_info("\tAdditional per-CPU info printed with stalls.\n");
- if (NUM_RCU_LVL_4 != 0)
- pr_info("\tFour-level hierarchy is enabled.\n");
+ if (RCU_NUM_LVLS >= 4)
+ pr_info("\tFour(or more)-level hierarchy is enabled.\n");
if (RCU_FANOUT_LEAF != 16)
pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
RCU_FANOUT_LEAF);
@@ -418,8 +416,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
rcu_print_detail_task_stall_rnp(rnp);
}
-#ifdef CONFIG_RCU_CPU_STALL_INFO
-
static void rcu_print_task_stall_begin(struct rcu_node *rnp)
{
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
@@ -431,18 +427,6 @@ static void rcu_print_task_stall_end(void)
pr_cont("\n");
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
-}
-
-static void rcu_print_task_stall_end(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
/*
* Scan the current list of tasks blocked within RCU read-side critical
* sections, printing out the tid of each.
@@ -552,8 +536,6 @@ void synchronize_rcu(void)
EXPORT_SYMBOL_GPL(synchronize_rcu);
static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-static unsigned long sync_rcu_preempt_exp_count;
-static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
/*
* Return non-zero if there are any tasks in RCU read-side critical
@@ -573,7 +555,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp)
* for the current expedited grace period. Works only for preemptible
* RCU -- other RCU implementation use other means.
*
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
@@ -589,7 +571,7 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
*
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake)
@@ -628,7 +610,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
* set the ->expmask bits on the leaf rcu_node structures to tell phase 2
* that work is needed here.
*
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void
sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -671,7 +653,7 @@ sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
* invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
* enabling rcu_read_unlock_special() to do the bit-clearing.
*
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void
sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -719,51 +701,17 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
void synchronize_rcu_expedited(void)
{
struct rcu_node *rnp;
+ struct rcu_node *rnp_unlock;
struct rcu_state *rsp = rcu_state_p;
- unsigned long snap;
- int trycount = 0;
+ unsigned long s;
- smp_mb(); /* Caller's modifications seen first by other CPUs. */
- snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1;
- smp_mb(); /* Above access cannot bleed into critical section. */
+ s = rcu_exp_gp_seq_snap(rsp);
- /*
- * Block CPU-hotplug operations. This means that any CPU-hotplug
- * operation that finds an rcu_node structure with tasks in the
- * process of being boosted will know that all tasks blocking
- * this expedited grace period will already be in the process of
- * being boosted. This simplifies the process of moving tasks
- * from leaf to root rcu_node structures.
- */
- if (!try_get_online_cpus()) {
- /* CPU-hotplug operation in flight, fall back to normal GP. */
- wait_rcu_gp(call_rcu);
- return;
- }
+ rnp_unlock = exp_funnel_lock(rsp, s);
+ if (rnp_unlock == NULL)
+ return; /* Someone else did our work for us. */
- /*
- * Acquire lock, falling back to synchronize_rcu() if too many
- * lock-acquisition failures. Of course, if someone does the
- * expedited grace period for us, just leave.
- */
- while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
- if (ULONG_CMP_LT(snap,
- READ_ONCE(sync_rcu_preempt_exp_count))) {
- put_online_cpus();
- goto mb_ret; /* Others did our work for us. */
- }
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- put_online_cpus();
- wait_rcu_gp(call_rcu);
- return;
- }
- }
- if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) {
- put_online_cpus();
- goto unlock_mb_ret; /* Others did our work for us. */
- }
+ rcu_exp_gp_seq_start(rsp);
/* force all RCU readers onto ->blkd_tasks lists. */
synchronize_sched_expedited();
@@ -779,20 +727,14 @@ void synchronize_rcu_expedited(void)
rcu_for_each_leaf_node(rsp, rnp)
sync_rcu_preempt_exp_init2(rsp, rnp);
- put_online_cpus();
-
/* Wait for snapshotted ->blkd_tasks lists to drain. */
rnp = rcu_get_root(rsp);
wait_event(sync_rcu_preempt_exp_wq,
sync_rcu_preempt_exp_done(rnp));
/* Clean up and exit. */
- smp_mb(); /* ensure expedited GP seen before counter increment. */
- WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1);
-unlock_mb_ret:
- mutex_unlock(&sync_rcu_preempt_exp_mutex);
-mb_ret:
- smp_mb(); /* ensure subsequent action seen after grace period. */
+ rcu_exp_gp_seq_end(rsp);
+ mutex_unlock(&rnp_unlock->exp_funnel_mutex);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -1703,8 +1645,6 @@ early_initcall(rcu_register_oom_notifier);
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
-
#ifdef CONFIG_RCU_FAST_NO_HZ
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
@@ -1793,33 +1733,6 @@ static void increment_cpu_stall_ticks(void)
raw_cpu_inc(rsp->rda->ticks_this_gp);
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
-static void print_cpu_stall_info_begin(void)
-{
- pr_cont(" {");
-}
-
-static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
-{
- pr_cont(" %d", cpu);
-}
-
-static void print_cpu_stall_info_end(void)
-{
- pr_cont("} ");
-}
-
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
-}
-
-static void increment_cpu_stall_ticks(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
#ifdef CONFIG_RCU_NOCB_CPU
/*
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 3ea7ffc7d5c4..6fc4c5ff3bb5 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -81,9 +81,9 @@ static void r_stop(struct seq_file *m, void *v)
static int show_rcubarrier(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
- seq_printf(m, "bcc: %d nbd: %lu\n",
+ seq_printf(m, "bcc: %d bseq: %lu\n",
atomic_read(&rsp->barrier_cpu_count),
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
return 0;
}
@@ -185,18 +185,15 @@ static int show_rcuexp(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
- seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
- atomic_long_read(&rsp->expedited_start),
- atomic_long_read(&rsp->expedited_done),
- atomic_long_read(&rsp->expedited_wrap),
- atomic_long_read(&rsp->expedited_tryfail),
+ seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+ rsp->expedited_sequence,
+ atomic_long_read(&rsp->expedited_workdone0),
atomic_long_read(&rsp->expedited_workdone1),
atomic_long_read(&rsp->expedited_workdone2),
+ atomic_long_read(&rsp->expedited_workdone3),
atomic_long_read(&rsp->expedited_normal),
- atomic_long_read(&rsp->expedited_stoppedcpus),
- atomic_long_read(&rsp->expedited_done_tries),
- atomic_long_read(&rsp->expedited_done_lost),
- atomic_long_read(&rsp->expedited_done_exit));
+ atomic_read(&rsp->expedited_need_qs),
+ rsp->expedited_sequence / 2);
return 0;
}