diff options
Diffstat (limited to 'kernel/rcu')
-rw-r--r-- | kernel/rcu/Kconfig | 14 | ||||
-rw-r--r-- | kernel/rcu/rcu.h | 67 | ||||
-rw-r--r-- | kernel/rcu/rcuperf.c | 66 | ||||
-rw-r--r-- | kernel/rcu/rcutorture.c | 397 | ||||
-rw-r--r-- | kernel/rcu/srcutiny.c | 29 | ||||
-rw-r--r-- | kernel/rcu/srcutree.c | 31 | ||||
-rw-r--r-- | kernel/rcu/tiny.c | 154 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 2213 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 132 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 426 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 790 | ||||
-rw-r--r-- | kernel/rcu/update.c | 70 |
12 files changed, 2004 insertions, 2385 deletions
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 9210379c0353..939a2056c87a 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -196,7 +196,7 @@ config RCU_BOOST This option boosts the priority of preempted RCU readers that block the current preemptible RCU grace period for too long. This option also prevents heavy loads from blocking RCU - callback invocation for all flavors of RCU. + callback invocation. Say Y here if you are working with real-time apps or heavy loads Say N here if you are unsure. @@ -225,12 +225,12 @@ config RCU_NOCB_CPU callback invocation to energy-efficient CPUs in battery-powered asymmetric multiprocessors. - This option offloads callback invocation from the set of - CPUs specified at boot time by the rcu_nocbs parameter. - For each such CPU, a kthread ("rcuox/N") will be created to - invoke callbacks, where the "N" is the CPU being offloaded, - and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and - "s" for RCU-sched. Nothing prevents this kthread from running + This option offloads callback invocation from the set of CPUs + specified at boot time by the rcu_nocbs parameter. For each + such CPU, a kthread ("rcuox/N") will be created to invoke + callbacks, where the "N" is the CPU being offloaded, and where + the "p" for RCU-preempt (PREEMPT kernels) and "s" for RCU-sched + (!PREEMPT kernels). Nothing prevents this kthread from running on the specified CPUs, but (1) the kthreads may be preempted between each callback, and (2) affinity or cgroups can be used to force the kthreads to run on whatever set of CPUs is desired. diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 4d04683c31b2..2866166863f0 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -176,8 +176,9 @@ static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old) /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally - * by call_rcu() and rcu callback execution, and are therefore not part of the - * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. + * by call_rcu() and rcu callback execution, and are therefore not part + * of the RCU API. These are in rcupdate.h because they are used by all + * RCU implementations. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD @@ -223,6 +224,7 @@ void kfree(const void *); */ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) { + rcu_callback_t f; unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); @@ -233,7 +235,9 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) return true; } else { RCU_TRACE(trace_rcu_invoke_callback(rn, head);) - head->func(head); + f = head->func; + WRITE_ONCE(head->func, (rcu_callback_t)0L); + f(head); rcu_lock_release(&rcu_callback_map); return false; } @@ -328,40 +332,35 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) } } -/* Returns first leaf rcu_node of the specified RCU flavor. */ -#define rcu_first_leaf_node(rsp) ((rsp)->level[rcu_num_lvls - 1]) +/* Returns a pointer to the first leaf rcu_node structure. */ +#define rcu_first_leaf_node() (rcu_state.level[rcu_num_lvls - 1]) /* Is this rcu_node a leaf? */ #define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1) /* Is this rcu_node the last leaf? */ -#define rcu_is_last_leaf_node(rsp, rnp) ((rnp) == &(rsp)->node[rcu_num_nodes - 1]) +#define rcu_is_last_leaf_node(rnp) ((rnp) == &rcu_state.node[rcu_num_nodes - 1]) /* - * Do a full breadth-first scan of the rcu_node structures for the - * specified rcu_state structure. + * Do a full breadth-first scan of the {s,}rcu_node structures for the + * specified state structure (for SRCU) or the only rcu_state structure + * (for RCU). */ -#define rcu_for_each_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) +#define srcu_for_each_node_breadth_first(sp, rnp) \ + for ((rnp) = &(sp)->node[0]; \ + (rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++) +#define rcu_for_each_node_breadth_first(rnp) \ + srcu_for_each_node_breadth_first(&rcu_state, rnp) /* - * Do a breadth-first scan of the non-leaf rcu_node structures for the - * specified rcu_state structure. Note that if there is a singleton - * rcu_node tree with but one rcu_node structure, this loop is a no-op. + * Scan the leaves of the rcu_node hierarchy for the rcu_state structure. + * Note that if there is a singleton rcu_node tree with but one rcu_node + * structure, this loop -will- visit the rcu_node structure. It is still + * a leaf node, even if it is also the root node. */ -#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; !rcu_is_leaf_node(rsp, rnp); (rnp)++) - -/* - * Scan the leaves of the rcu_node hierarchy for the specified rcu_state - * structure. Note that if there is a singleton rcu_node tree with but - * one rcu_node structure, this loop -will- visit the rcu_node structure. - * It is still a leaf node, even if it is also the root node. - */ -#define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = rcu_first_leaf_node(rsp); \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) +#define rcu_for_each_leaf_node(rnp) \ + for ((rnp) = rcu_first_leaf_node(); \ + (rnp) < &rcu_state.node[rcu_num_nodes]; (rnp)++) /* * Iterate over all possible CPUs in a leaf RCU node. @@ -435,6 +434,12 @@ do { \ #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ +#ifdef CONFIG_SRCU +void srcu_init(void); +#else /* #ifdef CONFIG_SRCU */ +static inline void srcu_init(void) { } +#endif /* #else #ifdef CONFIG_SRCU */ + #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ static inline bool rcu_gp_is_normal(void) { return true; } @@ -515,29 +520,19 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #ifdef CONFIG_TINY_RCU static inline unsigned long rcu_get_gp_seq(void) { return 0; } -static inline unsigned long rcu_bh_get_gp_seq(void) { return 0; } -static inline unsigned long rcu_sched_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } -static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; } static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } -static inline void rcu_bh_force_quiescent_state(void) { } -static inline void rcu_sched_force_quiescent_state(void) { } static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } #else /* #ifdef CONFIG_TINY_RCU */ unsigned long rcu_get_gp_seq(void); -unsigned long rcu_bh_get_gp_seq(void); -unsigned long rcu_sched_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); -unsigned long rcu_exp_batches_completed_sched(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); void rcu_force_quiescent_state(void); -void rcu_bh_force_quiescent_state(void); -void rcu_sched_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; extern struct workqueue_struct *rcu_par_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 34244523550e..b459da70b4fc 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -190,36 +190,6 @@ static struct rcu_perf_ops rcu_ops = { }; /* - * Definitions for rcu_bh perf testing. - */ - -static int rcu_bh_perf_read_lock(void) __acquires(RCU_BH) -{ - rcu_read_lock_bh(); - return 0; -} - -static void rcu_bh_perf_read_unlock(int idx) __releases(RCU_BH) -{ - rcu_read_unlock_bh(); -} - -static struct rcu_perf_ops rcu_bh_ops = { - .ptype = RCU_BH_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = rcu_bh_perf_read_lock, - .readunlock = rcu_bh_perf_read_unlock, - .get_gp_seq = rcu_bh_get_gp_seq, - .gp_diff = rcu_seq_diff, - .exp_completed = rcu_exp_batches_completed_sched, - .async = call_rcu_bh, - .gp_barrier = rcu_barrier_bh, - .sync = synchronize_rcu_bh, - .exp_sync = synchronize_rcu_bh_expedited, - .name = "rcu_bh" -}; - -/* * Definitions for srcu perf testing. */ @@ -306,36 +276,6 @@ static struct rcu_perf_ops srcud_ops = { }; /* - * Definitions for sched perf testing. - */ - -static int sched_perf_read_lock(void) -{ - preempt_disable(); - return 0; -} - -static void sched_perf_read_unlock(int idx) -{ - preempt_enable(); -} - -static struct rcu_perf_ops sched_ops = { - .ptype = RCU_SCHED_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = sched_perf_read_lock, - .readunlock = sched_perf_read_unlock, - .get_gp_seq = rcu_sched_get_gp_seq, - .gp_diff = rcu_seq_diff, - .exp_completed = rcu_exp_batches_completed_sched, - .async = call_rcu_sched, - .gp_barrier = rcu_barrier_sched, - .sync = synchronize_sched, - .exp_sync = synchronize_sched_expedited, - .name = "sched" -}; - -/* * Definitions for RCU-tasks perf testing. */ @@ -611,7 +551,7 @@ rcu_perf_cleanup(void) kfree(writer_n_durations); } - /* Do flavor-specific cleanup operations. */ + /* Do torture-type-specific cleanup operations. */ if (cur_ops->cleanup != NULL) cur_ops->cleanup(); @@ -661,8 +601,7 @@ rcu_perf_init(void) long i; int firsterr = 0; static struct rcu_perf_ops *perf_ops[] = { - &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, - &tasks_ops, + &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, }; if (!torture_init_begin(perf_type, verbose)) @@ -680,6 +619,7 @@ rcu_perf_init(void) for (i = 0; i < ARRAY_SIZE(perf_ops); i++) pr_cont(" %s", perf_ops[i]->name); pr_cont("\n"); + WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); firsterr = -EINVAL; goto unwind; } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index c596c6f1e457..210c77460365 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -66,15 +66,19 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@jos /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ #define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) -#define RCUTORTURE_RDR_BH 0x1 /* Extend readers by disabling bh. */ -#define RCUTORTURE_RDR_IRQ 0x2 /* ... disabling interrupts. */ -#define RCUTORTURE_RDR_PREEMPT 0x4 /* ... disabling preemption. */ -#define RCUTORTURE_RDR_RCU 0x8 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_NBITS 4 /* Number of bits defined above. */ -#define RCUTORTURE_MAX_EXTEND (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | \ - RCUTORTURE_RDR_PREEMPT) +#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ +#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ +#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ +#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */ +#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */ +#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */ +#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */ +#define RCUTORTURE_MAX_EXTEND \ + (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ + RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) #define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */ /* Must be power of two minus one. */ +#define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3) torture_param(int, cbflood_inter_holdoff, HZ, "Holdoff between floods (jiffies)"); @@ -89,6 +93,12 @@ torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); +torture_param(bool, fwd_progress, 1, "Test grace-period forward progress"); +torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait"); +torture_param(int, fwd_progress_holdoff, 60, + "Time between forward-progress tests (s)"); +torture_param(bool, fwd_progress_need_resched, 1, + "Hide cond_resched() behind need_resched()"); torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, @@ -125,7 +135,7 @@ torture_param(int, verbose, 1, static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); static int nrealreaders; static int ncbflooders; @@ -137,6 +147,7 @@ static struct task_struct **cbflood_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *stall_task; +static struct task_struct *fwd_prog_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; @@ -197,6 +208,18 @@ static const char * const rcu_torture_writer_state_names[] = { "RTWS_STOPPING", }; +/* Record reader segment types and duration for first failing read. */ +struct rt_read_seg { + int rt_readstate; + unsigned long rt_delay_jiffies; + unsigned long rt_delay_ms; + unsigned long rt_delay_us; + bool rt_preempted; +}; +static int err_segs_recorded; +static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; +static int rt_read_nsegs; + static const char *rcu_torture_writer_state_getname(void) { unsigned int i = READ_ONCE(rcu_torture_writer_state); @@ -278,7 +301,8 @@ struct rcu_torture_ops { void (*init)(void); void (*cleanup)(void); int (*readlock)(void); - void (*read_delay)(struct torture_random_state *rrsp); + void (*read_delay)(struct torture_random_state *rrsp, + struct rt_read_seg *rtrsp); void (*readunlock)(int idx); unsigned long (*get_gp_seq)(void); unsigned long (*gp_diff)(unsigned long new, unsigned long old); @@ -291,6 +315,7 @@ struct rcu_torture_ops { void (*cb_barrier)(void); void (*fqs)(void); void (*stats)(void); + int (*stall_dur)(void); int irq_capable; int can_boost; int extendables; @@ -310,12 +335,13 @@ static int rcu_torture_read_lock(void) __acquires(RCU) return 0; } -static void rcu_read_delay(struct torture_random_state *rrsp) +static void +rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) { unsigned long started; unsigned long completed; const unsigned long shortdelay_us = 200; - const unsigned long longdelay_ms = 50; + unsigned long longdelay_ms = 300; unsigned long long ts; /* We want a short delay sometimes to make a reader delay the grace @@ -325,16 +351,23 @@ static void rcu_read_delay(struct torture_random_state *rrsp) if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); + if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK)) + longdelay_ms = 5; /* Avoid triggering BH limits. */ mdelay(longdelay_ms); + rtrsp->rt_delay_ms = longdelay_ms; completed = cur_ops->get_gp_seq(); do_trace_rcu_torture_read(cur_ops->name, NULL, ts, started, completed); } - if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) + if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) { udelay(shortdelay_us); + rtrsp->rt_delay_us = shortdelay_us; + } if (!preempt_count() && - !(torture_random(rrsp) % (nrealreaders * 500))) + !(torture_random(rrsp) % (nrealreaders * 500))) { torture_preempt_schedule(); /* QS only if preemptible. */ + rtrsp->rt_preempted = true; + } } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -429,53 +462,14 @@ static struct rcu_torture_ops rcu_ops = { .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, .stats = NULL, + .stall_dur = rcu_jiffies_till_stall_check, .irq_capable = 1, .can_boost = rcu_can_boost(), + .extendables = RCUTORTURE_MAX_EXTEND, .name = "rcu" }; /* - * Definitions for rcu_bh torture testing. - */ - -static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) -{ - rcu_read_lock_bh(); - return 0; -} - -static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) -{ - rcu_read_unlock_bh(); -} - -static void rcu_bh_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops rcu_bh_ops = { - .ttype = RCU_BH_FLAVOR, - .init = rcu_sync_torture_init, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .get_gp_seq = rcu_bh_get_gp_seq, - .gp_diff = rcu_seq_diff, - .deferred_free = rcu_bh_torture_deferred_free, - .sync = synchronize_rcu_bh, - .exp_sync = synchronize_rcu_bh_expedited, - .call = call_rcu_bh, - .cb_barrier = rcu_barrier_bh, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .extendables = (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ), - .ext_irq_conflict = RCUTORTURE_RDR_RCU, - .name = "rcu_bh" -}; - -/* * Don't even think about trying any of these in real life!!! * The names includes "busted", and they really means it! * The only purpose of these functions is to provide a buggy RCU @@ -531,7 +525,8 @@ static int srcu_torture_read_lock(void) __acquires(srcu_ctlp) return srcu_read_lock(srcu_ctlp); } -static void srcu_read_delay(struct torture_random_state *rrsp) +static void +srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) { long delay; const long uspertick = 1000000 / HZ; @@ -541,10 +536,12 @@ static void srcu_read_delay(struct torture_random_state *rrsp) delay = torture_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); - if (!delay && in_task()) + if (!delay && in_task()) { schedule_timeout_interruptible(longdelay); - else - rcu_read_delay(rrsp); + rtrsp->rt_delay_jiffies = longdelay; + } else { + rcu_read_delay(rrsp, rtrsp); + } } static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp) @@ -663,48 +660,6 @@ static struct rcu_torture_ops busted_srcud_ops = { }; /* - * Definitions for sched torture testing. - */ - -static int sched_torture_read_lock(void) -{ - preempt_disable(); - return 0; -} - -static void sched_torture_read_unlock(int idx) -{ - preempt_enable(); -} - -static void rcu_sched_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops sched_ops = { - .ttype = RCU_SCHED_FLAVOR, - .init = rcu_sync_torture_init, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .get_gp_seq = rcu_sched_get_gp_seq, - .gp_diff = rcu_seq_diff, - .deferred_free = rcu_sched_torture_deferred_free, - .sync = synchronize_sched, - .exp_sync = synchronize_sched_expedited, - .get_state = get_state_synchronize_sched, - .cond_sync = cond_synchronize_sched, - .call = call_rcu_sched, - .cb_barrier = rcu_barrier_sched, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .extendables = RCUTORTURE_MAX_EXTEND, - .name = "sched" -}; - -/* * Definitions for RCU-tasks torture testing. */ @@ -1116,7 +1071,8 @@ rcu_torture_writer(void *arg) break; } } - rcu_torture_current_version++; + WRITE_ONCE(rcu_torture_current_version, + rcu_torture_current_version + 1); /* Cycle through nesting levels of rcu_expedite_gp() calls. */ if (can_expedite && !(torture_random(&rand) & 0xff & (!!expediting - 1))) { @@ -1132,7 +1088,10 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; - stutter_wait("rcu_torture_writer"); + if (stutter_wait("rcu_torture_writer")) + for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) + if (list_empty(&rcu_tortures[i].rtort_free)) + WARN_ON_ONCE(1); } while (!torture_must_stop()); /* Reset expediting back to unexpedited. */ if (expediting > 0) @@ -1199,7 +1158,8 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) * change, do a ->read_delay(). */ static void rcutorture_one_extend(int *readstate, int newstate, - struct torture_random_state *trsp) + struct torture_random_state *trsp, + struct rt_read_seg *rtrsp) { int idxnew = -1; int idxold = *readstate; @@ -1208,6 +1168,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, WARN_ON_ONCE(idxold < 0); WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1); + rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ if (statesnew & RCUTORTURE_RDR_BH) @@ -1216,6 +1177,10 @@ static void rcutorture_one_extend(int *readstate, int newstate, local_irq_disable(); if (statesnew & RCUTORTURE_RDR_PREEMPT) preempt_disable(); + if (statesnew & RCUTORTURE_RDR_RBH) + rcu_read_lock_bh(); + if (statesnew & RCUTORTURE_RDR_SCHED) + rcu_read_lock_sched(); if (statesnew & RCUTORTURE_RDR_RCU) idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT; @@ -1226,12 +1191,16 @@ static void rcutorture_one_extend(int *readstate, int newstate, local_bh_enable(); if (statesold & RCUTORTURE_RDR_PREEMPT) preempt_enable(); + if (statesold & RCUTORTURE_RDR_RBH) + rcu_read_unlock_bh(); + if (statesold & RCUTORTURE_RDR_SCHED) + rcu_read_unlock_sched(); if (statesold & RCUTORTURE_RDR_RCU) cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) - cur_ops->read_delay(trsp); + cur_ops->read_delay(trsp, rtrsp); /* Update the reader state. */ if (idxnew == -1) @@ -1260,18 +1229,19 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) { int mask = rcutorture_extend_mask_max(); unsigned long randmask1 = torture_random(trsp) >> 8; - unsigned long randmask2 = randmask1 >> 1; + unsigned long randmask2 = randmask1 >> 3; WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); - /* Half the time lots of bits, half the time only one bit. */ - if (randmask1 & 0x1) + /* Most of the time lots of bits, half the time only one bit. */ + if (!(randmask1 & 0x7)) mask = mask & randmask2; else mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS)); + /* Can't enable bh w/irq disabled. */ if ((mask & RCUTORTURE_RDR_IRQ) && - !(mask & RCUTORTURE_RDR_BH) && - (oldmask & RCUTORTURE_RDR_BH)) - mask |= RCUTORTURE_RDR_BH; /* Can't enable bh w/irq disabled. */ + ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) || + (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH)))) + mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; if ((mask & RCUTORTURE_RDR_IRQ) && !(mask & cur_ops->ext_irq_conflict) && (oldmask & cur_ops->ext_irq_conflict)) @@ -1283,20 +1253,25 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) * Do a randomly selected number of extensions of an existing RCU read-side * critical section. */ -static void rcutorture_loop_extend(int *readstate, - struct torture_random_state *trsp) +static struct rt_read_seg * +rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, + struct rt_read_seg *rtrsp) { int i; + int j; int mask = rcutorture_extend_mask_max(); WARN_ON_ONCE(!*readstate); /* -Existing- RCU read-side critsect! */ if (!((mask - 1) & mask)) - return; /* Current RCU flavor not extendable. */ - i = (torture_random(trsp) >> 3) & RCUTORTURE_RDR_MAX_LOOPS; - while (i--) { + return rtrsp; /* Current RCU reader not extendable. */ + /* Bias towards larger numbers of loops. */ + i = (torture_random(trsp) >> 3); + i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; + for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); - rcutorture_one_extend(readstate, mask, trsp); + rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]); } + return &rtrsp[j]; } /* @@ -1306,16 +1281,20 @@ static void rcutorture_loop_extend(int *readstate, */ static bool rcu_torture_one_read(struct torture_random_state *trsp) { + int i; unsigned long started; unsigned long completed; int newstate; struct rcu_torture *p; int pipe_count; int readstate = 0; + struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } }; + struct rt_read_seg *rtrsp = &rtseg[0]; + struct rt_read_seg *rtrsp1; unsigned long long ts; newstate = rcutorture_extend_mask(readstate, trsp); - rcutorture_one_extend(&readstate, newstate, trsp); + rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, @@ -1325,12 +1304,12 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcutorture_one_extend(&readstate, 0, trsp); + rcutorture_one_extend(&readstate, 0, trsp, rtrsp); return false; } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - rcutorture_loop_extend(&readstate, trsp); + rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); preempt_disable(); pipe_count = p->rtort_pipe_count; if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -1351,8 +1330,17 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - rcutorture_one_extend(&readstate, 0, trsp); + rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); + + /* If error or close call, record the sequence of reader protections. */ + if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) { + i = 0; + for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++) + err_segs[i++] = *rtrsp1; + rt_read_nsegs = i; + } + return true; } @@ -1387,6 +1375,9 @@ static void rcu_torture_timer(struct timer_list *unused) static int rcu_torture_reader(void *arg) { + unsigned long lastsleep = jiffies; + long myid = (long)arg; + int mynumonline = myid; DEFINE_TORTURE_RANDOM(rand); struct timer_list t; @@ -1402,6 +1393,12 @@ rcu_torture_reader(void *arg) } if (!rcu_torture_one_read(&rand)) schedule_timeout_interruptible(HZ); + if (time_after(jiffies, lastsleep)) { + schedule_timeout_interruptible(1); + lastsleep = jiffies + 10; + } + while (num_online_cpus() < mynumonline && !torture_must_stop()) + schedule_timeout_interruptible(HZ / 5); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { @@ -1655,6 +1652,121 @@ static int __init rcu_torture_stall_init(void) return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } +/* State structure for forward-progress self-propagating RCU callback. */ +struct fwd_cb_state { + struct rcu_head rh; + int stop; +}; + +/* + * Forward-progress self-propagating RCU callback function. Because + * callbacks run from softirq, this function is an implicit RCU read-side + * critical section. + */ +static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp) +{ + struct fwd_cb_state *fcsp = container_of(rhp, struct fwd_cb_state, rh); + + if (READ_ONCE(fcsp->stop)) { + WRITE_ONCE(fcsp->stop, 2); + return; + } + cur_ops->call(&fcsp->rh, rcu_torture_fwd_prog_cb); +} + +/* Carry out grace-period forward-progress testing. */ +static int rcu_torture_fwd_prog(void *args) +{ + unsigned long cver; + unsigned long dur; + struct fwd_cb_state fcs; + unsigned long gps; + int idx; + int sd; + int sd4; + bool selfpropcb = false; + unsigned long stopat; + int tested = 0; + int tested_tries = 0; + static DEFINE_TORTURE_RANDOM(trs); + + VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started"); + if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) + set_user_nice(current, MAX_NICE); + if (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) { + init_rcu_head_on_stack(&fcs.rh); + selfpropcb = true; + } + do { + schedule_timeout_interruptible(fwd_progress_holdoff * HZ); + if (selfpropcb) { + WRITE_ONCE(fcs.stop, 0); + cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); + } + cver = READ_ONCE(rcu_torture_current_version); + gps = cur_ops->get_gp_seq(); + sd = cur_ops->stall_dur() + 1; + sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; + dur = sd4 + torture_random(&trs) % (sd - sd4); + stopat = jiffies + dur; + while (time_before(jiffies, stopat) && !torture_must_stop()) { + idx = cur_ops->readlock(); + udelay(10); + cur_ops->readunlock(idx); + if (!fwd_progress_need_resched || need_resched()) + cond_resched(); + } + tested_tries++; + if (!time_before(jiffies, stopat) && !torture_must_stop()) { + tested++; + cver = READ_ONCE(rcu_torture_current_version) - cver; + gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + WARN_ON(!cver && gps < 2); + pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps); + } + if (selfpropcb) { + WRITE_ONCE(fcs.stop, 1); + cur_ops->sync(); /* Wait for running CB to complete. */ + cur_ops->cb_barrier(); /* Wait for queued callbacks. */ + } + /* Avoid slow periods, better to test when busy. */ + stutter_wait("rcu_torture_fwd_prog"); + } while (!torture_must_stop()); + if (selfpropcb) { + WARN_ON(READ_ONCE(fcs.stop) != 2); + destroy_rcu_head_on_stack(&fcs.rh); + } + /* Short runs might not contain a valid forward-progress attempt. */ + WARN_ON(!tested && tested_tries >= 5); + pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); + torture_kthread_stopping("rcu_torture_fwd_prog"); + return 0; +} + +/* If forward-progress checking is requested and feasible, spawn the thread. */ +static int __init rcu_torture_fwd_prog_init(void) +{ + if (!fwd_progress) + return 0; /* Not requested, so don't do it. */ + if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test"); + return 0; + } + if (stall_cpu > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing"); + if (IS_MODULE(CONFIG_RCU_TORTURE_TESTS)) + return -EINVAL; /* In module, can fail back to user. */ + WARN_ON(1); /* Make sure rcutorture notices conflict. */ + return 0; + } + if (fwd_progress_holdoff <= 0) + fwd_progress_holdoff = 1; + if (fwd_progress_div <= 0) + fwd_progress_div = 4; + return torture_create_kthread(rcu_torture_fwd_prog, + NULL, fwd_prog_task); +} + /* Callback function for RCU barrier testing. */ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) { @@ -1817,6 +1929,7 @@ static enum cpuhp_state rcutor_hp; static void rcu_torture_cleanup(void) { + int firsttime; int flags = 0; unsigned long gp_seq = 0; int i; @@ -1828,6 +1941,7 @@ rcu_torture_cleanup(void) } rcu_torture_barrier_cleanup(); + torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); torture_stop_kthread(rcu_torture_stall, stall_task); torture_stop_kthread(rcu_torture_writer, writer_task); @@ -1860,7 +1974,7 @@ rcu_torture_cleanup(void) cpuhp_remove_state(rcutor_hp); /* - * Wait for all RCU callbacks to fire, then do flavor-specific + * Wait for all RCU callbacks to fire, then do torture-type-specific * cleanup operations. */ if (cur_ops->cb_barrier != NULL) @@ -1870,6 +1984,33 @@ rcu_torture_cleanup(void) rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + if (err_segs_recorded) { + pr_alert("Failure/close-call rcutorture reader segments:\n"); + if (rt_read_nsegs == 0) + pr_alert("\t: No segments recorded!!!\n"); + firsttime = 1; + for (i = 0; i < rt_read_nsegs; i++) { + pr_alert("\t%d: %#x ", i, err_segs[i].rt_readstate); + if (err_segs[i].rt_delay_jiffies != 0) { + pr_cont("%s%ldjiffies", firsttime ? "" : "+", + err_segs[i].rt_delay_jiffies); + firsttime = 0; + } + if (err_segs[i].rt_delay_ms != 0) { + pr_cont("%s%ldms", firsttime ? "" : "+", + err_segs[i].rt_delay_ms); + firsttime = 0; + } + if (err_segs[i].rt_delay_us != 0) { + pr_cont("%s%ldus", firsttime ? "" : "+", + err_segs[i].rt_delay_us); + firsttime = 0; + } + pr_cont("%s\n", + err_segs[i].rt_preempted ? "preempted" : ""); + + } + } if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); else if (torture_onoff_failures()) @@ -1939,12 +2080,12 @@ static void rcu_test_debug_objects(void) static int __init rcu_torture_init(void) { - int i; + long i; int cpu; int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &sched_ops, &tasks_ops, + &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, + &busted_srcud_ops, &tasks_ops, }; if (!torture_init_begin(torture_type, verbose)) @@ -1963,6 +2104,7 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_cont(" %s", torture_ops[i]->name); pr_cont("\n"); + WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); firsterr = -EINVAL; goto unwind; } @@ -2013,6 +2155,8 @@ rcu_torture_init(void) per_cpu(rcu_torture_batch, cpu)[i] = 0; } } + err_segs_recorded = 0; + rt_read_nsegs = 0; /* Start up the kthreads. */ @@ -2044,7 +2188,7 @@ rcu_torture_init(void) goto unwind; } for (i = 0; i < nrealreaders; i++) { - firsterr = torture_create_kthread(rcu_torture_reader, NULL, + firsterr = torture_create_kthread(rcu_torture_reader, (void *)i, reader_tasks[i]); if (firsterr) goto unwind; @@ -2100,6 +2244,9 @@ rcu_torture_init(void) firsterr = rcu_torture_stall_init(); if (firsterr) goto unwind; + firsterr = rcu_torture_fwd_prog_init(); + if (firsterr) + goto unwind; firsterr = rcu_torture_barrier_init(); if (firsterr) goto unwind; diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 04fc2ed71af8..b46e6683f8c9 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -34,6 +34,8 @@ #include "rcu.h" int rcu_scheduler_active __read_mostly; +static LIST_HEAD(srcu_boot_list); +static bool srcu_init_done; static int init_srcu_struct_fields(struct srcu_struct *sp) { @@ -46,6 +48,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) sp->srcu_gp_waiting = false; sp->srcu_idx = 0; INIT_WORK(&sp->srcu_work, srcu_drive_gp); + INIT_LIST_HEAD(&sp->srcu_work.entry); return 0; } @@ -179,8 +182,12 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, *sp->srcu_cb_tail = rhp; sp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); - if (!READ_ONCE(sp->srcu_gp_running)) - schedule_work(&sp->srcu_work); + if (!READ_ONCE(sp->srcu_gp_running)) { + if (likely(srcu_init_done)) + schedule_work(&sp->srcu_work); + else if (list_empty(&sp->srcu_work.entry)) + list_add(&sp->srcu_work.entry, &srcu_boot_list); + } } EXPORT_SYMBOL_GPL(call_srcu); @@ -204,3 +211,21 @@ void __init rcu_scheduler_starting(void) { rcu_scheduler_active = RCU_SCHEDULER_RUNNING; } + +/* + * Queue work for srcu_struct structures with early boot callbacks. + * The work won't actually execute until the workqueue initialization + * phase that takes place after the scheduler starts. + */ +void __init srcu_init(void) +{ + struct srcu_struct *sp; + + srcu_init_done = true; + while (!list_empty(&srcu_boot_list)) { + sp = list_first_entry(&srcu_boot_list, + struct srcu_struct, srcu_work.entry); + list_del_init(&sp->srcu_work.entry); + schedule_work(&sp->srcu_work); + } +} diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6c9866a854b1..a8846ed7f352 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -51,6 +51,10 @@ module_param(exp_holdoff, ulong, 0444); static ulong counter_wrap_check = (ULONG_MAX >> 2); module_param(counter_wrap_check, ulong, 0444); +/* Early-boot callback-management, so early that no lock is required! */ +static LIST_HEAD(srcu_boot_list); +static bool __read_mostly srcu_init_done; + static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); static void process_srcu(struct work_struct *work); @@ -105,7 +109,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) rcu_init_levelspread(levelspread, num_rcu_lvl); /* Each pass through this loop initializes one srcu_node structure. */ - rcu_for_each_node_breadth_first(sp, snp) { + srcu_for_each_node_breadth_first(sp, snp) { spin_lock_init(&ACCESS_PRIVATE(snp, lock)); WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); @@ -235,7 +239,6 @@ static void check_init_srcu_struct(struct srcu_struct *sp) { unsigned long flags; - WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT); /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ @@ -561,7 +564,7 @@ static void srcu_gp_end(struct srcu_struct *sp) /* Initiate callback invocation as needed. */ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - rcu_for_each_node_breadth_first(sp, snp) { + srcu_for_each_node_breadth_first(sp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; last_lvl = snp >= sp->level[rcu_num_lvls - 1]; @@ -701,7 +704,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); srcu_gp_start(sp); - queue_delayed_work(rcu_gp_wq, &sp->work, srcu_get_delay(sp)); + if (likely(srcu_init_done)) + queue_delayed_work(rcu_gp_wq, &sp->work, + srcu_get_delay(sp)); + else if (list_empty(&sp->work.work.entry)) + list_add(&sp->work.work.entry, &srcu_boot_list); } spin_unlock_irqrestore_rcu_node(sp, flags); } @@ -980,7 +987,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * There are memory-ordering constraints implied by synchronize_srcu(). * On systems with more than one CPU, when synchronize_srcu() returns, * each CPU is guaranteed to have executed a full memory barrier since - * the end of its last corresponding SRCU-sched read-side critical section + * the end of its last corresponding SRCU read-side critical section * whose beginning preceded the call to synchronize_srcu(). In addition, * each CPU having an SRCU read-side critical section that extends beyond * the return from synchronize_srcu() is guaranteed to have executed a @@ -1308,3 +1315,17 @@ static int __init srcu_bootup_announce(void) return 0; } early_initcall(srcu_bootup_announce); + +void __init srcu_init(void) +{ + struct srcu_struct *sp; + + srcu_init_done = true; + while (!list_empty(&srcu_boot_list)) { + sp = list_first_entry(&srcu_boot_list, struct srcu_struct, + work.work.entry); + check_init_srcu_struct(sp); + list_del_init(&sp->work.work.entry); + queue_work(rcu_gp_wq, &sp->work.work); + } +} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index befc9321a89c..5f5963ba313e 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -46,69 +46,27 @@ struct rcu_ctrlblk { }; /* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_sched_ctrlblk = { - .donetail = &rcu_sched_ctrlblk.rcucblist, - .curtail = &rcu_sched_ctrlblk.rcucblist, +static struct rcu_ctrlblk rcu_ctrlblk = { + .donetail = &rcu_ctrlblk.rcucblist, + .curtail = &rcu_ctrlblk.rcucblist, }; -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .donetail = &rcu_bh_ctrlblk.rcucblist, - .curtail = &rcu_bh_ctrlblk.rcucblist, -}; - -void rcu_barrier_bh(void) -{ - wait_rcu_gp(call_rcu_bh); -} -EXPORT_SYMBOL(rcu_barrier_bh); - -void rcu_barrier_sched(void) -{ - wait_rcu_gp(call_rcu_sched); -} -EXPORT_SYMBOL(rcu_barrier_sched); - -/* - * Helper function for rcu_sched_qs() and rcu_bh_qs(). - * Also irqs are disabled to avoid confusion due to interrupt handlers - * invoking call_rcu(). - */ -static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) -{ - if (rcp->donetail != rcp->curtail) { - rcp->donetail = rcp->curtail; - return 1; - } - - return 0; -} - -/* - * Record an rcu quiescent state. And an rcu_bh quiescent state while we - * are at it, given that any rcu quiescent state is also an rcu_bh - * quiescent state. Use "+" instead of "||" to defeat short circuiting. - */ -void rcu_sched_qs(void) +void rcu_barrier(void) { - unsigned long flags; - - local_irq_save(flags); - if (rcu_qsctr_help(&rcu_sched_ctrlblk) + - rcu_qsctr_help(&rcu_bh_ctrlblk)) - raise_softirq(RCU_SOFTIRQ); - local_irq_restore(flags); + wait_rcu_gp(call_rcu); } +EXPORT_SYMBOL(rcu_barrier); -/* - * Record an rcu_bh quiescent state. - */ -void rcu_bh_qs(void) +/* Record an rcu quiescent state. */ +void rcu_qs(void) { unsigned long flags; local_irq_save(flags); - if (rcu_qsctr_help(&rcu_bh_ctrlblk)) + if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { + rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; raise_softirq(RCU_SOFTIRQ); + } local_irq_restore(flags); } @@ -120,34 +78,33 @@ void rcu_bh_qs(void) */ void rcu_check_callbacks(int user) { - if (user) - rcu_sched_qs(); - if (user || !in_softirq()) - rcu_bh_qs(); + if (user) { + rcu_qs(); + } else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { + set_tsk_need_resched(current); + set_preempt_need_resched(); + } } -/* - * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure - * whose grace period has elapsed. - */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) +/* Invoke the RCU callbacks whose grace period has elapsed. */ +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_head *next, *list; unsigned long flags; /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); - if (rcp->donetail == &rcp->rcucblist) { + if (rcu_ctrlblk.donetail == &rcu_ctrlblk.rcucblist) { /* No callbacks ready, so just leave. */ local_irq_restore(flags); return; } - list = rcp->rcucblist; - rcp->rcucblist = *rcp->donetail; - *rcp->donetail = NULL; - if (rcp->curtail == rcp->donetail) - rcp->curtail = &rcp->rcucblist; - rcp->donetail = &rcp->rcucblist; + list = rcu_ctrlblk.rcucblist; + rcu_ctrlblk.rcucblist = *rcu_ctrlblk.donetail; + *rcu_ctrlblk.donetail = NULL; + if (rcu_ctrlblk.curtail == rcu_ctrlblk.donetail) + rcu_ctrlblk.curtail = &rcu_ctrlblk.rcucblist; + rcu_ctrlblk.donetail = &rcu_ctrlblk.rcucblist; local_irq_restore(flags); /* Invoke the callbacks on the local list. */ @@ -162,37 +119,31 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) } } -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) -{ - __rcu_process_callbacks(&rcu_sched_ctrlblk); - __rcu_process_callbacks(&rcu_bh_ctrlblk); -} - /* * Wait for a grace period to elapse. But it is illegal to invoke - * synchronize_sched() from within an RCU read-side critical section. - * Therefore, any legal call to synchronize_sched() is a quiescent - * state, and so on a UP system, synchronize_sched() need do nothing. - * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the - * benefits of doing might_sleep() to reduce latency.) + * synchronize_rcu() from within an RCU read-side critical section. + * Therefore, any legal call to synchronize_rcu() is a quiescent + * state, and so on a UP system, synchronize_rcu() need do nothing. + * (But Lai Jiangshan points out the benefits of doing might_sleep() + * to reduce latency.) * * Cool, huh? (Due to Josh Triplett.) */ -void synchronize_sched(void) +void synchronize_rcu(void) { RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU read-side critical section"); + "Illegal synchronize_rcu() in RCU read-side critical section"); } -EXPORT_SYMBOL_GPL(synchronize_sched); +EXPORT_SYMBOL_GPL(synchronize_rcu); /* - * Helper function for call_rcu() and call_rcu_bh(). + * Post an RCU callback to be invoked after the end of an RCU grace + * period. But since we have but one CPU, that would be after any + * quiescent state. */ -static void __call_rcu(struct rcu_head *head, - rcu_callback_t func, - struct rcu_ctrlblk *rcp) +void call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; @@ -201,39 +152,20 @@ static void __call_rcu(struct rcu_head *head, head->next = NULL; local_irq_save(flags); - *rcp->curtail = head; - rcp->curtail = &head->next; + *rcu_ctrlblk.curtail = head; + rcu_ctrlblk.curtail = &head->next; local_irq_restore(flags); if (unlikely(is_idle_task(current))) { - /* force scheduling for rcu_sched_qs() */ + /* force scheduling for rcu_qs() */ resched_cpu(0); } } - -/* - * Post an RCU callback to be invoked after the end of an RCU-sched grace - * period. But since we have but one CPU, that would be after any - * quiescent state. - */ -void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func, &rcu_sched_ctrlblk); -} -EXPORT_SYMBOL_GPL(call_rcu_sched); - -/* - * Post an RCU bottom-half callback to be invoked after any subsequent - * quiescent state. - */ -void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func, &rcu_bh_ctrlblk); -} -EXPORT_SYMBOL_GPL(call_rcu_bh); +EXPORT_SYMBOL_GPL(call_rcu); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); rcu_early_boot_tests(); + srcu_init(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0b760c1369f7..121f833acd04 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -61,6 +61,7 @@ #include <linux/trace_events.h> #include <linux/suspend.h> #include <linux/ftrace.h> +#include <linux/tick.h> #include "tree.h" #include "rcu.h" @@ -73,45 +74,31 @@ /* Data structures. */ /* - * In order to export the rcu_state name to the tracing tools, it - * needs to be added in the __tracepoint_string section. - * This requires defining a separate variable tp_<sname>_varname - * that points to the string being used, and this will allow - * the tracing userspace tools to be able to decipher the string - * address to the matching string. + * Steal a bit from the bottom of ->dynticks for idle entry/exit + * control. Initially this is for TLB flushing. */ -#ifdef CONFIG_TRACING -# define DEFINE_RCU_TPS(sname) \ -static char sname##_varname[] = #sname; \ -static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; -# define RCU_STATE_NAME(sname) sname##_varname -#else -# define DEFINE_RCU_TPS(sname) -# define RCU_STATE_NAME(sname) __stringify(sname) +#define RCU_DYNTICK_CTRL_MASK 0x1 +#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) +#ifndef rcu_eqs_special_exit +#define rcu_eqs_special_exit() do { } while (0) #endif -#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ -DEFINE_RCU_TPS(sname) \ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ -struct rcu_state sname##_state = { \ - .level = { &sname##_state.node[0] }, \ - .rda = &sname##_data, \ - .call = cr, \ - .gp_state = RCU_GP_IDLE, \ - .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, \ - .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ - .name = RCU_STATE_NAME(sname), \ - .abbr = sabbr, \ - .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ - .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ - .ofl_lock = __SPIN_LOCK_UNLOCKED(sname##_state.ofl_lock), \ -} - -RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); -RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); - -static struct rcu_state *const rcu_state_p; -LIST_HEAD(rcu_struct_flavors); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { + .dynticks_nesting = 1, + .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, + .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), +}; +struct rcu_state rcu_state = { + .level = { &rcu_state.node[0] }, + .gp_state = RCU_GP_IDLE, + .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, + .barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex), + .name = RCU_NAME, + .abbr = RCU_ABBR, + .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), + .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex), + .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock), +}; /* Dump rcu_node combining tree at boot to verify correct setup. */ static bool dump_tree; @@ -158,16 +145,14 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static int rcu_scheduler_fully_active __read_mostly; -static void -rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long gps, unsigned long flags); +static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, + unsigned long gps, unsigned long flags); static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); -static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); -static void rcu_report_exp_rdp(struct rcu_state *rsp, - struct rcu_data *rdp, bool wake); +static void invoke_rcu_callbacks(struct rcu_data *rdp); +static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ @@ -183,7 +168,7 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); -/* Retreive RCU kthreads priority for rcutorture */ +/* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { return kthread_prio; @@ -217,67 +202,24 @@ unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. */ -static int rcu_gp_in_progress(struct rcu_state *rsp) +static int rcu_gp_in_progress(void) { - return rcu_seq_state(rcu_seq_current(&rsp->gp_seq)); -} - -/* - * Note a quiescent state. Because we do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period, this just sets a flag. - * The caller must have disabled preemption. - */ -void rcu_sched_qs(void) -{ - RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!"); - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) - return; - trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gp_seq), - TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), true); + return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq)); } -void rcu_bh_qs(void) +void rcu_softirq_qs(void) { - RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); - if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { - trace_rcu_grace_period(TPS("rcu_bh"), - __this_cpu_read(rcu_bh_data.gp_seq), - TPS("cpuqs")); - __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); - } + rcu_qs(); + rcu_preempt_deferred_qs(current); } /* - * Steal a bit from the bottom of ->dynticks for idle entry/exit - * control. Initially this is for TLB flushing. - */ -#define RCU_DYNTICK_CTRL_MASK 0x1 -#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) -#ifndef rcu_eqs_special_exit -#define rcu_eqs_special_exit() do { } while (0) -#endif - -static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = 1, - .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, - .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), -}; - -/* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state. */ static void rcu_dynticks_eqs_enter(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -285,7 +227,7 @@ static void rcu_dynticks_eqs_enter(void) * critical sections, and we also must force ordering with the * next idle sojourn. */ - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); /* Better be in an extended quiescent state! */ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICK_CTRL_CTR)); @@ -300,7 +242,7 @@ static void rcu_dynticks_eqs_enter(void) */ static void rcu_dynticks_eqs_exit(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -308,11 +250,11 @@ static void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { - atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks); + atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); smp_mb__after_atomic(); /* _exit after clearing mask. */ /* Prefer duplicate flushes to losing a flush. */ rcu_eqs_special_exit(); @@ -331,11 +273,11 @@ static void rcu_dynticks_eqs_exit(void) */ static void rcu_dynticks_eqs_online(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR) + if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR) return; - atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); } /* @@ -345,18 +287,18 @@ static void rcu_dynticks_eqs_online(void) */ bool rcu_dynticks_curr_cpu_in_eqs(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR); + return !(atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); } /* * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. */ -int rcu_dynticks_snap(struct rcu_dynticks *rdtp) +int rcu_dynticks_snap(struct rcu_data *rdp) { - int snap = atomic_add_return(0, &rdtp->dynticks); + int snap = atomic_add_return(0, &rdp->dynticks); return snap & ~RCU_DYNTICK_CTRL_MASK; } @@ -371,13 +313,13 @@ static bool rcu_dynticks_in_eqs(int snap) } /* - * Return true if the CPU corresponding to the specified rcu_dynticks + * Return true if the CPU corresponding to the specified rcu_data * structure has spent some time in an extended quiescent state since * rcu_dynticks_snap() returned the specified snapshot. */ -static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) +static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) { - return snap != rcu_dynticks_snap(rdtp); + return snap != rcu_dynticks_snap(rdp); } /* @@ -391,14 +333,14 @@ bool rcu_eqs_special_set(int cpu) { int old; int new; - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); do { - old = atomic_read(&rdtp->dynticks); + old = atomic_read(&rdp->dynticks); if (old & RCU_DYNTICK_CTRL_CTR) return false; new = old | RCU_DYNTICK_CTRL_MASK; - } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old); + } while (atomic_cmpxchg(&rdp->dynticks, old, new) != old); return true; } @@ -413,82 +355,30 @@ bool rcu_eqs_special_set(int cpu) * * The caller must have disabled interrupts and must not be idle. */ -static void rcu_momentary_dyntick_idle(void) +static void __maybe_unused rcu_momentary_dyntick_idle(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); int special; - raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); - special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); + special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, + &this_cpu_ptr(&rcu_data)->dynticks); /* It is illegal to call this from idle state. */ WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); + rcu_preempt_deferred_qs(current); } -/* - * Note a context switch. This is a quiescent state for RCU-sched, - * and requires special handling for preemptible RCU. - * The caller must have disabled interrupts. - */ -void rcu_note_context_switch(bool preempt) -{ - barrier(); /* Avoid RCU read-side critical sections leaking down. */ - trace_rcu_utilization(TPS("Start context switch")); - rcu_sched_qs(); - rcu_preempt_note_context_switch(preempt); - /* Load rcu_urgent_qs before other flags. */ - if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) - goto out; - this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); - if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) - rcu_momentary_dyntick_idle(); - this_cpu_inc(rcu_dynticks.rcu_qs_ctr); - if (!preempt) - rcu_tasks_qs(current); -out: - trace_rcu_utilization(TPS("End context switch")); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ -} -EXPORT_SYMBOL_GPL(rcu_note_context_switch); - -/* - * Register a quiescent state for all RCU flavors. If there is an - * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight - * dyntick-idle quiescent state visible to other CPUs (but only for those - * RCU flavors in desperate need of a quiescent state, which will normally - * be none of them). Either way, do a lightweight quiescent state for - * all RCU flavors. - * - * The barrier() calls are redundant in the common case when this is - * called externally, but just in case this is called from within this - * file. +/** + * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle * + * If the current CPU is idle or running at a first-level (not nested) + * interrupt from idle, return true. The caller must have at least + * disabled preemption. */ -void rcu_all_qs(void) +static int rcu_is_cpu_rrupt_from_idle(void) { - unsigned long flags; - - if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs)) - return; - preempt_disable(); - /* Load rcu_urgent_qs before other flags. */ - if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) { - preempt_enable(); - return; - } - this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); - barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) { - local_irq_save(flags); - rcu_momentary_dyntick_idle(); - local_irq_restore(flags); - } - if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) - rcu_sched_qs(); - this_cpu_inc(rcu_dynticks.rcu_qs_ctr); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ - preempt_enable(); + return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 && + __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1; } -EXPORT_SYMBOL_GPL(rcu_all_qs); #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ static long blimit = DEFAULT_RCU_BLIMIT; @@ -505,13 +395,47 @@ static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; +/* + * How long the grace period must be before we start recruiting + * quiescent-state help from rcu_note_context_switch(). + */ +static ulong jiffies_till_sched_qs = ULONG_MAX; +module_param(jiffies_till_sched_qs, ulong, 0444); +static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */ +module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */ + +/* + * Make sure that we give the grace-period kthread time to detect any + * idle CPUs before taking active measures to force quiescent states. + * However, don't go below 100 milliseconds, adjusted upwards for really + * large systems. + */ +static void adjust_jiffies_till_sched_qs(void) +{ + unsigned long j; + + /* If jiffies_till_sched_qs was specified, respect the request. */ + if (jiffies_till_sched_qs != ULONG_MAX) { + WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs); + return; + } + j = READ_ONCE(jiffies_till_first_fqs) + + 2 * READ_ONCE(jiffies_till_next_fqs); + if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV) + j = HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; + pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j); + WRITE_ONCE(jiffies_to_sched_qs, j); +} + static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp) { ulong j; int ret = kstrtoul(val, 0, &j); - if (!ret) + if (!ret) { WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j); + adjust_jiffies_till_sched_qs(); + } return ret; } @@ -520,8 +444,10 @@ static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param ulong j; int ret = kstrtoul(val, 0, &j); - if (!ret) + if (!ret) { WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1)); + adjust_jiffies_till_sched_qs(); + } return ret; } @@ -539,15 +465,8 @@ module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_fi module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644); module_param(rcu_kick_kthreads, bool, 0644); -/* - * How long the grace period must be before we start recruiting - * quiescent-state help from rcu_note_context_switch(). - */ -static ulong jiffies_till_sched_qs = HZ / 10; -module_param(jiffies_till_sched_qs, ulong, 0444); - -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); -static void force_quiescent_state(struct rcu_state *rsp); +static void force_qs_rnp(int (*f)(struct rcu_data *rdp)); +static void force_quiescent_state(void); static int rcu_pending(void); /* @@ -555,29 +474,11 @@ static int rcu_pending(void); */ unsigned long rcu_get_gp_seq(void) { - return READ_ONCE(rcu_state_p->gp_seq); + return READ_ONCE(rcu_state.gp_seq); } EXPORT_SYMBOL_GPL(rcu_get_gp_seq); /* - * Return the number of RCU-sched GPs completed thus far for debug & stats. - */ -unsigned long rcu_sched_get_gp_seq(void) -{ - return READ_ONCE(rcu_sched_state.gp_seq); -} -EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq); - -/* - * Return the number of RCU-bh GPs completed thus far for debug & stats. - */ -unsigned long rcu_bh_get_gp_seq(void) -{ - return READ_ONCE(rcu_bh_state.gp_seq); -} -EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq); - -/* * Return the number of RCU expedited batches completed thus far for * debug & stats. Odd numbers mean that a batch is in progress, even * numbers mean idle. The value returned will thus be roughly double @@ -585,48 +486,20 @@ EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq); */ unsigned long rcu_exp_batches_completed(void) { - return rcu_state_p->expedited_sequence; + return rcu_state.expedited_sequence; } EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); /* - * Return the number of RCU-sched expedited batches completed thus far - * for debug & stats. Similar to rcu_exp_batches_completed(). - */ -unsigned long rcu_exp_batches_completed_sched(void) -{ - return rcu_sched_state.expedited_sequence; -} -EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched); - -/* * Force a quiescent state. */ void rcu_force_quiescent_state(void) { - force_quiescent_state(rcu_state_p); + force_quiescent_state(); } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* - * Force a quiescent state for RCU BH. - */ -void rcu_bh_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_bh_state); -} -EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); - -/* - * Force a quiescent state for RCU-sched. - */ -void rcu_sched_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_sched_state); -} -EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); - -/* * Show the state of the grace-period kthreads. */ void show_rcu_gp_kthreads(void) @@ -634,31 +507,28 @@ void show_rcu_gp_kthreads(void) int cpu; struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_state *rsp; - for_each_rcu_flavor(rsp) { - pr_info("%s: wait state: %d ->state: %#lx\n", - rsp->name, rsp->gp_state, rsp->gp_kthread->state); - rcu_for_each_node_breadth_first(rsp, rnp) { - if (ULONG_CMP_GE(rsp->gp_seq, rnp->gp_seq_needed)) - continue; - pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", - rnp->grplo, rnp->grphi, rnp->gp_seq, - rnp->gp_seq_needed); - if (!rcu_is_leaf_node(rnp)) + pr_info("%s: wait state: %d ->state: %#lx\n", rcu_state.name, + rcu_state.gp_state, rcu_state.gp_kthread->state); + rcu_for_each_node_breadth_first(rnp) { + if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) + continue; + pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", + rnp->grplo, rnp->grphi, rnp->gp_seq, + rnp->gp_seq_needed); + if (!rcu_is_leaf_node(rnp)) + continue; + for_each_leaf_node_possible_cpu(rnp, cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->gpwrap || + ULONG_CMP_GE(rcu_state.gp_seq, + rdp->gp_seq_needed)) continue; - for_each_leaf_node_possible_cpu(rnp, cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (rdp->gpwrap || - ULONG_CMP_GE(rsp->gp_seq, - rdp->gp_seq_needed)) - continue; - pr_info("\tcpu %d ->gp_seq_needed %lu\n", - cpu, rdp->gp_seq_needed); - } + pr_info("\tcpu %d ->gp_seq_needed %lu\n", + cpu, rdp->gp_seq_needed); } - /* sched_show_task(rsp->gp_kthread); */ } + /* sched_show_task(rcu_state.gp_kthread); */ } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); @@ -668,34 +538,25 @@ EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq) { - struct rcu_state *rsp = NULL; - switch (test_type) { case RCU_FLAVOR: - rsp = rcu_state_p; - break; case RCU_BH_FLAVOR: - rsp = &rcu_bh_state; - break; case RCU_SCHED_FLAVOR: - rsp = &rcu_sched_state; + *flags = READ_ONCE(rcu_state.gp_flags); + *gp_seq = rcu_seq_current(&rcu_state.gp_seq); break; default: break; } - if (rsp == NULL) - return; - *flags = READ_ONCE(rsp->gp_flags); - *gp_seq = rcu_seq_current(&rsp->gp_seq); } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); /* - * Return the root node of the specified rcu_state structure. + * Return the root node of the rcu_state structure. */ -static struct rcu_node *rcu_get_root(struct rcu_state *rsp) +static struct rcu_node *rcu_get_root(void) { - return &rsp->node[0]; + return &rcu_state.node[0]; } /* @@ -708,28 +569,25 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) */ static void rcu_eqs_enter(bool user) { - struct rcu_state *rsp; - struct rcu_data *rdp; - struct rcu_dynticks *rdtp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - rdtp = this_cpu_ptr(&rcu_dynticks); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); + WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE); + WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting == 0); - if (rdtp->dynticks_nesting != 1) { - rdtp->dynticks_nesting--; + rdp->dynticks_nesting == 0); + if (rdp->dynticks_nesting != 1) { + rdp->dynticks_nesting--; return; } lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks); + trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); - do_nocb_deferred_wakeup(rdp); - } + rdp = this_cpu_ptr(&rcu_data); + do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); - WRITE_ONCE(rdtp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + rcu_preempt_deferred_qs(current); + WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ rcu_dynticks_eqs_enter(); rcu_dynticks_task_enter(); } @@ -770,44 +628,61 @@ void rcu_user_enter(void) } #endif /* CONFIG_NO_HZ_FULL */ -/** - * rcu_nmi_exit - inform RCU of exit from NMI context - * +/* * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting + * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * - * If you add or remove a call to rcu_nmi_exit(), be sure to test + * If you add or remove a call to rcu_nmi_exit_common(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_nmi_exit(void) +static __always_inline void rcu_nmi_exit_common(bool irq) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); /* * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. * (We are exiting an NMI handler, so RCU better be paying attention * to us!) */ - WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); + WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0); WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); /* * If the nesting level is not 1, the CPU wasn't RCU-idle, so * leave it in non-RCU-idle state. */ - if (rdtp->dynticks_nmi_nesting != 1) { - trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, rdtp->dynticks_nmi_nesting - 2, rdtp->dynticks); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ - rdtp->dynticks_nmi_nesting - 2); + if (rdp->dynticks_nmi_nesting != 1) { + trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); + WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ + rdp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0, rdtp->dynticks); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks); + WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + + if (irq) + rcu_prepare_for_idle(); + rcu_dynticks_eqs_enter(); + + if (irq) + rcu_dynticks_task_enter(); +} + +/** + * rcu_nmi_exit - inform RCU of exit from NMI context + * @irq: Is this call from rcu_irq_exit? + * + * If you add or remove a call to rcu_nmi_exit(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void rcu_nmi_exit(void) +{ + rcu_nmi_exit_common(false); } /** @@ -831,14 +706,8 @@ void rcu_nmi_exit(void) */ void rcu_irq_exit(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - lockdep_assert_irqs_disabled(); - if (rdtp->dynticks_nmi_nesting == 1) - rcu_prepare_for_idle(); - rcu_nmi_exit(); - if (rdtp->dynticks_nmi_nesting == 0) - rcu_dynticks_task_enter(); + rcu_nmi_exit_common(true); } /* @@ -866,24 +735,25 @@ void rcu_irq_exit_irqson(void) */ static void rcu_eqs_exit(bool user) { - struct rcu_dynticks *rdtp; + struct rcu_data *rdp; long oldval; lockdep_assert_irqs_disabled(); - rdtp = this_cpu_ptr(&rcu_dynticks); - oldval = rdtp->dynticks_nesting; + rdp = this_cpu_ptr(&rcu_data); + oldval = rdp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { - rdtp->dynticks_nesting++; + rdp->dynticks_nesting++; return; } rcu_dynticks_task_exit(); rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, rdtp->dynticks); + trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - WRITE_ONCE(rdtp->dynticks_nesting, 1); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + WRITE_ONCE(rdp->dynticks_nesting, 1); + WARN_ON_ONCE(rdp->dynticks_nmi_nesting); + WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } /** @@ -921,24 +791,25 @@ void rcu_user_exit(void) #endif /* CONFIG_NO_HZ_FULL */ /** - * rcu_nmi_enter - inform RCU of entry to NMI context + * rcu_nmi_enter_common - inform RCU of entry to NMI context + * @irq: Is this call from rcu_irq_enter? * - * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and - * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know + * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and + * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know * that the CPU is active. This implementation permits nested NMIs, as * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) * - * If you add or remove a call to rcu_nmi_enter(), be sure to test + * If you add or remove a call to rcu_nmi_enter_common(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_nmi_enter(void) +static __always_inline void rcu_nmi_enter_common(bool irq) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); long incby = 2; /* Complain about underflow. */ - WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); + WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); /* * If idle from RCU viewpoint, atomically increment ->dynticks @@ -949,18 +820,34 @@ void rcu_nmi_enter(void) * period (observation due to Andy Lutomirski). */ if (rcu_dynticks_curr_cpu_in_eqs()) { + + if (irq) + rcu_dynticks_task_exit(); + rcu_dynticks_eqs_exit(); + + if (irq) + rcu_cleanup_after_idle(); + incby = 1; } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), - rdtp->dynticks_nmi_nesting, - rdtp->dynticks_nmi_nesting + incby, rdtp->dynticks); - WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ - rdtp->dynticks_nmi_nesting + incby); + rdp->dynticks_nmi_nesting, + rdp->dynticks_nmi_nesting + incby, rdp->dynticks); + WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ + rdp->dynticks_nmi_nesting + incby); barrier(); } /** + * rcu_nmi_enter - inform RCU of entry to NMI context + */ +void rcu_nmi_enter(void) +{ + rcu_nmi_enter_common(false); +} + +/** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * * Enter an interrupt handler, which might possibly result in exiting @@ -984,14 +871,8 @@ void rcu_nmi_enter(void) */ void rcu_irq_enter(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - lockdep_assert_irqs_disabled(); - if (rdtp->dynticks_nmi_nesting == 0) - rcu_dynticks_task_exit(); - rcu_nmi_enter(); - if (rdtp->dynticks_nmi_nesting == 1) - rcu_cleanup_after_idle(); + rcu_nmi_enter_common(true); } /* @@ -1043,7 +924,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t) cpu = task_cpu(t); if (!task_curr(t)) return; /* This task is not running on that CPU. */ - smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true); + smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true); } #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) @@ -1054,11 +935,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t) * Disable preemption to avoid false positives that could otherwise * happen due to the current CPU number being sampled, this task being * preempted, its old CPU being taken offline, resuming on some other CPU, - * then determining that its old CPU is now offline. Because there are - * multiple flavors of RCU, and because this function can be called in the - * midst of updating the flavors while a given CPU coming online or going - * offline, it is necessary to check all flavors. If any of the flavors - * believe that given CPU is online, it is considered to be online. + * then determining that its old CPU is now offline. * * Disable checking if in an NMI handler because we cannot safely * report errors from NMI handlers anyway. In addition, it is OK to use @@ -1069,39 +946,22 @@ bool rcu_lockdep_current_cpu_online(void) { struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_state *rsp; + bool ret = false; if (in_nmi() || !rcu_scheduler_fully_active) return true; preempt_disable(); - for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; - if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) { - preempt_enable(); - return true; - } - } + rdp = this_cpu_ptr(&rcu_data); + rnp = rdp->mynode; + if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) + ret = true; preempt_enable(); - return false; + return ret; } EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ -/** - * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle - * - * If the current CPU is idle or running at a first-level (not nested) - * interrupt from idle, return true. The caller must have at least - * disabled preemption. - */ -static int rcu_is_cpu_rrupt_from_idle(void) -{ - return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 0 && - __this_cpu_read(rcu_dynticks.dynticks_nmi_nesting) <= 1; -} - /* * We are reporting a quiescent state on behalf of some other CPU, so * it is our responsibility to check for and handle potential overflow @@ -1126,9 +986,9 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) */ static int dyntick_save_progress_counter(struct rcu_data *rdp) { - rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); + rdp->dynticks_snap = rcu_dynticks_snap(rdp); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); + trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); return 1; } @@ -1177,35 +1037,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); - rdp->dynticks_fqs++; - rcu_gpnum_ovf(rnp, rdp); - return 1; - } - - /* - * Has this CPU encountered a cond_resched() since the beginning - * of the grace period? For this to be the case, the CPU has to - * have noticed the current grace period. This might not be the - * case for nohz_full CPUs looping in the kernel. - */ - jtsq = jiffies_till_sched_qs; - ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); - if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && - READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && - rcu_seq_current(&rdp->gp_seq) == rnp->gp_seq && !rdp->gpwrap) { - trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("rqc")); + if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) { + trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rnp, rdp); return 1; - } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { - /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ - smp_store_release(ruqp, true); } /* If waiting too long on an offline CPU, complain. */ if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && - time_after(jiffies, rdp->rsp->gp_start + HZ)) { + time_after(jiffies, rcu_state.gp_start + HZ)) { bool onl; struct rcu_node *rnp1; @@ -1226,39 +1066,56 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) /* * A CPU running for an extended time within the kernel can - * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, - * even context-switching back and forth between a pair of - * in-kernel CPU-bound tasks cannot advance grace periods. - * So if the grace period is old enough, make the CPU pay attention. - * Note that the unsynchronized assignments to the per-CPU - * rcu_need_heavy_qs variable are safe. Yes, setting of - * bits can be lost, but they will be set again on the next - * force-quiescent-state pass. So lost bit sets do not result - * in incorrect behavior, merely in a grace period lasting - * a few jiffies longer than it might otherwise. Because - * there are at most four threads involved, and because the - * updates are only once every few jiffies, the probability of - * lossage (and thus of slight grace-period extension) is - * quite low. + * delay RCU grace periods: (1) At age jiffies_to_sched_qs, + * set .rcu_urgent_qs, (2) At age 2*jiffies_to_sched_qs, set + * both .rcu_need_heavy_qs and .rcu_urgent_qs. Note that the + * unsynchronized assignments to the per-CPU rcu_need_heavy_qs + * variable are safe because the assignments are repeated if this + * CPU failed to pass through a quiescent state. This code + * also checks .jiffies_resched in case jiffies_to_sched_qs + * is set way high. */ - rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); + jtsq = READ_ONCE(jiffies_to_sched_qs); + ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu); + rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && - (time_after(jiffies, rdp->rsp->gp_start + jtsq) || - time_after(jiffies, rdp->rsp->jiffies_resched))) { + (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || + time_after(jiffies, rcu_state.jiffies_resched))) { WRITE_ONCE(*rnhqp, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ smp_store_release(ruqp, true); - rdp->rsp->jiffies_resched += jtsq; /* Re-enable beating. */ + } else if (time_after(jiffies, rcu_state.gp_start + jtsq)) { + WRITE_ONCE(*ruqp, true); } /* - * If more than halfway to RCU CPU stall-warning time, do a - * resched_cpu() to try to loosen things up a bit. Also check to - * see if the CPU is getting hammered with interrupts, but only - * once per grace period, just to keep the IPIs down to a dull roar. + * NO_HZ_FULL CPUs can run in-kernel without rcu_check_callbacks! + * The above code handles this, but only for straight cond_resched(). + * And some in-kernel loops check need_resched() before calling + * cond_resched(), which defeats the above code for CPUs that are + * running in-kernel with scheduling-clock interrupts disabled. + * So hit them over the head with the resched_cpu() hammer! */ - if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) { + if (tick_nohz_full_cpu(rdp->cpu) && + time_after(jiffies, + READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) { resched_cpu(rdp->cpu); + WRITE_ONCE(rdp->last_fqs_resched, jiffies); + } + + /* + * If more than halfway to RCU CPU stall-warning time, invoke + * resched_cpu() more frequently to try to loosen things up a bit. + * Also check to see if the CPU is getting hammered with interrupts, + * but only once per grace period, just to keep the IPIs down to + * a dull roar. + */ + if (time_after(jiffies, rcu_state.jiffies_resched)) { + if (time_after(jiffies, + READ_ONCE(rdp->last_fqs_resched) + jtsq)) { + resched_cpu(rdp->cpu); + WRITE_ONCE(rdp->last_fqs_resched, jiffies); + } if (IS_ENABLED(CONFIG_IRQ_WORK) && !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { @@ -1272,17 +1129,17 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 0; } -static void record_gp_stall_check_time(struct rcu_state *rsp) +static void record_gp_stall_check_time(void) { unsigned long j = jiffies; unsigned long j1; - rsp->gp_start = j; + rcu_state.gp_start = j; j1 = rcu_jiffies_till_stall_check(); /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rsp->jiffies_stall, j + j1); /* ^^^ */ - rsp->jiffies_resched = j + j1 / 2; - rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs); + smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + rcu_state.jiffies_resched = j + j1 / 2; + rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); } /* @@ -1298,25 +1155,23 @@ static const char *gp_state_getname(short gs) /* * Complain about starvation of grace-period kthread. */ -static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) +static void rcu_check_gp_kthread_starvation(void) { - unsigned long gpa; + struct task_struct *gpk = rcu_state.gp_kthread; unsigned long j; - j = jiffies; - gpa = READ_ONCE(rsp->gp_activity); - if (j - gpa > 2 * HZ) { + j = jiffies - READ_ONCE(rcu_state.gp_activity); + if (j > 2 * HZ) { pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", - rsp->name, j - gpa, - (long)rcu_seq_current(&rsp->gp_seq), - rsp->gp_flags, - gp_state_getname(rsp->gp_state), rsp->gp_state, - rsp->gp_kthread ? rsp->gp_kthread->state : ~0, - rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); - if (rsp->gp_kthread) { + rcu_state.name, j, + (long)rcu_seq_current(&rcu_state.gp_seq), + rcu_state.gp_flags, + gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, + gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); + if (gpk) { pr_err("RCU grace-period kthread stack dump:\n"); - sched_show_task(rsp->gp_kthread); - wake_up_process(rsp->gp_kthread); + sched_show_task(gpk); + wake_up_process(gpk); } } } @@ -1327,13 +1182,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) * that don't support NMI-based stack dumps. The NMI-triggered stack * traces are more accurate because they are printed by the target CPU. */ -static void rcu_dump_cpu_stacks(struct rcu_state *rsp) +static void rcu_dump_cpu_stacks(void) { int cpu; unsigned long flags; struct rcu_node *rnp; - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); for_each_leaf_node_possible_cpu(rnp, cpu) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) @@ -1347,19 +1202,20 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) * If too much time has passed in the current grace period, and if * so configured, go kick the relevant kthreads. */ -static void rcu_stall_kick_kthreads(struct rcu_state *rsp) +static void rcu_stall_kick_kthreads(void) { unsigned long j; if (!rcu_kick_kthreads) return; - j = READ_ONCE(rsp->jiffies_kick_kthreads); - if (time_after(jiffies, j) && rsp->gp_kthread && - (rcu_gp_in_progress(rsp) || READ_ONCE(rsp->gp_flags))) { - WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); + j = READ_ONCE(rcu_state.jiffies_kick_kthreads); + if (time_after(jiffies, j) && rcu_state.gp_kthread && + (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", + rcu_state.name); rcu_ftrace_dump(DUMP_ALL); - wake_up_process(rsp->gp_kthread); - WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ); + wake_up_process(rcu_state.gp_kthread); + WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); } } @@ -1369,18 +1225,18 @@ static void panic_on_rcu_stall(void) panic("RCU Stall\n"); } -static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) +static void print_other_cpu_stall(unsigned long gp_seq) { int cpu; unsigned long flags; unsigned long gpa; unsigned long j; int ndetected = 0; - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp = rcu_get_root(); long totqlen = 0; /* Kick and suppress, if so configured. */ - rcu_stall_kick_kthreads(rsp); + rcu_stall_kick_kthreads(); if (rcu_cpu_stall_suppress) return; @@ -1389,15 +1245,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - pr_err("INFO: %s detected stalls on CPUs/tasks:", rsp->name); + pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name); print_cpu_stall_info_begin(); - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { for_each_leaf_node_possible_cpu(rnp, cpu) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { - print_cpu_stall_info(rsp, cpu); + print_cpu_stall_info(cpu); ndetected++; } } @@ -1406,52 +1262,52 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, + totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data, cpu)->cblist); pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", - smp_processor_id(), (long)(jiffies - rsp->gp_start), - (long)rcu_seq_current(&rsp->gp_seq), totqlen); + smp_processor_id(), (long)(jiffies - rcu_state.gp_start), + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); if (ndetected) { - rcu_dump_cpu_stacks(rsp); + rcu_dump_cpu_stacks(); /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(rsp); + rcu_print_detail_task_stall(); } else { - if (rcu_seq_current(&rsp->gp_seq) != gp_seq) { + if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; - gpa = READ_ONCE(rsp->gp_activity); + gpa = READ_ONCE(rcu_state.gp_activity); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", - rsp->name, j - gpa, j, gpa, - jiffies_till_next_fqs, - rcu_get_root(rsp)->qsmask); + rcu_state.name, j - gpa, j, gpa, + READ_ONCE(jiffies_till_next_fqs), + rcu_get_root()->qsmask); /* In this case, the current CPU might be at fault. */ sched_show_task(current); } } /* Rewrite if needed in case of slow consoles. */ - if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) - WRITE_ONCE(rsp->jiffies_stall, + if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) + WRITE_ONCE(rcu_state.jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - rcu_check_gp_kthread_starvation(rsp); + rcu_check_gp_kthread_starvation(); panic_on_rcu_stall(); - force_quiescent_state(rsp); /* Kick them all. */ + force_quiescent_state(); /* Kick them all. */ } -static void print_cpu_stall(struct rcu_state *rsp) +static void print_cpu_stall(void) { int cpu; unsigned long flags; - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rcu_get_root(); long totqlen = 0; /* Kick and suppress, if so configured. */ - rcu_stall_kick_kthreads(rsp); + rcu_stall_kick_kthreads(); if (rcu_cpu_stall_suppress) return; @@ -1460,27 +1316,27 @@ static void print_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - pr_err("INFO: %s self-detected stall on CPU", rsp->name); + pr_err("INFO: %s self-detected stall on CPU", rcu_state.name); print_cpu_stall_info_begin(); raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); - print_cpu_stall_info(rsp, smp_processor_id()); + print_cpu_stall_info(smp_processor_id()); raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, + totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(&rcu_data, cpu)->cblist); pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", - jiffies - rsp->gp_start, - (long)rcu_seq_current(&rsp->gp_seq), totqlen); + jiffies - rcu_state.gp_start, + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); - rcu_check_gp_kthread_starvation(rsp); + rcu_check_gp_kthread_starvation(); - rcu_dump_cpu_stacks(rsp); + rcu_dump_cpu_stacks(); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Rewrite if needed in case of slow consoles. */ - if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) - WRITE_ONCE(rsp->jiffies_stall, + if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) + WRITE_ONCE(rcu_state.jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -1493,10 +1349,11 @@ static void print_cpu_stall(struct rcu_state *rsp) * progress and it could be we're stuck in kernel space without context * switches for an entirely unreasonable amount of time. */ - resched_cpu(smp_processor_id()); + set_tsk_need_resched(current); + set_preempt_need_resched(); } -static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) +static void check_cpu_stall(struct rcu_data *rdp) { unsigned long gs1; unsigned long gs2; @@ -1507,54 +1364,55 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) struct rcu_node *rnp; if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || - !rcu_gp_in_progress(rsp)) + !rcu_gp_in_progress()) return; - rcu_stall_kick_kthreads(rsp); + rcu_stall_kick_kthreads(); j = jiffies; /* * Lots of memory barriers to reject false positives. * - * The idea is to pick up rsp->gp_seq, then rsp->jiffies_stall, - * then rsp->gp_start, and finally another copy of rsp->gp_seq. - * These values are updated in the opposite order with memory - * barriers (or equivalent) during grace-period initialization - * and cleanup. Now, a false positive can occur if we get an new - * value of rsp->gp_start and a old value of rsp->jiffies_stall. - * But given the memory barriers, the only way that this can happen - * is if one grace period ends and another starts between these - * two fetches. This is detected by comparing the second fetch - * of rsp->gp_seq with the previous fetch from rsp->gp_seq. + * The idea is to pick up rcu_state.gp_seq, then + * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally + * another copy of rcu_state.gp_seq. These values are updated in + * the opposite order with memory barriers (or equivalent) during + * grace-period initialization and cleanup. Now, a false positive + * can occur if we get an new value of rcu_state.gp_start and a old + * value of rcu_state.jiffies_stall. But given the memory barriers, + * the only way that this can happen is if one grace period ends + * and another starts between these two fetches. This is detected + * by comparing the second fetch of rcu_state.gp_seq with the + * previous fetch from rcu_state.gp_seq. * - * Given this check, comparisons of jiffies, rsp->jiffies_stall, - * and rsp->gp_start suffice to forestall false positives. + * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, + * and rcu_state.gp_start suffice to forestall false positives. */ - gs1 = READ_ONCE(rsp->gp_seq); + gs1 = READ_ONCE(rcu_state.gp_seq); smp_rmb(); /* Pick up ->gp_seq first... */ - js = READ_ONCE(rsp->jiffies_stall); + js = READ_ONCE(rcu_state.jiffies_stall); smp_rmb(); /* ...then ->jiffies_stall before the rest... */ - gps = READ_ONCE(rsp->gp_start); + gps = READ_ONCE(rcu_state.gp_start); smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ - gs2 = READ_ONCE(rsp->gp_seq); + gs2 = READ_ONCE(rcu_state.gp_seq); if (gs1 != gs2 || ULONG_CMP_LT(j, js) || ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - if (rcu_gp_in_progress(rsp) && + if (rcu_gp_in_progress() && (READ_ONCE(rnp->qsmask) & rdp->grpmask) && - cmpxchg(&rsp->jiffies_stall, js, jn) == js) { + cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* We haven't checked in, so go dump stack. */ - print_cpu_stall(rsp); + print_cpu_stall(); - } else if (rcu_gp_in_progress(rsp) && + } else if (rcu_gp_in_progress() && ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && - cmpxchg(&rsp->jiffies_stall, js, jn) == js) { + cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp, gs2); + print_other_cpu_stall(gs2); } } @@ -1569,17 +1427,14 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) */ void rcu_cpu_stall_reset(void) { - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2); + WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); } /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long gp_seq_req, const char *s) { - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, gp_seq_req, + trace_rcu_future_grace_period(rcu_state.name, rnp->gp_seq, gp_seq_req, rnp->level, rnp->grplo, rnp->grphi, s); } @@ -1603,7 +1458,6 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, unsigned long gp_seq_req) { bool ret = false; - struct rcu_state *rsp = rdp->rsp; struct rcu_node *rnp; /* @@ -1647,18 +1501,18 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, } /* If GP already in progress, just leave, otherwise start one. */ - if (rcu_gp_in_progress(rsp)) { + if (rcu_gp_in_progress()) { trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot")); goto unlock_out; } trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); - WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); - rsp->gp_req_activity = jiffies; - if (!rsp->gp_kthread) { + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT); + rcu_state.gp_req_activity = jiffies; + if (!rcu_state.gp_kthread) { trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ @@ -1675,10 +1529,10 @@ unlock_out: * Clean up any old requests for the just-ended grace period. Also return * whether any additional grace periods have been requested. */ -static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static bool rcu_future_gp_cleanup(struct rcu_node *rnp) { bool needmore; - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed); if (!needmore) @@ -1689,19 +1543,18 @@ static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) } /* - * Awaken the grace-period kthread for the specified flavor of RCU. - * Don't do a self-awaken, and don't bother awakening when there is - * nothing for the grace-period kthread to do (as in several CPUs - * raced to awaken, and we lost), and finally don't try to awaken - * a kthread that has not yet been created. + * Awaken the grace-period kthread. Don't do a self-awaken, and don't + * bother awakening when there is nothing for the grace-period kthread + * to do (as in several CPUs raced to awaken, and we lost), and finally + * don't try to awaken a kthread that has not yet been created. */ -static void rcu_gp_kthread_wake(struct rcu_state *rsp) +static void rcu_gp_kthread_wake(void) { - if (current == rsp->gp_kthread || - !READ_ONCE(rsp->gp_flags) || - !rsp->gp_kthread) + if (current == rcu_state.gp_kthread || + !READ_ONCE(rcu_state.gp_flags) || + !rcu_state.gp_kthread) return; - swake_up_one(&rsp->gp_wq); + swake_up_one(&rcu_state.gp_wq); } /* @@ -1716,8 +1569,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) * * The caller must hold rnp->lock with interrupts disabled. */ -static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) +static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) { unsigned long gp_seq_req; bool ret = false; @@ -1738,15 +1590,15 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - gp_seq_req = rcu_seq_snap(&rsp->gp_seq); + gp_seq_req = rcu_seq_snap(&rcu_state.gp_seq); if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req)) ret = rcu_start_this_gp(rnp, rdp, gp_seq_req); /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) - trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccWaitCB")); + trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB")); else - trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccReadyCB")); + trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB")); return ret; } @@ -1757,25 +1609,24 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * that a new grace-period request be made, invokes rcu_accelerate_cbs() * while holding the leaf rcu_node structure's ->lock. */ -static void rcu_accelerate_cbs_unlocked(struct rcu_state *rsp, - struct rcu_node *rnp, +static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, struct rcu_data *rdp) { unsigned long c; bool needwake; lockdep_assert_irqs_disabled(); - c = rcu_seq_snap(&rsp->gp_seq); + c = rcu_seq_snap(&rcu_state.gp_seq); if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { /* Old request still live, so mark recent callbacks. */ (void)rcu_segcblist_accelerate(&rdp->cblist, c); return; } raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + needwake = rcu_accelerate_cbs(rnp, rdp); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_gp_kthread_wake(); } /* @@ -1788,8 +1639,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_state *rsp, * * The caller must hold rnp->lock with interrupts disabled. */ -static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) +static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) { raw_lockdep_assert_held_rcu_node(rnp); @@ -1804,7 +1654,7 @@ static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq); /* Classify any remaining callbacks. */ - return rcu_accelerate_cbs(rsp, rnp, rdp); + return rcu_accelerate_cbs(rnp, rdp); } /* @@ -1813,8 +1663,7 @@ static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * structure corresponding to the current CPU, and must have irqs disabled. * Returns true if the grace-period kthread needs to be awakened. */ -static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) +static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { bool ret; bool need_gp; @@ -1827,10 +1676,10 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, /* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || unlikely(READ_ONCE(rdp->gpwrap))) { - ret = rcu_advance_cbs(rsp, rnp, rdp); /* Advance callbacks. */ - trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuend")); + ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */ + trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend")); } else { - ret = rcu_accelerate_cbs(rsp, rnp, rdp); /* Recent callbacks. */ + ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */ } /* Now handle the beginnings of any new-to-this-CPU grace periods. */ @@ -1841,10 +1690,9 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, * set up to detect a quiescent state, otherwise don't * go looking for one. */ - trace_rcu_grace_period(rsp->name, rnp->gp_seq, TPS("cpustart")); + trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); } @@ -1856,7 +1704,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, return ret; } -static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) +static void note_gp_changes(struct rcu_data *rdp) { unsigned long flags; bool needwake; @@ -1870,16 +1718,16 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); return; } - needwake = __note_gp_changes(rsp, rnp, rdp); + needwake = __note_gp_changes(rnp, rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_gp_kthread_wake(); } -static void rcu_gp_slow(struct rcu_state *rsp, int delay) +static void rcu_gp_slow(int delay) { if (delay > 0 && - !(rcu_seq_ctr(rsp->gp_seq) % + !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) schedule_timeout_uninterruptible(delay); } @@ -1887,24 +1735,24 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) /* * Initialize a new grace period. Return false if no grace period required. */ -static bool rcu_gp_init(struct rcu_state *rsp) +static bool rcu_gp_init(void) { unsigned long flags; unsigned long oldmask; unsigned long mask; struct rcu_data *rdp; - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp = rcu_get_root(); - WRITE_ONCE(rsp->gp_activity, jiffies); + WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); - if (!READ_ONCE(rsp->gp_flags)) { + if (!READ_ONCE(rcu_state.gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq_rcu_node(rnp); return false; } - WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ + WRITE_ONCE(rcu_state.gp_flags, 0); /* Clear all flags: New GP. */ - if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { + if (WARN_ON_ONCE(rcu_gp_in_progress())) { /* * Grace period already in progress, don't start another. * Not supposed to be able to happen. @@ -1914,10 +1762,10 @@ static bool rcu_gp_init(struct rcu_state *rsp) } /* Advance to a new grace period and initialize state. */ - record_gp_stall_check_time(rsp); + record_gp_stall_check_time(); /* Record GP times before starting GP, hence rcu_seq_start(). */ - rcu_seq_start(&rsp->gp_seq); - trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("start")); + rcu_seq_start(&rcu_state.gp_seq); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -1926,15 +1774,15 @@ static bool rcu_gp_init(struct rcu_state *rsp) * for subsequent online CPUs, and that quiescent-state forcing * will handle subsequent offline CPUs. */ - rsp->gp_state = RCU_GP_ONOFF; - rcu_for_each_leaf_node(rsp, rnp) { - spin_lock(&rsp->ofl_lock); + rcu_state.gp_state = RCU_GP_ONOFF; + rcu_for_each_leaf_node(rnp) { + raw_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ raw_spin_unlock_irq_rcu_node(rnp); - spin_unlock(&rsp->ofl_lock); + raw_spin_unlock(&rcu_state.ofl_lock); continue; } @@ -1970,45 +1818,45 @@ static bool rcu_gp_init(struct rcu_state *rsp) } raw_spin_unlock_irq_rcu_node(rnp); - spin_unlock(&rsp->ofl_lock); + raw_spin_unlock(&rcu_state.ofl_lock); } - rcu_gp_slow(rsp, gp_preinit_delay); /* Races with CPU hotplug. */ + rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */ /* * Set the quiescent-state-needed bits in all the rcu_node - * structures for all currently online CPUs in breadth-first order, - * starting from the root rcu_node structure, relying on the layout - * of the tree within the rsp->node[] array. Note that other CPUs - * will access only the leaves of the hierarchy, thus seeing that no - * grace period is in progress, at least until the corresponding - * leaf node has been initialized. + * structures for all currently online CPUs in breadth-first + * order, starting from the root rcu_node structure, relying on the + * layout of the tree within the rcu_state.node[] array. Note that + * other CPUs will access only the leaves of the hierarchy, thus + * seeing that no grace period is in progress, at least until the + * corresponding leaf node has been initialized. * * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. */ - rsp->gp_state = RCU_GP_INIT; - rcu_for_each_node_breadth_first(rsp, rnp) { - rcu_gp_slow(rsp, gp_init_delay); + rcu_state.gp_state = RCU_GP_INIT; + rcu_for_each_node_breadth_first(rnp) { + rcu_gp_slow(gp_init_delay); raw_spin_lock_irqsave_rcu_node(rnp, flags); - rdp = this_cpu_ptr(rsp->rda); - rcu_preempt_check_blocked_tasks(rsp, rnp); + rdp = this_cpu_ptr(&rcu_data); + rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; - WRITE_ONCE(rnp->gp_seq, rsp->gp_seq); + WRITE_ONCE(rnp->gp_seq, rcu_state.gp_seq); if (rnp == rdp->mynode) - (void)__note_gp_changes(rsp, rnp, rdp); + (void)__note_gp_changes(rnp, rdp); rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gp_seq, + trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); /* Quiescent states for tasks on any now-offline CPUs. */ mask = rnp->qsmask & ~rnp->qsmaskinitnext; rnp->rcu_gp_init_mask = mask; if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); else raw_spin_unlock_irq_rcu_node(rnp); cond_resched_tasks_rcu_qs(); - WRITE_ONCE(rsp->gp_activity, jiffies); + WRITE_ONCE(rcu_state.gp_activity, jiffies); } return true; @@ -2018,12 +1866,12 @@ static bool rcu_gp_init(struct rcu_state *rsp) * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state * time. */ -static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) +static bool rcu_gp_fqs_check_wake(int *gfp) { - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp = rcu_get_root(); /* Someone like call_rcu() requested a force-quiescent-state scan. */ - *gfp = READ_ONCE(rsp->gp_flags); + *gfp = READ_ONCE(rcu_state.gp_flags); if (*gfp & RCU_GP_FLAG_FQS) return true; @@ -2037,45 +1885,110 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) /* * Do one round of quiescent-state forcing. */ -static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) +static void rcu_gp_fqs(bool first_time) { - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp = rcu_get_root(); - WRITE_ONCE(rsp->gp_activity, jiffies); - rsp->n_force_qs++; + WRITE_ONCE(rcu_state.gp_activity, jiffies); + rcu_state.n_force_qs++; if (first_time) { /* Collect dyntick-idle snapshots. */ - force_qs_rnp(rsp, dyntick_save_progress_counter); + force_qs_rnp(dyntick_save_progress_counter); } else { /* Handle dyntick-idle and offline CPUs. */ - force_qs_rnp(rsp, rcu_implicit_dynticks_qs); + force_qs_rnp(rcu_implicit_dynticks_qs); } /* Clear flag to prevent immediate re-entry. */ - if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq_rcu_node(rnp); - WRITE_ONCE(rsp->gp_flags, - READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, + READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq_rcu_node(rnp); } } /* + * Loop doing repeated quiescent-state forcing until the grace period ends. + */ +static void rcu_gp_fqs_loop(void) +{ + bool first_gp_fqs; + int gf; + unsigned long j; + int ret; + struct rcu_node *rnp = rcu_get_root(); + + first_gp_fqs = true; + j = READ_ONCE(jiffies_till_first_fqs); + ret = 0; + for (;;) { + if (!ret) { + rcu_state.jiffies_force_qs = jiffies + j; + WRITE_ONCE(rcu_state.jiffies_kick_kthreads, + jiffies + 3 * j); + } + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), + TPS("fqswait")); + rcu_state.gp_state = RCU_GP_WAIT_FQS; + ret = swait_event_idle_timeout_exclusive( + rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + rcu_state.gp_state = RCU_GP_DOING_FQS; + /* Locking provides needed memory barriers. */ + /* If grace period done, leave loop. */ + if (!READ_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)) + break; + /* If time for quiescent-state forcing, do it. */ + if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) || + (gf & RCU_GP_FLAG_FQS)) { + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), + TPS("fqsstart")); + rcu_gp_fqs(first_gp_fqs); + first_gp_fqs = false; + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), + TPS("fqsend")); + cond_resched_tasks_rcu_qs(); + WRITE_ONCE(rcu_state.gp_activity, jiffies); + ret = 0; /* Force full wait till next FQS. */ + j = READ_ONCE(jiffies_till_next_fqs); + } else { + /* Deal with stray signal. */ + cond_resched_tasks_rcu_qs(); + WRITE_ONCE(rcu_state.gp_activity, jiffies); + WARN_ON(signal_pending(current)); + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), + TPS("fqswaitsig")); + ret = 1; /* Keep old FQS timing. */ + j = jiffies; + if (time_after(jiffies, rcu_state.jiffies_force_qs)) + j = 1; + else + j = rcu_state.jiffies_force_qs - j; + } + } +} + +/* * Clean up after the old grace period. */ -static void rcu_gp_cleanup(struct rcu_state *rsp) +static void rcu_gp_cleanup(void) { unsigned long gp_duration; bool needgp = false; unsigned long new_gp_seq; struct rcu_data *rdp; - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp = rcu_get_root(); struct swait_queue_head *sq; - WRITE_ONCE(rsp->gp_activity, jiffies); + WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); - gp_duration = jiffies - rsp->gp_start; - if (gp_duration > rsp->gp_max) - rsp->gp_max = gp_duration; + gp_duration = jiffies - rcu_state.gp_start; + if (gp_duration > rcu_state.gp_max) + rcu_state.gp_max = gp_duration; /* * We know the grace period is complete, but to everyone else @@ -2096,48 +2009,50 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * the rcu_node structures before the beginning of the next grace * period is recorded in any of the rcu_node structures. */ - new_gp_seq = rsp->gp_seq; + new_gp_seq = rcu_state.gp_seq; rcu_seq_end(&new_gp_seq); - rcu_for_each_node_breadth_first(rsp, rnp) { + rcu_for_each_node_breadth_first(rnp) { raw_spin_lock_irq_rcu_node(rnp); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) - dump_blkd_tasks(rsp, rnp, 10); + dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->gp_seq, new_gp_seq); - rdp = this_cpu_ptr(rsp->rda); + rdp = this_cpu_ptr(&rcu_data); if (rnp == rdp->mynode) - needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; + needgp = __note_gp_changes(rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ - needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp; + needgp = rcu_future_gp_cleanup(rnp) || needgp; sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); cond_resched_tasks_rcu_qs(); - WRITE_ONCE(rsp->gp_activity, jiffies); - rcu_gp_slow(rsp, gp_cleanup_delay); + WRITE_ONCE(rcu_state.gp_activity, jiffies); + rcu_gp_slow(gp_cleanup_delay); } - rnp = rcu_get_root(rsp); - raw_spin_lock_irq_rcu_node(rnp); /* GP before rsp->gp_seq update. */ + rnp = rcu_get_root(); + raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */ /* Declare grace period done. */ - rcu_seq_end(&rsp->gp_seq); - trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("end")); - rsp->gp_state = RCU_GP_IDLE; + rcu_seq_end(&rcu_state.gp_seq); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); + rcu_state.gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ - rdp = this_cpu_ptr(rsp->rda); + rdp = this_cpu_ptr(&rcu_data); if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed, TPS("CleanupMore")); needgp = true; } /* Advance CBs to reduce false positives below. */ - if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { - WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - rsp->gp_req_activity = jiffies; - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), + if (!rcu_accelerate_cbs(rnp, rdp) && needgp) { + WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); + rcu_state.gp_req_activity = jiffies; + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), TPS("newreq")); } else { - WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); + WRITE_ONCE(rcu_state.gp_flags, + rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); } @@ -2145,116 +2060,60 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* * Body of kthread that handles grace periods. */ -static int __noreturn rcu_gp_kthread(void *arg) +static int __noreturn rcu_gp_kthread(void *unused) { - bool first_gp_fqs; - int gf; - unsigned long j; - int ret; - struct rcu_state *rsp = arg; - struct rcu_node *rnp = rcu_get_root(rsp); - rcu_bind_gp_kthread(); for (;;) { /* Handle grace-period start. */ for (;;) { - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gp_seq), + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), TPS("reqwait")); - rsp->gp_state = RCU_GP_WAIT_GPS; - swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & - RCU_GP_FLAG_INIT); - rsp->gp_state = RCU_GP_DONE_GPS; + rcu_state.gp_state = RCU_GP_WAIT_GPS; + swait_event_idle_exclusive(rcu_state.gp_wq, + READ_ONCE(rcu_state.gp_flags) & + RCU_GP_FLAG_INIT); + rcu_state.gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ - if (rcu_gp_init(rsp)) + if (rcu_gp_init()) break; cond_resched_tasks_rcu_qs(); - WRITE_ONCE(rsp->gp_activity, jiffies); + WRITE_ONCE(rcu_state.gp_activity, jiffies); WARN_ON(signal_pending(current)); - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gp_seq), + trace_rcu_grace_period(rcu_state.name, + READ_ONCE(rcu_state.gp_seq), TPS("reqwaitsig")); } /* Handle quiescent-state forcing. */ - first_gp_fqs = true; - j = jiffies_till_first_fqs; - ret = 0; - for (;;) { - if (!ret) { - rsp->jiffies_force_qs = jiffies + j; - WRITE_ONCE(rsp->jiffies_kick_kthreads, - jiffies + 3 * j); - } - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gp_seq), - TPS("fqswait")); - rsp->gp_state = RCU_GP_WAIT_FQS; - ret = swait_event_idle_timeout_exclusive(rsp->gp_wq, - rcu_gp_fqs_check_wake(rsp, &gf), j); - rsp->gp_state = RCU_GP_DOING_FQS; - /* Locking provides needed memory barriers. */ - /* If grace period done, leave loop. */ - if (!READ_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)) - break; - /* If time for quiescent-state forcing, do it. */ - if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || - (gf & RCU_GP_FLAG_FQS)) { - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gp_seq), - TPS("fqsstart")); - rcu_gp_fqs(rsp, first_gp_fqs); - first_gp_fqs = false; - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gp_seq), - TPS("fqsend")); - cond_resched_tasks_rcu_qs(); - WRITE_ONCE(rsp->gp_activity, jiffies); - ret = 0; /* Force full wait till next FQS. */ - j = jiffies_till_next_fqs; - } else { - /* Deal with stray signal. */ - cond_resched_tasks_rcu_qs(); - WRITE_ONCE(rsp->gp_activity, jiffies); - WARN_ON(signal_pending(current)); - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gp_seq), - TPS("fqswaitsig")); - ret = 1; /* Keep old FQS timing. */ - j = jiffies; - if (time_after(jiffies, rsp->jiffies_force_qs)) - j = 1; - else - j = rsp->jiffies_force_qs - j; - } - } + rcu_gp_fqs_loop(); /* Handle grace-period end. */ - rsp->gp_state = RCU_GP_CLEANUP; - rcu_gp_cleanup(rsp); - rsp->gp_state = RCU_GP_CLEANED; + rcu_state.gp_state = RCU_GP_CLEANUP; + rcu_gp_cleanup(); + rcu_state.gp_state = RCU_GP_CLEANED; } } /* - * Report a full set of quiescent states to the specified rcu_state data - * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period - * kthread if another grace period is required. Whether we wake - * the grace-period kthread or it awakens itself for the next round - * of quiescent-state forcing, that kthread will clean up after the - * just-completed grace period. Note that the caller must hold rnp->lock, - * which is released before return. + * Report a full set of quiescent states to the rcu_state data structure. + * Invoke rcu_gp_kthread_wake() to awaken the grace-period kthread if + * another grace period is required. Whether we wake the grace-period + * kthread or it awakens itself for the next round of quiescent-state + * forcing, that kthread will clean up after the just-completed grace + * period. Note that the caller must hold rnp->lock, which is released + * before return. */ -static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) +static void rcu_report_qs_rsp(unsigned long flags) + __releases(rcu_get_root()->lock) { - raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp)); - WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); - WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); - rcu_gp_kthread_wake(rsp); + raw_lockdep_assert_held_rcu_node(rcu_get_root()); + WARN_ON_ONCE(!rcu_gp_in_progress()); + WRITE_ONCE(rcu_state.gp_flags, + READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags); + rcu_gp_kthread_wake(); } /* @@ -2271,9 +2130,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * disabled. This allows propagating quiescent state due to resumed tasks * during grace-period initialization. */ -static void -rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long gps, unsigned long flags) +static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, + unsigned long gps, unsigned long flags) __releases(rnp->lock) { unsigned long oldmask = 0; @@ -2296,7 +2154,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; - trace_rcu_quiescent_state_report(rsp->name, rnp->gp_seq, + trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq, mask, rnp->qsmask, rnp->level, rnp->grplo, rnp->grphi, !!rnp->gp_tasks); @@ -2326,19 +2184,18 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * state for this grace period. Invoke rcu_report_qs_rsp() * to clean up and start the next grace period if one is needed. */ - rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ + rcu_report_qs_rsp(flags); /* releases rnp->lock. */ } /* * Record a quiescent state for all tasks that were previously queued * on the specified rcu_node structure and that were blocking the current - * RCU grace period. The caller must hold the specified rnp->lock with + * RCU grace period. The caller must hold the corresponding rnp->lock with * irqs disabled, and this lock is released upon return, but irqs remain * disabled. */ static void __maybe_unused -rcu_report_unblock_qs_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags) +rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { unsigned long gps; @@ -2346,8 +2203,7 @@ rcu_report_unblock_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(rcu_state_p == &rcu_sched_state) || - WARN_ON_ONCE(rsp != rcu_state_p) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2361,7 +2217,7 @@ rcu_report_unblock_qs_rnp(struct rcu_state *rsp, * Only one rcu_node structure in the tree, so don't * try to report up to its nonexistent parent! */ - rcu_report_qs_rsp(rsp, flags); + rcu_report_qs_rsp(flags); return; } @@ -2370,7 +2226,7 @@ rcu_report_unblock_qs_rnp(struct rcu_state *rsp, mask = rnp->grpmask; raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ - rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); + rcu_report_qs_rnp(mask, rnp_p, gps, flags); } /* @@ -2378,7 +2234,7 @@ rcu_report_unblock_qs_rnp(struct rcu_state *rsp, * structure. This must be called from the specified CPU. */ static void -rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) +rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) { unsigned long flags; unsigned long mask; @@ -2397,7 +2253,6 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -2411,12 +2266,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + needwake = rcu_accelerate_cbs(rnp, rdp); - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_gp_kthread_wake(); } } @@ -2427,10 +2282,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * quiescent state for this grace period, and record that fact if so. */ static void -rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) +rcu_check_quiescent_state(struct rcu_data *rdp) { /* Check for grace-period ends and beginnings. */ - note_gp_changes(rsp, rdp); + note_gp_changes(rdp); /* * Does this CPU still need to do its part for current grace period? @@ -2450,24 +2305,26 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rsp, rdp); + rcu_report_qs_rdp(rdp->cpu, rdp); } /* - * Trace the fact that this CPU is going offline. + * Near the end of the offline process. Trace the fact that this CPU + * is going offline. */ -static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) +int rcutree_dying_cpu(unsigned int cpu) { RCU_TRACE(bool blkd;) - RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) + RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);) RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; + return 0; RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) - trace_rcu_grace_period(rsp->name, rnp->gp_seq, + trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); + return 0; } /* @@ -2521,23 +2378,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) * There can only be one CPU hotplug operation at a time, so no need for * explicit locking. */ -static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) +int rcutree_dead_cpu(unsigned int cpu) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; + return 0; /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); + /* Do any needed no-CB deferred wakeups from this CPU. */ + do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu)); + return 0; } /* * Invoke any RCU callbacks that have made it to the end of their grace * period. Thottle as specified by rdp->blimit. */ -static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) +static void rcu_do_batch(struct rcu_data *rdp) { unsigned long flags; struct rcu_head *rhp; @@ -2546,10 +2406,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { - trace_rcu_batch_start(rsp->name, + trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), 0); - trace_rcu_batch_end(rsp->name, 0, + trace_rcu_batch_end(rcu_state.name, 0, !rcu_segcblist_empty(&rdp->cblist), need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -2564,7 +2424,8 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); bl = rdp->blimit; - trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), + trace_rcu_batch_start(rcu_state.name, + rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); local_irq_restore(flags); @@ -2573,7 +2434,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rhp = rcu_cblist_dequeue(&rcl); for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { debug_rcu_head_unqueue(rhp); - if (__rcu_reclaim(rsp->name, rhp)) + if (__rcu_reclaim(rcu_state.name, rhp)) rcu_cblist_dequeued_lazy(&rcl); /* * Stop only if limit reached and CPU has something to do. @@ -2587,7 +2448,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); count = -rcl.len; - trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(), + trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); /* Update counts and requeue any remaining callbacks. */ @@ -2603,7 +2464,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ if (count == 0 && rdp->qlen_last_fqs_check != 0) { rdp->qlen_last_fqs_check = 0; - rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->n_force_qs_snap = rcu_state.n_force_qs; } else if (count < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = count; @@ -2631,37 +2492,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) void rcu_check_callbacks(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); - increment_cpu_stall_ticks(); - if (user || rcu_is_cpu_rrupt_from_idle()) { - - /* - * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. - * - * No memory barrier is required here because both - * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local - * variables that other CPUs neither access nor modify, - * at least not while the corresponding CPU is online. - */ - - rcu_sched_qs(); - rcu_bh_qs(); - rcu_note_voluntary_context_switch(current); - - } else if (!in_softirq()) { - - /* - * Get here if this CPU did not take its interrupt from - * softirq, in other words, if it is not interrupting - * a rcu_bh read-side critical section. This is an _bh - * critical section, so note it. - */ - - rcu_bh_qs(); + raw_cpu_inc(rcu_data.ticks_this_gp); + /* The load-acquire pairs with the store-release setting to true. */ + if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { + /* Idle and userspace execution already are quiescent states. */ + if (!rcu_is_cpu_rrupt_from_idle() && !user) { + set_tsk_need_resched(current); + set_preempt_need_resched(); + } + __this_cpu_write(rcu_data.rcu_urgent_qs, false); } - rcu_preempt_check_callbacks(); + rcu_flavor_check_callbacks(user); if (rcu_pending()) invoke_rcu_core(); @@ -2675,20 +2516,19 @@ void rcu_check_callbacks(int user) * * The caller must have suppressed start of new grace periods. */ -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) +static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) { int cpu; unsigned long flags; unsigned long mask; struct rcu_node *rnp; - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { cond_resched_tasks_rcu_qs(); mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { - if (rcu_state_p == &rcu_sched_state || - rsp != rcu_state_p || + if (!IS_ENABLED(CONFIG_PREEMPT) || rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they @@ -2705,13 +2545,13 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { - if (f(per_cpu_ptr(rsp->rda, cpu))) + if (f(per_cpu_ptr(&rcu_data, cpu))) mask |= bit; } } if (mask != 0) { /* Idle/offline CPUs, report (releases rnp->lock). */ - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2723,7 +2563,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) * Force quiescent states on reluctant CPUs, and also detect which * CPUs are in dyntick-idle mode. */ -static void force_quiescent_state(struct rcu_state *rsp) +static void force_quiescent_state(void) { unsigned long flags; bool ret; @@ -2731,9 +2571,9 @@ static void force_quiescent_state(struct rcu_state *rsp) struct rcu_node *rnp_old = NULL; /* Funnel through hierarchy to reduce memory contention. */ - rnp = __this_cpu_read(rsp->rda->mynode); + rnp = __this_cpu_read(rcu_data.mynode); for (; rnp != NULL; rnp = rnp->parent) { - ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || + ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) || !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); @@ -2741,18 +2581,19 @@ static void force_quiescent_state(struct rcu_state *rsp) return; rnp_old = rnp; } - /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ + /* rnp_old == rcu_get_root(), rnp == NULL. */ /* Reached the root of the rcu_node tree, acquire lock. */ raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); - if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } - WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, + READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); - rcu_gp_kthread_wake(rsp); + rcu_gp_kthread_wake(); } /* @@ -2760,30 +2601,29 @@ static void force_quiescent_state(struct rcu_state *rsp) * RCU to come out of its idle mode. */ static void -rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) +rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp) { const unsigned long gpssdelay = rcu_jiffies_till_stall_check() * HZ; unsigned long flags; unsigned long j; - struct rcu_node *rnp_root = rcu_get_root(rsp); + struct rcu_node *rnp_root = rcu_get_root(); static atomic_t warned = ATOMIC_INIT(0); - if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress(rsp) || + if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) return; j = jiffies; /* Expensive access, and in common case don't get here. */ - if (time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) || - time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) || + if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || atomic_read(&warned)) return; raw_spin_lock_irqsave_rcu_node(rnp, flags); j = jiffies; - if (rcu_gp_in_progress(rsp) || + if (rcu_gp_in_progress() || ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, READ_ONCE(rsp->gp_req_activity) + gpssdelay) || - time_before(j, READ_ONCE(rsp->gp_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || atomic_read(&warned)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; @@ -2793,21 +2633,21 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, if (rnp_root != rnp) raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ j = jiffies; - if (rcu_gp_in_progress(rsp) || + if (rcu_gp_in_progress() || ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, rsp->gp_req_activity + gpssdelay) || - time_before(j, rsp->gp_activity + gpssdelay) || + time_before(j, rcu_state.gp_req_activity + gpssdelay) || + time_before(j, rcu_state.gp_activity + gpssdelay) || atomic_xchg(&warned, 1)) { raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n", - __func__, (long)READ_ONCE(rsp->gp_seq), + __func__, (long)READ_ONCE(rcu_state.gp_seq), (long)READ_ONCE(rnp_root->gp_seq_needed), - j - rsp->gp_req_activity, j - rsp->gp_activity, - rsp->gp_flags, rsp->gp_state, rsp->name, - rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL); + j - rcu_state.gp_req_activity, j - rcu_state.gp_activity, + rcu_state.gp_flags, rcu_state.gp_state, rcu_state.name, + rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL); WARN_ON(1); if (rnp_root != rnp) raw_spin_unlock_rcu_node(rnp_root); @@ -2815,69 +2655,65 @@ rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, } /* - * This does the RCU core processing work for the specified rcu_state - * and rcu_data structures. This may be called only from the CPU to - * whom the rdp belongs. + * This does the RCU core processing work for the specified rcu_data + * structures. This may be called only from the CPU to whom the rdp + * belongs. */ -static void -__rcu_process_callbacks(struct rcu_state *rsp) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { unsigned long flags; - struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + if (cpu_is_offline(smp_processor_id())) + return; + trace_rcu_utilization(TPS("Start RCU core")); WARN_ON_ONCE(!rdp->beenonline); + /* Report any deferred quiescent states if preemption enabled. */ + if (!(preempt_count() & PREEMPT_MASK)) { + rcu_preempt_deferred_qs(current); + } else if (rcu_preempt_need_deferred_qs(current)) { + set_tsk_need_resched(current); + set_preempt_need_resched(); + } + /* Update RCU state based on any recent quiescent states. */ - rcu_check_quiescent_state(rsp, rdp); + rcu_check_quiescent_state(rdp); /* No grace period and unregistered callbacks? */ - if (!rcu_gp_in_progress(rsp) && + if (!rcu_gp_in_progress() && rcu_segcblist_is_enabled(&rdp->cblist)) { local_irq_save(flags); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) - rcu_accelerate_cbs_unlocked(rsp, rnp, rdp); + rcu_accelerate_cbs_unlocked(rnp, rdp); local_irq_restore(flags); } - rcu_check_gp_start_stall(rsp, rnp, rdp); + rcu_check_gp_start_stall(rnp, rdp); /* If there are callbacks ready, invoke them. */ if (rcu_segcblist_ready_cbs(&rdp->cblist)) - invoke_rcu_callbacks(rsp, rdp); + invoke_rcu_callbacks(rdp); /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); -} - -/* - * Do RCU core processing for the current CPU. - */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) -{ - struct rcu_state *rsp; - - if (cpu_is_offline(smp_processor_id())) - return; - trace_rcu_utilization(TPS("Start RCU core")); - for_each_rcu_flavor(rsp) - __rcu_process_callbacks(rsp); trace_rcu_utilization(TPS("End RCU core")); } /* - * Schedule RCU callback invocation. If the specified type of RCU - * does not support RCU priority boosting, just do a direct call, - * otherwise wake up the per-CPU kernel kthread. Note that because we - * are running on the current CPU with softirqs disabled, the - * rcu_cpu_kthread_task cannot disappear out from under us. + * Schedule RCU callback invocation. If the running implementation of RCU + * does not support RCU priority boosting, just do a direct call, otherwise + * wake up the per-CPU kernel kthread. Note that because we are running + * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task + * cannot disappear out from under us. */ -static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) +static void invoke_rcu_callbacks(struct rcu_data *rdp) { if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) return; - if (likely(!rsp->boost)) { - rcu_do_batch(rsp, rdp); + if (likely(!rcu_state.boost)) { + rcu_do_batch(rdp); return; } invoke_rcu_callbacks_kthread(); @@ -2892,8 +2728,8 @@ static void invoke_rcu_core(void) /* * Handle any core-RCU processing required by a call_rcu() invocation. */ -static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, - struct rcu_head *head, unsigned long flags) +static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, + unsigned long flags) { /* * If called from an extended quiescent state, invoke the RCU @@ -2917,18 +2753,18 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, rdp->qlen_last_fqs_check + qhimark)) { /* Are we ignoring a completed grace period? */ - note_gp_changes(rsp, rdp); + note_gp_changes(rdp); /* Start a new grace period if one not already started. */ - if (!rcu_gp_in_progress(rsp)) { - rcu_accelerate_cbs_unlocked(rsp, rdp->mynode, rdp); + if (!rcu_gp_in_progress()) { + rcu_accelerate_cbs_unlocked(rdp->mynode, rdp); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; - if (rsp->n_force_qs == rdp->n_force_qs_snap && + if (rcu_state.n_force_qs == rdp->n_force_qs_snap && rcu_segcblist_first_pend_cb(&rdp->cblist) != head) - force_quiescent_state(rsp); - rdp->n_force_qs_snap = rsp->n_force_qs; + force_quiescent_state(); + rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); } } @@ -2944,12 +2780,11 @@ static void rcu_leak_callback(struct rcu_head *rhp) /* * Helper function for call_rcu() and friends. The cpu argument will * normally be -1, indicating "currently running CPU". It may specify - * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() + * a CPU only if that CPU is a no-CBs CPU. Currently, only rcu_barrier() * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, rcu_callback_t func, - struct rcu_state *rsp, int cpu, bool lazy) +__call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) { unsigned long flags; struct rcu_data *rdp; @@ -2971,14 +2806,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, head->func = func; head->next = NULL; local_irq_save(flags); - rdp = this_cpu_ptr(rsp->rda); + rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { int offline; if (cpu != -1) - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = per_cpu_ptr(&rcu_data, cpu); if (likely(rdp->mynode)) { /* Post-boot, so this should be for a no-CBs CPU. */ offline = !__call_rcu_nocb(rdp, head, lazy, flags); @@ -3001,72 +2836,60 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, rcu_idle_count_callbacks_posted(); if (__is_kfree_rcu_offset((unsigned long)func)) - trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, + trace_rcu_kfree_callback(rcu_state.name, head, + (unsigned long)func, rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); else - trace_rcu_callback(rsp->name, head, + trace_rcu_callback(rcu_state.name, head, rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ - __call_rcu_core(rsp, rdp, head, flags); + __call_rcu_core(rdp, head, flags); local_irq_restore(flags); } /** - * call_rcu_sched() - Queue an RCU for invocation after sched grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_sched() assumes - * that the read-side critical sections end on enabling of preemption - * or on voluntary preemption. - * RCU read-side critical sections are delimited by: - * - * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR - * - anything that disables preemption. - * - * These may be nested. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func, &rcu_sched_state, -1, 0); -} -EXPORT_SYMBOL_GPL(call_rcu_sched); - -/** - * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. + * call_rcu() - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. * @func: actual callback function to be invoked after the grace period * * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_bh() assumes - * that the read-side critical sections end on completion of a softirq - * handler. This means that read-side critical sections in process - * context must not be interrupted by softirqs. This interface is to be - * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by: - * - * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR - * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. - * - * These may be nested. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func, &rcu_bh_state, -1, 0); -} -EXPORT_SYMBOL_GPL(call_rcu_bh); + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and + * may be nested. In addition, regions of code across which interrupts, + * preemption, or softirqs have been disabled also serve as RCU read-side + * critical sections. This includes hardware interrupt handlers, softirq + * handlers, and NMI handlers. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing RCU read-side critical section. On systems with more + * than one CPU, this means that when "func()" is invoked, each CPU is + * guaranteed to have executed a full memory barrier since the end of its + * last RCU read-side critical section whose beginning preceded the call + * to call_rcu(). It also means that each CPU executing an RCU read-side + * critical section that continues beyond the start of "func()" must have + * executed a memory barrier after the call_rcu() but before the beginning + * of that RCU read-side critical section. Note that these guarantees + * include CPUs that are offline, idle, or executing in user mode, as + * well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting RCU callback function "func()", then both CPU A and CPU B are + * guaranteed to execute a full memory barrier during the time interval + * between the call to call_rcu() and the invocation of "func()" -- even + * if CPU A and CPU B are the same CPU (but again only if the system has + * more than one CPU). + */ +void call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + __call_rcu(head, func, -1, 0); +} +EXPORT_SYMBOL_GPL(call_rcu); /* * Queue an RCU callback for lazy invocation after a grace period. @@ -3075,110 +2898,12 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); * callbacks in the list of pending callbacks. Until then, this * function may only be called from __kfree_rcu(). */ -void kfree_call_rcu(struct rcu_head *head, - rcu_callback_t func) +void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, rcu_state_p, -1, 1); + __call_rcu(head, func, -1, 1); } EXPORT_SYMBOL_GPL(kfree_call_rcu); -/* - * Because a context switch is a grace period for RCU-sched and RCU-bh, - * any blocking grace-period wait automatically implies a grace period - * if there is only one CPU online at any point time during execution - * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to - * occasionally incorrectly indicate that there are multiple CPUs online - * when there was in fact only one the whole time, as this just adds - * some overhead: RCU still operates correctly. - */ -static int rcu_blocking_is_gp(void) -{ - int ret; - - might_sleep(); /* Check for RCU read-side critical section. */ - preempt_disable(); - ret = num_online_cpus() <= 1; - preempt_enable(); - return ret; -} - -/** - * synchronize_sched - wait until an rcu-sched grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-sched - * grace period has elapsed, in other words after all currently executing - * rcu-sched read-side critical sections have completed. These read-side - * critical sections are delimited by rcu_read_lock_sched() and - * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), - * local_irq_disable(), and so on may be used in place of - * rcu_read_lock_sched(). - * - * This means that all preempt_disable code sequences, including NMI and - * non-threaded hardware-interrupt handlers, in progress on entry will - * have completed before this primitive returns. However, this does not - * guarantee that softirq handlers will have completed, since in some - * kernels, these handlers can run in process context, and can block. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_sched() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-sched read-side critical section whose beginning - * preceded the call to synchronize_sched(). In addition, each CPU having - * an RCU read-side critical section that extends beyond the return from - * synchronize_sched() is guaranteed to have executed a full memory barrier - * after the beginning of synchronize_sched() and before the beginning of - * that RCU read-side critical section. Note that these guarantees include - * CPUs that are offline, idle, or executing in user mode, as well as CPUs - * that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_sched(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - */ -void synchronize_sched(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU-sched read-side critical section"); - if (rcu_blocking_is_gp()) - return; - if (rcu_gp_is_expedited()) - synchronize_sched_expedited(); - else - wait_rcu_gp(call_rcu_sched); -} -EXPORT_SYMBOL_GPL(synchronize_sched); - -/** - * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. - * - * Control will return to the caller some time after a full rcu_bh grace - * period has elapsed, in other words after all currently executing rcu_bh - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), - * and may be nested. - * - * See the description of synchronize_sched() for more detailed information - * on memory ordering guarantees. - */ -void synchronize_rcu_bh(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); - if (rcu_blocking_is_gp()) - return; - if (rcu_gp_is_expedited()) - synchronize_rcu_bh_expedited(); - else - wait_rcu_gp(call_rcu_bh); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_bh); - /** * get_state_synchronize_rcu - Snapshot current RCU state * @@ -3193,7 +2918,7 @@ unsigned long get_state_synchronize_rcu(void) * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - return rcu_seq_snap(&rcu_state_p->gp_seq); + return rcu_seq_snap(&rcu_state.gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -3213,70 +2938,30 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void cond_synchronize_rcu(unsigned long oldstate) { - if (!rcu_seq_done(&rcu_state_p->gp_seq, oldstate)) + if (!rcu_seq_done(&rcu_state.gp_seq, oldstate)) synchronize_rcu(); else smp_mb(); /* Ensure GP ends before subsequent accesses. */ } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); -/** - * get_state_synchronize_sched - Snapshot current RCU-sched state - * - * Returns a cookie that is used by a later call to cond_synchronize_sched() - * to determine whether or not a full grace period has elapsed in the - * meantime. - */ -unsigned long get_state_synchronize_sched(void) -{ - /* - * Any prior manipulation of RCU-protected data must happen - * before the load from ->gp_seq. - */ - smp_mb(); /* ^^^ */ - return rcu_seq_snap(&rcu_sched_state.gp_seq); -} -EXPORT_SYMBOL_GPL(get_state_synchronize_sched); - -/** - * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period - * - * @oldstate: return value from earlier call to get_state_synchronize_sched() - * - * If a full RCU-sched grace period has elapsed since the earlier call to - * get_state_synchronize_sched(), just return. Otherwise, invoke - * synchronize_sched() to wait for a full grace period. - * - * Yes, this function does not take counter wrap into account. But - * counter wrap is harmless. If the counter wraps, we have waited for - * more than 2 billion grace periods (and way more on a 64-bit system!), - * so waiting for one additional grace period should be just fine. - */ -void cond_synchronize_sched(unsigned long oldstate) -{ - if (!rcu_seq_done(&rcu_sched_state.gp_seq, oldstate)) - synchronize_sched(); - else - smp_mb(); /* Ensure GP ends before subsequent accesses. */ -} -EXPORT_SYMBOL_GPL(cond_synchronize_sched); - /* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, for the specified type of RCU, returning 1 if so. - * The checks are in order of increasing expense: checks that can be - * carried out against CPU-local state are performed first. However, - * we must check for CPU stalls first, else we might not get a chance. + * Check to see if there is any immediate RCU-related work to be done by + * the current CPU, returning 1 if so and zero otherwise. The checks are + * in order of increasing expense: checks that can be carried out against + * CPU-local state are performed first. However, we must check for CPU + * stalls first, else we might not get a chance. */ -static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) +static int rcu_pending(void) { + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rsp, rdp); + check_cpu_stall(rdp); /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ - if (rcu_nohz_full_cpu(rsp)) + if (rcu_nohz_full_cpu()) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ @@ -3288,7 +2973,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ - if (!rcu_gp_in_progress(rsp) && + if (!rcu_gp_in_progress() && rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; @@ -3307,21 +2992,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, returning 1 if so. This function is part of the - * RCU implementation; it is -not- an exported member of the RCU API. - */ -static int rcu_pending(void) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda))) - return 1; - return 0; -} - -/* * Return true if the specified CPU has any callback. If all_lazy is * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) @@ -3331,17 +3001,12 @@ static bool rcu_cpu_has_callbacks(bool *all_lazy) bool al = true; bool hc = false; struct rcu_data *rdp; - struct rcu_state *rsp; - for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); - if (rcu_segcblist_empty(&rdp->cblist)) - continue; + rdp = this_cpu_ptr(&rcu_data); + if (!rcu_segcblist_empty(&rdp->cblist)) { hc = true; - if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) { + if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) al = false; - break; - } } if (all_lazy) *all_lazy = al; @@ -3349,81 +3014,80 @@ static bool rcu_cpu_has_callbacks(bool *all_lazy) } /* - * Helper function for _rcu_barrier() tracing. If tracing is disabled, + * Helper function for rcu_barrier() tracing. If tracing is disabled, * the compiler is expected to optimize this away. */ -static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, - int cpu, unsigned long done) +static void rcu_barrier_trace(const char *s, int cpu, unsigned long done) { - trace_rcu_barrier(rsp->name, s, cpu, - atomic_read(&rsp->barrier_cpu_count), done); + trace_rcu_barrier(rcu_state.name, s, cpu, + atomic_read(&rcu_state.barrier_cpu_count), done); } /* - * RCU callback function for _rcu_barrier(). If we are last, wake - * up the task executing _rcu_barrier(). + * RCU callback function for rcu_barrier(). If we are last, wake + * up the task executing rcu_barrier(). */ static void rcu_barrier_callback(struct rcu_head *rhp) { - struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); - struct rcu_state *rsp = rdp->rsp; - - if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { - _rcu_barrier_trace(rsp, TPS("LastCB"), -1, - rsp->barrier_sequence); - complete(&rsp->barrier_completion); + if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { + rcu_barrier_trace(TPS("LastCB"), -1, + rcu_state.barrier_sequence); + complete(&rcu_state.barrier_completion); } else { - _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence); + rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence); } } /* * Called with preemption disabled, and from cross-cpu IRQ context. */ -static void rcu_barrier_func(void *type) +static void rcu_barrier_func(void *unused) { - struct rcu_state *rsp = type; - struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); - _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence); + rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { - atomic_inc(&rsp->barrier_cpu_count); + atomic_inc(&rcu_state.barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); - _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1, - rsp->barrier_sequence); + rcu_barrier_trace(TPS("IRQNQ"), -1, + rcu_state.barrier_sequence); } } -/* - * Orchestrate the specified type of RCU barrier, waiting for all - * RCU callbacks of the specified type to complete. +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + * + * Note that this primitive does not necessarily wait for an RCU grace period + * to complete. For example, if there are no RCU callbacks queued anywhere + * in the system, then rcu_barrier() is within its rights to return + * immediately, without waiting for anything, much less an RCU grace period. */ -static void _rcu_barrier(struct rcu_state *rsp) +void rcu_barrier(void) { int cpu; struct rcu_data *rdp; - unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); + unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); - _rcu_barrier_trace(rsp, TPS("Begin"), -1, s); + rcu_barrier_trace(TPS("Begin"), -1, s); /* Take mutex to serialize concurrent rcu_barrier() requests. */ - mutex_lock(&rsp->barrier_mutex); + mutex_lock(&rcu_state.barrier_mutex); /* Did someone else do our work for us? */ - if (rcu_seq_done(&rsp->barrier_sequence, s)) { - _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1, - rsp->barrier_sequence); + if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { + rcu_barrier_trace(TPS("EarlyExit"), -1, + rcu_state.barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ - mutex_unlock(&rsp->barrier_mutex); + mutex_unlock(&rcu_state.barrier_mutex); return; } /* Mark the start of the barrier operation. */ - rcu_seq_start(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence); + rcu_seq_start(&rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence); /* * Initialize the count to one rather than to zero in order to @@ -3431,8 +3095,8 @@ static void _rcu_barrier(struct rcu_state *rsp) * (or preemption of this task). Exclude CPU-hotplug operations * to ensure that no offline CPU has callbacks queued. */ - init_completion(&rsp->barrier_completion); - atomic_set(&rsp->barrier_cpu_count, 1); + init_completion(&rcu_state.barrier_completion); + atomic_set(&rcu_state.barrier_cpu_count, 1); get_online_cpus(); /* @@ -3443,26 +3107,26 @@ static void _rcu_barrier(struct rcu_state *rsp) for_each_possible_cpu(cpu) { if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) continue; - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = per_cpu_ptr(&rcu_data, cpu); if (rcu_is_nocb_cpu(cpu)) { - if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { - _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu, - rsp->barrier_sequence); + if (!rcu_nocb_cpu_needs_barrier(cpu)) { + rcu_barrier_trace(TPS("OfflineNoCB"), cpu, + rcu_state.barrier_sequence); } else { - _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu, - rsp->barrier_sequence); + rcu_barrier_trace(TPS("OnlineNoCB"), cpu, + rcu_state.barrier_sequence); smp_mb__before_atomic(); - atomic_inc(&rsp->barrier_cpu_count); + atomic_inc(&rcu_state.barrier_cpu_count); __call_rcu(&rdp->barrier_head, - rcu_barrier_callback, rsp, cpu, 0); + rcu_barrier_callback, cpu, 0); } } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { - _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu, - rsp->barrier_sequence); - smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); + rcu_barrier_trace(TPS("OnlineQ"), cpu, + rcu_state.barrier_sequence); + smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); } else { - _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu, - rsp->barrier_sequence); + rcu_barrier_trace(TPS("OnlineNQ"), cpu, + rcu_state.barrier_sequence); } } put_online_cpus(); @@ -3471,37 +3135,20 @@ static void _rcu_barrier(struct rcu_state *rsp) * Now that we have an rcu_barrier_callback() callback on each * CPU, and thus each counted, remove the initial count. */ - if (atomic_dec_and_test(&rsp->barrier_cpu_count)) - complete(&rsp->barrier_completion); + if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) + complete(&rcu_state.barrier_completion); /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ - wait_for_completion(&rsp->barrier_completion); + wait_for_completion(&rcu_state.barrier_completion); /* Mark the end of the barrier operation. */ - _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence); - rcu_seq_end(&rsp->barrier_sequence); + rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence); + rcu_seq_end(&rcu_state.barrier_sequence); /* Other rcu_barrier() invocations can now safely proceed. */ - mutex_unlock(&rsp->barrier_mutex); -} - -/** - * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. - */ -void rcu_barrier_bh(void) -{ - _rcu_barrier(&rcu_bh_state); -} -EXPORT_SYMBOL_GPL(rcu_barrier_bh); - -/** - * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. - */ -void rcu_barrier_sched(void) -{ - _rcu_barrier(&rcu_sched_state); + mutex_unlock(&rcu_state.barrier_mutex); } -EXPORT_SYMBOL_GPL(rcu_barrier_sched); +EXPORT_SYMBOL_GPL(rcu_barrier); /* * Propagate ->qsinitmask bits up the rcu_node tree to account for the @@ -3535,46 +3182,46 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) * Do boot-time initialization of a CPU's per-CPU RCU data. */ static void __init -rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) +rcu_boot_init_percpu_data(int cpu) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); - rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); - rdp->rcu_ofl_gp_seq = rsp->gp_seq; + WARN_ON_ONCE(rdp->dynticks_nesting != 1); + WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); + rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; - rdp->rcu_onl_gp_seq = rsp->gp_seq; + rdp->rcu_onl_gp_seq = rcu_state.gp_seq; rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; rdp->cpu = cpu; - rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); } /* - * Initialize a CPU's per-CPU RCU data. Note that only one online or + * Invoked early in the CPU-online process, when pretty much all services + * are available. The incoming CPU is not present. + * + * Initializes a CPU's per-CPU RCU data. Note that only one online or * offline event can be happening at a given time. Note also that we can * accept some slop in the rsp->gp_seq access due to the fact that this * CPU cannot possibly have any RCU callbacks in flight yet. */ -static void -rcu_init_percpu_data(int cpu, struct rcu_state *rsp) +int rcutree_prepare_cpu(unsigned int cpu) { unsigned long flags; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_node *rnp = rcu_get_root(); /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->qlen_last_fqs_check = 0; - rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->blimit = blimit; if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ !init_nocb_callback_list(rdp)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ - rdp->dynticks->dynticks_nesting = 1; /* CPU not up, no tearing. */ + rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ @@ -3589,25 +3236,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gp_seq = rnp->gp_seq; rdp->gp_seq_needed = rnp->gp_seq; rdp->cpu_no_qs.b.norm = true; - rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; - trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuonl")); + trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -} - -/* - * Invoked early in the CPU-online process, when pretty much all - * services are available. The incoming CPU is not present. - */ -int rcutree_prepare_cpu(unsigned int cpu) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - rcu_init_percpu_data(cpu, rsp); - rcu_prepare_kthreads(cpu); rcu_spawn_all_nocb_kthreads(cpu); @@ -3619,7 +3252,7 @@ int rcutree_prepare_cpu(unsigned int cpu) */ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) { - struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); } @@ -3633,15 +3266,12 @@ int rcutree_online_cpu(unsigned int cpu) unsigned long flags; struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_state *rsp; - for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->ffmask |= rdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask |= rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (IS_ENABLED(CONFIG_TREE_SRCU)) srcu_online_cpu(cpu); if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) @@ -3660,15 +3290,12 @@ int rcutree_offline_cpu(unsigned int cpu) unsigned long flags; struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_state *rsp; - for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->ffmask &= ~rdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask &= ~rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcutree_affinity_setting(cpu, cpu); if (IS_ENABLED(CONFIG_TREE_SRCU)) @@ -3676,32 +3303,6 @@ int rcutree_offline_cpu(unsigned int cpu) return 0; } -/* - * Near the end of the offline process. We do only tracing here. - */ -int rcutree_dying_cpu(unsigned int cpu) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - rcu_cleanup_dying_cpu(rsp); - return 0; -} - -/* - * The outgoing CPU is gone and we are running elsewhere. - */ -int rcutree_dead_cpu(unsigned int cpu) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) { - rcu_cleanup_dead_cpu(cpu, rsp); - do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); - } - return 0; -} - static DEFINE_PER_CPU(int, rcu_cpu_started); /* @@ -3723,137 +3324,113 @@ void rcu_cpu_starting(unsigned int cpu) unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_state *rsp; if (per_cpu(rcu_cpu_started, cpu)) return; per_cpu(rcu_cpu_started, cpu) = 1; - for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - mask = rdp->grpmask; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->qsmaskinitnext |= mask; - oldmask = rnp->expmaskinitnext; - rnp->expmaskinitnext |= mask; - oldmask ^= rnp->expmaskinitnext; - nbits = bitmap_weight(&oldmask, BITS_PER_LONG); - /* Allow lockless access for expedited grace periods. */ - smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ - rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ - rdp->rcu_onl_gp_seq = READ_ONCE(rsp->gp_seq); - rdp->rcu_onl_gp_flags = READ_ONCE(rsp->gp_flags); - if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ - /* Report QS -after- changing ->qsmaskinitnext! */ - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); - } else { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + mask = rdp->grpmask; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->qsmaskinitnext |= mask; + oldmask = rnp->expmaskinitnext; + rnp->expmaskinitnext |= mask; + oldmask ^= rnp->expmaskinitnext; + nbits = bitmap_weight(&oldmask, BITS_PER_LONG); + /* Allow lockless access for expedited grace periods. */ + smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ + rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); + rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); + if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ + /* Report QS -after- changing ->qsmaskinitnext! */ + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); + } else { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } #ifdef CONFIG_HOTPLUG_CPU /* - * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinitnext - * bit masks. + * The outgoing function has no further need of RCU, so remove it from + * the rcu_node tree's ->qsmaskinitnext bit masks. + * + * Note that this function is special in that it is invoked directly + * from the outgoing CPU rather than from the cpuhp_step mechanism. + * This is because this function must be invoked at a precise location. */ -static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +void rcu_report_dead(unsigned int cpu) { unsigned long flags; unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + /* QS for any half-done expedited grace period. */ + preempt_disable(); + rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); + preempt_enable(); + rcu_preempt_deferred_qs(current); + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - spin_lock(&rsp->ofl_lock); + raw_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ - rdp->rcu_ofl_gp_seq = READ_ONCE(rsp->gp_seq); - rdp->rcu_ofl_gp_flags = READ_ONCE(rsp->gp_flags); + rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); + rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ - rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - spin_unlock(&rsp->ofl_lock); -} - -/* - * The outgoing function has no further need of RCU, so remove it from - * the list of CPUs that RCU must track. - * - * Note that this function is special in that it is invoked directly - * from the outgoing CPU rather than from the cpuhp_step mechanism. - * This is because this function must be invoked at a precise location. - */ -void rcu_report_dead(unsigned int cpu) -{ - struct rcu_state *rsp; - - /* QS for any half-done expedited RCU-sched GP. */ - preempt_disable(); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(rcu_sched_state.rda), true); - preempt_enable(); - for_each_rcu_flavor(rsp) - rcu_cleanup_dying_idle_cpu(cpu, rsp); + raw_spin_unlock(&rcu_state.ofl_lock); per_cpu(rcu_cpu_started, cpu) = 0; } -/* Migrate the dead CPU's callbacks to the current CPU. */ -static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) +/* + * The outgoing CPU has just passed through the dying-idle state, and we + * are being invoked from the CPU that was IPIed to continue the offline + * operation. Migrate the outgoing CPU's callbacks to the current CPU. + */ +void rcutree_migrate_callbacks(int cpu) { unsigned long flags; struct rcu_data *my_rdp; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_node *rnp_root = rcu_get_root(); bool needwake; if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ local_irq_save(flags); - my_rdp = this_cpu_ptr(rsp->rda); + my_rdp = this_cpu_ptr(&rcu_data); if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { local_irq_restore(flags); return; } raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ /* Leverage recent GPs and set GP for new callbacks. */ - needwake = rcu_advance_cbs(rsp, rnp_root, rdp) || - rcu_advance_cbs(rsp, rnp_root, my_rdp); + needwake = rcu_advance_cbs(rnp_root, rdp) || + rcu_advance_cbs(rnp_root, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_gp_kthread_wake(); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", cpu, rcu_segcblist_n_cbs(&rdp->cblist), rcu_segcblist_first_cb(&rdp->cblist)); } - -/* - * The outgoing CPU has just passed through the dying-idle state, - * and we are being invoked from the CPU that was IPIed to continue the - * offline operation. We need to migrate the outgoing CPU's callbacks. - */ -void rcutree_migrate_callbacks(int cpu) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - rcu_migrate_callbacks(cpu, rsp); -} #endif /* @@ -3881,14 +3458,13 @@ static int rcu_pm_notify(struct notifier_block *self, } /* - * Spawn the kthreads that handle each RCU flavor's grace periods. + * Spawn the kthreads that handle RCU's grace periods. */ static int __init rcu_spawn_gp_kthread(void) { unsigned long flags; int kthread_prio_in = kthread_prio; struct rcu_node *rnp; - struct rcu_state *rsp; struct sched_param sp; struct task_struct *t; @@ -3908,19 +3484,17 @@ static int __init rcu_spawn_gp_kthread(void) kthread_prio, kthread_prio_in); rcu_scheduler_fully_active = 1; - for_each_rcu_flavor(rsp) { - t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); - BUG_ON(IS_ERR(t)); - rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rsp->gp_kthread = t; - if (kthread_prio) { - sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - wake_up_process(t); + t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); + BUG_ON(IS_ERR(t)); + rnp = rcu_get_root(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_state.gp_kthread = t; + if (kthread_prio) { + sp.sched_priority = kthread_prio; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + wake_up_process(t); rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); return 0; @@ -3947,9 +3521,9 @@ void rcu_scheduler_starting(void) } /* - * Helper function for rcu_init() that initializes one rcu_state structure. + * Helper function for rcu_init() that initializes the rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp) +static void __init rcu_init_one(void) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; @@ -3971,14 +3545,15 @@ static void __init rcu_init_one(struct rcu_state *rsp) /* Initialize the level-tracking arrays. */ for (i = 1; i < rcu_num_lvls; i++) - rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1]; + rcu_state.level[i] = + rcu_state.level[i - 1] + num_rcu_lvl[i - 1]; rcu_init_levelspread(levelspread, num_rcu_lvl); /* Initialize the elements themselves, starting from the leaves. */ for (i = rcu_num_lvls - 1; i >= 0; i--) { cpustride *= levelspread[i]; - rnp = rsp->level[i]; + rnp = rcu_state.level[i]; for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) { raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), @@ -3986,9 +3561,9 @@ static void __init rcu_init_one(struct rcu_state *rsp) raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, &rcu_fqs_class[i], fqs[i]); - rnp->gp_seq = rsp->gp_seq; - rnp->gp_seq_needed = rsp->gp_seq; - rnp->completedqs = rsp->gp_seq; + rnp->gp_seq = rcu_state.gp_seq; + rnp->gp_seq_needed = rcu_state.gp_seq; + rnp->completedqs = rcu_state.gp_seq; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; @@ -4001,8 +3576,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->parent = NULL; } else { rnp->grpnum = j % levelspread[i - 1]; - rnp->grpmask = 1UL << rnp->grpnum; - rnp->parent = rsp->level[i - 1] + + rnp->grpmask = BIT(rnp->grpnum); + rnp->parent = rcu_state.level[i - 1] + j / levelspread[i - 1]; } rnp->level = i; @@ -4016,16 +3591,15 @@ static void __init rcu_init_one(struct rcu_state *rsp) } } - init_swait_queue_head(&rsp->gp_wq); - init_swait_queue_head(&rsp->expedited_wq); - rnp = rcu_first_leaf_node(rsp); + init_swait_queue_head(&rcu_state.gp_wq); + init_swait_queue_head(&rcu_state.expedited_wq); + rnp = rcu_first_leaf_node(); for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; - per_cpu_ptr(rsp->rda, i)->mynode = rnp; - rcu_boot_init_percpu_data(i, rsp); + per_cpu_ptr(&rcu_data, i)->mynode = rnp; + rcu_boot_init_percpu_data(i); } - list_add(&rsp->flavors, &rcu_struct_flavors); } /* @@ -4051,6 +3625,8 @@ static void __init rcu_init_geometry(void) jiffies_till_first_fqs = d; if (jiffies_till_next_fqs == ULONG_MAX) jiffies_till_next_fqs = d; + if (jiffies_till_sched_qs == ULONG_MAX) + adjust_jiffies_till_sched_qs(); /* If the compile-time values are accurate, just leave. */ if (rcu_fanout_leaf == RCU_FANOUT_LEAF && @@ -4109,16 +3685,16 @@ static void __init rcu_init_geometry(void) /* * Dump out the structure of the rcu_node combining tree associated - * with the rcu_state structure referenced by rsp. + * with the rcu_state structure. */ -static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) +static void __init rcu_dump_rcu_node_tree(void) { int level = 0; struct rcu_node *rnp; pr_info("rcu_node tree layout dump\n"); pr_info(" "); - rcu_for_each_node_breadth_first(rsp, rnp) { + rcu_for_each_node_breadth_first(rnp) { if (rnp->level != level) { pr_cont("\n"); pr_info(" "); @@ -4140,11 +3716,9 @@ void __init rcu_init(void) rcu_bootup_announce(); rcu_init_geometry(); - rcu_init_one(&rcu_bh_state); - rcu_init_one(&rcu_sched_state); + rcu_init_one(); if (dump_tree) - rcu_dump_rcu_node_tree(&rcu_sched_state); - __rcu_init_preempt(); + rcu_dump_rcu_node_tree(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* @@ -4164,6 +3738,7 @@ void __init rcu_init(void) WARN_ON(!rcu_gp_wq); rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_par_gp_wq); + srcu_init(); } #include "tree_exp.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4e74df768c57..703e19ff532d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -34,34 +34,9 @@ #include "rcu_segcblist.h" -/* - * Dynticks per-CPU state. - */ -struct rcu_dynticks { - long dynticks_nesting; /* Track process nesting level. */ - long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ - atomic_t dynticks; /* Even value for idle, else odd. */ - bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ - unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ - bool rcu_urgent_qs; /* GP old need light quiescent state. */ -#ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* Are all CPU's CBs lazy? */ - unsigned long nonlazy_posted; - /* # times non-lazy CBs posted to CPU. */ - unsigned long nonlazy_posted_snap; - /* idle-period nonlazy_posted snapshot. */ - unsigned long last_accelerate; - /* Last jiffy CBs were accelerated. */ - unsigned long last_advance_all; - /* Last jiffy CBs were all advanced. */ - int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ -#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -}; - /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { smp_call_func_t rew_func; - struct rcu_state *rew_rsp; unsigned long rew_s; struct work_struct rew_work; }; @@ -170,7 +145,7 @@ struct rcu_node { * are indexed relative to this interval rather than the global CPU ID space. * This generates the bit for a CPU in node-local masks. */ -#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) +#define leaf_node_cpu_bit(rnp, cpu) (BIT((cpu) - (rnp)->grplo)) /* * Union to allow "aggregate OR" operation on the need for a quiescent @@ -189,12 +164,11 @@ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ unsigned long gp_seq_needed; /* Track rsp->rcu_gp_seq_needed ctr. */ - unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ - /* for rcu_all_qs() invocations. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ + bool deferred_qs; /* This CPU awaiting a deferred QS? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ @@ -213,23 +187,27 @@ struct rcu_data { long blimit; /* Upper limit on a processed batch */ /* 3) dynticks interface. */ - struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ int dynticks_snap; /* Per-GP tracking for dynticks. */ - - /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ - unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ - unsigned long cond_resched_completed; - /* Grace period that needs help */ - /* from cond_resched(). */ - - /* 5) _rcu_barrier(), OOM callbacks, and expediting. */ - struct rcu_head barrier_head; + long dynticks_nesting; /* Track process nesting level. */ + long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ + atomic_t dynticks; /* Even value for idle, else odd. */ + bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ + bool rcu_urgent_qs; /* GP old need light quiescent state. */ #ifdef CONFIG_RCU_FAST_NO_HZ - struct rcu_head oom_head; + bool all_lazy; /* Are all CPU's CBs lazy? */ + unsigned long nonlazy_posted; /* # times non-lazy CB posted to CPU. */ + unsigned long nonlazy_posted_snap; + /* Nonlazy_posted snapshot. */ + unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ + unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ + int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + + /* 4) rcu_barrier(), OOM callbacks, and expediting. */ + struct rcu_head barrier_head; int exp_dynticks_snap; /* Double-check need for IPI. */ - /* 6) Callback offloading. */ + /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU struct rcu_head *nocb_head; /* CBs waiting for kthread. */ struct rcu_head **nocb_tail; @@ -256,7 +234,7 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 7) Diagnostic data, including RCU CPU stall warnings. */ + /* 6) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ @@ -266,9 +244,9 @@ struct rcu_data { short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */ unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ short rcu_onl_gp_flags; /* ->gp_flags at last online. */ + unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ int cpu; - struct rcu_state *rsp; }; /* Values for nocb_defer_wakeup field in struct rcu_data. */ @@ -314,8 +292,6 @@ struct rcu_state { struct rcu_node *level[RCU_NUM_LVLS + 1]; /* Hierarchy levels (+1 to */ /* shut bogus gcc warning) */ - struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ - call_rcu_func_t call; /* call_rcu() flavor. */ int ncpus; /* # CPUs seen so far. */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -334,7 +310,7 @@ struct rcu_state { atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ unsigned long barrier_sequence; /* ++ at start and end of */ - /* _rcu_barrier(). */ + /* rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ struct mutex exp_mutex; /* Serialize expedited GP. */ @@ -366,9 +342,8 @@ struct rcu_state { /* jiffies. */ const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ - struct list_head flavors; /* List of RCU flavors. */ - spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; + raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ }; @@ -388,7 +363,6 @@ struct rcu_state { #define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */ -#ifndef RCU_TREE_NONCORE static const char * const gp_state_names[] = { "RCU_GP_IDLE", "RCU_GP_WAIT_GPS", @@ -400,13 +374,29 @@ static const char * const gp_state_names[] = { "RCU_GP_CLEANUP", "RCU_GP_CLEANED", }; -#endif /* #ifndef RCU_TREE_NONCORE */ - -extern struct list_head rcu_struct_flavors; -/* Sequence through rcu_state structures for each RCU flavor. */ -#define for_each_rcu_flavor(rsp) \ - list_for_each_entry((rsp), &rcu_struct_flavors, flavors) +/* + * In order to export the rcu_state name to the tracing tools, it + * needs to be added in the __tracepoint_string section. + * This requires defining a separate variable tp_<sname>_varname + * that points to the string being used, and this will allow + * the tracing userspace tools to be able to decipher the string + * address to the matching string. + */ +#ifdef CONFIG_PREEMPT_RCU +#define RCU_ABBR 'p' +#define RCU_NAME_RAW "rcu_preempt" +#else /* #ifdef CONFIG_PREEMPT_RCU */ +#define RCU_ABBR 's' +#define RCU_NAME_RAW "rcu_sched" +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +#ifndef CONFIG_TRACING +#define RCU_NAME RCU_NAME_RAW +#else /* #ifdef CONFIG_TRACING */ +static char rcu_name[] = RCU_NAME_RAW; +static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; +#define RCU_NAME rcu_name +#endif /* #else #ifdef CONFIG_TRACING */ /* * RCU implementation internal declarations: @@ -419,7 +409,7 @@ extern struct rcu_state rcu_bh_state; extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ -int rcu_dynticks_snap(struct rcu_dynticks *rdtp); +int rcu_dynticks_snap(struct rcu_data *rdp); #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -428,45 +418,37 @@ DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DECLARE_PER_CPU(char, rcu_cpu_has_work); #endif /* #ifdef CONFIG_RCU_BOOST */ -#ifndef RCU_TREE_NONCORE - /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -static void rcu_preempt_note_context_switch(bool preempt); +static void rcu_qs(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_print_detail_task_stall(struct rcu_state *rsp); +static void rcu_print_detail_task_stall(void); static int rcu_print_task_stall(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); -static void rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, - struct rcu_node *rnp); -static void rcu_preempt_check_callbacks(void); +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); +static void rcu_flavor_check_callbacks(int user); void call_rcu(struct rcu_head *head, rcu_callback_t func); -static void __init __rcu_init_preempt(void); -static void dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, - int ncheck); +static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); -#ifdef CONFIG_RCU_BOOST -static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp); -#endif /* #ifdef CONFIG_RCU_BOOST */ static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(void); static void rcu_prepare_for_idle(void); static void rcu_idle_count_callbacks_posted(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); +static bool rcu_preempt_need_deferred_qs(struct task_struct *t); +static void rcu_preempt_deferred_qs(struct task_struct *t); static void print_cpu_stall_info_begin(void); -static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); +static void print_cpu_stall_info(int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); -static void increment_cpu_stall_ticks(void); -static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); +static bool rcu_nocb_cpu_needs_barrier(int cpu); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); @@ -481,11 +463,11 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_all_nocb_kthreads(int cpu); static void __init rcu_spawn_nocb_kthreads(void); #ifdef CONFIG_RCU_NOCB_CPU -static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); +static void __init rcu_organize_nocb_kthreads(void); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_bind_gp_kthread(void); -static bool rcu_nohz_full_cpu(struct rcu_state *rsp); +static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); @@ -496,5 +478,3 @@ void srcu_offline_cpu(unsigned int cpu); void srcu_online_cpu(unsigned int cpu) { } void srcu_offline_cpu(unsigned int cpu) { } #endif /* #else #ifdef CONFIG_SRCU */ - -#endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 0b2c2ad69629..8d18c1014e2b 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -25,39 +25,39 @@ /* * Record the start of an expedited grace period. */ -static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +static void rcu_exp_gp_seq_start(void) { - rcu_seq_start(&rsp->expedited_sequence); + rcu_seq_start(&rcu_state.expedited_sequence); } /* * Return then value that expedited-grace-period counter will have * at the end of the current grace period. */ -static __maybe_unused unsigned long rcu_exp_gp_seq_endval(struct rcu_state *rsp) +static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) { - return rcu_seq_endval(&rsp->expedited_sequence); + return rcu_seq_endval(&rcu_state.expedited_sequence); } /* * Record the end of an expedited grace period. */ -static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +static void rcu_exp_gp_seq_end(void) { - rcu_seq_end(&rsp->expedited_sequence); + rcu_seq_end(&rcu_state.expedited_sequence); smp_mb(); /* Ensure that consecutive grace periods serialize. */ } /* * Take a snapshot of the expedited-grace-period counter. */ -static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +static unsigned long rcu_exp_gp_seq_snap(void) { unsigned long s; smp_mb(); /* Caller's modifications seen first by other CPUs. */ - s = rcu_seq_snap(&rsp->expedited_sequence); - trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + s = rcu_seq_snap(&rcu_state.expedited_sequence); + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("snap")); return s; } @@ -66,9 +66,9 @@ static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) * if a full expedited grace period has elapsed since that snapshot * was taken. */ -static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +static bool rcu_exp_gp_seq_done(unsigned long s) { - return rcu_seq_done(&rsp->expedited_sequence, s); + return rcu_seq_done(&rcu_state.expedited_sequence, s); } /* @@ -78,26 +78,26 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) * ever been online. This means that this function normally takes its * no-work-to-do fastpath. */ -static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) +static void sync_exp_reset_tree_hotplug(void) { bool done; unsigned long flags; unsigned long mask; unsigned long oldmask; - int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */ + int ncpus = smp_load_acquire(&rcu_state.ncpus); /* Order vs. locking. */ struct rcu_node *rnp; struct rcu_node *rnp_up; /* If no new CPUs onlined since last time, nothing to do. */ - if (likely(ncpus == rsp->ncpus_snap)) + if (likely(ncpus == rcu_state.ncpus_snap)) return; - rsp->ncpus_snap = ncpus; + rcu_state.ncpus_snap = ncpus; /* * Each pass through the following loop propagates newly onlined * CPUs for the current rcu_node structure up the rcu_node tree. */ - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -135,13 +135,13 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) * Reset the ->expmask values in the rcu_node tree in preparation for * a new expedited grace period. */ -static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) +static void __maybe_unused sync_exp_reset_tree(void) { unsigned long flags; struct rcu_node *rnp; - sync_exp_reset_tree_hotplug(rsp); - rcu_for_each_node_breadth_first(rsp, rnp) { + sync_exp_reset_tree_hotplug(); + rcu_for_each_node_breadth_first(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; @@ -194,7 +194,7 @@ static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) * * Caller must hold the specified rcu_node structure's ->lock. */ -static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, +static void __rcu_report_exp_rnp(struct rcu_node *rnp, bool wake, unsigned long flags) __releases(rnp->lock) { @@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - swake_up_one(&rsp->expedited_wq); + swake_up_one(&rcu_state.expedited_wq); } break; } @@ -229,20 +229,19 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * Report expedited quiescent state for specified node. This is a * lock-acquisition wrapper function for __rcu_report_exp_rnp(). */ -static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, bool wake) +static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) { unsigned long flags; raw_spin_lock_irqsave_rcu_node(rnp, flags); - __rcu_report_exp_rnp(rsp, rnp, wake, flags); + __rcu_report_exp_rnp(rnp, wake, flags); } /* * Report expedited quiescent state for multiple CPUs, all covered by the * specified leaf rcu_node structure. */ -static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, +static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long mask, bool wake) { unsigned long flags; @@ -253,23 +252,23 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, return; } rnp->expmask &= ~mask; - __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ + __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */ } /* * Report expedited quiescent state for specified rcu_data (CPU). */ -static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, - bool wake) +static void rcu_report_exp_rdp(struct rcu_data *rdp) { - rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); + WRITE_ONCE(rdp->deferred_qs, false); + rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } -/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s) +/* Common code for work-done checking. */ +static bool sync_exp_work_done(unsigned long s) { - if (rcu_exp_gp_seq_done(rsp, s)) { - trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); + if (rcu_exp_gp_seq_done(s)) { + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ return true; @@ -284,28 +283,28 @@ static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s) * with the mutex held, indicating that the caller must actually do the * expedited grace period. */ -static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +static bool exp_funnel_lock(unsigned long s) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); struct rcu_node *rnp = rdp->mynode; - struct rcu_node *rnp_root = rcu_get_root(rsp); + struct rcu_node *rnp_root = rcu_get_root(); /* Low-contention fastpath. */ if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && (rnp == rnp_root || ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && - mutex_trylock(&rsp->exp_mutex)) + mutex_trylock(&rcu_state.exp_mutex)) goto fastpath; /* * Each pass through the following loop works its way up * the rcu_node tree, returning if others have done the work or - * otherwise falls through to acquire rsp->exp_mutex. The mapping + * otherwise falls through to acquire ->exp_mutex. The mapping * from CPU to rcu_node structure can be inexact, as it is just * promoting locality and is not strictly needed for correctness. */ for (; rnp != NULL; rnp = rnp->parent) { - if (sync_exp_work_done(rsp, s)) + if (sync_exp_work_done(s)) return true; /* Work not done, either wait here or go up. */ @@ -314,68 +313,29 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) /* Someone else doing GP, so wait for them. */ spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level, rnp->grplo, rnp->grphi, TPS("wait")); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(rsp, s)); + sync_exp_work_done(s)); return true; } rnp->exp_seq_rq = s; /* Followers can wait on us. */ spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, - rnp->grphi, TPS("nxtlvl")); + trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level, + rnp->grplo, rnp->grphi, TPS("nxtlvl")); } - mutex_lock(&rsp->exp_mutex); + mutex_lock(&rcu_state.exp_mutex); fastpath: - if (sync_exp_work_done(rsp, s)) { - mutex_unlock(&rsp->exp_mutex); + if (sync_exp_work_done(s)) { + mutex_unlock(&rcu_state.exp_mutex); return true; } - rcu_exp_gp_seq_start(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); + rcu_exp_gp_seq_start(); + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("start")); return false; } -/* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *data) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - struct rcu_state *rsp = data; - - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - if (rcu_is_cpu_rrupt_from_idle()) { - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), true); - return; - } - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); - /* Store .exp before .rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - resched_cpu(smp_processor_id()); -} - -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - struct rcu_state *rsp = &rcu_sched_state; - - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) - return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); - WARN_ON_ONCE(ret); -} - /* * Select the CPUs within the specified rcu_node that the upcoming * expedited grace period needs to wait for. @@ -391,7 +351,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) struct rcu_exp_work *rewp = container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); - struct rcu_state *rsp = rewp->rew_rsp; func = rewp->rew_func; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -400,15 +359,14 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) mask_ofl_test = 0; for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int snap; if (raw_smp_processor_id() == cpu || !(rnp->qsmaskinitnext & mask)) { mask_ofl_test |= mask; } else { - snap = rcu_dynticks_snap(rdtp); + snap = rcu_dynticks_snap(rdp); if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; else @@ -429,17 +387,16 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) /* IPI the remaining CPUs for expedited quiescent state. */ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); if (!(mask_ofl_ipi & mask)) continue; retry_ipi: - if (rcu_dynticks_in_eqs_since(rdp->dynticks, - rdp->exp_dynticks_snap)) { + if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { mask_ofl_test |= mask; continue; } - ret = smp_call_function_single(cpu, func, rsp, 0); + ret = smp_call_function_single(cpu, func, NULL, 0); if (!ret) { mask_ofl_ipi &= ~mask; continue; @@ -450,7 +407,7 @@ retry_ipi: (rnp->expmask & mask)) { /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); + trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl")); schedule_timeout_uninterruptible(1); goto retry_ipi; } @@ -462,33 +419,31 @@ retry_ipi: /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); } /* * Select the nodes that the upcoming expedited grace period needs * to wait for. */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) +static void sync_rcu_exp_select_cpus(smp_call_func_t func) { int cpu; struct rcu_node *rnp; - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); - sync_exp_reset_tree(rsp); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); + trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset")); + sync_exp_reset_tree(); + trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("select")); /* Schedule work for each leaf rcu_node structure. */ - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ rnp->rew.rew_func = func; - rnp->rew.rew_rsp = rsp; if (!READ_ONCE(rcu_par_gp_wq) || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || - rcu_is_last_leaf_node(rsp, rnp)) { + rcu_is_last_leaf_node(rnp)) { /* No workqueues yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; @@ -505,12 +460,12 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, } /* Wait for workqueue jobs (if any) to complete. */ - rcu_for_each_leaf_node(rsp, rnp) + rcu_for_each_leaf_node(rnp) if (rnp->exp_need_flush) flush_work(&rnp->rew.rew_work); } -static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +static void synchronize_sched_expedited_wait(void) { int cpu; unsigned long jiffies_stall; @@ -518,16 +473,16 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) unsigned long mask; int ndetected; struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(rsp); + struct rcu_node *rnp_root = rcu_get_root(); int ret; - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("startwait")); + trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); jiffies_stall = rcu_jiffies_till_stall_check(); jiffies_start = jiffies; for (;;) { ret = swait_event_timeout_exclusive( - rsp->expedited_wq, + rcu_state.expedited_wq, sync_rcu_preempt_exp_done_unlocked(rnp_root), jiffies_stall); if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) @@ -537,9 +492,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) continue; panic_on_rcu_stall(); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rsp->name); + rcu_state.name); ndetected = 0; - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { ndetected += rcu_print_task_exp_stall(rnp); for_each_leaf_node_possible_cpu(rnp, cpu) { struct rcu_data *rdp; @@ -548,7 +503,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) if (!(rnp->expmask & mask)) continue; ndetected++; - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = per_cpu_ptr(&rcu_data, cpu); pr_cont(" %d-%c%c%c", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], @@ -556,11 +511,11 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rsp->expedited_sequence, + jiffies - jiffies_start, rcu_state.expedited_sequence, rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); if (ndetected) { pr_err("blocking rcu_node structures:"); - rcu_for_each_node_breadth_first(rsp, rnp) { + rcu_for_each_node_breadth_first(rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ if (sync_rcu_preempt_exp_done_unlocked(rnp)) @@ -572,7 +527,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } pr_cont("\n"); } - rcu_for_each_leaf_node(rsp, rnp) { + rcu_for_each_leaf_node(rnp) { for_each_leaf_node_possible_cpu(rnp, cpu) { mask = leaf_node_cpu_bit(rnp, cpu); if (!(rnp->expmask & mask)) @@ -590,21 +545,21 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) * grace period. Also update all the ->exp_seq_rq counters as needed * in order to avoid counter-wrap problems. */ -static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) +static void rcu_exp_wait_wake(unsigned long s) { struct rcu_node *rnp; - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + synchronize_sched_expedited_wait(); + rcu_exp_gp_seq_end(); + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end")); /* * Switch over to wakeup mode, allowing the next GP, but -only- the * next GP, to proceed. */ - mutex_lock(&rsp->exp_wake_mutex); + mutex_lock(&rcu_state.exp_wake_mutex); - rcu_for_each_node_breadth_first(rsp, rnp) { + rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { spin_lock(&rnp->exp_lock); /* Recheck, avoid hang in case someone just arrived. */ @@ -613,24 +568,23 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) spin_unlock(&rnp->exp_lock); } smp_mb(); /* All above changes before wakeup. */ - wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]); + wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]); } - trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); - mutex_unlock(&rsp->exp_wake_mutex); + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake")); + mutex_unlock(&rcu_state.exp_wake_mutex); } /* * Common code to drive an expedited grace period forward, used by * workqueues and mid-boot-time tasks. */ -static void rcu_exp_sel_wait_wake(struct rcu_state *rsp, - smp_call_func_t func, unsigned long s) +static void rcu_exp_sel_wait_wake(smp_call_func_t func, unsigned long s) { /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, func); + sync_rcu_exp_select_cpus(func); /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rsp, s); + rcu_exp_wait_wake(s); } /* @@ -641,15 +595,14 @@ static void wait_rcu_exp_gp(struct work_struct *wp) struct rcu_exp_work *rewp; rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s); + rcu_exp_sel_wait_wake(rewp->rew_func, rewp->rew_s); } /* - * Given an rcu_state pointer and a smp_call_function() handler, kick - * off the specified flavor of expedited grace period. + * Given a smp_call_function() handler, kick off the specified + * implementation of expedited grace period. */ -static void _synchronize_rcu_expedited(struct rcu_state *rsp, - smp_call_func_t func) +static void _synchronize_rcu_expedited(smp_call_func_t func) { struct rcu_data *rdp; struct rcu_exp_work rew; @@ -658,71 +611,37 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, /* If expedited grace periods are prohibited, fall back to normal. */ if (rcu_gp_is_normal()) { - wait_rcu_gp(rsp->call); + wait_rcu_gp(call_rcu); return; } /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) + s = rcu_exp_gp_seq_snap(); + if (exp_funnel_lock(s)) return; /* Someone else did our work for us. */ /* Ensure that load happens before action based on it. */ if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { /* Direct call during scheduler init and early_initcalls(). */ - rcu_exp_sel_wait_wake(rsp, func, s); + rcu_exp_sel_wait_wake(func, s); } else { /* Marshall arguments & schedule the expedited grace period. */ rew.rew_func = func; - rew.rew_rsp = rsp; rew.rew_s = s; INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); queue_work(rcu_gp_wq, &rew.rew_work); } /* Wait for expedited grace period to complete. */ - rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - rnp = rcu_get_root(rsp); + rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); + rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(rsp, s)); + sync_exp_work_done(s)); smp_mb(); /* Workqueue actions happen before return. */ /* Let the next expedited grace period start. */ - mutex_unlock(&rsp->exp_mutex); -} - -/** - * synchronize_sched_expedited - Brute-force RCU-sched grace period - * - * Wait for an RCU-sched grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. In fact, - * if you are using synchronize_sched_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_sched() instead. - * - * This implementation can be thought of as an application of sequence - * locking to expedited grace periods, but using the sequence counter to - * determine when someone else has already done the work instead of for - * retrying readers. - */ -void synchronize_sched_expedited(void) -{ - struct rcu_state *rsp = &rcu_sched_state; - - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched_expedited() in RCU read-side critical section"); - - /* If only one CPU, this is automatically a grace period. */ - if (rcu_blocking_is_gp()) - return; - - _synchronize_rcu_expedited(rsp, sync_sched_exp_handler); + mutex_unlock(&rcu_state.exp_mutex); } -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); #ifdef CONFIG_PREEMPT_RCU @@ -733,34 +652,78 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); * ->expmask fields in the rcu_node tree. Otherwise, immediately * report the quiescent state. */ -static void sync_rcu_exp_handler(void *info) +static void sync_rcu_exp_handler(void *unused) { - struct rcu_data *rdp; - struct rcu_state *rsp = info; + unsigned long flags; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; struct task_struct *t = current; /* - * Within an RCU read-side critical section, request that the next - * rcu_read_unlock() report. Unless this RCU read-side critical - * section has already blocked, in which case it is already set - * up for the expedited grace period to wait on it. + * First, the common case of not being in an RCU read-side + * critical section. If also enabled or idle, immediately + * report the quiescent state, otherwise defer. */ - if (t->rcu_read_lock_nesting > 0 && - !t->rcu_read_unlock_special.b.blocked) { - t->rcu_read_unlock_special.b.exp_need_qs = true; + if (!t->rcu_read_lock_nesting) { + if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || + rcu_dynticks_curr_cpu_in_eqs()) { + rcu_report_exp_rdp(rdp); + } else { + rdp->deferred_qs = true; + set_tsk_need_resched(t); + set_preempt_need_resched(); + } return; } /* - * We are either exiting an RCU read-side critical section (negative - * values of t->rcu_read_lock_nesting) or are not in one at all - * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU - * read-side critical section that blocked before this expedited - * grace period started. Either way, we can immediately report - * the quiescent state. + * Second, the less-common case of being in an RCU read-side + * critical section. In this case we can count on a future + * rcu_read_unlock(). However, this rcu_read_unlock() might + * execute on some other CPU, but in that case there will be + * a future context switch. Either way, if the expedited + * grace period is still waiting on this CPU, set ->deferred_qs + * so that the eventual quiescent state will be reported. + * Note that there is a large group of race conditions that + * can have caused this quiescent state to already have been + * reported, so we really do need to check ->expmask. + */ + if (t->rcu_read_lock_nesting > 0) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->expmask & rdp->grpmask) + rdp->deferred_qs = true; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + + /* + * The final and least likely case is where the interrupted + * code was just about to or just finished exiting the RCU-preempt + * read-side critical section, and no, we can't tell which. + * So either way, set ->deferred_qs to flag later code that + * a quiescent state is required. + * + * If the CPU is fully enabled (or if some buggy RCU-preempt + * read-side critical section is being used from idle), just + * invoke rcu_preempt_defer_qs() to immediately report the + * quiescent state. We cannot use rcu_read_unlock_special() + * because we are in an interrupt handler, which will cause that + * function to take an early exit without doing anything. + * + * Otherwise, force a context switch after the CPU enables everything. */ - rdp = this_cpu_ptr(rsp->rda); - rcu_report_exp_rdp(rsp, rdp, true); + rdp->deferred_qs = true; + if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || + WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { + rcu_preempt_deferred_qs(t); + } else { + set_tsk_need_resched(t); + set_preempt_need_resched(); + } +} + +/* PREEMPT=y, so no PREEMPT=n expedited grace period to clean up after. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ } /** @@ -780,11 +743,11 @@ static void sync_rcu_exp_handler(void *info) * you are using synchronize_rcu_expedited() in a loop, please restructure * your code to batch your updates, and then Use a single synchronize_rcu() * instead. + * + * This has the same semantics as (but is more brutal than) synchronize_rcu(). */ void synchronize_rcu_expedited(void) { - struct rcu_state *rsp = rcu_state_p; - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), @@ -792,19 +755,82 @@ void synchronize_rcu_expedited(void) if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return; - _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); + _synchronize_rcu_expedited(sync_rcu_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #else /* #ifdef CONFIG_PREEMPT_RCU */ +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static void sync_sched_exp_handler(void *unused) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = this_cpu_ptr(&rcu_data); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) + return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); + return; + } + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + +/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ + struct rcu_data *rdp; + int ret; + struct rcu_node *rnp; + + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + return; + ret = smp_call_function_single(cpu, sync_sched_exp_handler, NULL, 0); + WARN_ON_ONCE(ret); +} + /* - * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptible RCU does not exist, map to rcu-sched. + * Because a context switch is a grace period for !PREEMPT, any + * blocking grace-period wait automatically implies a grace period if + * there is only one CPU online at any point time during execution of + * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to + * occasionally incorrectly indicate that there are multiple CPUs online + * when there was in fact only one the whole time, as this just adds some + * overhead: RCU still operates correctly. */ +static int rcu_blocking_is_gp(void) +{ + int ret; + + might_sleep(); /* Check for RCU read-side critical section. */ + preempt_disable(); + ret = num_online_cpus() <= 1; + preempt_enable(); + return ret; +} + +/* PREEMPT=n implementation of synchronize_rcu_expedited(). */ void synchronize_rcu_expedited(void) { - synchronize_sched_expedited(); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); + + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + + _synchronize_rcu_expedited(sync_sched_exp_handler); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a97c20ea9bce..05915e536336 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -38,8 +38,7 @@ #include "../locking/rtmutex_common.h" /* - * Control variables for per-CPU and per-rcu_node kthreads. These - * handle all flavors of RCU. + * Control variables for per-CPU and per-rcu_node kthreads. */ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -106,6 +105,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); if (jiffies_till_next_fqs != ULONG_MAX) pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs); + if (jiffies_till_sched_qs != ULONG_MAX) + pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs); if (rcu_kick_kthreads) pr_info("\tKick kthreads if too-long grace period.\n"); if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) @@ -123,12 +124,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_PREEMPT_RCU -RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); -static struct rcu_state *const rcu_state_p = &rcu_preempt_state; -static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; - -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake); +static void rcu_report_exp_rnp(struct rcu_node *rnp, bool wake); static void rcu_read_unlock_special(struct task_struct *t); /* @@ -284,13 +280,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) */ - if (blkd_state & RCU_EXP_BLKD && - t->rcu_read_unlock_special.b.exp_need_qs) { - t->rcu_read_unlock_special.b.exp_need_qs = false; - rcu_report_exp_rdp(rdp->rsp, rdp, true); - } else { - WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); - } + if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs) + rcu_report_exp_rdp(rdp); + else + WARN_ON_ONCE(rdp->deferred_qs); } /* @@ -306,15 +299,15 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * * Callers to this function must disable preemption. */ -static void rcu_preempt_qs(void) +static void rcu_qs(void) { - RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n"); - if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n"); + if (__this_cpu_read(rcu_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), - __this_cpu_read(rcu_data_p->gp_seq), + __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); - __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false); - barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ + __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); + barrier(); /* Coordinate with rcu_flavor_check_callbacks(). */ current->rcu_read_unlock_special.b.need_qs = false; } } @@ -332,19 +325,20 @@ static void rcu_preempt_qs(void) * * Caller must disable interrupts. */ -static void rcu_preempt_note_context_switch(bool preempt) +void rcu_note_context_switch(bool preempt) { struct task_struct *t = current; - struct rcu_data *rdp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp; + barrier(); /* Avoid RCU read-side critical sections leaking down. */ + trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); t->rcu_read_unlock_special.b.blocked = true; @@ -357,7 +351,7 @@ static void rcu_preempt_note_context_switch(bool preempt) */ WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); - trace_rcu_preempt_task(rdp->rsp->name, + trace_rcu_preempt_task(rcu_state.name, t->pid, (rnp->qsmask & rdp->grpmask) ? rnp->gp_seq @@ -371,6 +365,9 @@ static void rcu_preempt_note_context_switch(bool preempt) * behalf of preempted instance of __rcu_read_unlock(). */ rcu_read_unlock_special(t); + rcu_preempt_deferred_qs(t); + } else { + rcu_preempt_deferred_qs(t); } /* @@ -382,8 +379,13 @@ static void rcu_preempt_note_context_switch(bool preempt) * grace period, then the fact that the task has been enqueued * means that we continue to block the current grace period. */ - rcu_preempt_qs(); + rcu_qs(); + if (rdp->deferred_qs) + rcu_report_exp_rdp(rdp); + trace_rcu_utilization(TPS("End context switch")); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ } +EXPORT_SYMBOL_GPL(rcu_note_context_switch); /* * Check for preempted RCU readers blocking the current grace period @@ -464,74 +466,56 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) } /* - * Handle special cases during rcu_read_unlock(), such as needing to - * notify RCU core processing or task having blocked during the RCU - * read-side critical section. + * Report deferred quiescent states. The deferral time can + * be quite short, for example, in the case of the call from + * rcu_read_unlock_special(). */ -static void rcu_read_unlock_special(struct task_struct *t) +static void +rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) { bool empty_exp; bool empty_norm; bool empty_exp_now; - unsigned long flags; struct list_head *np; bool drop_boost_mutex = false; struct rcu_data *rdp; struct rcu_node *rnp; union rcu_special special; - /* NMI handlers cannot block and cannot safely manipulate state. */ - if (in_nmi()) - return; - - local_irq_save(flags); - /* * If RCU core is waiting for this CPU to exit its critical section, * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; + rdp = this_cpu_ptr(&rcu_data); + if (!special.s && !rdp->deferred_qs) { + local_irq_restore(flags); + return; + } if (special.b.need_qs) { - rcu_preempt_qs(); + rcu_qs(); t->rcu_read_unlock_special.b.need_qs = false; - if (!t->rcu_read_unlock_special.s) { + if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) { local_irq_restore(flags); return; } } /* - * Respond to a request for an expedited grace period, but only if - * we were not preempted, meaning that we were running on the same - * CPU throughout. If we were preempted, the exp_need_qs flag - * would have been cleared at the time of the first preemption, - * and the quiescent state would be reported when we were dequeued. + * Respond to a request by an expedited grace period for a + * quiescent state from this CPU. Note that requests from + * tasks are handled when removing the task from the + * blocked-tasks list below. */ - if (special.b.exp_need_qs) { - WARN_ON_ONCE(special.b.blocked); - t->rcu_read_unlock_special.b.exp_need_qs = false; - rdp = this_cpu_ptr(rcu_state_p->rda); - rcu_report_exp_rdp(rcu_state_p, rdp, true); + if (rdp->deferred_qs) { + rcu_report_exp_rdp(rdp); if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); return; } } - /* Hardware IRQ handlers cannot block, complain if they get here. */ - if (in_irq() || in_serving_softirq()) { - lockdep_rcu_suspicious(__FILE__, __LINE__, - "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); - pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n", - t->rcu_read_unlock_special.s, - t->rcu_read_unlock_special.b.blocked, - t->rcu_read_unlock_special.b.exp_need_qs, - t->rcu_read_unlock_special.b.need_qs); - local_irq_restore(flags); - return; - } - /* Clean up if blocked during RCU read-side critical section. */ if (special.b.blocked) { t->rcu_read_unlock_special.b.blocked = false; @@ -582,7 +566,7 @@ static void rcu_read_unlock_special(struct task_struct *t) rnp->grplo, rnp->grphi, !!rnp->gp_tasks); - rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags); + rcu_report_unblock_qs_rnp(rnp, flags); } else { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -596,13 +580,79 @@ static void rcu_read_unlock_special(struct task_struct *t) * then we need to report up the rcu_node hierarchy. */ if (!empty_exp && empty_exp_now) - rcu_report_exp_rnp(rcu_state_p, rnp, true); + rcu_report_exp_rnp(rnp, true); } else { local_irq_restore(flags); } } /* + * Is a deferred quiescent-state pending, and are we also not in + * an RCU read-side critical section? It is the caller's responsibility + * to ensure it is otherwise safe to report any deferred quiescent + * states. The reason for this is that it is safe to report a + * quiescent state during context switch even though preemption + * is disabled. This function cannot be expected to understand these + * nuances, so the caller must handle them. + */ +static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +{ + return (this_cpu_ptr(&rcu_data)->deferred_qs || + READ_ONCE(t->rcu_read_unlock_special.s)) && + t->rcu_read_lock_nesting <= 0; +} + +/* + * Report a deferred quiescent state if needed and safe to do so. + * As with rcu_preempt_need_deferred_qs(), "safe" involves only + * not being in an RCU read-side critical section. The caller must + * evaluate safety in terms of interrupt, softirq, and preemption + * disabling. + */ +static void rcu_preempt_deferred_qs(struct task_struct *t) +{ + unsigned long flags; + bool couldrecurse = t->rcu_read_lock_nesting >= 0; + + if (!rcu_preempt_need_deferred_qs(t)) + return; + if (couldrecurse) + t->rcu_read_lock_nesting -= INT_MIN; + local_irq_save(flags); + rcu_preempt_deferred_qs_irqrestore(t, flags); + if (couldrecurse) + t->rcu_read_lock_nesting += INT_MIN; +} + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ +static void rcu_read_unlock_special(struct task_struct *t) +{ + unsigned long flags; + bool preempt_bh_were_disabled = + !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); + bool irqs_were_disabled; + + /* NMI handlers cannot block and cannot safely manipulate state. */ + if (in_nmi()) + return; + + local_irq_save(flags); + irqs_were_disabled = irqs_disabled_flags(flags); + if ((preempt_bh_were_disabled || irqs_were_disabled) && + t->rcu_read_unlock_special.b.blocked) { + /* Need to defer quiescent state until everything is enabled. */ + raise_softirq_irqoff(RCU_SOFTIRQ); + local_irq_restore(flags); + return; + } + rcu_preempt_deferred_qs_irqrestore(t, flags); +} + +/* * Dump detailed information for all tasks blocking the current RCU * grace period on the specified rcu_node structure. */ @@ -633,12 +683,12 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) * Dump detailed information for all tasks blocking the current RCU * grace period. */ -static void rcu_print_detail_task_stall(struct rcu_state *rsp) +static void rcu_print_detail_task_stall(void) { - struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_node *rnp = rcu_get_root(); rcu_print_detail_task_stall_rnp(rnp); - rcu_for_each_leaf_node(rsp, rnp) + rcu_for_each_leaf_node(rnp) rcu_print_detail_task_stall_rnp(rnp); } @@ -706,14 +756,13 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * Also, if there are blocked tasks on the list, they automatically * block the newly created grace period, so set up ->gp_tasks accordingly. */ -static void -rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) - dump_blkd_tasks(rsp, rnp, 10); + dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp) && (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { rnp->gp_tasks = rnp->blkd_tasks.next; @@ -732,62 +781,38 @@ rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) * * Caller must disable hard irqs. */ -static void rcu_preempt_check_callbacks(void) +static void rcu_flavor_check_callbacks(int user) { - struct rcu_state *rsp = &rcu_preempt_state; struct task_struct *t = current; - if (t->rcu_read_lock_nesting == 0) { - rcu_preempt_qs(); + if (user || rcu_is_cpu_rrupt_from_idle()) { + rcu_note_voluntary_context_switch(current); + } + if (t->rcu_read_lock_nesting > 0 || + (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { + /* No QS, force context switch if deferred. */ + if (rcu_preempt_need_deferred_qs(t)) { + set_tsk_need_resched(t); + set_preempt_need_resched(); + } + } else if (rcu_preempt_need_deferred_qs(t)) { + rcu_preempt_deferred_qs(t); /* Report deferred QS. */ + return; + } else if (!t->rcu_read_lock_nesting) { + rcu_qs(); /* Report immediate QS. */ return; } + + /* If GP is oldish, ask for help from rcu_read_unlock_special(). */ if (t->rcu_read_lock_nesting > 0 && - __this_cpu_read(rcu_data_p->core_needs_qs) && - __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm) && + __this_cpu_read(rcu_data.core_needs_qs) && + __this_cpu_read(rcu_data.cpu_no_qs.b.norm) && !t->rcu_read_unlock_special.b.need_qs && - time_after(jiffies, rsp->gp_start + HZ)) + time_after(jiffies, rcu_state.gp_start + HZ)) t->rcu_read_unlock_special.b.need_qs = true; } /** - * call_rcu() - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all pre-existing RCU read-side - * critical sections have completed. However, the callback function - * might well execute concurrently with RCU read-side critical sections - * that started after call_rcu() was invoked. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - * - * Note that all CPUs must agree that the grace period extended beyond - * all pre-existing RCU read-side critical section. On systems with more - * than one CPU, this means that when "func()" is invoked, each CPU is - * guaranteed to have executed a full memory barrier since the end of its - * last RCU read-side critical section whose beginning preceded the call - * to call_rcu(). It also means that each CPU executing an RCU read-side - * critical section that continues beyond the start of "func()" must have - * executed a memory barrier after the call_rcu() but before the beginning - * of that RCU read-side critical section. Note that these guarantees - * include CPUs that are offline, idle, or executing in user mode, as - * well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the - * resulting RCU callback function "func()", then both CPU A and CPU B are - * guaranteed to execute a full memory barrier during the time interval - * between the call to call_rcu() and the invocation of "func()" -- even - * if CPU A and CPU B are the same CPU (but again only if the system has - * more than one CPU). - */ -void call_rcu(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func, rcu_state_p, -1, 0); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/** * synchronize_rcu - wait until a grace period has elapsed. * * Control will return to the caller some time after a full grace @@ -797,14 +822,28 @@ EXPORT_SYMBOL_GPL(call_rcu); * concurrently with new RCU read-side critical sections that began while * synchronize_rcu() was waiting. RCU read-side critical sections are * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. + * In addition, regions of code across which interrupts, preemption, or + * softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last RCU read-side critical section whose beginning + * preceded the call to synchronize_rcu(). In addition, each CPU having + * an RCU read-side critical section that extends beyond the return from + * synchronize_rcu() is guaranteed to have executed a full memory barrier + * after the beginning of synchronize_rcu() and before the beginning of + * that RCU read-side critical section. Note that these guarantees include + * CPUs that are offline, idle, or executing in user mode, as well as CPUs + * that are executing in the kernel. * - * See the description of synchronize_sched() for more detailed - * information on memory-ordering guarantees. However, please note - * that -only- the memory-ordering guarantees apply. For example, - * synchronize_rcu() is -not- guaranteed to wait on things like code - * protected by preempt_disable(), instead, synchronize_rcu() is -only- - * guaranteed to wait on RCU read-side critical sections, that is, sections - * of code protected by rcu_read_lock(). + * Furthermore, if CPU A invoked synchronize_rcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). */ void synchronize_rcu(void) { @@ -821,28 +860,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -/** - * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. - * - * Note that this primitive does not necessarily wait for an RCU grace period - * to complete. For example, if there are no RCU callbacks queued anywhere - * in the system, then rcu_barrier() is within its rights to return - * immediately, without waiting for anything, much less an RCU grace period. - */ -void rcu_barrier(void) -{ - _rcu_barrier(rcu_state_p); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - -/* - * Initialize preemptible RCU's state structures. - */ -static void __init __rcu_init_preempt(void) -{ - rcu_init_one(rcu_state_p); -} - /* * Check for a task exiting while in a preemptible-RCU read-side * critical section, clean up if so. No need to issue warnings, @@ -859,6 +876,7 @@ void exit_rcu(void) barrier(); t->rcu_read_unlock_special.b.blocked = true; __rcu_read_unlock(); + rcu_preempt_deferred_qs(current); } /* @@ -866,7 +884,7 @@ void exit_rcu(void) * specified number of elements. */ static void -dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) +dump_blkd_tasks(struct rcu_node *rnp, int ncheck) { int cpu; int i; @@ -893,7 +911,7 @@ dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) } pr_cont("\n"); for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = per_cpu_ptr(&rcu_data, cpu); onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", cpu, ".o"[onl], @@ -904,8 +922,6 @@ dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) #else /* #ifdef CONFIG_PREEMPT_RCU */ -static struct rcu_state *const rcu_state_p = &rcu_sched_state; - /* * Tell them what RCU they are running. */ @@ -916,14 +932,85 @@ static void __init rcu_bootup_announce(void) } /* - * Because preemptible RCU does not exist, we never have to check for - * CPUs being in quiescent states. + * Note a quiescent state for PREEMPT=n. Because we do not need to know + * how many quiescent states passed, just if there was at least one since + * the start of the grace period, this just sets a flag. The caller must + * have disabled preemption. */ -static void rcu_preempt_note_context_switch(bool preempt) +static void rcu_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!"); + if (!__this_cpu_read(rcu_data.cpu_no_qs.s)) + return; + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); + __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) + return; + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); } /* + * Register an urgently needed quiescent state. If there is an + * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight + * dyntick-idle quiescent state visible to other CPUs, which will in + * some cases serve for expedited as well as normal grace periods. + * Either way, register a lightweight quiescent state. + * + * The barrier() calls are redundant in the common case when this is + * called externally, but just in case this is called from within this + * file. + * + */ +void rcu_all_qs(void) +{ + unsigned long flags; + + if (!raw_cpu_read(rcu_data.rcu_urgent_qs)) + return; + preempt_disable(); + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { + preempt_enable(); + return; + } + this_cpu_write(rcu_data.rcu_urgent_qs, false); + barrier(); /* Avoid RCU read-side critical sections leaking down. */ + if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { + local_irq_save(flags); + rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } + rcu_qs(); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ + preempt_enable(); +} +EXPORT_SYMBOL_GPL(rcu_all_qs); + +/* + * Note a PREEMPT=n context switch. The caller must have disabled interrupts. + */ +void rcu_note_context_switch(bool preempt) +{ + barrier(); /* Avoid RCU read-side critical sections leaking down. */ + trace_rcu_utilization(TPS("Start context switch")); + rcu_qs(); + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) + goto out; + this_cpu_write(rcu_data.rcu_urgent_qs, false); + if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) + rcu_momentary_dyntick_idle(); + if (!preempt) + rcu_tasks_qs(current); +out: + trace_rcu_utilization(TPS("End context switch")); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ +} +EXPORT_SYMBOL_GPL(rcu_note_context_switch); + +/* * Because preemptible RCU does not exist, there are never any preempted * RCU readers. */ @@ -941,10 +1028,20 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) } /* + * Because there is no preemptible RCU, there can be no deferred quiescent + * states. + */ +static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +{ + return false; +} +static void rcu_preempt_deferred_qs(struct task_struct *t) { } + +/* * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ -static void rcu_print_detail_task_stall(struct rcu_state *rsp) +static void rcu_print_detail_task_stall(void) { } @@ -972,36 +1069,54 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. */ -static void -rcu_preempt_check_blocked_tasks(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { WARN_ON_ONCE(rnp->qsmask); } /* - * Because preemptible RCU does not exist, it never has any callbacks - * to check. + * Check to see if this CPU is in a non-context-switch quiescent state + * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). + * Also schedule RCU core processing. + * + * This function must be called from hardirq context. It is normally + * invoked from the scheduling-clock interrupt. */ -static void rcu_preempt_check_callbacks(void) +static void rcu_flavor_check_callbacks(int user) { -} + if (user || rcu_is_cpu_rrupt_from_idle()) { -/* - * Because preemptible RCU does not exist, rcu_barrier() is just - * another name for rcu_barrier_sched(). - */ -void rcu_barrier(void) -{ - rcu_barrier_sched(); + /* + * Get here if this CPU took its interrupt from user + * mode or from the idle loop, and if this is not a + * nested interrupt. In this case, the CPU is in + * a quiescent state, so note it. + * + * No memory barrier is required here because rcu_qs() + * references only CPU-local variables that other CPUs + * neither access nor modify, at least not while the + * corresponding CPU is online. + */ + + rcu_qs(); + } } -EXPORT_SYMBOL_GPL(rcu_barrier); -/* - * Because preemptible RCU does not exist, it need not be initialized. - */ -static void __init __rcu_init_preempt(void) +/* PREEMPT=n implementation of synchronize_rcu(). */ +void synchronize_rcu(void) { + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); + if (rcu_blocking_is_gp()) + return; + if (rcu_gp_is_expedited()) + synchronize_rcu_expedited(); + else + wait_rcu_gp(call_rcu); } +EXPORT_SYMBOL_GPL(synchronize_rcu); /* * Because preemptible RCU does not exist, tasks cannot possibly exit @@ -1015,7 +1130,7 @@ void exit_rcu(void) * Dump the guaranteed-empty blocked-tasks state. Trust but verify. */ static void -dump_blkd_tasks(struct rcu_state *rsp, struct rcu_node *rnp, int ncheck) +dump_blkd_tasks(struct rcu_node *rnp, int ncheck) { WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); } @@ -1212,21 +1327,20 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * already exist. We only create this kthread for preemptible RCU. * Returns zero if all is well, a negated errno otherwise. */ -static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp) +static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { - int rnp_index = rnp - &rsp->node[0]; + int rnp_index = rnp - rcu_get_root(); unsigned long flags; struct sched_param sp; struct task_struct *t; - if (rcu_state_p != rsp) + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) return 0; if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) return 0; - rsp->boost = 1; + rcu_state.boost = 1; if (rnp->boost_kthread_task != NULL) return 0; t = kthread_create(rcu_boost_kthread, (void *)rnp, @@ -1244,9 +1358,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, static void rcu_kthread_do_work(void) { - rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); - rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); - rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); + rcu_do_batch(this_cpu_ptr(&rcu_data)); } static void rcu_cpu_kthread_setup(unsigned int cpu) @@ -1268,9 +1380,9 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu) } /* - * Per-CPU kernel thread that invokes RCU callbacks. This replaces the - * RCU softirq used in flavors and configurations of RCU that do not - * support RCU priority boosting. + * Per-CPU kernel thread that invokes RCU callbacks. This replaces + * the RCU softirq used in configurations of RCU that do not support RCU + * priority boosting. */ static void rcu_cpu_kthread(unsigned int cpu) { @@ -1353,18 +1465,18 @@ static void __init rcu_spawn_boost_kthreads(void) for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); - rcu_for_each_leaf_node(rcu_state_p, rnp) - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); + rcu_for_each_leaf_node(rnp) + (void)rcu_spawn_one_boost_kthread(rnp); } static void rcu_prepare_kthreads(int cpu) { - struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ if (rcu_scheduler_fully_active) - (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); + (void)rcu_spawn_one_boost_kthread(rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1411,8 +1523,8 @@ static void rcu_prepare_kthreads(int cpu) * 1 if so. This function is part of the RCU implementation; it is -not- * an exported member of the RCU API. * - * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs - * any flavor of RCU. + * Because we not have RCU_FAST_NO_HZ, just check whether or not this + * CPU has RCU callbacks queued. */ int rcu_needs_cpu(u64 basemono, u64 *nextevt) { @@ -1478,41 +1590,36 @@ static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); /* - * Try to advance callbacks for all flavors of RCU on the current CPU, but - * only if it has been awhile since the last time we did so. Afterwards, - * if there are any callbacks ready for immediate invocation, return true. + * Try to advance callbacks on the current CPU, but only if it has been + * awhile since the last time we did so. Afterwards, if there are any + * callbacks ready for immediate invocation, return true. */ static bool __maybe_unused rcu_try_advance_all_cbs(void) { bool cbs_ready = false; - struct rcu_data *rdp; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp; - struct rcu_state *rsp; /* Exit early if we advanced recently. */ - if (jiffies == rdtp->last_advance_all) + if (jiffies == rdp->last_advance_all) return false; - rdtp->last_advance_all = jiffies; + rdp->last_advance_all = jiffies; - for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; + rnp = rdp->mynode; - /* - * Don't bother checking unless a grace period has - * completed since we last checked and there are - * callbacks not yet ready to invoke. - */ - if ((rcu_seq_completed_gp(rdp->gp_seq, - rcu_seq_current(&rnp->gp_seq)) || - unlikely(READ_ONCE(rdp->gpwrap))) && - rcu_segcblist_pend_cbs(&rdp->cblist)) - note_gp_changes(rsp, rdp); - - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - cbs_ready = true; - } + /* + * Don't bother checking unless a grace period has + * completed since we last checked and there are + * callbacks not yet ready to invoke. + */ + if ((rcu_seq_completed_gp(rdp->gp_seq, + rcu_seq_current(&rnp->gp_seq)) || + unlikely(READ_ONCE(rdp->gpwrap))) && + rcu_segcblist_pend_cbs(&rdp->cblist)) + note_gp_changes(rdp); + + if (rcu_segcblist_ready_cbs(&rdp->cblist)) + cbs_ready = true; return cbs_ready; } @@ -1526,16 +1633,16 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) */ int rcu_needs_cpu(u64 basemono, u64 *nextevt) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); unsigned long dj; lockdep_assert_irqs_disabled(); /* Snapshot to detect later posting of non-lazy callback. */ - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; + rdp->nonlazy_posted_snap = rdp->nonlazy_posted; /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) { + if (!rcu_cpu_has_callbacks(&rdp->all_lazy)) { *nextevt = KTIME_MAX; return 0; } @@ -1546,10 +1653,10 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) invoke_rcu_core(); return 1; } - rdtp->last_accelerate = jiffies; + rdp->last_accelerate = jiffies; /* Request timer delay depending on laziness, and round. */ - if (!rdtp->all_lazy) { + if (!rdp->all_lazy) { dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; } else { @@ -1572,10 +1679,8 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) static void rcu_prepare_for_idle(void) { bool needwake; - struct rcu_data *rdp; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp; - struct rcu_state *rsp; int tne; lockdep_assert_irqs_disabled(); @@ -1584,10 +1689,10 @@ static void rcu_prepare_for_idle(void) /* Handle nohz enablement switches conservatively. */ tne = READ_ONCE(tick_nohz_active); - if (tne != rdtp->tick_nohz_enabled_snap) { + if (tne != rdp->tick_nohz_enabled_snap) { if (rcu_cpu_has_callbacks(NULL)) invoke_rcu_core(); /* force nohz to see update. */ - rdtp->tick_nohz_enabled_snap = tne; + rdp->tick_nohz_enabled_snap = tne; return; } if (!tne) @@ -1598,10 +1703,10 @@ static void rcu_prepare_for_idle(void) * callbacks, invoke RCU core for the side-effect of recalculating * idle duration on re-entry to idle. */ - if (rdtp->all_lazy && - rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { - rdtp->all_lazy = false; - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; + if (rdp->all_lazy && + rdp->nonlazy_posted != rdp->nonlazy_posted_snap) { + rdp->all_lazy = false; + rdp->nonlazy_posted_snap = rdp->nonlazy_posted; invoke_rcu_core(); return; } @@ -1610,19 +1715,16 @@ static void rcu_prepare_for_idle(void) * If we have not yet accelerated this jiffy, accelerate all * callbacks on this CPU. */ - if (rdtp->last_accelerate == jiffies) + if (rdp->last_accelerate == jiffies) return; - rdtp->last_accelerate = jiffies; - for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); - if (!rcu_segcblist_pend_cbs(&rdp->cblist)) - continue; + rdp->last_accelerate = jiffies; + if (rcu_segcblist_pend_cbs(&rdp->cblist)) { rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + needwake = rcu_accelerate_cbs(rnp, rdp); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ if (needwake) - rcu_gp_kthread_wake(rsp); + rcu_gp_kthread_wake(); } } @@ -1650,104 +1752,23 @@ static void rcu_cleanup_after_idle(void) */ static void rcu_idle_count_callbacks_posted(void) { - __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); -} - -/* - * Data for flushing lazy RCU callbacks at OOM time. - */ -static atomic_t oom_callback_count; -static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); - -/* - * RCU OOM callback -- decrement the outstanding count and deliver the - * wake-up if we are the last one. - */ -static void rcu_oom_callback(struct rcu_head *rhp) -{ - if (atomic_dec_and_test(&oom_callback_count)) - wake_up(&oom_callback_wq); -} - -/* - * Post an rcu_oom_notify callback on the current CPU if it has at - * least one lazy callback. This will unnecessarily post callbacks - * to CPUs that already have a non-lazy callback at the end of their - * callback list, but this is an infrequent operation, so accept some - * extra overhead to keep things simple. - */ -static void rcu_oom_notify_cpu(void *unused) -{ - struct rcu_state *rsp; - struct rcu_data *rdp; - - for_each_rcu_flavor(rsp) { - rdp = raw_cpu_ptr(rsp->rda); - if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) { - atomic_inc(&oom_callback_count); - rsp->call(&rdp->oom_head, rcu_oom_callback); - } - } -} - -/* - * If low on memory, ensure that each CPU has a non-lazy callback. - * This will wake up CPUs that have only lazy callbacks, in turn - * ensuring that they free up the corresponding memory in a timely manner. - * Because an uncertain amount of memory will be freed in some uncertain - * timeframe, we do not claim to have freed anything. - */ -static int rcu_oom_notify(struct notifier_block *self, - unsigned long notused, void *nfreed) -{ - int cpu; - - /* Wait for callbacks from earlier instance to complete. */ - wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); - smp_mb(); /* Ensure callback reuse happens after callback invocation. */ - - /* - * Prevent premature wakeup: ensure that all increments happen - * before there is a chance of the counter reaching zero. - */ - atomic_set(&oom_callback_count, 1); - - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched_tasks_rcu_qs(); - } - - /* Unconditionally decrement: no need to wake ourselves up. */ - atomic_dec(&oom_callback_count); - - return NOTIFY_OK; + __this_cpu_add(rcu_data.nonlazy_posted, 1); } -static struct notifier_block rcu_oom_nb = { - .notifier_call = rcu_oom_notify -}; - -static int __init rcu_register_oom_notifier(void) -{ - register_oom_notifier(&rcu_oom_nb); - return 0; -} -early_initcall(rcu_register_oom_notifier); - #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #ifdef CONFIG_RCU_FAST_NO_HZ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + unsigned long nlpd = rdp->nonlazy_posted - rdp->nonlazy_posted_snap; sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", - rdtp->last_accelerate & 0xffff, jiffies & 0xffff, + rdp->last_accelerate & 0xffff, jiffies & 0xffff, ulong2long(nlpd), - rdtp->all_lazy ? 'L' : '.', - rdtp->tick_nohz_enabled_snap ? '.' : 'D'); + rdp->all_lazy ? 'L' : '.', + rdp->tick_nohz_enabled_snap ? '.' : 'D'); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ @@ -1768,21 +1789,19 @@ static void print_cpu_stall_info_begin(void) /* * Print out diagnostic information for the specified stalled CPU. * - * If the specified CPU is aware of the current RCU grace period - * (flavor specified by rsp), then print the number of scheduling - * clock interrupts the CPU has taken during the time that it has - * been aware. Otherwise, print the number of RCU grace periods - * that this CPU is ignorant of, for example, "1" if the CPU was - * aware of the previous grace period. + * If the specified CPU is aware of the current RCU grace period, then + * print the number of scheduling clock interrupts the CPU has taken + * during the time that it has been aware. Otherwise, print the number + * of RCU grace periods that this CPU is ignorant of, for example, "1" + * if the CPU was aware of the previous grace period. * * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. */ -static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) +static void print_cpu_stall_info(int cpu) { unsigned long delta; char fast_no_hz[72]; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = rdp->dynticks; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; unsigned long ticks_value; @@ -1792,7 +1811,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) */ touch_nmi_watchdog(); - ticks_value = rcu_seq_ctr(rsp->gp_seq - rdp->gp_seq); + ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); if (ticks_value) { ticks_title = "GPs behind"; } else { @@ -1810,10 +1829,10 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : "!."[!delta], ticks_value, ticks_title, - rcu_dynticks_snap(rdtp) & 0xfff, - rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, + rcu_dynticks_snap(rdp) & 0xfff, + rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, + READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, fast_no_hz); } @@ -1823,20 +1842,12 @@ static void print_cpu_stall_info_end(void) pr_err("\t"); } -/* Zero ->ticks_this_gp for all flavors of RCU. */ +/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ static void zero_cpu_stall_ticks(struct rcu_data *rdp) { rdp->ticks_this_gp = 0; rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); -} - -/* Increment ->ticks_this_gp for all flavors of RCU. */ -static void increment_cpu_stall_ticks(void) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - raw_cpu_inc(rsp->rda->ticks_this_gp); + WRITE_ONCE(rdp->last_fqs_resched, jiffies); } #ifdef CONFIG_RCU_NOCB_CPU @@ -1958,17 +1969,17 @@ static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) mod_timer(&rdp->nocb_timer, jiffies + 1); WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } /* - * Does the specified CPU need an RCU callback for the specified flavor + * Does the specified CPU need an RCU callback for this invocation * of rcu_barrier()? */ -static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +static bool rcu_nocb_cpu_needs_barrier(int cpu) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); unsigned long ret; #ifdef CONFIG_PROVE_RCU struct rcu_head *rhp; @@ -1979,7 +1990,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) * There needs to be a barrier before this function is called, * but associated with a prior determination that no more * callbacks would be posted. In the worst case, the first - * barrier in _rcu_barrier() suffices (but the caller cannot + * barrier in rcu_barrier() suffices (but the caller cannot * necessarily rely on this, not a substitute for the caller * getting the concurrency design right!). There must also be * a barrier between the following load an posting of a callback @@ -2037,7 +2048,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, /* If we are not being polled and there is a kthread, awaken it ... */ t = READ_ONCE(rdp->nocb_kthread); if (rcu_nocb_poll || !t) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNotPoll")); return; } @@ -2046,7 +2057,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ wake_nocb_leader(rdp, false); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, @@ -2057,7 +2068,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, /* ... or if many callbacks queued. */ if (!irqs_disabled_flags(flags)) { wake_nocb_leader(rdp, true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeOvf")); } else { wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, @@ -2065,7 +2076,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, } rdp->qlen_last_fqs_check = LONG_MAX / 2; } else { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } return; } @@ -2087,12 +2098,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, return false; __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); if (__is_kfree_rcu_offset((unsigned long)rhp->func)) - trace_rcu_kfree_callback(rdp->rsp->name, rhp, + trace_rcu_kfree_callback(rcu_state.name, rhp, (unsigned long)rhp->func, -atomic_long_read(&rdp->nocb_q_count_lazy), -atomic_long_read(&rdp->nocb_q_count)); else - trace_rcu_callback(rdp->rsp->name, rhp, + trace_rcu_callback(rcu_state.name, rhp, -atomic_long_read(&rdp->nocb_q_count_lazy), -atomic_long_read(&rdp->nocb_q_count)); @@ -2142,7 +2153,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; local_irq_save(flags); - c = rcu_seq_snap(&rdp->rsp->gp_seq); + c = rcu_seq_snap(&rcu_state.gp_seq); if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { local_irq_restore(flags); } else { @@ -2150,7 +2161,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) needwake = rcu_start_this_gp(rnp, rdp, c); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) - rcu_gp_kthread_wake(rdp->rsp); + rcu_gp_kthread_wake(); } /* @@ -2187,7 +2198,7 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); + trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep")); swait_event_interruptible_exclusive(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); @@ -2197,7 +2208,7 @@ wait_again: raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll")); + trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Poll")); } /* @@ -2224,7 +2235,7 @@ wait_again: if (rcu_nocb_poll) { schedule_timeout_interruptible(1); } else { - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, + trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("WokeEmpty")); } goto wait_again; @@ -2269,7 +2280,7 @@ wait_again: static void nocb_follower_wait(struct rcu_data *rdp) { for (;;) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); swait_event_interruptible_exclusive(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { @@ -2277,7 +2288,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) return; } WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty")); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); } } @@ -2312,10 +2323,10 @@ static int rcu_nocb_kthread(void *arg) rdp->nocb_follower_tail = &rdp->nocb_follower_head; raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); BUG_ON(!list); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty")); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty")); /* Each pass through the following loop invokes a callback. */ - trace_rcu_batch_start(rdp->rsp->name, + trace_rcu_batch_start(rcu_state.name, atomic_long_read(&rdp->nocb_q_count_lazy), atomic_long_read(&rdp->nocb_q_count), -1); c = cl = 0; @@ -2323,23 +2334,23 @@ static int rcu_nocb_kthread(void *arg) next = list->next; /* Wait for enqueuing to complete, if needed. */ while (next == NULL && &list->next != tail) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WaitQueue")); schedule_timeout_interruptible(1); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeQueue")); next = list->next; } debug_rcu_head_unqueue(list); local_bh_disable(); - if (__rcu_reclaim(rdp->rsp->name, list)) + if (__rcu_reclaim(rcu_state.name, list)) cl++; c++; local_bh_enable(); cond_resched_tasks_rcu_qs(); list = next; } - trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); + trace_rcu_batch_end(rcu_state.name, c, !!list, 0, 0, 1); smp_mb__before_atomic(); /* _add after CB invocation. */ atomic_long_add(-c, &rdp->nocb_q_count); atomic_long_add(-cl, &rdp->nocb_q_count_lazy); @@ -2367,7 +2378,7 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); } /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ @@ -2393,7 +2404,6 @@ void __init rcu_init_nohz(void) { int cpu; bool need_rcu_nocb_mask = false; - struct rcu_state *rsp; #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) @@ -2427,11 +2437,9 @@ void __init rcu_init_nohz(void) if (rcu_nocb_poll) pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); - for_each_rcu_flavor(rsp) { - for_each_cpu(cpu, rcu_nocb_mask) - init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu)); - rcu_organize_nocb_kthreads(rsp); - } + for_each_cpu(cpu, rcu_nocb_mask) + init_nocb_callback_list(per_cpu_ptr(&rcu_data, cpu)); + rcu_organize_nocb_kthreads(); } /* Initialize per-rcu_data variables for no-CBs CPUs. */ @@ -2446,16 +2454,15 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) /* * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are - * brought online out of order, this can require re-organizing the - * leader-follower relationships. + * rcuo kthread, spawn it. If the CPUs are brought online out of order, + * this can require re-organizing the leader-follower relationships. */ -static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) +static void rcu_spawn_one_nocb_kthread(int cpu) { struct rcu_data *rdp; struct rcu_data *rdp_last; struct rcu_data *rdp_old_leader; - struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu); + struct rcu_data *rdp_spawn = per_cpu_ptr(&rcu_data, cpu); struct task_struct *t; /* @@ -2485,9 +2492,9 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) rdp_spawn->nocb_next_follower = rdp_old_leader; } - /* Spawn the kthread for this CPU and RCU flavor. */ + /* Spawn the kthread for this CPU. */ t = kthread_run(rcu_nocb_kthread, rdp_spawn, - "rcuo%c/%d", rsp->abbr, cpu); + "rcuo%c/%d", rcu_state.abbr, cpu); BUG_ON(IS_ERR(t)); WRITE_ONCE(rdp_spawn->nocb_kthread, t); } @@ -2498,11 +2505,8 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) */ static void rcu_spawn_all_nocb_kthreads(int cpu) { - struct rcu_state *rsp; - if (rcu_scheduler_fully_active) - for_each_rcu_flavor(rsp) - rcu_spawn_one_nocb_kthread(rsp, cpu); + rcu_spawn_one_nocb_kthread(cpu); } /* @@ -2526,7 +2530,7 @@ module_param(rcu_nocb_leader_stride, int, 0444); /* * Initialize leader-follower relationships for all no-CBs CPU. */ -static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) +static void __init rcu_organize_nocb_kthreads(void) { int cpu; int ls = rcu_nocb_leader_stride; @@ -2548,7 +2552,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) * we will spawn the needed set of rcu_nocb_kthread() kthreads. */ for_each_cpu(cpu, rcu_nocb_mask) { - rdp = per_cpu_ptr(rsp->rda, cpu); + rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu >= nl) { /* New leader, set up for followers & next leader. */ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; @@ -2585,7 +2589,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +static bool rcu_nocb_cpu_needs_barrier(int cpu) { WARN_ON_ONCE(1); /* Should be dead code. */ return false; @@ -2654,12 +2658,12 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) * This code relies on the fact that all NO_HZ_FULL CPUs are also * CONFIG_RCU_NOCB_CPU CPUs. */ -static bool rcu_nohz_full_cpu(struct rcu_state *rsp) +static bool rcu_nohz_full_cpu(void) { #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(smp_processor_id()) && - (!rcu_gp_in_progress(rsp) || - ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ))) + (!rcu_gp_in_progress() || + ULONG_CMP_LT(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) return true; #endif /* #ifdef CONFIG_NO_HZ_FULL */ return false; diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 39cb23d22109..f203b94f6b5b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -203,11 +203,7 @@ void rcu_test_sync_prims(void) if (!IS_ENABLED(CONFIG_PROVE_RCU)) return; synchronize_rcu(); - synchronize_rcu_bh(); - synchronize_sched(); synchronize_rcu_expedited(); - synchronize_rcu_bh_expedited(); - synchronize_sched_expedited(); } #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) @@ -298,7 +294,7 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held); * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. * - * Note that rcu_read_lock() is disallowed if the CPU is either idle or + * Note that rcu_read_lock_bh() is disallowed if the CPU is either idle or * offline from an RCU perspective, so check for those as well. */ int rcu_read_lock_bh_held(void) @@ -336,7 +332,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, int i; int j; - /* Initialize and register callbacks for each flavor specified. */ + /* Initialize and register callbacks for each crcu_array element. */ for (i = 0; i < n; i++) { if (checktiny && (crcu_array[i] == call_rcu || @@ -472,6 +468,7 @@ int rcu_jiffies_till_stall_check(void) } return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; } +EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); void rcu_sysrq_start(void) { @@ -701,19 +698,19 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* * Wait for all pre-existing t->on_rq and t->nvcsw - * transitions to complete. Invoking synchronize_sched() + * transitions to complete. Invoking synchronize_rcu() * suffices because all these transitions occur with - * interrupts disabled. Without this synchronize_sched(), + * interrupts disabled. Without this synchronize_rcu(), * a read-side critical section that started before the * grace period might be incorrectly seen as having started * after the grace period. * - * This synchronize_sched() also dispenses with the + * This synchronize_rcu() also dispenses with the * need for a memory barrier on the first store to * ->rcu_tasks_holdout, as it forces the store to happen * after the beginning of the grace period. */ - synchronize_sched(); + synchronize_rcu(); /* * There were callbacks, so we need to wait for an @@ -740,7 +737,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) * This does only part of the job, ensuring that all * tasks that were previously exiting reach the point * where they have disabled preemption, allowing the - * later synchronize_sched() to finish the job. + * later synchronize_rcu() to finish the job. */ synchronize_srcu(&tasks_rcu_exit_srcu); @@ -790,20 +787,20 @@ static int __noreturn rcu_tasks_kthread(void *arg) * cause their RCU-tasks read-side critical sections to * extend past the end of the grace period. However, * because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_sched() + * interrupts disabled, we can use synchronize_rcu() * to force the needed ordering on all such CPUs. * - * This synchronize_sched() also confines all + * This synchronize_rcu() also confines all * ->rcu_tasks_holdout accesses to be within the grace * period, avoiding the need for memory barriers for * ->rcu_tasks_holdout accesses. * - * In addition, this synchronize_sched() waits for exiting + * In addition, this synchronize_rcu() waits for exiting * tasks to complete their final preempt_disable() region * of execution, cleaning up after the synchronize_srcu() * above. */ - synchronize_sched(); + synchronize_rcu(); /* Invoke the callbacks. */ while (list) { @@ -870,15 +867,10 @@ static void __init rcu_tasks_bootup_oddness(void) #ifdef CONFIG_PROVE_RCU /* - * Early boot self test parameters, one for each flavor + * Early boot self test parameters. */ static bool rcu_self_test; -static bool rcu_self_test_bh; -static bool rcu_self_test_sched; - module_param(rcu_self_test, bool, 0444); -module_param(rcu_self_test_bh, bool, 0444); -module_param(rcu_self_test_sched, bool, 0444); static int rcu_self_test_counter; @@ -888,25 +880,16 @@ static void test_callback(struct rcu_head *r) pr_info("RCU test callback executed %d\n", rcu_self_test_counter); } +DEFINE_STATIC_SRCU(early_srcu); + static void early_boot_test_call_rcu(void) { static struct rcu_head head; + static struct rcu_head shead; call_rcu(&head, test_callback); -} - -static void early_boot_test_call_rcu_bh(void) -{ - static struct rcu_head head; - - call_rcu_bh(&head, test_callback); -} - -static void early_boot_test_call_rcu_sched(void) -{ - static struct rcu_head head; - - call_rcu_sched(&head, test_callback); + if (IS_ENABLED(CONFIG_SRCU)) + call_srcu(&early_srcu, &shead, test_callback); } void rcu_early_boot_tests(void) @@ -915,10 +898,6 @@ void rcu_early_boot_tests(void) if (rcu_self_test) early_boot_test_call_rcu(); - if (rcu_self_test_bh) - early_boot_test_call_rcu_bh(); - if (rcu_self_test_sched) - early_boot_test_call_rcu_sched(); rcu_test_sync_prims(); } @@ -930,16 +909,11 @@ static int rcu_verify_early_boot_tests(void) if (rcu_self_test) { early_boot_test_counter++; rcu_barrier(); + if (IS_ENABLED(CONFIG_SRCU)) { + early_boot_test_counter++; + srcu_barrier(&early_srcu); + } } - if (rcu_self_test_bh) { - early_boot_test_counter++; - rcu_barrier_bh(); - } - if (rcu_self_test_sched) { - early_boot_test_counter++; - rcu_barrier_sched(); - } - if (rcu_self_test_counter != early_boot_test_counter) { WARN_ON(1); ret = -1; |