diff options
| author | Boqun Feng <boqun.feng@gmail.com> | 2026-01-04 18:52:53 +0800 |
|---|---|---|
| committer | Boqun Feng <boqun.feng@gmail.com> | 2026-01-04 18:52:53 +0800 |
| commit | 60908279164a1dc651b7f4685cfbfe5161a4a797 (patch) | |
| tree | 5f8ebdb1d737f6119ec600731464b4c135b95586 /kernel | |
| parent | 8f0b4cce4481fb22653697cced8d0d04027cb1e8 (diff) | |
| parent | 760f05bc830d86667c07af6c80dc58d599061a67 (diff) | |
Merge branch 'rcu-tasks-trace.20260101a'
* rcu-tasks-trace.20260101a:
rcutorture: Test rcu_tasks_trace_expedite_current()
srcu: Create an rcu_tasks_trace_expedite_current() function
checkpatch: Deprecate rcu_read_{,un}lock_trace()
rcu: Update Requirements.rst for RCU Tasks Trace
rcu: Add noinstr-fast rcu_read_{,un}lock_tasks_trace() APIs
rcu: Move rcu_tasks_trace_srcu_struct out of #ifdef CONFIG_TASKS_RCU_GENERIC
rcu: Clean up after the SRCU-fastification of RCU Tasks Trace
context_tracking: Remove rcu_task_trace_heavyweight_{enter,exit}()
rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/context_tracking.c | 20 | ||||
| -rw-r--r-- | kernel/fork.c | 3 | ||||
| -rw-r--r-- | kernel/rcu/Kconfig | 43 | ||||
| -rw-r--r-- | kernel/rcu/rcu.h | 9 | ||||
| -rw-r--r-- | kernel/rcu/rcuscale.c | 7 | ||||
| -rw-r--r-- | kernel/rcu/rcutorture.c | 3 | ||||
| -rw-r--r-- | kernel/rcu/tasks.h | 708 |
7 files changed, 41 insertions, 752 deletions
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index fb5be6e9b423..a743e7ffa6c0 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -54,24 +54,6 @@ static __always_inline void rcu_task_enter(void) #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } -/* Turn on heavyweight RCU tasks trace readers on kernel exit. */ -static __always_inline void rcu_task_trace_heavyweight_enter(void) -{ -#ifdef CONFIG_TASKS_TRACE_RCU - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) - current->trc_reader_special.b.need_mb = true; -#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ -} - -/* Turn off heavyweight RCU tasks trace readers on kernel entry. */ -static __always_inline void rcu_task_trace_heavyweight_exit(void) -{ -#ifdef CONFIG_TASKS_TRACE_RCU - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) - current->trc_reader_special.b.need_mb = false; -#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ -} - /* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state, that is, @@ -85,7 +67,6 @@ static noinstr void ct_kernel_exit_state(int offset) * critical sections, and we also must force ordering with the * next idle sojourn. */ - rcu_task_trace_heavyweight_enter(); // Before CT state update! // RCU is still watching. Better not be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !rcu_is_watching_curr_cpu()); (void)ct_state_inc(offset); @@ -108,7 +89,6 @@ static noinstr void ct_kernel_enter_state(int offset) */ seq = ct_state_inc(offset); // RCU is now watching. Better not be in an extended quiescent state! - rcu_task_trace_heavyweight_exit(); // After CT state update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & CT_RCU_WATCHING)); } diff --git a/kernel/fork.c b/kernel/fork.c index b1f3915d5f8e..d7ed107cbb47 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1828,9 +1828,6 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; - p->trc_reader_special.s = 0; - INIT_LIST_HEAD(&p->trc_holdout_list); - INIT_LIST_HEAD(&p->trc_blkd_node); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 4d9b21f69eaa..762299291e09 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -82,7 +82,7 @@ config NEED_SRCU_NMI_SAFE def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU config TASKS_RCU_GENERIC - def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU + def_bool TASKS_RCU || TASKS_RUDE_RCU help This option enables generic infrastructure code supporting task-based RCU implementations. Not for manual selection. @@ -142,6 +142,29 @@ config TASKS_TRACE_RCU default n select IRQ_WORK +config TASKS_TRACE_RCU_NO_MB + bool "Override RCU Tasks Trace inclusion of read-side memory barriers" + depends on RCU_EXPERT && TASKS_TRACE_RCU + default ARCH_WANTS_NO_INSTR + help + This option prevents the use of read-side memory barriers in + rcu_read_lock_tasks_trace() and rcu_read_unlock_tasks_trace() + even in kernels built with CONFIG_ARCH_WANTS_NO_INSTR=n, that is, + in kernels that do not have noinstr set up in entry/exit code. + By setting this option, you are promising to carefully review + use of ftrace, BPF, and friends to ensure that no tracing + operation is attached to a function that runs in that portion + of the entry/exit code that RCU does not watch, that is, + where rcu_is_watching() returns false. Alternatively, you + might choose to never remove traces except by rebooting. + + Those wishing to disable read-side memory barriers for an entire + architecture can select this Kconfig option, hence the polarity. + + Say Y here if you need speed and will review use of tracing. + Say N here for certain esoteric testing of RCU itself. + Take the default if you are unsure. + config RCU_STALL_COMMON def_bool TREE_RCU help @@ -313,24 +336,6 @@ config RCU_NOCB_CPU_CB_BOOST Say Y here if you want to set RT priority for offloading kthreads. Say N here if you are building a !PREEMPT_RT kernel and are unsure. -config TASKS_TRACE_RCU_READ_MB - bool "Tasks Trace RCU readers use memory barriers in user and idle" - depends on RCU_EXPERT && TASKS_TRACE_RCU - default PREEMPT_RT || NR_CPUS < 8 - help - Use this option to further reduce the number of IPIs sent - to CPUs executing in userspace or idle during tasks trace - RCU grace periods. Given that a reasonable setting of - the rcupdate.rcu_task_ipi_delay kernel boot parameter - eliminates such IPIs for many workloads, proper setting - of this Kconfig option is important mostly for aggressive - real-time installations and for battery-powered devices, - hence the default chosen above. - - Say Y here if you hate IPIs. - Say N here if you hate read-side memory barriers. - Take the default if you are unsure. - config RCU_LAZY bool "RCU callback lazy invocation functionality" depends on RCU_NOCB_CPU diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 9cf01832a6c3..dc5d614b372c 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -544,10 +544,6 @@ struct task_struct *get_rcu_tasks_rude_gp_kthread(void); void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq); #endif // # ifdef CONFIG_TASKS_RUDE_RCU -#ifdef CONFIG_TASKS_TRACE_RCU -void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq); -#endif - #ifdef CONFIG_TASKS_RCU_GENERIC void tasks_cblist_init_generic(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ @@ -673,11 +669,6 @@ void show_rcu_tasks_rude_gp_kthread(void); #else static inline void show_rcu_tasks_rude_gp_kthread(void) {} #endif -#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_TRACE_RCU) -void show_rcu_tasks_trace_gp_kthread(void); -#else -static inline void show_rcu_tasks_trace_gp_kthread(void) {} -#endif #ifdef CONFIG_TINY_RCU static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 7484d8ad5767..1c50f89fbd6f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -400,11 +400,6 @@ static void tasks_trace_scale_read_unlock(int idx) rcu_read_unlock_trace(); } -static void rcu_tasks_trace_scale_stats(void) -{ - rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG); -} - static struct rcu_scale_ops tasks_tracing_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -416,8 +411,6 @@ static struct rcu_scale_ops tasks_tracing_ops = { .gp_barrier = rcu_barrier_tasks_trace, .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, - .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread, - .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats, .name = "tasks-tracing" }; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 07e51974b06b..d00b043823ae 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1178,10 +1178,9 @@ static struct rcu_torture_ops tasks_tracing_ops = { .deferred_free = rcu_tasks_tracing_torture_deferred_free, .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, + .exp_current = rcu_tasks_trace_expedite_current, .call = call_rcu_tasks_trace, .cb_barrier = rcu_barrier_tasks_trace, - .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, - .get_gp_data = rcu_tasks_trace_get_gp_data, .cbflood_max = 50000, .irq_capable = 1, .slow_gps = 1, diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 2dc044fd126e..76f952196a29 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -161,11 +161,6 @@ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); #endif -/* Avoid IPIing CPUs early in the grace period. */ -#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0) -static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; -module_param(rcu_task_ipi_delay, int, 0644); - /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ #define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30) #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) @@ -718,7 +713,6 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } - /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { @@ -801,9 +795,7 @@ static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *t #endif // #ifndef CONFIG_TINY_RCU -static void exit_tasks_rcu_finish_trace(struct task_struct *t); - -#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) +#if defined(CONFIG_TASKS_RCU) //////////////////////////////////////////////////////////////////////// // @@ -898,7 +890,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) rtp->postgp_func(rtp); } -#endif /* #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) */ +#endif /* #if defined(CONFIG_TASKS_RCU) */ #ifdef CONFIG_TASKS_RCU @@ -1322,13 +1314,11 @@ void exit_tasks_rcu_finish(void) raw_spin_lock_irqsave_rcu_node(rtpcp, flags); list_del_init(&t->rcu_tasks_exit_list); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); - - exit_tasks_rcu_finish_trace(t); } #else /* #ifdef CONFIG_TASKS_RCU */ void exit_tasks_rcu_start(void) { } -void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } +void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_RUDE_RCU @@ -1449,682 +1439,11 @@ EXPORT_SYMBOL_GPL(rcu_tasks_rude_get_gp_data); #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ -//////////////////////////////////////////////////////////////////////// -// -// Tracing variant of Tasks RCU. This variant is designed to be used -// to protect tracing hooks, including those of BPF. This variant -// therefore: -// -// 1. Has explicit read-side markers to allow finite grace periods -// in the face of in-kernel loops for PREEMPT=n builds. -// -// 2. Protects code in the idle loop, exception entry/exit, and -// CPU-hotplug code paths, similar to the capabilities of SRCU. -// -// 3. Avoids expensive read-side instructions, having overhead similar -// to that of Preemptible RCU. -// -// There are of course downsides. For example, the grace-period code -// can send IPIs to CPUs, even when those CPUs are in the idle loop or -// in nohz_full userspace. If needed, these downsides can be at least -// partially remedied. -// -// Perhaps most important, this variant of RCU does not affect the vanilla -// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace -// readers can operate from idle, offline, and exception entry/exit in no -// way allows rcu_preempt and rcu_sched readers to also do so. -// -// The implementation uses rcu_tasks_wait_gp(), which relies on function -// pointers in the rcu_tasks structure. The rcu_spawn_tasks_trace_kthread() -// function sets these function pointers up so that rcu_tasks_wait_gp() -// invokes these functions in this order: -// -// rcu_tasks_trace_pregp_step(): -// Disables CPU hotplug, adds all currently executing tasks to the -// holdout list, then checks the state of all tasks that blocked -// or were preempted within their current RCU Tasks Trace read-side -// critical section, adding them to the holdout list if appropriate. -// Finally, this function re-enables CPU hotplug. -// The ->pertask_func() pointer is NULL, so there is no per-task processing. -// rcu_tasks_trace_postscan(): -// Invokes synchronize_rcu() to wait for late-stage exiting tasks -// to finish exiting. -// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty: -// Scans the holdout list, attempting to identify a quiescent state -// for each task on the list. If there is a quiescent state, the -// corresponding task is removed from the holdout list. Once this -// list is empty, the grace period has completed. -// rcu_tasks_trace_postgp(): -// Provides the needed full memory barrier and does debug checks. -// -// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks. -// -// Pre-grace-period update-side code is ordered before the grace period -// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period -// read-side code is ordered before the grace period by atomic operations -// on .b.need_qs flag of each task involved in this process, or by scheduler -// context-switch ordering (for locked-down non-running readers). - -// The lockdep state must be outside of #ifdef to be useful. -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_trace_key; -struct lockdep_map rcu_trace_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key); -EXPORT_SYMBOL_GPL(rcu_trace_lock_map); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -#ifdef CONFIG_TASKS_TRACE_RCU - -// Record outstanding IPIs to each CPU. No point in sending two... -static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); - -// The number of detections of task quiescent state relying on -// heavyweight readers executing explicit memory barriers. -static unsigned long n_heavy_reader_attempts; -static unsigned long n_heavy_reader_updates; -static unsigned long n_heavy_reader_ofl_updates; -static unsigned long n_trc_holdouts; - -void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); -DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, - "RCU Tasks Trace"); - -/* Load from ->trc_reader_special.b.need_qs with proper ordering. */ -static u8 rcu_ld_need_qs(struct task_struct *t) -{ - smp_mb(); // Enforce full grace-period ordering. - return smp_load_acquire(&t->trc_reader_special.b.need_qs); -} - -/* Store to ->trc_reader_special.b.need_qs with proper ordering. */ -static void rcu_st_need_qs(struct task_struct *t, u8 v) -{ - smp_store_release(&t->trc_reader_special.b.need_qs, v); - smp_mb(); // Enforce full grace-period ordering. -} - -/* - * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for - * the four-byte operand-size restriction of some platforms. - * - * Returns the old value, which is often ignored. - */ -u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) -{ - return cmpxchg(&t->trc_reader_special.b.need_qs, old, new); -} -EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); - -/* - * If we are the last reader, signal the grace-period kthread. - * Also remove from the per-CPU list of blocked tasks. - */ -void rcu_read_unlock_trace_special(struct task_struct *t) -{ - unsigned long flags; - struct rcu_tasks_percpu *rtpcp; - union rcu_special trs; - - // Open-coded full-word version of rcu_ld_need_qs(). - smp_mb(); // Enforce full grace-period ordering. - trs = smp_load_acquire(&t->trc_reader_special); - - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) - smp_mb(); // Pairs with update-side barriers. - // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. - if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) { - u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS, - TRC_NEED_QS_CHECKED); - - WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result); - } - if (trs.b.blocked) { - rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu); - raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - list_del_init(&t->trc_blkd_node); - WRITE_ONCE(t->trc_reader_special.b.blocked, false); - raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); - } - WRITE_ONCE(t->trc_reader_nesting, 0); -} -EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); - -/* Add a newly blocked reader task to its CPU's list. */ -void rcu_tasks_trace_qs_blkd(struct task_struct *t) -{ - unsigned long flags; - struct rcu_tasks_percpu *rtpcp; - - local_irq_save(flags); - rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu); - raw_spin_lock_rcu_node(rtpcp); // irqs already disabled - t->trc_blkd_cpu = smp_processor_id(); - if (!rtpcp->rtp_blkd_tasks.next) - INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); - list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); - WRITE_ONCE(t->trc_reader_special.b.blocked, true); - raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); -} -EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd); - -/* Add a task to the holdout list, if it is not already on the list. */ -static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) -{ - if (list_empty(&t->trc_holdout_list)) { - get_task_struct(t); - list_add(&t->trc_holdout_list, bhp); - n_trc_holdouts++; - } -} - -/* Remove a task from the holdout list, if it is in fact present. */ -static void trc_del_holdout(struct task_struct *t) -{ - if (!list_empty(&t->trc_holdout_list)) { - list_del_init(&t->trc_holdout_list); - put_task_struct(t); - n_trc_holdouts--; - } -} - -/* IPI handler to check task state. */ -static void trc_read_check_handler(void *t_in) -{ - int nesting; - struct task_struct *t = current; - struct task_struct *texp = t_in; - - // If the task is no longer running on this CPU, leave. - if (unlikely(texp != t)) - goto reset_ipi; // Already on holdout list, so will check later. - - // If the task is not in a read-side critical section, and - // if this is the last reader, awaken the grace-period kthread. - nesting = READ_ONCE(t->trc_reader_nesting); - if (likely(!nesting)) { - rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); - goto reset_ipi; - } - // If we are racing with an rcu_read_unlock_trace(), try again later. - if (unlikely(nesting < 0)) - goto reset_ipi; - - // Get here if the task is in a read-side critical section. - // Set its state so that it will update state for the grace-period - // kthread upon exit from that critical section. - rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED); - -reset_ipi: - // Allow future IPIs to be sent on CPU and for task. - // Also order this IPI handler against any later manipulations of - // the intended task. - smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ - smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ -} - -/* Callback function for scheduler to check locked-down task. */ -static int trc_inspect_reader(struct task_struct *t, void *bhp_in) -{ - struct list_head *bhp = bhp_in; - int cpu = task_cpu(t); - int nesting; - bool ofl = cpu_is_offline(cpu); - - if (task_curr(t) && !ofl) { - // If no chance of heavyweight readers, do it the hard way. - if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) - return -EINVAL; - - // If heavyweight readers are enabled on the remote task, - // we can inspect its state despite its currently running. - // However, we cannot safely change its state. - n_heavy_reader_attempts++; - // Check for "running" idle tasks on offline CPUs. - if (!rcu_watching_zero_in_eqs(cpu, &t->trc_reader_nesting)) - return -EINVAL; // No quiescent state, do it the hard way. - n_heavy_reader_updates++; - nesting = 0; - } else { - // The task is not running, so C-language access is safe. - nesting = t->trc_reader_nesting; - WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t)))); - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl) - n_heavy_reader_ofl_updates++; - } - - // If not exiting a read-side critical section, mark as checked - // so that the grace-period kthread will remove it from the - // holdout list. - if (!nesting) { - rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); - return 0; // In QS, so done. - } - if (nesting < 0) - return -EINVAL; // Reader transitioning, try again later. - - // The task is in a read-side critical section, so set up its - // state so that it will update state upon exit from that critical - // section. - if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) - trc_add_holdout(t, bhp); - return 0; -} - -/* Attempt to extract the state for the specified task. */ -static void trc_wait_for_one_reader(struct task_struct *t, - struct list_head *bhp) -{ - int cpu; - - // If a previous IPI is still in flight, let it complete. - if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI - return; - - // The current task had better be in a quiescent state. - if (t == current) { - rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); - WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); - return; - } - - // Attempt to nail down the task for inspection. - get_task_struct(t); - if (!task_call_func(t, trc_inspect_reader, bhp)) { - put_task_struct(t); - return; - } - put_task_struct(t); - - // If this task is not yet on the holdout list, then we are in - // an RCU read-side critical section. Otherwise, the invocation of - // trc_add_holdout() that added it to the list did the necessary - // get_task_struct(). Either way, the task cannot be freed out - // from under this code. - - // If currently running, send an IPI, either way, add to list. - trc_add_holdout(t, bhp); - if (task_curr(t) && - time_after(jiffies + 1, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { - // The task is currently running, so try IPIing it. - cpu = task_cpu(t); - - // If there is already an IPI outstanding, let it happen. - if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0) - return; - - per_cpu(trc_ipi_to_cpu, cpu) = true; - t->trc_ipi_to_cpu = cpu; - rcu_tasks_trace.n_ipis++; - if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) { - // Just in case there is some other reason for - // failure than the target CPU being offline. - WARN_ONCE(1, "%s(): smp_call_function_single() failed for CPU: %d\n", - __func__, cpu); - rcu_tasks_trace.n_ipis_fails++; - per_cpu(trc_ipi_to_cpu, cpu) = false; - t->trc_ipi_to_cpu = -1; - } - } -} - -/* - * Initialize for first-round processing for the specified task. - * Return false if task is NULL or already taken care of, true otherwise. - */ -static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself) -{ - // During early boot when there is only the one boot CPU, there - // is no idle task for the other CPUs. Also, the grace-period - // kthread is always in a quiescent state. In addition, just return - // if this task is already on the list. - if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list)) - return false; - - rcu_st_need_qs(t, 0); - t->trc_ipi_to_cpu = -1; - return true; -} - -/* Do first-round processing for the specified task. */ -static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) -{ - if (rcu_tasks_trace_pertask_prep(t, true)) - trc_wait_for_one_reader(t, hop); -} - -/* Initialize for a new RCU-tasks-trace grace period. */ -static void rcu_tasks_trace_pregp_step(struct list_head *hop) -{ - LIST_HEAD(blkd_tasks); - int cpu; - unsigned long flags; - struct rcu_tasks_percpu *rtpcp; - struct task_struct *t; - - // There shouldn't be any old IPIs, but... - for_each_possible_cpu(cpu) - WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); - - // Disable CPU hotplug across the CPU scan for the benefit of - // any IPIs that might be needed. This also waits for all readers - // in CPU-hotplug code paths. - cpus_read_lock(); - - // These rcu_tasks_trace_pertask_prep() calls are serialized to - // allow safe access to the hop list. - for_each_online_cpu(cpu) { - rcu_read_lock(); - // Note that cpu_curr_snapshot() picks up the target - // CPU's current task while its runqueue is locked with - // an smp_mb__after_spinlock(). This ensures that either - // the grace-period kthread will see that task's read-side - // critical section or the task will see the updater's pre-GP - // accesses. The trailing smp_mb() in cpu_curr_snapshot() - // does not currently play a role other than simplify - // that function's ordering semantics. If these simplified - // ordering semantics continue to be redundant, that smp_mb() - // might be removed. - t = cpu_curr_snapshot(cpu); - if (rcu_tasks_trace_pertask_prep(t, true)) - trc_add_holdout(t, hop); - rcu_read_unlock(); - cond_resched_tasks_rcu_qs(); - } - - // Only after all running tasks have been accounted for is it - // safe to take care of the tasks that have blocked within their - // current RCU tasks trace read-side critical section. - for_each_possible_cpu(cpu) { - rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu); - raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks); - while (!list_empty(&blkd_tasks)) { - rcu_read_lock(); - t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node); - list_del_init(&t->trc_blkd_node); - list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); - raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); - rcu_tasks_trace_pertask(t, hop); - rcu_read_unlock(); - raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - } - raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); - cond_resched_tasks_rcu_qs(); - } - - // Re-enable CPU hotplug now that the holdout list is populated. - cpus_read_unlock(); -} - -/* - * Do intermediate processing between task and holdout scans. - */ -static void rcu_tasks_trace_postscan(struct list_head *hop) -{ - // Wait for late-stage exiting tasks to finish exiting. - // These might have passed the call to exit_tasks_rcu_finish(). - - // If you remove the following line, update rcu_trace_implies_rcu_gp()!!! - synchronize_rcu(); - // Any tasks that exit after this point will set - // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs. -} - -/* Communicate task state back to the RCU tasks trace stall warning request. */ -struct trc_stall_chk_rdr { - int nesting; - int ipi_to_cpu; - u8 needqs; -}; - -static int trc_check_slow_task(struct task_struct *t, void *arg) -{ - struct trc_stall_chk_rdr *trc_rdrp = arg; - - if (task_curr(t) && cpu_online(task_cpu(t))) - return false; // It is running, so decline to inspect it. - trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting); - trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu); - trc_rdrp->needqs = rcu_ld_need_qs(t); - return true; -} - -/* Show the state of a task stalling the current RCU tasks trace GP. */ -static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) -{ - int cpu; - struct trc_stall_chk_rdr trc_rdr; - bool is_idle_tsk = is_idle_task(t); - - if (*firstreport) { - pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n"); - *firstreport = false; - } - cpu = task_cpu(t); - if (!task_call_func(t, trc_check_slow_task, &trc_rdr)) - pr_alert("P%d: %c%c\n", - t->pid, - ".I"[t->trc_ipi_to_cpu >= 0], - ".i"[is_idle_tsk]); - else - pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n", - t->pid, - ".I"[trc_rdr.ipi_to_cpu >= 0], - ".i"[is_idle_tsk], - ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], - ".B"[!!data_race(t->trc_reader_special.b.blocked)], - trc_rdr.nesting, - " !CN"[trc_rdr.needqs & 0x3], - " ?"[trc_rdr.needqs > 0x3], - cpu, cpu_online(cpu) ? "" : "(offline)"); - sched_show_task(t); -} - -/* List stalled IPIs for RCU tasks trace. */ -static void show_stalled_ipi_trace(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - if (per_cpu(trc_ipi_to_cpu, cpu)) - pr_alert("\tIPI outstanding to CPU %d\n", cpu); -} - -/* Do one scan of the holdout list. */ -static void check_all_holdout_tasks_trace(struct list_head *hop, - bool needreport, bool *firstreport) -{ - struct task_struct *g, *t; - - // Disable CPU hotplug across the holdout list scan for IPIs. - cpus_read_lock(); - - list_for_each_entry_safe(t, g, hop, trc_holdout_list) { - // If safe and needed, try to check the current task. - if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && - !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED)) - trc_wait_for_one_reader(t, hop); - - // If check succeeded, remove this task from the list. - if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 && - rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED) - trc_del_holdout(t); - else if (needreport) - show_stalled_task_trace(t, firstreport); - cond_resched_tasks_rcu_qs(); - } - - // Re-enable CPU hotplug now that the holdout list scan has completed. - cpus_read_unlock(); - - if (needreport) { - if (*firstreport) - pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n"); - show_stalled_ipi_trace(); - } -} - -static void rcu_tasks_trace_empty_fn(void *unused) -{ -} - -/* Wait for grace period to complete and provide ordering. */ -static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) -{ - int cpu; - - // Wait for any lingering IPI handlers to complete. Note that - // if a CPU has gone offline or transitioned to userspace in the - // meantime, all IPI handlers should have been drained beforehand. - // Yes, this assumes that CPUs process IPIs in order. If that ever - // changes, there will need to be a recheck and/or timed wait. - for_each_online_cpu(cpu) - if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu)))) - smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1); - - smp_mb(); // Caller's code must be ordered after wakeup. - // Pairs with pretty much every ordering primitive. -} - -/* Report any needed quiescent state for this exiting task. */ -static void exit_tasks_rcu_finish_trace(struct task_struct *t) -{ - union rcu_special trs = READ_ONCE(t->trc_reader_special); - - rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); - WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); - if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked)) - rcu_read_unlock_trace_special(t); - else - WRITE_ONCE(t->trc_reader_nesting, 0); -} - -/** - * call_rcu_tasks_trace() - Queue a callback trace task-based grace period - * @rhp: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a trace rcu-tasks - * grace period elapses, in other words after all currently executing - * trace rcu-tasks read-side critical sections have completed. These - * read-side critical sections are delimited by calls to rcu_read_lock_trace() - * and rcu_read_unlock_trace(). - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) -{ - call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace); -} -EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); - -/** - * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period - * - * Control will return to the caller some time after a trace rcu-tasks - * grace period has elapsed, in other words after all currently executing - * trace rcu-tasks read-side critical sections have elapsed. These read-side - * critical sections are delimited by calls to rcu_read_lock_trace() - * and rcu_read_unlock_trace(). - * - * This is a very specialized primitive, intended only for a few uses in - * tracing and other situations requiring manipulation of function preambles - * and profiling hooks. The synchronize_rcu_tasks_trace() function is not - * (yet) intended for heavy use from multiple CPUs. - * - * See the description of synchronize_rcu() for more detailed information - * on memory ordering guarantees. - */ -void synchronize_rcu_tasks_trace(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section"); - synchronize_rcu_tasks_generic(&rcu_tasks_trace); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); - -/** - * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks_trace(void) -{ - rcu_barrier_tasks_generic(&rcu_tasks_trace); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); - -int rcu_tasks_trace_lazy_ms = -1; -module_param(rcu_tasks_trace_lazy_ms, int, 0444); - -static int __init rcu_spawn_tasks_trace_kthread(void) -{ - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { - rcu_tasks_trace.gp_sleep = HZ / 10; - rcu_tasks_trace.init_fract = HZ / 10; - } else { - rcu_tasks_trace.gp_sleep = HZ / 200; - if (rcu_tasks_trace.gp_sleep <= 0) - rcu_tasks_trace.gp_sleep = 1; - rcu_tasks_trace.init_fract = HZ / 200; - if (rcu_tasks_trace.init_fract <= 0) - rcu_tasks_trace.init_fract = 1; - } - if (rcu_tasks_trace_lazy_ms >= 0) - rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms); - rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; - rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; - rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; - rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; - rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); - return 0; -} - -#if !defined(CONFIG_TINY_RCU) -void show_rcu_tasks_trace_gp_kthread(void) -{ - char buf[64]; - - snprintf(buf, sizeof(buf), "N%lu h:%lu/%lu/%lu", - data_race(n_trc_holdouts), - data_race(n_heavy_reader_ofl_updates), - data_race(n_heavy_reader_updates), - data_race(n_heavy_reader_attempts)); - show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); -} -EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); - -void rcu_tasks_trace_torture_stats_print(char *tt, char *tf) -{ - rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, ""); -} -EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print); -#endif // !defined(CONFIG_TINY_RCU) - -struct task_struct *get_rcu_tasks_trace_gp_kthread(void) -{ - return rcu_tasks_trace.kthread_ptr; -} -EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread); - -void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq) -{ - *flags = 0; - *gp_seq = rcu_seq_current(&rcu_tasks_trace.tasks_gp_seq); -} -EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data); - -#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ -static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } -#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ - #ifndef CONFIG_TINY_RCU void show_rcu_tasks_gp_kthreads(void) { show_rcu_tasks_classic_gp_kthread(); show_rcu_tasks_rude_gp_kthread(); - show_rcu_tasks_trace_gp_kthread(); } #endif /* #ifndef CONFIG_TINY_RCU */ @@ -2251,10 +1570,6 @@ void __init tasks_cblist_init_generic(void) #ifdef CONFIG_TASKS_RUDE_RCU cblist_init_generic(&rcu_tasks_rude); #endif - -#ifdef CONFIG_TASKS_TRACE_RCU - cblist_init_generic(&rcu_tasks_trace); -#endif } static int __init rcu_init_tasks_generic(void) @@ -2267,10 +1582,6 @@ static int __init rcu_init_tasks_generic(void) rcu_spawn_tasks_rude_kthread(); #endif -#ifdef CONFIG_TASKS_TRACE_RCU - rcu_spawn_tasks_trace_kthread(); -#endif - // Run the self-tests. rcu_tasks_initiate_self_tests(); @@ -2281,3 +1592,16 @@ core_initcall(rcu_init_tasks_generic); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Tracing variant of Tasks RCU. This variant is designed to be used +// to protect tracing hooks, including those of BPF. This variant +// is implemented via a straightforward mapping onto SRCU-fast. + +DEFINE_SRCU_FAST(rcu_tasks_trace_srcu_struct); +EXPORT_SYMBOL_GPL(rcu_tasks_trace_srcu_struct); + +#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ |
