From be8ee18152b0523752f3a44900363838bd1573bb Mon Sep 17 00:00:00 2001 From: Atul Kumar Pant Date: Sat, 18 Jan 2025 14:09:27 +0530 Subject: sched_ext: Fixes typos in comments Fixes some spelling errors in the comments. Signed-off-by: Atul Kumar Pant Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8857c0709bdd..283d7f1addc5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -416,7 +416,7 @@ struct sched_ext_ops { /** * @update_idle: Update the idle state of a CPU - * @cpu: CPU to udpate the idle state for + * @cpu: CPU to update the idle state for * @idle: whether entering or exiting the idle state * * This operation is called when @rq's CPU goes or leaves the idle @@ -1214,7 +1214,7 @@ static bool scx_kf_allowed_if_unlocked(void) /** * nldsq_next_task - Iterate to the next task in a non-local DSQ - * @dsq: user dsq being interated + * @dsq: user dsq being iterated * @cur: current position, %NULL to start iteration * @rev: walk backwards * @@ -2078,7 +2078,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) /* * list_add_tail() must be used. scx_ops_bypass() depends on tasks being - * appened to the runnable_list. + * appended to the runnable_list. */ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); } @@ -2480,7 +2480,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, /* * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly * banging on the same DSQ on a large NUMA system to the point where switching - * to the bypass mode can take a long time. Inject artifical delays while the + * to the bypass mode can take a long time. Inject artificial delays while the * bypass mode is switching to guarantee timely completion. */ static void scx_ops_breather(struct rq *rq) @@ -3144,7 +3144,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) * * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used * to implement the default task ordering. The older the timestamp, the higher - * prority the task - the global FIFO ordering matching the default scheduling + * priority the task - the global FIFO ordering matching the default scheduling * behavior. * * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to @@ -4590,7 +4590,7 @@ static int scx_cgroup_init(void) cgroup_warned_missing_idle = false; /* - * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and init, all online cgroups are initialized. */ rcu_read_lock(); -- cgit v1.2.3 From 2279563e3a8cac367b267b09c15cf1e39c06c5cc Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 22 Jan 2025 10:05:25 +0100 Subject: sched_ext: Include task weight in the error state dump Report the task weight when dumping the task state during an error exit. Moreover, adjust the output format to display dsq_vtime, slice, and weight on the same line. This can help identify whether certain tasks were excessively prioritized or de-prioritized due to large niceness gaps. Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 283d7f1addc5..7081c7be5f62 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5277,9 +5277,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); - dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu", - p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, - p->scx.dsq_vtime, p->scx.slice); + dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", + p->scx.dsq_vtime, p->scx.slice, p->scx.weight); dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); if (SCX_HAS_OP(dump_task)) { -- cgit v1.2.3 From e76946110137703c16423baf6ee177b751a34b7e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 23 Jan 2025 16:25:35 +0800 Subject: workqueue: Put the pwq after detaching the rescuer from the pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 68f83057b913("workqueue: Reap workers via kthread_stop() and remove detach_completion") adds code to reap the normal workers but mistakenly does not handle the rescuer and also removes the code waiting for the rescuer in put_unbound_pool(), which caused a use-after-free bug reported by Cheung Wall. To avoid the use-after-free bug, the pool’s reference must be held until the detachment is complete. Therefore, move the code that puts the pwq after detaching the rescuer from the pool. Reported-by: cheung wall Cc: cheung wall Link: https://lore.kernel.org/lkml/CAKHoSAvP3iQW+GwmKzWjEAOoPvzeWeoMO0Gz7Pp3_4kxt-RMoA@mail.gmail.com/ Fixes: 68f83057b913("workqueue: Reap workers via kthread_stop() and remove detach_completion") Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33a23c7b2274..ccad33001c58 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3516,12 +3516,6 @@ repeat: } } - /* - * Put the reference grabbed by send_mayday(). @pool won't - * go away while we're still attached to it. - */ - put_pwq(pwq); - /* * Leave this pool. Notify regular workers; otherwise, we end up * with 0 concurrency and stalling the execution. @@ -3532,6 +3526,12 @@ repeat: worker_detach_from_pool(rescuer); + /* + * Put the reference grabbed by send_mayday(). @pool might + * go away any time after it. + */ + put_pwq_unlocked(pwq); + raw_spin_lock_irq(&wq_mayday_lock); } -- cgit v1.2.3 From d6f3e7d564b2309e1f17e709a70eca78d7ca2bb8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Jan 2025 12:22:12 -1000 Subject: sched_ext: Fix incorrect autogroup migration detection scx_move_task() is called from sched_move_task() and tells the BPF scheduler that cgroup migration is being committed. sched_move_task() is used by both cgroup and autogroup migrations and scx_move_task() tried to filter out autogroup migrations by testing the destination cgroup and PF_EXITING but this is not enough. In fact, without explicitly tagging the thread which is doing the cgroup migration, there is no good way to tell apart scx_move_task() invocations for racing migration to the root cgroup and an autogroup migration. This led to scx_move_task() incorrectly ignoring a migration from non-root cgroup to an autogroup of the root cgroup triggering the following warning: WARNING: CPU: 7 PID: 1 at kernel/sched/ext.c:3725 scx_cgroup_can_attach+0x196/0x340 ... Call Trace: cgroup_migrate_execute+0x5b1/0x700 cgroup_attach_task+0x296/0x400 __cgroup_procs_write+0x128/0x140 cgroup_procs_write+0x17/0x30 kernfs_fop_write_iter+0x141/0x1f0 vfs_write+0x31d/0x4a0 __x64_sys_write+0x72/0xf0 do_syscall_64+0x82/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix it by adding an argument to sched_move_task() that indicates whether the moving is for a cgroup or autogroup migration. After the change, scx_move_task() is called only for cgroup migrations and renamed to scx_cgroup_move_task(). Link: https://github.com/sched-ext/scx/issues/370 Fixes: 819513666966 ("sched_ext: Add cgroup support") Cc: stable@vger.kernel.org # v6.12+ Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/sched/autogroup.c | 4 ++-- kernel/sched/core.c | 7 ++++--- kernel/sched/ext.c | 15 +-------------- kernel/sched/ext.h | 4 ++-- kernel/sched/sched.h | 2 +- 5 files changed, 10 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index db68a964e34e..c4a3ccf6a8ac 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -150,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p) * see this thread after that: we can no longer use signal->autogroup. * See the PF_EXITING check in task_wants_autogroup(). */ - sched_move_task(p); + sched_move_task(p, true); } static void @@ -182,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * sched_autogroup_exit_task(). */ for_each_thread(p, t) - sched_move_task(t); + sched_move_task(t, true); unlock_task_sighand(p, &flags); autogroup_kref_put(prev); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 901170708e2a..e77897a62442 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9042,7 +9042,7 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect * its new group. */ -void sched_move_task(struct task_struct *tsk) +void sched_move_task(struct task_struct *tsk, bool for_autogroup) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; @@ -9071,7 +9071,8 @@ void sched_move_task(struct task_struct *tsk) put_prev_task(rq, tsk); sched_change_group(tsk, group); - scx_move_task(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -9172,7 +9173,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) - sched_move_task(task); + sched_move_task(task, false); scx_cgroup_finish_attach(); } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7081c7be5f62..c7b159f48834 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4323,24 +4323,11 @@ err: return ops_sanitize_err("cgroup_prep_move", ret); } -void scx_move_task(struct task_struct *p) +void scx_cgroup_move_task(struct task_struct *p) { if (!scx_cgroup_enabled) return; - /* - * We're called from sched_move_task() which handles both cgroup and - * autogroup moves. Ignore the latter. - * - * Also ignore exiting tasks, because in the exit path tasks transition - * from the autogroup to the root group, so task_group_is_autogroup() - * alone isn't able to catch exiting autogroup tasks. This is safe for - * cgroup_move(), because cgroup migrations never happen for PF_EXITING - * tasks. - */ - if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING)) - return; - /* * @p must have ops.cgroup_prep_move() called on it and thus * cgrp_moving_from set. diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 4d022d17ac7d..1079b56b0f7a 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -73,7 +73,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); -void scx_move_task(struct task_struct *p); +void scx_cgroup_move_task(struct task_struct *p); void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); @@ -82,7 +82,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle); static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static inline void scx_move_task(struct task_struct *p) {} +static inline void scx_cgroup_move_task(struct task_struct *p) {} static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 38e0e323dda2..b93c8c3dc05a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -572,7 +572,7 @@ extern void sched_online_group(struct task_group *tg, extern void sched_destroy_group(struct task_group *tg); extern void sched_release_group(struct task_group *tg); -extern void sched_move_task(struct task_struct *tsk); +extern void sched_move_task(struct task_struct *tsk, bool for_autogroup); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); -- cgit v1.2.3 From 1626e5ef0b00386a4fd083fa7c46c8edbd75f9b4 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 27 Jan 2025 23:06:16 +0100 Subject: sched_ext: Fix lock imbalance in dispatch_to_local_dsq() While performing the rq locking dance in dispatch_to_local_dsq(), we may trigger the following lock imbalance condition, in particular when multiple tasks are rapidly changing CPU affinity (i.e., running a `stress-ng --race-sched 0`): [ 13.413579] ===================================== [ 13.413660] WARNING: bad unlock balance detected! [ 13.413729] 6.13.0-virtme #15 Not tainted [ 13.413792] ------------------------------------- [ 13.413859] kworker/1:1/80 is trying to release lock (&rq->__lock) at: [ 13.413954] [] dispatch_to_local_dsq+0x108/0x1a0 [ 13.414111] but there are no more locks to release! [ 13.414176] [ 13.414176] other info that might help us debug this: [ 13.414258] 1 lock held by kworker/1:1/80: [ 13.414318] #0: ffff8b66feb41698 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x20/0x90 [ 13.414612] [ 13.414612] stack backtrace: [ 13.415255] CPU: 1 UID: 0 PID: 80 Comm: kworker/1:1 Not tainted 6.13.0-virtme #15 [ 13.415505] Workqueue: 0x0 (events) [ 13.415567] Sched_ext: dsp_local_on (enabled+all), task: runnable_at=-2ms [ 13.415570] Call Trace: [ 13.415700] [ 13.415744] dump_stack_lvl+0x78/0xe0 [ 13.415806] ? dispatch_to_local_dsq+0x108/0x1a0 [ 13.415884] print_unlock_imbalance_bug+0x11b/0x130 [ 13.415965] ? dispatch_to_local_dsq+0x108/0x1a0 [ 13.416226] lock_release+0x231/0x2c0 [ 13.416326] _raw_spin_unlock+0x1b/0x40 [ 13.416422] dispatch_to_local_dsq+0x108/0x1a0 [ 13.416554] flush_dispatch_buf+0x199/0x1d0 [ 13.416652] balance_one+0x194/0x370 [ 13.416751] balance_scx+0x61/0x1e0 [ 13.416848] prev_balance+0x43/0xb0 [ 13.416947] __pick_next_task+0x6b/0x1b0 [ 13.417052] __schedule+0x20d/0x1740 This happens because dispatch_to_local_dsq() is racing with dispatch_dequeue() and, when the latter wins, we incorrectly assume that the task has been moved to dst_rq. Fix by properly tracking the currently locked rq. Fixes: 4d3ca89bdd31 ("sched_ext: Refactor consume_remote_task()") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index c7b159f48834..a6d6d6dadde5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2575,6 +2575,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, { struct rq *src_rq = task_rq(p); struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); +#ifdef CONFIG_SMP + struct rq *locked_rq = rq; +#endif /* * We're synchronized against dequeue through DISPATCHING. As @p can't @@ -2611,8 +2614,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); /* switch to @src_rq lock */ - if (rq != src_rq) { - raw_spin_rq_unlock(rq); + if (locked_rq != src_rq) { + raw_spin_rq_unlock(locked_rq); + locked_rq = src_rq; raw_spin_rq_lock(src_rq); } @@ -2630,6 +2634,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } else { move_remote_task_to_local_dsq(p, enq_flags, src_rq, dst_rq); + /* task has been moved to dst_rq, which is now locked */ + locked_rq = dst_rq; } /* if the destination CPU is idle, wake it up */ @@ -2638,8 +2644,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } /* switch back to @rq lock */ - if (rq != dst_rq) { - raw_spin_rq_unlock(dst_rq); + if (locked_rq != rq) { + raw_spin_rq_unlock(locked_rq); raw_spin_rq_lock(rq); } #else /* CONFIG_SMP */ -- cgit v1.2.3 From b69bb476dee99d564d65d418e9a20acca6f32c3f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 30 Jan 2025 16:05:42 -0800 Subject: cgroup: fix race between fork and cgroup.kill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tejun reported the following race between fork() and cgroup.kill at [1]. Tejun: I was looking at cgroup.kill implementation and wondering whether there could be a race window. So, __cgroup_kill() does the following: k1. Set CGRP_KILL. k2. Iterate tasks and deliver SIGKILL. k3. Clear CGRP_KILL. The copy_process() does the following: c1. Copy a bunch of stuff. c2. Grab siglock. c3. Check fatal_signal_pending(). c4. Commit to forking. c5. Release siglock. c6. Call cgroup_post_fork() which puts the task on the css_set and tests CGRP_KILL. The intention seems to be that either a forking task gets SIGKILL and terminates on c3 or it sees CGRP_KILL on c6 and kills the child. However, I don't see what guarantees that k3 can't happen before c6. ie. After a forking task passes c5, k2 can take place and then before the forking task reaches c6, k3 can happen. Then, nobody would send SIGKILL to the child. What am I missing? This is indeed a race. One way to fix this race is by taking cgroup_threadgroup_rwsem in write mode in __cgroup_kill() as the fork() side takes cgroup_threadgroup_rwsem in read mode from cgroup_can_fork() to cgroup_post_fork(). However that would be heavy handed as this adds one more potential stall scenario for cgroup.kill which is usually called under extreme situation like memory pressure. To fix this race, let's maintain a sequence number per cgroup which gets incremented on __cgroup_kill() call. On the fork() side, the cgroup_can_fork() will cache the sequence number locally and recheck it against the cgroup's sequence number at cgroup_post_fork() site. If the sequence numbers mismatch, it means __cgroup_kill() can been called and we should send SIGKILL to the newly created task. Reported-by: Tejun Heo Closes: https://lore.kernel.org/all/Z5QHE2Qn-QZ6M-KW@slm.duckdns.org/ [1] Fixes: 661ee6280931 ("cgroup: introduce cgroup.kill") Cc: stable@vger.kernel.org # v5.14+ Signed-off-by: Shakeel Butt Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d9061bd55436..afc665b7b1fe 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4013,7 +4013,7 @@ static void __cgroup_kill(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - set_bit(CGRP_KILL, &cgrp->flags); + cgrp->kill_seq++; spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); @@ -4029,10 +4029,6 @@ static void __cgroup_kill(struct cgroup *cgrp) send_sig(SIGKILL, task, 0); } css_task_iter_end(&it); - - spin_lock_irq(&css_set_lock); - clear_bit(CGRP_KILL, &cgrp->flags); - spin_unlock_irq(&css_set_lock); } static void cgroup_kill(struct cgroup *cgrp) @@ -6488,6 +6484,10 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); + if (kargs->cgrp) + kargs->kill_seq = kargs->cgrp->kill_seq; + else + kargs->kill_seq = cset->dfl_cgrp->kill_seq; spin_unlock_irq(&css_set_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { @@ -6668,6 +6668,7 @@ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + unsigned int cgrp_kill_seq = 0; unsigned long cgrp_flags = 0; bool kill = false; struct cgroup_subsys *ss; @@ -6681,10 +6682,13 @@ void cgroup_post_fork(struct task_struct *child, /* init tasks are special, only link regular threads */ if (likely(child->pid)) { - if (kargs->cgrp) + if (kargs->cgrp) { cgrp_flags = kargs->cgrp->flags; - else + cgrp_kill_seq = kargs->cgrp->kill_seq; + } else { cgrp_flags = cset->dfl_cgrp->flags; + cgrp_kill_seq = cset->dfl_cgrp->kill_seq; + } WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; @@ -6719,7 +6723,7 @@ void cgroup_post_fork(struct task_struct *child, * child down right after we finished preparing it for * userspace. */ - kill = test_bit(CGRP_KILL, &cgrp_flags); + kill = kargs->kill_seq != cgrp_kill_seq; } spin_unlock_irq(&css_set_lock); -- cgit v1.2.3 From db5fd3cf8bf41b84b577b8ad5234ea95f327c9be Mon Sep 17 00:00:00 2001 From: Muhammad Adeel Date: Fri, 7 Feb 2025 14:24:32 +0000 Subject: cgroup: Remove steal time from usage_usec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CPU usage time is the time when user, system or both are using the CPU. Steal time is the time when CPU is waiting to be run by the Hypervisor. It should not be added to the CPU usage time, hence removing it from the usage_usec entry. Fixes: 936f2a70f2077 ("cgroup: add cpu.stat file to root cgroup") Acked-by: Axel Busch Acked-by: Michal Koutný Signed-off-by: Muhammad Adeel Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 5877974ece92..aac91466279f 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -590,7 +590,6 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; - cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; #ifdef CONFIG_SCHED_CORE bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; -- cgit v1.2.3 From bcc6244e13b4d4903511a1ea84368abf925031c0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 29 Jan 2025 20:53:03 +0100 Subject: sched: Clarify wake_up_q()'s write to task->wake_q.next Clarify that wake_up_q() does an atomic write to task->wake_q.next, after which a concurrent __wake_q_add() can immediately overwrite task->wake_q.next again. Signed-off-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250129-sched-wakeup-prettier-v1-1-2f51f5f663fa@google.com --- kernel/sched/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3e5a6bf587f9..8931d9b1e895 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1055,9 +1055,10 @@ void wake_up_q(struct wake_q_head *head) struct task_struct *task; task = container_of(node, struct task_struct, wake_q); - /* Task can safely be re-inserted now: */ node = node->next; - task->wake_q.next = NULL; + /* pairs with cmpxchg_relaxed() in __wake_q_add() */ + WRITE_ONCE(task->wake_q.next, NULL); + /* Task can safely be re-inserted now. */ /* * wake_up_process() executes a full barrier, which pairs with -- cgit v1.2.3 From 2fa0fbeb69edd367b7c44f484e8dc5a5a1a311ef Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 7 Feb 2025 10:58:23 -1000 Subject: sched_ext: Implement auto local dispatching of migration disabled tasks Migration disabled tasks are special and pinned to their previous CPUs. They tripped up some unsuspecting BPF schedulers as their ->nr_cpus_allowed may not agree with the bits set in ->cpus_ptr. Make it easier for BPF schedulers by automatically dispatching them to the pinned local DSQs by default. If a BPF scheduler wants to handle migration disabled tasks explicitly, it can set SCX_OPS_ENQ_MIGRATION_DISABLED. Signed-off-by: Tejun Heo Acked-by: Andrea Righi --- kernel/sched/ext.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index a6d6d6dadde5..efdbf4d85a21 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -122,6 +122,19 @@ enum scx_ops_flags { */ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + /* + * A migration disabled task can only execute on its current CPU. By + * default, such tasks are automatically put on the CPU's local DSQ with + * the default slice on enqueue. If this ops flag is set, they also go + * through ops.enqueue(). + * + * A migration disabled task never invokes ops.select_cpu() as it can + * only select the current CPU. Also, p->cpus_ptr will only contain its + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr + * and thus may disagree with cpumask_weight(p->cpus_ptr). + */ + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + /* * CPU cgroup support flags */ @@ -130,6 +143,7 @@ enum scx_ops_flags { SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | SCX_OPS_SWITCH_PARTIAL | SCX_OPS_HAS_CGROUP_WEIGHT, }; @@ -882,6 +896,7 @@ static bool scx_warned_zero_slice; static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); @@ -2014,6 +2029,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, unlikely(p->flags & PF_EXITING)) goto local; + /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ + if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && + is_migration_disabled(p)) + goto local; + if (!SCX_HAS_OP(enqueue)) goto global; @@ -5052,6 +5072,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) static_branch_disable(&scx_has_op[i]); static_branch_disable(&scx_ops_enq_last); static_branch_disable(&scx_ops_enq_exiting); + static_branch_disable(&scx_ops_enq_migration_disabled); static_branch_disable(&scx_ops_cpu_preempt); static_branch_disable(&scx_builtin_idle_enabled); synchronize_rcu(); @@ -5661,6 +5682,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ops->flags & SCX_OPS_ENQ_EXITING) static_branch_enable(&scx_ops_enq_exiting); + if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) + static_branch_enable(&scx_ops_enq_migration_disabled); if (scx_ops.cpu_acquire || scx_ops.cpu_release) static_branch_enable(&scx_ops_cpu_preempt); -- cgit v1.2.3 From 32966821574cd2917bd60f2554f435fe527f4702 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 7 Feb 2025 10:59:06 -1000 Subject: sched_ext: Fix migration disabled handling in targeted dispatches A dispatch operation that can target a specific local DSQ - scx_bpf_dsq_move_to_local() or scx_bpf_dsq_move() - checks whether the task can be migrated to the target CPU using task_can_run_on_remote_rq(). If the task can't be migrated to the targeted CPU, it is bounced through a global DSQ. task_can_run_on_remote_rq() assumes that the task is on a CPU that's different from the targeted CPU but the callers doesn't uphold the assumption and may call the function when the task is already on the target CPU. When such task has migration disabled, task_can_run_on_remote_rq() ends up returning %false incorrectly unnecessarily bouncing the task to a global DSQ. Fix it by updating the callers to only call task_can_run_on_remote_rq() when the task is on a different CPU than the target CPU. As this is a bit subtle, for clarity and documentation: - Make task_can_run_on_remote_rq() trigger SCHED_WARN_ON() if the task is on the same CPU as the target CPU. - is_migration_disabled() test in task_can_run_on_remote_rq() cannot trigger if the task is on a different CPU than the target CPU as the preceding task_allowed_on_cpu() test should fail beforehand. Convert the test into SCHED_WARN_ON(). Signed-off-by: Tejun Heo Fixes: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") Fixes: 0366017e0973 ("sched_ext: Use task_can_run_on_remote_rq() test in dispatch_to_local_dsq()") Cc: stable@vger.kernel.org # v6.12+ --- kernel/sched/ext.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index efdbf4d85a21..e01144340d67 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2333,12 +2333,16 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, * * - The BPF scheduler is bypassed while the rq is offline and we can always say * no to the BPF scheduler initiated migrations while offline. + * + * The caller must ensure that @p and @rq are on different CPUs. */ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { int cpu = cpu_of(rq); + SCHED_WARN_ON(task_cpu(p) == cpu); + /* * We don't require the BPF scheduler to avoid dispatching to offline * CPUs mostly for convenience but also because CPUs can go offline @@ -2352,8 +2356,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, return false; } - if (unlikely(is_migration_disabled(p))) - return false; + /* + * If @p has migration disabled, @p->cpus_ptr only contains its current + * CPU and the above task_allowed_on_cpu() test should have failed. + */ + SCHED_WARN_ON(is_migration_disabled(p)); if (!scx_rq_online(rq)) return false; @@ -2457,7 +2464,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, if (dst_dsq->id == SCX_DSQ_LOCAL) { dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); - if (!task_can_run_on_remote_rq(p, dst_rq, true)) { + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dst_dsq = find_global_dsq(p); dst_rq = src_rq; } @@ -2611,7 +2619,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } #ifdef CONFIG_SMP - if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dispatch_enqueue(find_global_dsq(p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; -- cgit v1.2.3 From f3f08c3acfb8860e07a22814a344e83c99ad7398 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 10 Feb 2025 09:27:09 -1000 Subject: sched_ext: Fix incorrect assumption about migration disabled tasks in task_can_run_on_remote_rq() While fixing migration disabled task handling, 32966821574c ("sched_ext: Fix migration disabled handling in targeted dispatches") assumed that a migration disabled task's ->cpus_ptr would only have the pinned CPU. While this is eventually true for migration disabled tasks that are switched out, ->cpus_ptr update is performed by migrate_disable_switch() which is called right before context_switch() in __scheduler(). However, the task is enqueued earlier during pick_next_task() via put_prev_task_scx(), so there is a race window where another CPU can see the task on a DSQ. If the CPU tries to dispatch the migration disabled task while in that window, task_allowed_on_cpu() will succeed and task_can_run_on_remote_rq() will subsequently trigger SCHED_WARN(is_migration_disabled()). WARNING: CPU: 8 PID: 1837 at kernel/sched/ext.c:2466 task_can_run_on_remote_rq+0x12e/0x140 Sched_ext: layered (enabled+all), task: runnable_at=-10ms RIP: 0010:task_can_run_on_remote_rq+0x12e/0x140 ... consume_dispatch_q+0xab/0x220 scx_bpf_dsq_move_to_local+0x58/0xd0 bpf_prog_84dd17b0654b6cf0_layered_dispatch+0x290/0x1cfa bpf__sched_ext_ops_dispatch+0x4b/0xab balance_one+0x1fe/0x3b0 balance_scx+0x61/0x1d0 prev_balance+0x46/0xc0 __pick_next_task+0x73/0x1c0 __schedule+0x206/0x1730 schedule+0x3a/0x160 __do_sys_sched_yield+0xe/0x20 do_syscall_64+0xbb/0x1e0 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fix it by converting the SCHED_WARN() back to a regular failure path. Also, perform the migration disabled test before task_allowed_on_cpu() test so that BPF schedulers which fail to handle migration disabled tasks can be noticed easily. While at it, adjust scx_ops_error() message for !task_allowed_on_cpu() case for brevity and consistency. Signed-off-by: Tejun Heo Fixes: 32966821574c ("sched_ext: Fix migration disabled handling in targeted dispatches") Acked-by: Andrea Righi Reported-by: Jake Hillion --- kernel/sched/ext.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e01144340d67..54edd0e2132a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2343,6 +2343,25 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, SCHED_WARN_ON(task_cpu(p) == cpu); + /* + * If @p has migration disabled, @p->cpus_ptr is updated to contain only + * the pinned CPU in migrate_disable_switch() while @p is being switched + * out. However, put_prev_task_scx() is called before @p->cpus_ptr is + * updated and thus another CPU may see @p on a DSQ inbetween leading to + * @p passing the below task_allowed_on_cpu() check while migration is + * disabled. + * + * Test the migration disabled state first as the race window is narrow + * and the BPF scheduler failing to check migration disabled state can + * easily be masked if task_allowed_on_cpu() is done first. + */ + if (unlikely(is_migration_disabled(p))) { + if (trigger_error) + scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", + p->comm, p->pid, task_cpu(p), cpu); + return false; + } + /* * We don't require the BPF scheduler to avoid dispatching to offline * CPUs mostly for convenience but also because CPUs can go offline @@ -2351,17 +2370,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, */ if (!task_allowed_on_cpu(p, cpu)) { if (trigger_error) - scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", - cpu_of(rq), p->comm, p->pid); + scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", + cpu, p->comm, p->pid); return false; } - /* - * If @p has migration disabled, @p->cpus_ptr only contains its current - * CPU and the above task_allowed_on_cpu() test should have failed. - */ - SCHED_WARN_ON(is_migration_disabled(p)); - if (!scx_rq_online(rq)) return false; -- cgit v1.2.3 From 4cf7d58620bfc2ebe934e3dfa97208f13f14ab8b Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Sun, 9 Feb 2025 09:46:50 +0530 Subject: genirq: Remove unused CONFIG_GENERIC_PENDING_IRQ_CHIPFLAGS CONFIG_GENERIC_PENDING_IRQ_CHIPFLAGS is not used anymore, hence remove it. Fixes: f94a18249b7f ("genirq: Remove IRQ_MOVE_PCNTXT and related code") Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250209041655.331470-7-apatel@ventanamicro.com --- kernel/irq/Kconfig | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5432418c0fea..875f25ed6f71 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -31,10 +31,6 @@ config GENERIC_IRQ_EFFECTIVE_AFF_MASK config GENERIC_PENDING_IRQ bool -# Deduce delayed migration from top-level interrupt chip flags -config GENERIC_PENDING_IRQ_CHIPFLAGS - bool - # Support for generic irq migrating off cpu before the cpu is offline. config GENERIC_IRQ_MIGRATION bool -- cgit v1.2.3 From f5717c93a1b999970f3a64d771a1a9ee68cc37d0 Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Wed, 12 Feb 2025 21:09:35 +0800 Subject: sched_ext: Use SCX_CALL_OP_TASK in task_tick_scx Now when we use scx_bpf_task_cgroup() in ops.tick() to get the cgroup of the current task, the following error will occur: scx_foo[3795244] triggered exit kind 1024: runtime error (called on a task not being operated on) The reason is that we are using SCX_CALL_OP() instead of SCX_CALL_OP_TASK() when calling ops.tick(), which triggers the error during the subsequent scx_kf_allowed_on_arg_tasks() check. SCX_CALL_OP_TASK() was first introduced in commit 36454023f50b ("sched_ext: Track tasks that are subjects of the in-flight SCX operation") to ensure task's rq lock is held when accessing task's sched_group. Since ops.tick() is marked as SCX_KF_TERMINAL and task_tick_scx() is protected by the rq lock, we can use SCX_CALL_OP_TASK() to avoid the above issue. Similarly, the same changes should be made for ops.disable() and ops.exit_task(), as they are also protected by task_rq_lock() and it's safe to access the task's task_group. Fixes: 36454023f50b ("sched_ext: Track tasks that are subjects of the in-flight SCX operation") Signed-off-by: Chuyi Zhou Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 54edd0e2132a..5a81d9a1e31f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3899,7 +3899,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) curr->scx.slice = 0; touch_core_sched(rq, curr); } else if (SCX_HAS_OP(tick)) { - SCX_CALL_OP(SCX_KF_REST, tick, curr); + SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr); } if (!curr->scx.slice) @@ -4046,7 +4046,7 @@ static void scx_ops_disable_task(struct task_struct *p) WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); if (SCX_HAS_OP(disable)) - SCX_CALL_OP(SCX_KF_REST, disable, p); + SCX_CALL_OP_TASK(SCX_KF_REST, disable, p); scx_set_task_state(p, SCX_TASK_READY); } @@ -4075,7 +4075,7 @@ static void scx_ops_exit_task(struct task_struct *p) } if (SCX_HAS_OP(exit_task)) - SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); + SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args); scx_set_task_state(p, SCX_TASK_NONE); } -- cgit v1.2.3 From 9ba0e1755a40f9920ad0f4168031291b3eb58d7b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Feb 2025 13:19:57 -0500 Subject: ring-buffer: Unlock resize on mmap error Memory mapping the tracing ring buffer will disable resizing the buffer. But if there's an error in the memory mapping like an invalid parameter, the function exits out without re-enabling the resizing of the ring buffer, preventing the ring buffer from being resized after that. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250213131957.530ec3c5@gandalf.local.home Fixes: 117c39200d9d7 ("ring-buffer: Introducing ring-buffer mapping functions") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b8e0ae15ca5b..07b421115692 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7126,6 +7126,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, kfree(cpu_buffer->subbuf_ids); cpu_buffer->subbuf_ids = NULL; rb_free_meta_page(cpu_buffer); + atomic_dec(&cpu_buffer->resize_disabled); } unlock: -- cgit v1.2.3 From 60b8f711143de7cd9c0f55be0fe7eb94b19eb5c7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Feb 2025 13:41:32 -0500 Subject: tracing: Have the error of __tracing_resize_ring_buffer() passed to user Currently if __tracing_resize_ring_buffer() returns an error, the tracing_resize_ringbuffer() returns -ENOMEM. But it may not be a memory issue that caused the function to fail. If the ring buffer is memory mapped, then the resizing of the ring buffer will be disabled. But if the user tries to resize the buffer, it will get an -ENOMEM returned, which is confusing because there is plenty of memory. The actual error returned was -EBUSY, which would make much more sense to the user. Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250213134132.7e4505d7@gandalf.local.home Fixes: 117c39200d9d7 ("ring-buffer: Introducing ring-buffer mapping functions") Signed-off-by: Steven Rostedt (Google) Reviewed-by: Masami Hiramatsu (Google) --- kernel/trace/trace.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1496a5ac33ae..25ff37aab00f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5977,8 +5977,6 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id) { - int ret; - guard(mutex)(&trace_types_lock); if (cpu_id != RING_BUFFER_ALL_CPUS) { @@ -5987,11 +5985,7 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr, return -EINVAL; } - ret = __tracing_resize_ring_buffer(tr, size, cpu_id); - if (ret < 0) - ret = -ENOMEM; - - return ret; + return __tracing_resize_ring_buffer(tr, size, cpu_id); } static void update_last_data(struct trace_array *tr) -- cgit v1.2.3 From f5b95f1fa2ef3a03f49eeec658ba97e721412b32 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Feb 2025 10:28:20 -0500 Subject: ring-buffer: Validate the persistent meta data subbuf array The meta data for a mapped ring buffer contains an array of indexes of all the subbuffers. The first entry is the reader page, and the rest of the entries lay out the order of the subbuffers in how the ring buffer link list is to be created. The validator currently makes sure that all the entries are within the range of 0 and nr_subbufs. But it does not check if there are any duplicates. While working on the ring buffer, I corrupted this array, where I added duplicates. The validator did not catch it and created the ring buffer link list on top of it. Luckily, the corruption was only that the reader page was also in the writer path and only presented corrupted data but did not crash the kernel. But if there were duplicates in the writer side, then it could corrupt the ring buffer link list and cause a crash. Create a bitmask array with the size of the number of subbuffers. Then clear it. When walking through the subbuf array checking to see if the entries are within the range, test if its bit is already set in the subbuf_mask. If it is, then there is duplicates and fail the validation. If not, set the corresponding bit and continue. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250214102820.7509ddea@gandalf.local.home Fixes: c76883f18e59b ("ring-buffer: Add test if range of boot buffer is valid") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 07b421115692..0419d41a2060 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1672,7 +1672,8 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) * must be the same. */ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, - struct trace_buffer *buffer, int nr_pages) + struct trace_buffer *buffer, int nr_pages, + unsigned long *subbuf_mask) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; @@ -1680,6 +1681,9 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, unsigned long buffers_end; int i; + if (!subbuf_mask) + return false; + /* Check the meta magic and meta struct size */ if (meta->magic != RING_BUFFER_META_MAGIC || meta->struct_size != sizeof(*meta)) { @@ -1712,6 +1716,8 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, subbuf = rb_subbufs_from_meta(meta); + bitmap_clear(subbuf_mask, 0, meta->nr_subbufs); + /* Is the meta buffers and the subbufs themselves have correct data? */ for (i = 0; i < meta->nr_subbufs; i++) { if (meta->buffers[i] < 0 || @@ -1725,6 +1731,12 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, return false; } + if (test_bit(meta->buffers[i], subbuf_mask)) { + pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu); + return false; + } + + set_bit(meta->buffers[i], subbuf_mask); subbuf = (void *)subbuf + subbuf_size; } @@ -1889,17 +1901,22 @@ static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { struct ring_buffer_meta *meta; + unsigned long *subbuf_mask; unsigned long delta; void *subbuf; int cpu; int i; + /* Create a mask to test the subbuf array */ + subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); + /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); - if (rb_meta_valid(meta, cpu, buffer, nr_pages)) { + if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; @@ -1943,6 +1960,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf += meta->subbuf_size; } } + bitmap_free(subbuf_mask); } static void *rbm_start(struct seq_file *m, loff_t *pos) -- cgit v1.2.3 From 129fe718819cc5e24ea2f489db9ccd4371f0c6f6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Feb 2025 11:55:47 -0500 Subject: tracing: Do not allow mmap() of persistent ring buffer When trying to mmap a trace instance buffer that is attached to reserve_mem, it would crash: BUG: unable to handle page fault for address: ffffe97bd00025c8 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 2862f3067 P4D 2862f3067 PUD 0 Oops: Oops: 0000 [#1] PREEMPT_RT SMP PTI CPU: 4 UID: 0 PID: 981 Comm: mmap-rb Not tainted 6.14.0-rc2-test-00003-g7f1a5e3fbf9e-dirty #233 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:validate_page_before_insert+0x5/0xb0 Code: e2 01 89 d0 c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 <48> 8b 46 08 a8 01 75 67 66 90 48 89 f0 8b 50 34 85 d2 74 76 48 89 RSP: 0018:ffffb148c2f3f968 EFLAGS: 00010246 RAX: ffff9fa5d3322000 RBX: ffff9fa5ccff9c08 RCX: 00000000b879ed29 RDX: ffffe97bd00025c0 RSI: ffffe97bd00025c0 RDI: ffff9fa5ccff9c08 RBP: ffffb148c2f3f9f0 R08: 0000000000000004 R09: 0000000000000004 R10: 0000000000000000 R11: 0000000000000200 R12: 0000000000000000 R13: 00007f16a18d5000 R14: ffff9fa5c48db6a8 R15: 0000000000000000 FS: 00007f16a1b54740(0000) GS:ffff9fa73df00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffe97bd00025c8 CR3: 00000001048c6006 CR4: 0000000000172ef0 Call Trace: ? __die_body.cold+0x19/0x1f ? __die+0x2e/0x40 ? page_fault_oops+0x157/0x2b0 ? search_module_extables+0x53/0x80 ? validate_page_before_insert+0x5/0xb0 ? kernelmode_fixup_or_oops.isra.0+0x5f/0x70 ? __bad_area_nosemaphore+0x16e/0x1b0 ? bad_area_nosemaphore+0x16/0x20 ? do_kern_addr_fault+0x77/0x90 ? exc_page_fault+0x22b/0x230 ? asm_exc_page_fault+0x2b/0x30 ? validate_page_before_insert+0x5/0xb0 ? vm_insert_pages+0x151/0x400 __rb_map_vma+0x21f/0x3f0 ring_buffer_map+0x21b/0x2f0 tracing_buffers_mmap+0x70/0xd0 __mmap_region+0x6f0/0xbd0 mmap_region+0x7f/0x130 do_mmap+0x475/0x610 vm_mmap_pgoff+0xf2/0x1d0 ksys_mmap_pgoff+0x166/0x200 __x64_sys_mmap+0x37/0x50 x64_sys_call+0x1670/0x1d70 do_syscall_64+0xbb/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f The reason was that the code that maps the ring buffer pages to user space has: page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); And uses that in: vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); But virt_to_page() does not work with vmap()'d memory which is what the persistent ring buffer has. It is rather trivial to allow this, but for now just disable mmap() of instances that have their ring buffer from the reserve_mem option. If an mmap() is performed on a persistent buffer it will return -ENODEV just like it would if the .mmap field wasn't defined in the file_operations structure. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250214115547.0d7287d3@gandalf.local.home Fixes: 9b7bdf6f6ece6 ("tracing: Have trace_printk not use binary prints if boot buffer") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 25ff37aab00f..0e6d517e74e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8279,6 +8279,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) struct trace_iterator *iter = &info->iter; int ret = 0; + /* Currently the boot mapped buffer is not supported for mmap */ + if (iter->tr->flags & TRACE_ARRAY_FL_BOOT) + return -ENODEV; + ret = get_snapshot_map(iter->tr); if (ret) return ret; -- cgit v1.2.3 From 97937834ae876f29565415ab15f1284666dc6be3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Feb 2025 12:35:12 -0500 Subject: ring-buffer: Update pages_touched to reflect persistent buffer content The pages_touched field represents the number of subbuffers in the ring buffer that have content that can be read. This is used in accounting of "dirty_pages" and "buffer_percent" to allow the user to wait for the buffer to be filled to a certain amount before it reads the buffer in blocking mode. The persistent buffer never updated this value so it was set to zero, and this accounting would take it as it had no content. This would cause user space to wait for content even though there's enough content in the ring buffer that satisfies the buffer_percent. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250214123512.0631436e@gandalf.local.home Fixes: 5f3b6e839f3ce ("ring-buffer: Validate boot range memory events") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0419d41a2060..bb6089c2951e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1850,6 +1850,11 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->cpu); goto invalid; } + + /* If the buffer has content, update pages_touched */ + if (ret) + local_inc(&cpu_buffer->pages_touched); + entries += ret; entry_bytes += local_read(&head_page->page->commit); local_set(&cpu_buffer->head_page->entries, ret); -- cgit v1.2.3