From a5b98009f16d8a5fb4a8ff9a193f5735515c38fa Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 14 Apr 2026 14:15:43 +0800 Subject: sched/psi: fix race between file release and pressure write A potential race condition exists between pressure write and cgroup file release regarding the priv member of struct kernfs_open_file, which triggers the uaf reported in [1]. Consider the following scenario involving execution on two separate CPUs: CPU0 CPU1 ==== ==== vfs_rmdir() kernfs_iop_rmdir() cgroup_rmdir() cgroup_kn_lock_live() cgroup_destroy_locked() cgroup_addrm_files() cgroup_rm_file() kernfs_remove_by_name() kernfs_remove_by_name_ns() vfs_write() __kernfs_remove() new_sync_write() kernfs_drain() kernfs_fop_write_iter() kernfs_drain_open_files() cgroup_file_write() kernfs_release_file() pressure_write() cgroup_file_release() ctx = of->priv; kfree(ctx); of->priv = NULL; cgroup_kn_unlock() cgroup_kn_lock_live() cgroup_get(cgrp) cgroup_kn_unlock() if (ctx->psi.trigger) // here, trigger uaf for ctx, that is of->priv The cgroup_rmdir() is protected by the cgroup_mutex, it also safeguards the memory deallocation of of->priv performed within cgroup_file_release(). However, the operations involving of->priv executed within pressure_write() are not entirely covered by the protection of cgroup_mutex. Consequently, if the code in pressure_write(), specifically the section handling the ctx variable executes after cgroup_file_release() has completed, a uaf vulnerability involving of->priv is triggered. Therefore, the issue can be resolved by extending the scope of the cgroup_mutex lock within pressure_write() to encompass all code paths involving of->priv, thereby properly synchronizing the race condition occurring between cgroup_file_release() and pressure_write(). And, if an live kn lock can be successfully acquired while executing the pressure write operation, it indicates that the cgroup deletion process has not yet reached its final stage; consequently, the priv pointer within open_file cannot be NULL. Therefore, the operation to retrieve the ctx value must be moved to a point *after* the live kn lock has been successfully acquired. In another situation, specifically after entering cgroup_kn_lock_live() but before acquiring cgroup_mutex, there exists a different class of race condition: CPU0: write memory.pressure CPU1: write cgroup.pressure=0 =========================== ============================= kernfs_fop_write_iter() kernfs_get_active_of(of) pressure_write() cgroup_kn_lock_live(memory.pressure) cgroup_tryget(cgrp) kernfs_break_active_protection(kn) ... blocks on cgroup_mutex cgroup_pressure_write() cgroup_kn_lock_live(cgroup.pressure) cgroup_file_show(memory.pressure, false) kernfs_show(false) kernfs_drain_open_files() cgroup_file_release(of) kfree(ctx) of->priv = NULL cgroup_kn_unlock() ... acquires cgroup_mutex ctx = of->priv; // may now be NULL if (ctx->psi.trigger) // NULL dereference Consequently, there is a possibility that of->priv is NULL, the pressure write needs to check for this. Now that the scope of the cgroup_mutex has been expanded, the original explicit cgroup_get/put operations are no longer necessary, this is because acquiring/releasing the live kn lock inherently executes a cgroup get/put operation. [1] BUG: KASAN: slab-use-after-free in pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011 Call Trace: pressure_write+0xa4/0x210 kernel/cgroup/cgroup.c:4011 cgroup_file_write+0x36f/0x790 kernel/cgroup/cgroup.c:4311 kernfs_fop_write_iter+0x3b0/0x540 fs/kernfs/file.c:352 Allocated by task 9352: cgroup_file_open+0x90/0x3a0 kernel/cgroup/cgroup.c:4256 kernfs_fop_open+0x9eb/0xcb0 fs/kernfs/file.c:724 do_dentry_open+0x83d/0x13e0 fs/open.c:949 Freed by task 9353: cgroup_file_release+0xd6/0x100 kernel/cgroup/cgroup.c:4283 kernfs_release_file fs/kernfs/file.c:764 [inline] kernfs_drain_open_files+0x392/0x720 fs/kernfs/file.c:834 kernfs_drain+0x470/0x600 fs/kernfs/dir.c:525 Fixes: 0e94682b73bf ("psi: introduce psi monitor") Reported-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=33e571025d88efd1312c Tested-by: syzbot+33e571025d88efd1312c@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1f084ee71443..3243c2087ee3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3934,33 +3934,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { - struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_file_ctx *ctx; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; + ssize_t ret = 0; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - cgroup_get(cgrp); - cgroup_kn_unlock(of->kn); + ctx = of->priv; + if (!ctx) { + ret = -ENODEV; + goto out_unlock; + } /* Allow only one trigger per file descriptor */ if (ctx->psi.trigger) { - cgroup_put(cgrp); - return -EBUSY; + ret = -EBUSY; + goto out_unlock; } psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { - cgroup_put(cgrp); - return PTR_ERR(new); + ret = PTR_ERR(new); + goto out_unlock; } smp_store_release(&ctx->psi.trigger, new); - cgroup_put(cgrp); + +out_unlock: + cgroup_kn_unlock(of->kn); + if (ret) + return ret; return nbytes; } -- cgit v1.2.3 From c802f460dd485c1332b5a35e7adcfb2bc22536a2 Mon Sep 17 00:00:00 2001 From: cuitao Date: Tue, 14 Apr 2026 09:53:27 +0800 Subject: cgroup/rdma: fix integer overflow in rdmacg_try_charge() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The expression `rpool->resources[index].usage + 1` is computed in int arithmetic before being assigned to s64 variable `new`. When usage equals INT_MAX (the default "max" value), the addition overflows to INT_MIN. This negative value then passes the `new > max` check incorrectly, allowing a charge that should be rejected and corrupting usage to negative. Fix by casting usage to s64 before the addition so the arithmetic is done in 64-bit. Fixes: 39d3e7584a68 ("rdmacg: Added rdma cgroup controller") Signed-off-by: cuitao Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 9967fb25c563..4fdab4cf49e0 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -283,7 +283,7 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg, ret = PTR_ERR(rpool); goto err; } else { - new = rpool->resources[index].usage + 1; + new = (s64)rpool->resources[index].usage + 1; if (new > rpool->resources[index].max) { ret = -EAGAIN; goto err; -- cgit v1.2.3 From 41d701ddc36d5301b44ea79529f3cf03c541c1e1 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Fri, 17 Apr 2026 11:37:41 +0800 Subject: cgroup/cpuset: record DL BW alloc CPU for attach rollback cpuset_can_attach() allocates DL bandwidth only when migrating deadline tasks to a disjoint CPU mask, but cpuset_cancel_attach() rolls back based only on nr_migrate_dl_tasks. This makes the DL bandwidth alloc/free paths asymmetric: rollback can call dl_bw_free() even when no dl_bw_alloc() was done. Rollback also needs to undo the reservation against the same CPU/root domain that was charged. Record the CPU used by dl_bw_alloc() and use that state in cpuset_cancel_attach(). If no allocation happened, dl_bw_cpu stays at -1 and rollback skips dl_bw_free(). If allocation did happen, bandwidth is returned to the same CPU/root domain. Successful attach paths are unchanged. This only fixes failed attach rollback accounting. Fixes: 2ef269ef1ac0 ("cgroup/cpuset: Free DL BW in case can_attach() fails") Signed-off-by: Guopeng Zhang Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 5 +++++ kernel/cgroup/cpuset.c | 13 +++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index fd7d19842ded..bb4e692bea30 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -168,6 +168,11 @@ struct cpuset { int nr_deadline_tasks; int nr_migrate_dl_tasks; u64 sum_migrate_dl_bw; + /* + * CPU used for temporary DL bandwidth allocation during attach; + * -1 if no DL bandwidth was allocated in the current attach. + */ + int dl_bw_cpu; /* Invalid partition error code, not lock protected */ enum prs_errcode prs_err; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1335e437098e..e3a081a07c6d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -288,6 +288,7 @@ struct cpuset top_cpuset = { .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, + .dl_bw_cpu = -1, }; /** @@ -579,6 +580,8 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) if (!trial) return NULL; + trial->dl_bw_cpu = -1; + /* Setup cpumask pointer array */ cpumask_var_t *pmask[4] = { &trial->cpus_allowed, @@ -2980,6 +2983,7 @@ static void reset_migrate_dl_data(struct cpuset *cs) { cs->nr_migrate_dl_tasks = 0; cs->sum_migrate_dl_bw = 0; + cs->dl_bw_cpu = -1; } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ @@ -3056,6 +3060,8 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) reset_migrate_dl_data(cs); goto out_unlock; } + + cs->dl_bw_cpu = cpu; } out_success: @@ -3080,12 +3086,11 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) mutex_lock(&cpuset_mutex); dec_attach_in_progress_locked(cs); - if (cs->nr_migrate_dl_tasks) { - int cpu = cpumask_any(cs->effective_cpus); + if (cs->dl_bw_cpu >= 0) + dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw); - dl_bw_free(cpu, cs->sum_migrate_dl_bw); + if (cs->nr_migrate_dl_tasks) reset_migrate_dl_data(cs); - } mutex_unlock(&cpuset_mutex); } -- cgit v1.2.3 From 87019cb6c26178cef8fb9f9265b6ab7c4bda5262 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2026 05:33:41 -1000 Subject: sched_ext: Mark scx_sched_hash insecure_elasticity scx_sched_hash is inserted into under scx_sched_lock (raw_spinlock_irq) in scx_link_sched(). rhashtable's sync grow path calls get_random_u32() and does a GFP_ATOMIC allocation; both acquire regular spinlocks, which is unsafe under raw_spinlock_t. Set insecure_elasticity to skip the sync grow. v2: - Dropped dsq_hash changes. Insertion is not under raw_spin_lock. - Switched from no_sync_grow flag to insecure_elasticity. Fixes: 25037af712eb ("sched_ext: Add rhashtable lookup for sub-schedulers") Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 012ca8bd70fb..7edd46f3ac43 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -32,6 +32,7 @@ static const struct rhashtable_params scx_sched_hash_params = { .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id), .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id), .head_offset = offsetof(struct scx_sched, hash_node), + .insecure_elasticity = true, /* inserted under scx_sched_lock */ }; static struct rhashtable scx_sched_hash; -- cgit v1.2.3 From 2d2b026c3ea792a0c91d4acf4430d8b65bedf271 Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Mon, 20 Apr 2026 17:28:47 +0800 Subject: sched_ext: Deny SCX kfuncs to non-SCX struct_ops programs scx_kfunc_context_filter() currently allows non-SCX struct_ops programs (e.g. tcp_congestion_ops) to call SCX unlocked kfuncs. This is wrong for two reasons: - It is semantically incorrect: a TCP congestion control program has no business calling SCX kfuncs such as scx_bpf_kick_cpu(). - With CONFIG_EXT_SUB_SCHED=y, kfuncs like scx_bpf_kick_cpu() call scx_prog_sched(aux), which invokes bpf_prog_get_assoc_struct_ops(aux) and casts the result to struct sched_ext_ops * before reading ops->priv. For a non-SCX struct_ops program the returned pointer is the kdata of that struct_ops type, which is far smaller than sched_ext_ops, making the read an out-of-bounds access (confirmed with KASAN). Extend the filter to cover scx_kfunc_set_any and scx_kfunc_set_idle as well, and deny all SCX kfuncs for any struct_ops program that is not the SCX struct_ops. This addresses both issues: the semantic contract is enforced at the verifier level, and the runtime out-of-bounds access becomes unreachable. Fixes: d1d3c1c6ae36 ("sched_ext: Add verifier-time kfunc context filter") Suggested-by: Tejun Heo Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 32 ++++++++++++++++++-------------- kernel/sched/ext_idle.c | 1 + kernel/sched/ext_idle.h | 1 + 3 files changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7edd46f3ac43..d66fea57ee69 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9480,6 +9480,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_any, + .filter = scx_kfunc_context_filter, }; /* @@ -9527,13 +9528,12 @@ static const u32 scx_kf_allow_flags[] = { }; /* - * Verifier-time filter for context-sensitive SCX kfuncs. Registered via the - * .filter field on each per-group btf_kfunc_id_set. The BPF core invokes this - * for every kfunc call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or + * Verifier-time filter for SCX kfuncs. Registered via the .filter field on + * each per-group btf_kfunc_id_set. The BPF core invokes this for every kfunc + * call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the - * kfunc - so the filter must short-circuit on kfuncs it doesn't govern (e.g. - * scx_kfunc_ids_any) by falling through to "allow" when none of the - * context-sensitive sets contain the kfunc. + * kfunc - so the filter must short-circuit on kfuncs it doesn't govern by + * falling through to "allow" when none of the SCX sets contain the kfunc. */ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) { @@ -9542,18 +9542,21 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); + bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id); + bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id); u32 moff, flags; - /* Not a context-sensitive kfunc (e.g. from scx_kfunc_ids_any) - allow. */ - if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release)) + /* Not an SCX kfunc - allow. */ + if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || + in_cpu_release || in_idle || in_any)) return 0; /* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */ if (prog->type == BPF_PROG_TYPE_SYSCALL) - return (in_unlocked || in_select_cpu) ? 0 : -EACCES; + return (in_unlocked || in_select_cpu || in_idle || in_any) ? 0 : -EACCES; if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) - return -EACCES; + return (in_any || in_idle) ? 0 : -EACCES; /* * add_subprog_and_kfunc() collects all kfunc calls, including dead code @@ -9566,14 +9569,15 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) return 0; /* - * Non-SCX struct_ops: only unlocked kfuncs are safe. The other - * context-sensitive kfuncs assume the rq lock is held by the SCX - * dispatch path, which doesn't apply to other struct_ops users. + * Non-SCX struct_ops: SCX kfuncs are not permitted. */ if (prog->aux->st_ops != &bpf_sched_ext_ops) - return in_unlocked ? 0 : -EACCES; + return -EACCES; /* SCX struct_ops: check the per-op allow list. */ + if (in_any || in_idle) + return 0; + moff = prog->aux->attach_st_ops_member_off; flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)]; diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 443d12a3df67..c43d62d90e40 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -1467,6 +1467,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_idle) static const struct btf_kfunc_id_set scx_kfunc_set_idle = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_idle, + .filter = scx_kfunc_context_filter, }; /* diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index dc35f850481e..8d169d3bbdf9 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -12,6 +12,7 @@ struct sched_ext_ops; +extern struct btf_id_set8 scx_kfunc_ids_idle; extern struct btf_id_set8 scx_kfunc_ids_select_cpu; void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); -- cgit v1.2.3 From 4e3d7c89e15ac5dbf45b7d7a49bb374650c03339 Mon Sep 17 00:00:00 2001 From: zhidao su Date: Thu, 23 Apr 2026 10:58:32 +0800 Subject: sched_ext: Fix local_dsq_post_enq() to use task's scheduler in sub-sched local_dsq_post_enq() calls call_task_dequeue() with scx_root instead of the scheduler instance actually managing the task. When CONFIG_EXT_SUB_SCHED is enabled, tasks may be managed by a sub-scheduler whose ops.dequeue() callback differs from root's. Using scx_root causes the wrong scheduler's ops.dequeue() to be consulted: sub-sched tasks dispatched to a local DSQ via scx_bpf_dsq_move_to_local() will have SCX_TASK_IN_CUSTODY cleared but the sub-scheduler's ops.dequeue() is never invoked, violating the custody exit semantics. Fix by adding a 'struct scx_sched *sch' parameter to local_dsq_post_enq() and move_local_task_to_local_dsq(), and propagating the correct scheduler from their callers dispatch_enqueue(), move_task_between_dsqs(), and consume_dispatch_q(). This is consistent with dispatch_enqueue()'s non-local path which already passes 'sch' directly to call_task_dequeue() for global/bypass DSQs. Fixes: ebf1ccff79c4 ("sched_ext: Fix ops.dequeue() semantics") Signed-off-by: zhidao su Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index d66fea57ee69..1f670028bf19 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1389,13 +1389,13 @@ static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, p->scx.flags &= ~SCX_TASK_IN_CUSTODY; } -static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p, - u64 enq_flags) +static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq, + struct task_struct *p, u64 enq_flags) { struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); bool preempt = false; - call_task_dequeue(scx_root, rq, p, 0); + call_task_dequeue(sch, rq, p, 0); /* * If @rq is in balance, the CPU is already vacant and looking for the @@ -1519,7 +1519,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, * concurrently in a non-atomic way. */ if (is_local) { - local_dsq_post_enq(dsq, p, enq_flags); + local_dsq_post_enq(sch, dsq, p, enq_flags); } else { /* * Task on global/bypass DSQ: leave custody, task on @@ -2130,7 +2130,8 @@ static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_fl schedule_reenq_local(rq, 0); } -static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, +static void move_local_task_to_local_dsq(struct scx_sched *sch, + struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *src_dsq, struct rq *dst_rq) { @@ -2150,7 +2151,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, dsq_inc_nr(dst_dsq, p, enq_flags); p->scx.dsq = dst_dsq; - local_dsq_post_enq(dst_dsq, p, enq_flags); + local_dsq_post_enq(sch, dst_dsq, p, enq_flags); } /** @@ -2371,7 +2372,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, /* @p is going from a non-local DSQ to a local DSQ */ if (src_rq == dst_rq) { task_unlink_from_dsq(p, src_dsq); - move_local_task_to_local_dsq(p, enq_flags, + move_local_task_to_local_dsq(sch, p, enq_flags, src_dsq, dst_rq); raw_spin_unlock(&src_dsq->lock); } else { @@ -2424,7 +2425,7 @@ retry: if (rq == task_rq) { task_unlink_from_dsq(p, dsq); - move_local_task_to_local_dsq(p, enq_flags, dsq, rq); + move_local_task_to_local_dsq(sch, p, enq_flags, dsq, rq); raw_spin_unlock(&dsq->lock); return true; } -- cgit v1.2.3 From 13e786b64bd3fd81c7eb22aa32bf8305c32f2ccf Mon Sep 17 00:00:00 2001 From: Petr Malat Date: Thu, 23 Apr 2026 04:48:26 -0500 Subject: cgroup: Increment nr_dying_subsys_* from rmdir context Incrementing nr_dying_subsys_* in offline_css(), which is executed by cgroup_offline_wq worker, leads to a race where user can see the value to be 0 if he reads cgroup.stat after calling rmdir and before the worker executes. This makes the user wrongly expect resources released by the removed cgroup to be available for a new assignment. Increment nr_dying_subsys_* from kill_css(), which is called from the cgroup_rmdir() context. Fixes: ab0312526867 ("cgroup: Show # of subsystem CSSes in cgroup.stat") Signed-off-by: Petr Malat Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3243c2087ee3..c928dea9dea6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5724,16 +5724,6 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); - - css->cgroup->nr_dying_subsys[ss->id]++; - /* - * Parent css and cgroup cannot be freed until after the freeing - * of child css, see css_free_rwork_fn(). - */ - while ((css = css->parent)) { - css->nr_descendants--; - css->cgroup->nr_dying_subsys[ss->id]++; - } } /** @@ -6045,6 +6035,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref) */ static void kill_css(struct cgroup_subsys_state *css) { + struct cgroup_subsys *ss = css->ss; + lockdep_assert_held(&cgroup_mutex); if (css->flags & CSS_DYING) @@ -6081,6 +6073,16 @@ static void kill_css(struct cgroup_subsys_state *css) * css is confirmed to be seen as killed on all CPUs. */ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); + + css->cgroup->nr_dying_subsys[ss->id]++; + /* + * Parent css and cgroup cannot be freed until after the freeing + * of child css, see css_free_rwork_fn(). + */ + while ((css = css->parent)) { + css->nr_descendants--; + css->cgroup->nr_dying_subsys[ss->id]++; + } } /** -- cgit v1.2.3 From 510a27055446b8f0d29487ca8b8d2033dc2b6ca6 Mon Sep 17 00:00:00 2001 From: Richard Cheng Date: Fri, 24 Apr 2026 18:02:21 +0800 Subject: sched_ext: sync disable_irq_work in bpf_scx_unreg() When unregistered my self-written scx scheduler, the following panic occurs. [ 229.923133] Kernel text patching generated an invalid instruction at 0xffff80009bc2c1f8! [ 229.923146] Internal error: Oops - BRK: 00000000f2000100 [#1] SMP [ 230.077871] CPU: 48 UID: 0 PID: 1760 Comm: kworker/u583:7 Not tainted 7.0.0+ #3 PREEMPT(full) [ 230.086677] Hardware name: NVIDIA GB200 NVL/P3809-BMC, BIOS 02.05.12 20251107 [ 230.093972] Workqueue: events_unbound bpf_map_free_deferred [ 230.099675] Sched_ext: invariant_0.1.0_aarch64_unknown_linux_gnu_debug (disabling), task: runnable_at=-174ms [ 230.116843] pc : 0xffff80009bc2c1f8 [ 230.120406] lr : dequeue_task_scx+0x270/0x2d0 [ 230.217749] Call trace: [ 230.228515] 0xffff80009bc2c1f8 (P) [ 230.232077] dequeue_task+0x84/0x188 [ 230.235728] sched_change_begin+0x1dc/0x250 [ 230.240000] __set_cpus_allowed_ptr_locked+0x17c/0x240 [ 230.245250] __set_cpus_allowed_ptr+0x74/0xf0 [ 230.249701] ___migrate_enable+0x4c/0xa0 [ 230.253707] bpf_map_free_deferred+0x1a4/0x1b0 [ 230.258246] process_one_work+0x184/0x540 [ 230.262342] worker_thread+0x19c/0x348 [ 230.266170] kthread+0x13c/0x150 [ 230.269465] ret_from_fork+0x10/0x20 [ 230.281393] Code: d4202000 d4202000 d4202000 d4202000 (d4202000) [ 230.287621] ---[ end trace 0000000000000000 ]--- [ 231.160046] Kernel panic - not syncing: Oops - BRK: Fatal exception in interrupt The root cause is that the JIT page backing ops->quiescent() is freed before all callers of that function have stopped. The expected ordering during teardown is: bitmap_zero(sch->has_op) + synchronize_rcu() -> guarantees no CPU will ever call sch->ops.* again -> only THEN free the BPF struct_ops JIT page bpf_scx_unreg() is supposed to enforce the order, but after commit f4a6c506d118 ("sched_ext: Always bounce scx_disable() through irq_work"), disable_work is no longer queued directly, causing kthread_flush_work() to be a noop. Thus, the caller drops the struct_ops map too early and poisoned with AARCH64_BREAK_FAULT before disable_workfn ever execute. So the subsequent dequeue_task() still sees SCX_HAS_OP(sch, quiescent) as true and calls ops.quiescent, which hit on the poisoned page and BRK panic. Add a helper scx_flush_disable_work() so the future use cases that want to flush disable_work can use it. Also amend the call for scx_root_enable_workfn() and scx_sub_enable_workfn() which have similar pattern in the error path. Fixes: f4a6c506d118 ("sched_ext: Always bounce scx_disable() through irq_work") Signed-off-by: Richard Cheng Reviewed-by: Andrea Righi Reviewed-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1f670028bf19..a018034dd81c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5923,6 +5923,20 @@ static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) irq_work_queue(&sch->disable_irq_work); } +/** + * scx_flush_disable_work - flush the disable work and wait for it to finish + * @sch: the scheduler + * + * sch->disable_work might still not queued, causing kthread_flush_work() + * as a noop. Syncing the irq_work first is required to guarantee the + * kthread work has been queued before waiting for it. + */ +static void scx_flush_disable_work(struct scx_sched *sch) +{ + irq_work_sync(&sch->disable_irq_work); + kthread_flush_work(&sch->disable_work); +} + static void dump_newline(struct seq_buf *s) { trace_sched_ext_dump(""); @@ -6823,7 +6837,7 @@ err_disable: * completion. sch's base reference will be put by bpf_scx_unreg(). */ scx_error(sch, "scx_root_enable() failed (%d)", ret); - kthread_flush_work(&sch->disable_work); + scx_flush_disable_work(sch); cmd->ret = 0; } @@ -7090,7 +7104,7 @@ err_unlock_and_disable: percpu_up_write(&scx_fork_rwsem); err_disable: mutex_unlock(&scx_enable_mutex); - kthread_flush_work(&sch->disable_work); + scx_flush_disable_work(sch); cmd->ret = 0; } @@ -7351,7 +7365,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link) struct scx_sched *sch = rcu_dereference_protected(ops->priv, true); scx_disable(sch, SCX_EXIT_UNREG); - kthread_flush_work(&sch->disable_work); + scx_flush_disable_work(sch); RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); } -- cgit v1.2.3 From bd2d76455b65aab77652823919db128a8e585825 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 10:14:32 -1000 Subject: sched_ext: Defer scx_hardlockup() out of NMI scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(), which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing it from NMI context can lead to deadlocks. The hardlockup handler is best-effort recovery and the disable path it triggers runs off of irq_work anyway. Move the handle_lockup() call into an irq_work so it runs in IRQ context. Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support") Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index a018034dd81c..34de1c9b7a7c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4940,6 +4940,25 @@ void scx_softlockup(u32 dur_s) smp_processor_id(), dur_s); } +/* + * scx_hardlockup() runs from NMI and eventually calls scx_claim_exit(), + * which takes scx_sched_lock. scx_sched_lock isn't NMI-safe and grabbing + * it from NMI context can lead to deadlocks. Defer via irq_work; the + * disable path runs off irq_work anyway. + */ +static atomic_t scx_hardlockup_cpu = ATOMIC_INIT(-1); + +static void scx_hardlockup_irq_workfn(struct irq_work *work) +{ + int cpu = atomic_xchg(&scx_hardlockup_cpu, -1); + + if (cpu >= 0 && handle_lockup("hard lockup - CPU %d", cpu)) + printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", + cpu); +} + +static DEFINE_IRQ_WORK(scx_hardlockup_irq_work, scx_hardlockup_irq_workfn); + /** * scx_hardlockup - sched_ext hardlockup handler * @@ -4948,17 +4967,19 @@ void scx_softlockup(u32 dur_s) * Try kicking out the current scheduler in an attempt to recover the system to * a good state before taking more drastic actions. * - * Returns %true if sched_ext is enabled and abort was initiated, which may - * resolve the reported hardlockup. %false if sched_ext is not enabled or - * someone else already initiated abort. + * Queues an irq_work; the handle_lockup() call happens in IRQ context (see + * scx_hardlockup_irq_workfn). + * + * Returns %true if sched_ext is enabled and the work was queued, %false + * otherwise. */ bool scx_hardlockup(int cpu) { - if (!handle_lockup("hard lockup - CPU %d", cpu)) + if (!rcu_access_pointer(scx_root)) return false; - printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", - cpu); + atomic_cmpxchg(&scx_hardlockup_cpu, -1, cpu); + irq_work_queue(&scx_hardlockup_irq_work); return true; } -- cgit v1.2.3 From 411d3ef1a70589755e3beed2f5bf1f8aa0c27d1a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:35 -1000 Subject: sched_ext: Unregister sub_kset on scheduler disable When ops.sub_attach is set, scx_alloc_and_add_sched() creates sub_kset as a child of &sch->kobj, which pins the parent with its own reference. The disable paths never call kset_unregister(), so the final kobject_put() in bpf_scx_unreg() leaves a stale reference and scx_kobj_release() never runs, leaking the whole struct scx_sched on every load/unload cycle. Unregister sub_kset in scx_root_disable() and scx_sub_disable() before kobject_del(&sch->kobj). Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support") Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 34de1c9b7a7c..7f991ecb1398 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5721,6 +5721,8 @@ static void scx_sub_disable(struct scx_sched *sch) if (sch->ops.exit) SCX_CALL_OP(sch, exit, NULL, sch->exit_info); + if (sch->sub_kset) + kset_unregister(sch->sub_kset); kobject_del(&sch->kobj); } #else /* CONFIG_EXT_SUB_SCHED */ @@ -5852,6 +5854,10 @@ static void scx_root_disable(struct scx_sched *sch) * could observe an object of the same name still in the hierarchy when * the next scheduler is loaded. */ +#ifdef CONFIG_EXT_SUB_SCHED + if (sch->sub_kset) + kset_unregister(sch->sub_kset); +#endif kobject_del(&sch->kobj); free_kick_syncs(); -- cgit v1.2.3 From 4fda9f0e7c950da4fe03cedeb2ac818edf5d03e9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:35 -1000 Subject: sched_ext: Guard scx_dsq_move() against NULL kit->dsq after failed iter_new bpf_iter_scx_dsq_new() clears kit->dsq on failure and bpf_iter_scx_dsq_{next,destroy}() guard against that. scx_dsq_move() doesn't - it dereferences kit->dsq immediately, so a BPF program that calls scx_bpf_dsq_move[_vtime]() after a failed iter_new oopses the kernel. Return false if kit->dsq is NULL. Fixes: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") Cc: stable@vger.kernel.org # v6.12+ Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7f991ecb1398..68c67113204f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -8076,12 +8076,22 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, struct task_struct *p, u64 dsq_id, u64 enq_flags) { struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; - struct scx_sched *sch = src_dsq->sched; + struct scx_sched *sch; struct rq *this_rq, *src_rq, *locked_rq; bool dispatched = false; bool in_balance; unsigned long flags; + /* + * The verifier considers an iterator slot initialized on any + * KF_ITER_NEW return, so a BPF program may legally reach here after + * bpf_iter_scx_dsq_new() failed and left @kit->dsq NULL. + */ + if (unlikely(!src_dsq)) + return false; + + sch = src_dsq->sched; + if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags)) return false; -- cgit v1.2.3 From da2d81b4118a74e65d2335e221a38d665902a98c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:35 -1000 Subject: sched_ext: Skip tasks with stale task_rq in bypass_lb_cpu() bypass_lb_cpu() transfers tasks between per-CPU bypass DSQs without migrating them - task_cpu() only updates when the donee later consumes the task via move_remote_task_to_local_dsq(). If the LB timer fires again before consumption and the new DSQ becomes a donor, @p is still on the previous CPU and task_rq(@p) != donor_rq. @p can't be moved without its own rq locked. Skip such tasks. Fixes: 95d1df610cdc ("sched_ext: Implement load balancer for bypass mode") Cc: stable@vger.kernel.org # v6.19+ Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 68c67113204f..f8500ce37b22 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5023,6 +5023,15 @@ resume: if (cpumask_empty(donee_mask)) break; + /* + * If an earlier pass placed @p on @donor_dsq from a different + * CPU and the donee hasn't consumed it yet, @p is still on the + * previous CPU and task_rq(@p) != @donor_rq. @p can't be moved + * without its rq locked. Skip. + */ + if (task_rq(p) != donor_rq) + continue; + donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); if (donee >= nr_cpu_ids) continue; -- cgit v1.2.3 From 21a5a97ba47842ef0c52d6c89e501dce27806550 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:35 -1000 Subject: sched_ext: Don't disable tasks in scx_sub_enable_workfn() abort path scx_sub_enable_workfn()'s prep loop calls __scx_init_task(sch, p, false) without transitioning task state, then sets SCX_TASK_SUB_INIT. If prep fails partway, the abort path runs __scx_disable_and_exit_task(sch, p) on the marked tasks. Task state is still the parent's ENABLED, so that dispatches to the SCX_TASK_ENABLED arm and calls scx_disable_task(sch, p) - i.e. child->ops.disable() - for tasks on which child->ops.enable() never ran. A BPF sub-scheduler allocating per-task state in enable/freeing in disable would operate on uninitialized state. The dying-task branch in scx_disable_and_exit_task() has the same problem, and scx_enabling_sub_sched was cleared before the abort cleanup loop - a task exiting during cleanup tripped the WARN and skipped both ops.exit_task and the SCX_TASK_SUB_INIT clear, leaking per-task resources and leaving the task stuck. Introduce scx_sub_init_cancel_task() that calls ops.exit_task with cancelled=true - matching what the top-level init path does when init_task itself returns -errno. Use it in the abort loop and in the dying-task branch. scx_enabling_sub_sched now stays set until the abort loop finishes clearing SUB_INIT, so concurrent exits hitting the dying-task branch can still find @sch. That branch also clears SCX_TASK_SUB_INIT unconditionally when seen, leaving the task unmarked even if the WARN fires. Fixes: 337ec00b1d9c ("sched_ext: Implement cgroup sub-sched enabling and disabling") Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f8500ce37b22..dd0539ab9ba8 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3633,6 +3633,22 @@ static void __scx_disable_and_exit_task(struct scx_sched *sch, SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); } +/* + * Undo a completed __scx_init_task(sch, p, false) when scx_enable_task() never + * ran. The task state has not been transitioned, so this mirrors the + * SCX_TASK_INIT branch in __scx_disable_and_exit_task(). + */ +static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct *p) +{ + struct scx_exit_task_args args = { .cancelled = true }; + + lockdep_assert_held(&p->pi_lock); + lockdep_assert_rq_held(task_rq(p)); + + if (SCX_HAS_OP(sch, exit_task)) + SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); +} + static void scx_disable_and_exit_task(struct scx_sched *sch, struct task_struct *p) { @@ -3641,11 +3657,12 @@ static void scx_disable_and_exit_task(struct scx_sched *sch, /* * If set, @p exited between __scx_init_task() and scx_enable_task() in * scx_sub_enable() and is initialized for both the associated sched and - * its parent. Disable and exit for the child too. + * its parent. Exit for the child too - scx_enable_task() never ran for + * it, so undo only init_task. */ - if ((p->scx.flags & SCX_TASK_SUB_INIT) && - !WARN_ON_ONCE(!scx_enabling_sub_sched)) { - __scx_disable_and_exit_task(scx_enabling_sub_sched, p); + if (p->scx.flags & SCX_TASK_SUB_INIT) { + if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) + scx_sub_init_cancel_task(scx_enabling_sub_sched, p); p->scx.flags &= ~SCX_TASK_SUB_INIT; } @@ -7124,16 +7141,23 @@ out_unlock: abort: put_task_struct(p); scx_task_iter_stop(&sti); - scx_enabling_sub_sched = NULL; + /* + * Undo __scx_init_task() for tasks we marked. scx_enable_task() never + * ran for @sch on them, so calling scx_disable_task() here would invoke + * ops.disable() without a matching ops.enable(). scx_enabling_sub_sched + * must stay set until SUB_INIT is cleared from every marked task - + * scx_disable_and_exit_task() reads it when a task exits concurrently. + */ scx_task_iter_start(&sti, sch->cgrp); while ((p = scx_task_iter_next_locked(&sti))) { if (p->scx.flags & SCX_TASK_SUB_INIT) { - __scx_disable_and_exit_task(sch, p); + scx_sub_init_cancel_task(sch, p); p->scx.flags &= ~SCX_TASK_SUB_INIT; } } scx_task_iter_stop(&sti); + scx_enabling_sub_sched = NULL; err_unlock_and_disable: /* we'll soon enter disable path, keep bypass on */ scx_cgroup_unlock(); -- cgit v1.2.3 From 80afd4c84bc8f5e80145ce35279f5ce53f6043db Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:35 -1000 Subject: sched_ext: Read scx_root under scx_cgroup_ops_rwsem in cgroup setters scx_group_set_{weight,idle,bandwidth}() cache scx_root before acquiring scx_cgroup_ops_rwsem, so the pointer can be stale by the time the op runs. If the loaded scheduler is disabled and freed (via RCU work) and another is enabled between the naked load and the rwsem acquire, the reader sees scx_cgroup_enabled=true (the new scheduler's) but dereferences the freed one - UAF on SCX_HAS_OP(sch, ...) / SCX_CALL_OP(sch, ...). scx_cgroup_enabled is toggled only under scx_cgroup_ops_rwsem write (scx_cgroup_{init,exit}), so reading scx_root inside the rwsem read section correlates @sch with the enabled snapshot. Fixes: a5bd6ba30b33 ("sched_ext: Use cgroup_lock/unlock() to synchronize against cgroup operations") Cc: stable@vger.kernel.org # v6.18+ Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index dd0539ab9ba8..f6d22636a4de 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4343,9 +4343,10 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) void scx_group_set_weight(struct task_group *tg, unsigned long weight) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; percpu_down_read(&scx_cgroup_ops_rwsem); + sch = scx_root; if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && tg->scx.weight != weight) @@ -4358,9 +4359,10 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) void scx_group_set_idle(struct task_group *tg, bool idle) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; percpu_down_read(&scx_cgroup_ops_rwsem); + sch = scx_root; if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle); @@ -4374,9 +4376,10 @@ void scx_group_set_idle(struct task_group *tg, bool idle) void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; percpu_down_read(&scx_cgroup_ops_rwsem); + sch = scx_root; if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && (tg->scx.bw_period_us != period_us || -- cgit v1.2.3 From cc2a387d330d1fc51a9b7f211a7e5d39c9f0ab94 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:35 -1000 Subject: sched_ext: Resolve caller's scheduler in scx_bpf_destroy_dsq() / scx_bpf_dsq_nr_queued() scx_bpf_create_dsq() resolves the calling scheduler via scx_prog_sched(aux) and inserts the new DSQ into that scheduler's dsq_hash. Its inverse scx_bpf_destroy_dsq() and the query helper scx_bpf_dsq_nr_queued() were hard-coded to rcu_dereference(scx_root), so a sub-scheduler could only destroy or query DSQs in the root scheduler's hash - never its own. If the root had a DSQ with the same id, the sub-sched silently destroyed it and the root aborted on the next dispatch ("invalid DSQ ID 0x0.."). Take a const struct bpf_prog_aux *aux via KF_IMPLICIT_ARGS and resolve the scheduler with scx_prog_sched(aux), matching scx_bpf_create_dsq(). Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support") Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f6d22636a4de..cc5df32db8ff 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -8722,11 +8722,12 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux /** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Return the number of tasks in the DSQ matching @dsq_id. If not found, * -%ENOENT is returned. */ -__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) +__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux) { struct scx_sched *sch; struct scx_dispatch_q *dsq; @@ -8734,7 +8735,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) preempt_disable(); - sch = rcu_dereference_sched(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) { ret = -ENODEV; goto out; @@ -8766,21 +8767,21 @@ out: /** * scx_bpf_destroy_dsq - Destroy a custom DSQ * @dsq_id: DSQ to destroy + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is * empty and no further tasks are dispatched to it. Ignored if called on a DSQ * which doesn't exist. Can be called from any online scx_ops operations. */ -__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) +__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id, const struct bpf_prog_aux *aux) { struct scx_sched *sch; - rcu_read_lock(); - sch = rcu_dereference(scx_root); + guard(rcu)(); + sch = scx_prog_sched(aux); if (sch) destroy_dsq(sch, dsq_id); - rcu_read_unlock(); } /** @@ -9534,8 +9535,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_any) BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) -BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) -BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) +BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) -- cgit v1.2.3 From 2f2ea77092660b53bfcbc4acc590b57ce9ab5dce Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:35 -1000 Subject: sched_ext: Use dsq->first_task instead of list_empty() in dispatch_enqueue() FIFO-tail dispatch_enqueue()'s FIFO-tail path used list_empty(&dsq->list) to decide whether to set dsq->first_task on enqueue. dsq->list can contain parked BPF iterator cursors (SCX_DSQ_LNODE_ITER_CURSOR), so list_empty() is not a reliable "no real task" check. If the last real task is unlinked while a cursor is parked, first_task becomes NULL; the next FIFO-tail enqueue then sees list_empty() == false and skips the first_task update, leaving scx_bpf_dsq_peek() returning NULL for a non-empty DSQ. Test dsq->first_task directly, which already tracks only real tasks and is maintained under dsq->lock. Fixes: 44f5c8ec5b9a ("sched_ext: Add lockless peek operation for DSQs") Cc: stable@vger.kernel.org # v6.19+ Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi Cc: Ryan Newton --- kernel/sched/ext.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index cc5df32db8ff..8a2a90659c65 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1495,11 +1495,13 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) rcu_assign_pointer(dsq->first_task, p); } else { - bool was_empty; - - was_empty = list_empty(&dsq->list); + /* + * dsq->list can contain parked BPF iterator cursors, so + * list_empty() here isn't a reliable proxy for "no real + * task in the DSQ". Test dsq->first_task directly. + */ list_add_tail(&p->scx.dsq_list.node, &dsq->list); - if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + if (!dsq->first_task && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) rcu_assign_pointer(dsq->first_task, p); } } -- cgit v1.2.3 From 7fb39e4eb4c3db52e4707a6a1cd45362f7e803f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:36 -1000 Subject: sched_ext: Save and restore scx_locked_rq across SCX_CALL_OP SCX_CALL_OP{,_RET}() unconditionally clears scx_locked_rq_state to NULL on exit. Correct at the top level, but ops can recurse via scx_bpf_sub_dispatch(): a parent's ops.dispatch calls the helper, which invokes the child's ops.dispatch under another SCX_CALL_OP. When the inner call returns, the NULL clobbers the outer's state. The parent's BPF then calls kfuncs like scx_bpf_cpuperf_set() which read scx_locked_rq()==NULL and re-acquire the already-held rq. Snapshot scx_locked_rq_state on entry and restore on exit. Rename the rq parameter to locked_rq across all SCX_CALL_OP* macros so the snapshot local can be typed as 'struct rq *' without colliding with the parameter token in the expansion. SCX_CALL_OP_TASK{,_RET}() and SCX_CALL_OP_2TASKS_RET() funnel through the two base macros and inherit the fix. Fixes: 4f8b122848db ("sched_ext: Add basic building blocks for nested sub-scheduler dispatching") Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8a2a90659c65..26968d0a6752 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -470,24 +470,35 @@ static inline void update_locked_rq(struct rq *rq) __this_cpu_write(scx_locked_rq_state, rq); } -#define SCX_CALL_OP(sch, op, rq, args...) \ +/* + * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not + * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit. + */ +#define SCX_CALL_OP(sch, op, locked_rq, args...) \ do { \ - if (rq) \ - update_locked_rq(rq); \ + struct rq *__prev_locked_rq; \ + \ + if (locked_rq) { \ + __prev_locked_rq = scx_locked_rq(); \ + update_locked_rq(locked_rq); \ + } \ (sch)->ops.op(args); \ - if (rq) \ - update_locked_rq(NULL); \ + if (locked_rq) \ + update_locked_rq(__prev_locked_rq); \ } while (0) -#define SCX_CALL_OP_RET(sch, op, rq, args...) \ +#define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \ ({ \ + struct rq *__prev_locked_rq; \ __typeof__((sch)->ops.op(args)) __ret; \ \ - if (rq) \ - update_locked_rq(rq); \ + if (locked_rq) { \ + __prev_locked_rq = scx_locked_rq(); \ + update_locked_rq(locked_rq); \ + } \ __ret = (sch)->ops.op(args); \ - if (rq) \ - update_locked_rq(NULL); \ + if (locked_rq) \ + update_locked_rq(__prev_locked_rq); \ __ret; \ }) @@ -499,39 +510,39 @@ do { \ * those subject tasks. * * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held - - * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock - * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if - * kf_tasks[] is set, @p's scheduler-protected fields are stable. + * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's + * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. + * So if kf_tasks[] is set, @p's scheduler-protected fields are stable. * * kf_tasks[] can not stack, so task-based SCX ops must not nest. The * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants * while a previous one is still in progress. */ -#define SCX_CALL_OP_TASK(sch, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \ do { \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - SCX_CALL_OP((sch), op, rq, task, ##args); \ + SCX_CALL_OP((sch), op, locked_rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ } while (0) -#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \ ({ \ __typeof__((sch)->ops.op(task, ##args)) __ret; \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - __ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ __ret; \ }) -#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...) \ +#define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \ ({ \ __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task0; \ current->scx.kf_tasks[1] = task1; \ - __ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \ current->scx.kf_tasks[0] = NULL; \ current->scx.kf_tasks[1] = NULL; \ __ret; \ -- cgit v1.2.3 From 207d76a372fb1bb324eadc8cb5bcaa0a8da7cefd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:36 -1000 Subject: sched_ext: Pass held rq to SCX_CALL_OP() for dump_cpu/dump_task scx_dump_state() walks CPUs with rq_lock_irqsave() held and invokes ops.dump_cpu / ops.dump_task with NULL locked_rq, leaving scx_locked_rq_state NULL. If the BPF callback calls a kfunc that re-acquires rq based on scx_locked_rq() - e.g. scx_bpf_cpuperf_set(cpu) - it re-acquires the already-held rq. Pass the held rq to SCX_CALL_OP(). Thread it into scx_dump_task() too. The pre-loop ops.dump call runs before rq_lock_irqsave() so keeps rq=NULL. Fixes: 07814a9439a3 ("sched_ext: Print debug dump after an error exit") Cc: stable@vger.kernel.org # v6.12+ Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 26968d0a6752..73d629559d6d 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6117,9 +6117,8 @@ static void ops_dump_exit(void) scx_dump_data.cpu = -1; } -static void scx_dump_task(struct scx_sched *sch, - struct seq_buf *s, struct scx_dump_ctx *dctx, - struct task_struct *p, char marker) +static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_dump_ctx *dctx, + struct rq *rq, struct task_struct *p, char marker) { static unsigned long bt[SCX_EXIT_BT_LEN]; struct scx_sched *task_sch = scx_task_sched(p); @@ -6160,7 +6159,7 @@ static void scx_dump_task(struct scx_sched *sch, if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); - SCX_CALL_OP(sch, dump_task, NULL, dctx, p); + SCX_CALL_OP(sch, dump_task, rq, dctx, p); ops_dump_exit(); } @@ -6284,8 +6283,7 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, used = seq_buf_used(&ns); if (SCX_HAS_OP(sch, dump_cpu)) { ops_dump_init(&ns, " "); - SCX_CALL_OP(sch, dump_cpu, NULL, - &dctx, cpu, idle); + SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle); ops_dump_exit(); } @@ -6308,11 +6306,11 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, if (rq->curr->sched_class == &ext_sched_class && (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) - scx_dump_task(sch, &s, &dctx, rq->curr, '*'); + scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*'); list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) if (dump_all_tasks || scx_task_on_sched(sch, p)) - scx_dump_task(sch, &s, &dctx, p, ' '); + scx_dump_task(sch, &s, &dctx, rq, p, ' '); next: rq_unlock_irqrestore(rq, &rf); } -- cgit v1.2.3 From 4155fb489fa175ec74eedde7d02219cf2fe74303 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:36 -1000 Subject: sched_ext: Pass held rq to SCX_CALL_OP() for core_sched_before scx_prio_less() runs from core-sched's pick_next_task() path with rq locked but invokes ops.core_sched_before() with NULL locked_rq, leaving scx_locked_rq_state NULL. If the BPF callback calls a kfunc that re-acquires rq based on scx_locked_rq() - e.g. scx_bpf_cpuperf_set(cpu) - it re-acquires the already-held rq. Pass task_rq(a). Fixes: 7b0888b7cc19 ("sched_ext: Implement core-sched support") Cc: stable@vger.kernel.org # v6.12+ Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 73d629559d6d..ba977154273c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3198,7 +3198,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) && !scx_bypassing(sch_a, task_cpu(a))) return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before, - NULL, + task_rq(a), (struct task_struct *)a, (struct task_struct *)b); else -- cgit v1.2.3 From d292aa00de1aea72961f94c0db43f6b5c72684c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:36 -1000 Subject: sched_ext: Make bypass LB cpumasks per-scheduler scx_bypass_lb_{donee,resched}_cpumask were file-scope statics shared by all scheduler instances. With CONFIG_EXT_SUB_SCHED, multiple sched instances each arm their own bypass_lb_timer; concurrent bypass_lb_node() calls RMW the global cpumasks with no lock, corrupting donee/resched decisions. Move the cpumasks into struct scx_sched, allocate them alongside the timer in scx_alloc_and_add_sched(), free them in scx_sched_free_rcu_work(). Fixes: 95d1df610cdc ("sched_ext: Implement load balancer for bypass mode") Cc: stable@vger.kernel.org # v6.19+ Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 33 +++++++++++++++++++-------------- kernel/sched/ext_internal.h | 2 ++ 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ba977154273c..e07f8c46e399 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -53,8 +53,6 @@ DEFINE_STATIC_KEY_FALSE(__scx_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); static DEFINE_RAW_SPINLOCK(scx_bypass_lock); -static cpumask_var_t scx_bypass_lb_donee_cpumask; -static cpumask_var_t scx_bypass_lb_resched_cpumask; static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -4747,6 +4745,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work) irq_work_sync(&sch->disable_irq_work); kthread_destroy_worker(sch->helper); timer_shutdown_sync(&sch->bypass_lb_timer); + free_cpumask_var(sch->bypass_lb_donee_cpumask); + free_cpumask_var(sch->bypass_lb_resched_cpumask); #ifdef CONFIG_EXT_SUB_SCHED kfree(sch->cgrp_path); @@ -5123,8 +5123,8 @@ resume: static void bypass_lb_node(struct scx_sched *sch, int node) { const struct cpumask *node_mask = cpumask_of_node(node); - struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask; - struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask; + struct cpumask *donee_mask = sch->bypass_lb_donee_cpumask; + struct cpumask *resched_mask = sch->bypass_lb_resched_cpumask; u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; u32 nr_target, nr_donor_target; u32 before_min = U32_MAX, before_max = 0; @@ -6520,6 +6520,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn); kthread_init_work(&sch->disable_work, scx_disable_workfn); timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); + + if (!alloc_cpumask_var(&sch->bypass_lb_donee_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_stop_helper; + } + if (!alloc_cpumask_var(&sch->bypass_lb_resched_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_free_lb_cpumask; + } sch->ops = *ops; rcu_assign_pointer(ops->priv, sch); @@ -6529,14 +6538,14 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, char *buf = kzalloc(PATH_MAX, GFP_KERNEL); if (!buf) { ret = -ENOMEM; - goto err_stop_helper; + goto err_free_lb_resched; } cgroup_path(cgrp, buf, PATH_MAX); sch->cgrp_path = kstrdup(buf, GFP_KERNEL); kfree(buf); if (!sch->cgrp_path) { ret = -ENOMEM; - goto err_stop_helper; + goto err_free_lb_resched; } sch->cgrp = cgrp; @@ -6571,10 +6580,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, #endif /* CONFIG_EXT_SUB_SCHED */ return sch; -#ifdef CONFIG_EXT_SUB_SCHED +err_free_lb_resched: + free_cpumask_var(sch->bypass_lb_resched_cpumask); +err_free_lb_cpumask: + free_cpumask_var(sch->bypass_lb_donee_cpumask); err_stop_helper: kthread_destroy_worker(sch->helper); -#endif err_free_pcpu: for_each_possible_cpu(cpu) { if (cpu == bypass_fail_cpu) @@ -9761,12 +9772,6 @@ static int __init scx_init(void) return ret; } - if (!alloc_cpumask_var(&scx_bypass_lb_donee_cpumask, GFP_KERNEL) || - !alloc_cpumask_var(&scx_bypass_lb_resched_cpumask, GFP_KERNEL)) { - pr_err("sched_ext: Failed to allocate cpumasks\n"); - return -ENOMEM; - } - return 0; } __initcall(scx_init); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 62ce4eaf6a3f..a075732d4430 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -1075,6 +1075,8 @@ struct scx_sched { struct irq_work disable_irq_work; struct kthread_work disable_work; struct timer_list bypass_lb_timer; + cpumask_var_t bypass_lb_donee_cpumask; + cpumask_var_t bypass_lb_resched_cpumask; struct rcu_work rcu_work; /* all ancestors including self */ -- cgit v1.2.3 From c0e8ddc76d54402171787414b1b8eb387812f1f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:36 -1000 Subject: sched_ext: Align cgroup #ifdef guards with SUB_SCHED vs GROUP_SCHED Two EXT_GROUP_SCHED/SUB_SCHED guards are misclassified: - scx_root_enable_workfn()'s cgroup_get(cgrp) and the err_put_cgrp unwind in scx_alloc_and_add_sched() are under `#if GROUP || SUB`, but the matching cgroup_put() in scx_sched_free_rcu_work() is inside `#ifdef SUB` only (via sch->cgrp, stored only under SUB). GROUP-only would leak a reference on every root-sched enable. - sch_cgroup() / set_cgroup_sched() live under `#if GROUP || SUB` but touch SUB-only fields (sch->cgrp, cgroup->scx_sched). GROUP-only wouldn't compile. GROUP needs CGROUP_SCHED; SUB needs only CGROUPS. CGROUPS=y/CGROUP_SCHED=n gives the reachable GROUP=n, SUB=y combination; GROUP=y, SUB=n isn't reachable today (SUB is def_bool y under CGROUPS). Neither miscategorization triggers a real bug in any reachable config, but keep the guards honest: - Narrow cgroup_get and err_put_cgrp to `#ifdef SUB` (matches the free-side put). - Move sch_cgroup() and set_cgroup_sched() to a separate `#ifdef SUB` block with no-op stubs for the !SUB case; keep root_cgroup() and scx_cgroup_{ lock,unlock}() under `#if GROUP || SUB` since those only need cgroup core. Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support") Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e07f8c46e399..e2898d60315b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4413,21 +4413,6 @@ static struct cgroup *root_cgroup(void) return &cgrp_dfl_root.cgrp; } -static struct cgroup *sch_cgroup(struct scx_sched *sch) -{ - return sch->cgrp; -} - -/* for each descendant of @cgrp including self, set ->scx_sched to @sch */ -static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) -{ - struct cgroup *pos; - struct cgroup_subsys_state *css; - - cgroup_for_each_live_descendant_pre(pos, css, cgrp) - rcu_assign_pointer(pos->scx_sched, sch); -} - static void scx_cgroup_lock(void) { #ifdef CONFIG_EXT_GROUP_SCHED @@ -4445,12 +4430,30 @@ static void scx_cgroup_unlock(void) } #else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ static struct cgroup *root_cgroup(void) { return NULL; } -static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } -static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} static void scx_cgroup_lock(void) {} static void scx_cgroup_unlock(void) {} #endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ +#ifdef CONFIG_EXT_SUB_SCHED +static struct cgroup *sch_cgroup(struct scx_sched *sch) +{ + return sch->cgrp; +} + +/* for each descendant of @cgrp including self, set ->scx_sched to @sch */ +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) +{ + struct cgroup *pos; + struct cgroup_subsys_state *css; + + cgroup_for_each_live_descendant_pre(pos, css, cgrp) + rcu_assign_pointer(pos->scx_sched, sch); +} +#else /* CONFIG_EXT_SUB_SCHED */ +static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} +#endif /* CONFIG_EXT_SUB_SCHED */ + /* * Omitted operations: * @@ -6604,7 +6607,7 @@ err_free_ei: err_free_sch: kfree(sch); err_put_cgrp: -#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) +#ifdef CONFIG_EXT_SUB_SCHED cgroup_put(cgrp); #endif return ERR_PTR(ret); @@ -6695,7 +6698,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (ret) goto err_unlock; -#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) +#ifdef CONFIG_EXT_SUB_SCHED cgroup_get(cgrp); #endif sch = scx_alloc_and_add_sched(ops, cgrp, NULL); -- cgit v1.2.3 From ea7c716a24aebe887e0990649ab697bd698cc325 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:36 -1000 Subject: sched_ext: Refuse cross-task select_cpu_from_kfunc calls select_cpu_from_kfunc() skipped pi_lock for @p when called from ops.select_cpu() or another rq-locked SCX op, assuming the held lock protects @p. scx_bpf_select_cpu_dfl() / __scx_bpf_select_cpu_and() accept an arbitrary KF_RCU task_struct, so a caller in e.g. ops.select_cpu(p1) or ops.enqueue(p1) can pass some other p2 - the held pi_lock / rq lock is p1's, not p2's - and reading p2->cpus_ptr / nr_cpus_allowed races with set_cpus_allowed_ptr() and migrate_disable_switch() on another CPU. Abort the scheduler on cross-task calls in both branches: for ops.select_cpu() use scx_kf_arg_task_ok() to verify @p is the wake-up task recorded in current->scx.kf_tasks[] by SCX_CALL_OP_TASK_RET(); for other rq-locked SCX ops compare task_rq(p) against scx_locked_rq(). v2: Switch the in_select_cpu cross-task check from direct_dispatch_task comparison to scx_kf_arg_task_ok(). The former spuriously rejects when ops.select_cpu() calls scx_bpf_dsq_insert() first, then calls scx_bpf_select_cpu_*() on the same task. (Andrea Righi) Fixes: 0022b328504d ("sched_ext: Decouple kfunc unlocked-context check from kf_mask") Reported-by: Chris Mason Signed-off-by: Tejun Heo Cc: Andrea Righi --- kernel/sched/ext_idle.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index c43d62d90e40..7468560a6d80 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -927,14 +927,24 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, * Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq * lock or @p's pi_lock. Three cases: * - * - inside ops.select_cpu(): try_to_wake_up() holds @p's pi_lock. + * - inside ops.select_cpu(): try_to_wake_up() holds the wake-up + * task's pi_lock; the wake-up task is recorded in kf_tasks[0] + * by SCX_CALL_OP_TASK_RET(). * - other rq-locked SCX op: scx_locked_rq() points at the held rq. * - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops): * nothing held, take pi_lock ourselves. + * + * In the first two cases, BPF schedulers may pass an arbitrary task + * that the held lock doesn't cover. Refuse those. */ if (this_rq()->scx.in_select_cpu) { + if (!scx_kf_arg_task_ok(sch, p)) + return -EINVAL; lockdep_assert_held(&p->pi_lock); - } else if (!scx_locked_rq()) { + } else if (scx_locked_rq()) { + if (task_rq(p) != scx_locked_rq()) + goto cross_task; + } else { raw_spin_lock_irqsave(&p->pi_lock, irq_flags); we_locked = true; } @@ -960,6 +970,11 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); return cpu; + +cross_task: + scx_error(sch, "select_cpu kfunc called cross-task on %s[%d]", + p->comm, p->pid); + return -EINVAL; } /** -- cgit v1.2.3 From 05b4a9a9bc37f1fa289a8f07b4fbfc3ae681b650 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:36 -1000 Subject: sched_ext: Reject NULL-sch callers in scx_bpf_task_set_slice/dsq_vtime scx_prog_sched(aux) returns NULL for TRACING / SYSCALL BPF progs that have no struct_ops association when the root scheduler has sub_attach set. scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() pass that NULL into scx_task_on_sched(sch, p), which under CONFIG_EXT_SUB_SCHED is rcu_access_pointer(p->scx.sched) == sch. For any non-scx task p->scx.sched is NULL, so NULL == NULL returns true and the authority gate is bypassed - a privileged but non-struct_ops-associated prog can poke p->scx.slice / p->scx.dsq_vtime on arbitrary tasks. Reject !sch up front so the gate only admits callers with a resolved scheduler. Fixes: 245d09c594ea ("sched_ext: Enforce scheduler ownership when updating slice and dsq_vtime") Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e2898d60315b..f333fd0cb83f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -8640,7 +8640,7 @@ __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice, guard(rcu)(); sch = scx_prog_sched(aux); - if (unlikely(!scx_task_on_sched(sch, p))) + if (unlikely(!sch || !scx_task_on_sched(sch, p))) return false; p->scx.slice = slice; @@ -8663,7 +8663,7 @@ __bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime, guard(rcu)(); sch = scx_prog_sched(aux); - if (unlikely(!scx_task_on_sched(sch, p))) + if (unlikely(!sch || !scx_task_on_sched(sch, p))) return false; p->scx.dsq_vtime = vtime; -- cgit v1.2.3 From deb7b2f93d0129b79425f830a1e5e7e1bb2c4973 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:36 -1000 Subject: sched_ext: Release cpus_read_lock on scx_link_sched() failure in root enable scx_root_enable_workfn() takes cpus_read_lock() before scx_link_sched(sch), but the `if (ret) goto err_disable` on failure skips the matching cpus_read_unlock() - all other err_disable gotos along this path drop the lock first. scx_link_sched() only returns non-zero on the sub-sched path (parent != NULL), so the leak path is unreachable via the root caller today. Still, the unwind is out of line with the surrounding paths. Drop cpus_read_lock() before goto err_disable. v2: Correct Fixes: tag (Andrea Righi). Fixes: 25037af712eb ("sched_ext: Add rhashtable lookup for sub-schedulers") Reported-by: Chris Mason Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f333fd0cb83f..9eda20e5fdb8 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6736,8 +6736,10 @@ static void scx_root_enable_workfn(struct kthread_work *work) rcu_assign_pointer(scx_root, sch); ret = scx_link_sched(sch); - if (ret) + if (ret) { + cpus_read_unlock(); goto err_disable; + } scx_idle_enable(ops); -- cgit v1.2.3 From 0562b572ce591858749fc2f9477567e7e5c5d99f Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Wed, 15 Apr 2026 19:37:38 +0000 Subject: liveupdate: fix return value on session allocation failure When session allocation fails during deserialization, the global 'err' variable was not updated before returning. This caused subsequent calls to luo_session_deserialize() to incorrectly report success. Ensure 'err' is set to the error code from PTR_ERR(session). This ensures that an error is correctly returned to userspace when it attempts to open /dev/liveupdate in the new kernel if deserialization failed. Link: https://lore.kernel.org/20260415193738.515491-1-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav (Google) Cc: David Matlack Cc: Mike Rapoport Cc: Samiullah Khawaja Signed-off-by: Andrew Morton --- kernel/liveupdate/luo_session.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c index a3327a28fc1f..7a42385dabe2 100644 --- a/kernel/liveupdate/luo_session.c +++ b/kernel/liveupdate/luo_session.c @@ -514,11 +514,12 @@ int luo_session_deserialize(void) { struct luo_session_header *sh = &luo_session_global.incoming; static bool is_deserialized; - static int err; + static int saved_err; + int err; /* If has been deserialized, always return the same error code */ if (is_deserialized) - return err; + return saved_err; is_deserialized = true; if (!sh->active) @@ -547,7 +548,8 @@ int luo_session_deserialize(void) pr_warn("Failed to allocate session [%.*s] during deserialization %pe\n", (int)sizeof(sh->ser[i].name), sh->ser[i].name, session); - return PTR_ERR(session); + err = PTR_ERR(session); + goto save_err; } err = luo_session_insert(sh, session); @@ -555,7 +557,7 @@ int luo_session_deserialize(void) pr_warn("Failed to insert session [%s] %pe\n", session->name, ERR_PTR(err)); luo_session_free(session); - return err; + goto save_err; } scoped_guard(mutex, &session->mutex) { @@ -565,7 +567,7 @@ int luo_session_deserialize(void) if (err) { pr_warn("Failed to deserialize files for session [%s] %pe\n", session->name, ERR_PTR(err)); - return err; + goto save_err; } } @@ -574,6 +576,9 @@ int luo_session_deserialize(void) sh->ser = NULL; return 0; +save_err: + saved_err = err; + return err; } int luo_session_serialize(void) -- cgit v1.2.3 From 9ec95329894864170a1a7685b9a11b739393131a Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 10 Apr 2026 02:03:03 -0700 Subject: kho: fix error handling in kho_add_subtree() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix two error handling issues in kho_add_subtree(), where it doesn't handle the error path correctly. 1. If fdt_setprop() fails after the subnode has been created, the subnode is not removed. This leaves an incomplete node in the FDT (missing "preserved-data" or "blob-size" properties). 2. The fdt_setprop() return value (an FDT error code) is stored directly in err and returned to the caller, which expects -errno. Fix both by storing fdt_setprop() results in fdt_err, jumping to a new out_del_node label that removes the subnode on failure, and only setting err = 0 on the success path, otherwise returning -ENOMEM (instead of FDT_ERR_ errors that would come from fdt_setprop). No user-visible changes. This patch fixes error handling in the KHO (Kexec HandOver) subsystem, which is used to preserve data across kexec reboots. The fix only affects a rare failure path during kexec preparation — specifically when the kernel runs out of space in the Flattened Device Tree buffer while registering preserved memory regions. In the unlikely event that this error path was triggered, the old code would leave a malformed node in the device tree and return an incorrect error code to the calling subsystem, which could lead to confusing log messages or incorrect recovery decisions. With this fix, the incomplete node is properly cleaned up and the appropriate errno value is propagated, this error code is not returned to the user. Link: https://lore.kernel.org/20260410-kho_fix_send-v2-1-1b4debf7ee08@debian.org Fixes: 3dc92c311498 ("kexec: add Kexec HandOver (KHO) generation helpers") Signed-off-by: Breno Leitao Suggested-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Breno Leitao Cc: Pasha Tatashin Cc: Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 94762de1fe5f..18509d8082ea 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -762,19 +762,24 @@ int kho_add_subtree(const char *name, void *blob, size_t size) goto out_pack; } - err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME, - &phys, sizeof(phys)); - if (err < 0) - goto out_pack; + fdt_err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_PROP_NAME, + &phys, sizeof(phys)); + if (fdt_err < 0) + goto out_del_node; - err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_SIZE_PROP_NAME, - &size_u64, sizeof(size_u64)); - if (err < 0) - goto out_pack; + fdt_err = fdt_setprop(root_fdt, off, KHO_SUB_TREE_SIZE_PROP_NAME, + &size_u64, sizeof(size_u64)); + if (fdt_err < 0) + goto out_del_node; WARN_ON_ONCE(kho_debugfs_blob_add(&kho_out.dbg, name, blob, size, false)); + err = 0; + goto out_pack; + +out_del_node: + fdt_del_node(root_fdt, off); out_pack: fdt_pack(root_fdt); -- cgit v1.2.3 From c5cd6fd75b6a55761337c9e965dd5ad02485d00d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Apr 2026 13:22:22 +0200 Subject: sched/fair: Fix the negative lag increase fix Vincent reported that my rework of his original patch lost a little something. Specifically it got the return value wrong; it should not compare against the old se->vlag, but rather against the current value. Since the thing that matters is if the effective vruntime of an entity is affected and the thing needs repositioning or not. Fixes: 059258b0d424 ("sched/fair: Prevent negative lag increase during delayed dequeue") Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Tested-by: Vincent Guittot Link: https://patch.msgid.link/20260423094107.GT3102624%40noisy.programming.kicks-ass.net --- kernel/sched/fair.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 69361c63353a..615861d96357 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -847,13 +847,19 @@ static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avrunt * Similarly, check that the entity didn't gain positive lag when DELAY_ZERO * is set. * - * Return true if the lag has been adjusted. + * Return true if the vlag has been modified. Specifically: + * + * se->vlag != avg_vruntime() - se->vruntime + * + * This can be due to clamping in entity_lag() or clamping due to + * sched_delayed. Either way, when vlag is modified and the entity is + * retained, the tree needs to be adjusted. */ static __always_inline bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { - s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq)); - bool ret; + u64 avruntime = avg_vruntime(cfs_rq); + s64 vlag = entity_lag(cfs_rq, se, avruntime); WARN_ON_ONCE(!se->on_rq); @@ -863,10 +869,9 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) if (sched_feat(DELAY_ZERO)) vlag = min(vlag, 0); } - ret = (vlag == se->vlag); se->vlag = vlag; - return ret; + return avruntime - vlag != se->vruntime; } /* -- cgit v1.2.3 From ac8e69e693631689d74d8f1ebee6f84f737f797f Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 22 Apr 2026 11:34:00 +0200 Subject: sched/fair: Fix wakeup_preempt_fair() vs delayed dequeue Similar to how pick_next_entity() must dequeue delayed entities, so too must wakeup_preempt_fair(). Any delayed task being found means it is eligible and hence past the 0-lag point, ready for removal. Worse, by not removing delayed entities from consideration, it can skew the preemption decision, with the end result that a short slice wakeup will not result in a preemption. tip/sched/core tip/sched/core +this patch cyclictest slice (ms) (default)2.8 8 8 hackbench slice (ms) (default)2.8 20 20 Total Samples | 22559 22595 22683 Average (us) | 157 64( 59%) 59( 8%) Median (P50) (us) | 57 57( 0%) 58(- 2%) 90th Percentile (us) | 64 60( 6%) 60( 0%) 99th Percentile (us) | 2407 67( 97%) 67( 0%) 99.9th Percentile (us) | 3400 2288( 33%) 727( 68%) Maximum (us) | 5037 9252(-84%) 7461( 19%) Fixes: f12e148892ed ("sched/fair: Prepare pick_next_task() for delayed dequeue") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260422093400.319251-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 615861d96357..728965851842 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1104,7 +1104,7 @@ static inline void cancel_protect_slice(struct sched_entity *se) * * Which allows tree pruning through eligibility. */ -static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool protect) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *se = __pick_first_entity(cfs_rq); @@ -1175,11 +1175,6 @@ found: return best; } -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) -{ - return __pick_eevdf(cfs_rq, true); -} - struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -5754,11 +5749,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * -pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) +pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect) { struct sched_entity *se; - se = pick_eevdf(cfs_rq); + se = pick_eevdf(cfs_rq, protect); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); /* @@ -9032,7 +9027,7 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f { enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; struct task_struct *donor = rq->donor; - struct sched_entity *se = &donor->se, *pse = &p->se; + struct sched_entity *nse, *se = &donor->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; @@ -9143,11 +9138,17 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f } pick: + nse = pick_next_entity(rq, cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT); + /* If @p has become the most eligible task, force preemption */ + if (nse == pse) + goto preempt; + /* - * If @p has become the most eligible task, force preemption. + * Because p is enqueued, nse being null can only mean that we + * dequeued a delayed task. */ - if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse) - goto preempt; + if (!nse) + goto pick; if (sched_feat(RUN_TO_PARITY)) update_protect_slice(cfs_rq, se); @@ -9184,7 +9185,7 @@ again: throttled |= check_cfs_rq_runtime(cfs_rq); - se = pick_next_entity(rq, cfs_rq); + se = pick_next_entity(rq, cfs_rq, true); if (!se) goto again; cfs_rq = group_cfs_rq(se); -- cgit v1.2.3 From 3da56dc063cd77b9c0b40add930767fab4e389f3 Mon Sep 17 00:00:00 2001 From: Zicheng Qu Date: Fri, 24 Apr 2026 07:11:13 +0000 Subject: sched/fair: Clear rel_deadline when initializing forked entities A yield-triggered crash can happen when a newly forked sched_entity enters the fair class with se->rel_deadline unexpectedly set. The failing sequence is: 1. A task is forked while se->rel_deadline is still set. 2. __sched_fork() initializes vruntime, vlag and other sched_entity state, but does not clear rel_deadline. 3. On the first enqueue, enqueue_entity() calls place_entity(). 4. Because se->rel_deadline is set, place_entity() treats se->deadline as a relative deadline and converts it to an absolute deadline by adding the current vruntime. 5. However, the forked entity's deadline is not a valid inherited relative deadline for this new scheduling instance, so the conversion produces an abnormally large deadline. 6. If the task later calls sched_yield(), yield_task_fair() advances se->vruntime to se->deadline. 7. The inflated vruntime is then used by the following enqueue path, where the vruntime-derived key can overflow when multiplied by the entity weight. 8. This corrupts cfs_rq->sum_w_vruntime, breaks EEVDF eligibility calculation, and can eventually make all entities appear ineligible. pick_next_entity() may then return NULL unexpectedly, leading to a later NULL dereference. A captured trace shows the effect clearly. Before yield, the entity's vruntime was around: 9834017729983308 After yield_task_fair() executed: se->vruntime = se->deadline the vruntime jumped to: 19668035460670230 and the deadline was later advanced further to: 19668035463470230 This shows that the deadline had already become abnormally large before yield_task_fair() copied it into vruntime. rel_deadline is only meaningful when se->deadline really carries a relative deadline that still needs to be placed against vruntime. A freshly forked sched_entity should not inherit or retain this state. Clear se->rel_deadline in __sched_fork(), together with the other sched_entity runtime state, so that the first enqueue does not interpret the new entity's deadline as a stale relative deadline. Fixes: 82e9d0456e06 ("sched/fair: Avoid re-setting virtual deadline on 'migrations'") Analyzed-by: Hui Tang Analyzed-by: Zhang Qiao Signed-off-by: Zicheng Qu Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260424071113.1199600-1-quzicheng@huawei.com --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da20fb6ea25a..b8871449d3c6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4458,6 +4458,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) p->se.nr_migrations = 0; p->se.vruntime = 0; p->se.vlag = 0; + p->se.rel_deadline = 0; INIT_LIST_HEAD(&p->se.group_node); /* A delayed task cannot be in clone(). */ -- cgit v1.2.3 From 163f8b7f9a84086c67c76aeadc04e6d43e32df6e Mon Sep 17 00:00:00 2001 From: Kuba Piecuch Date: Tue, 28 Apr 2026 12:46:01 +0000 Subject: sched_ext: Call wakeup_preempt() in local_dsq_post_enq() There are several edge cases (see linked thread) where an IMMED task can be left lingering on a local DSQ if an RT task swoops in at the wrong time. All of these edge cases are due to rq->next_class being idle even after dispatching a task to rq's local DSQ. We should bump rq->next_class to &ext_sched_class as soon as we've inserted a task into the local DSQ. To optimize the common case of rq->next_class == &ext_sched_class, only call wakeup_preempt() if rq->next_class is below EXT. If next_class is EXT or above, wakeup_preempt() is a no-op anyway. This lets us also simplify the preempt_curr() logic a bit since wakeup_preempt() will call preempt_curr() for us if next_class is below EXT. Link: https://lore.kernel.org/all/DHZPHUFXB4N3.2RY28MUEWBNYK@google.com/ Signed-off-by: Kuba Piecuch Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 9eda20e5fdb8..cac0b18239fe 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1402,14 +1402,51 @@ static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq struct task_struct *p, u64 enq_flags) { struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); - bool preempt = false; call_task_dequeue(sch, rq, p, 0); + /* + * Note that @rq's lock may be dropped between this enqueue and @p + * actually getting on CPU. This gives higher-class tasks (e.g. RT) + * an opportunity to wake up on @rq and prevent @p from running. + * Here are some concrete examples: + * + * Example 1: + * + * We dispatch two tasks from a single ops.dispatch(): + * - First, a local task to this CPU's local DSQ; + * - Second, a local/remote task to a remote CPU's local DSQ. + * We must drop the local rq lock in order to finish the second + * dispatch. In that time, an RT task can wake up on the local rq. + * + * Example 2: + * + * We dispatch a local/remote task to a remote CPU's local DSQ. + * We must drop the remote rq lock before the dispatched task can run, + * which gives an RT task an opportunity to wake up on the remote rq. + * + * Both examples work the same if we replace dispatching with moving + * the tasks from a user-created DSQ. + * + * We must detect these wakeups so that we can re-enqueue IMMED tasks + * from @rq's local DSQ. scx_wakeup_preempt() serves exactly this + * purpose, but for it to be invoked, we must ensure that we bump + * @rq->next_class to &ext_sched_class if it's currently idle. + * + * wakeup_preempt() does the bumping, and since we only invoke it if + * @rq->next_class is below &ext_sched_class, it will also + * resched_curr(rq). + */ + if (sched_class_above(p->sched_class, rq->next_class)) + wakeup_preempt(rq, p, 0); + /* * If @rq is in balance, the CPU is already vacant and looking for the * next task to run. No need to preempt or trigger resched after moving * @p into its local DSQ. + * Note that the wakeup_preempt() above may have already triggered + * a resched if @rq->next_class was idle. It's harmless, since + * need_resched is cleared immediately after task pick. */ if (rq->scx.flags & SCX_RQ_IN_BALANCE) return; @@ -1417,11 +1454,8 @@ static void local_dsq_post_enq(struct scx_sched *sch, struct scx_dispatch_q *dsq if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && rq->curr->sched_class == &ext_sched_class) { rq->curr->scx.slice = 0; - preempt = true; - } - - if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class)) resched_curr(rq); + } } static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, -- cgit v1.2.3 From d99f7a32f09dccbe396187370ec1a74a31b73d7e Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Wed, 29 Apr 2026 01:36:12 +0800 Subject: sched_ext: Fix scx_flush_disable_work() UAF race scx_flush_disable_work() calls irq_work_sync() followed by kthread_flush_work() to ensure that the disable kthread work has fully completed before bpf_scx_unreg() frees the SCX scheduler. However, a concurrent scx_vexit() (e.g., triggered by a watchdog stall) creates a race window between scx_claim_exit() and irq_work_queue(): CPU A (scx_vexit (watchdog)) CPU B (bpf_scx_unreg) ---- ---- scx_claim_exit() atomic_try_cmpxchg(NONE->kind) stack_trace_save() vscnprintf() scx_disable() scx_claim_exit() -> FAIL scx_flush_disable_work() irq_work_sync() // no-op: not queued yet kthread_flush_work() // no-op: not queued yet kobject_put(&sch->kobj) -> free %sch irq_work_queue() -> UAF on %sch scx_disable_irq_workfn() kthread_queue_work() -> UAF The root cause is that CPU B's scx_flush_disable_work() returns after syncing an irq_work that has not yet been queued, while CPU A is still executing the code between scx_claim_exit() and irq_work_queue(). Loop until exit_kind reaches SCX_EXIT_DONE or SCX_EXIT_NONE, draining disable_irq_work and disable_work in each pass. This ensures that any work queued after the previous check is caught, while also correctly handling cases where no disable was triggered (e.g., the scx_sub_enable_workfn() abort path). Fixes: 510a27055446 ("sched_ext: sync disable_irq_work in bpf_scx_unreg()") Reported-by: https://sashiko.dev/#/patchset/20260424100221.32407-1-icheng%40nvidia.com Suggested-by: Tejun Heo Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index cac0b18239fe..9483be03a4ca 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6039,8 +6039,13 @@ static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) */ static void scx_flush_disable_work(struct scx_sched *sch) { - irq_work_sync(&sch->disable_irq_work); - kthread_flush_work(&sch->disable_work); + int kind; + + do { + irq_work_sync(&sch->disable_irq_work); + kthread_flush_work(&sch->disable_work); + kind = atomic_read(&sch->exit_kind); + } while (kind != SCX_EXIT_NONE && kind != SCX_EXIT_DONE); } static void dump_newline(struct seq_buf *s) -- cgit v1.2.3 From 3b75dd76e64a04771861bb5647951c264919e563 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 20 Apr 2026 06:25:09 -0700 Subject: tracing: branch: Fix inverted check on stat tracer registration init_annotated_branch_stats() and all_annotated_branch_stats() check the return value of register_stat_tracer() with "if (!ret)", but register_stat_tracer() returns 0 on success and a negative errno on failure. The inverted check causes the warning to be printed on every successful registration, e.g.: Warning: could not register annotated branches stats while leaving real failures silent. The initcall also returned a hard-coded 1 instead of the actual error. Invert the check and propagate ret so that the warning fires on real errors and the initcall reports the correct status. Cc: Mathieu Desnoyers Cc: Ingo Molnar Cc: Frederic Weisbecker Link: https://patch.msgid.link/20260420-tracing-v1-1-d8f4cd0d6af1@debian.org Fixes: 002bb86d8d42 ("tracing/ftrace: separate events tracing and stats tracing engine") Signed-off-by: Breno Leitao Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt --- kernel/trace/trace_branch.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 6809b370e991..d1564db95a8f 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -373,10 +373,10 @@ __init static int init_annotated_branch_stats(void) int ret; ret = register_stat_tracer(&annotated_branch_stats); - if (!ret) { + if (ret) { printk(KERN_WARNING "Warning: could not register " "annotated branches stats\n"); - return 1; + return ret; } return 0; } @@ -438,10 +438,10 @@ __init static int all_annotated_branch_stats(void) int ret; ret = register_stat_tracer(&all_branch_stats); - if (!ret) { + if (ret) { printk(KERN_WARNING "Warning: could not register " "all branches stats\n"); - return 1; + return ret; } return 0; } -- cgit v1.2.3 From bc7304f3ae20972d11db6e0b1b541c63feda5f05 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 28 Apr 2026 12:34:25 +0200 Subject: futex: Prevent lockup in requeue-PI during signal/ timeout wakeup During wait-requeue-pi (task A) and requeue-PI (task B) the following race can happen: Task A Task B futex_wait_requeue_pi() futex_setup_timer() futex_do_wait() futex_requeue() CLASS(hb, hb1)(&key1); CLASS(hb, hb2)(&key2); *timeout* futex_requeue_pi_wakeup_sync() requeue_state = Q_REQUEUE_PI_IGNORE *blocks on hb->lock* futex_proxy_trylock_atomic() futex_requeue_pi_prepare() Q_REQUEUE_PI_IGNORE => -EAGAIN double_unlock_hb(hb1, hb2) *retry* Task B acquires both hb locks and attempts to acquire the PI-lock of the top most waiter (task B). Task A is leaving early due to a signal/ timeout and started removing itself from the queue. It updates its requeue_state but can not remove it from the list because this requires the hb lock which is owned by task B. Usually task A is able to swoop the lock after task B unlocked it. However if task B is of higher priority then task A may not be able to wake up in time and acquire the lock before task B gets it again. Especially on a UP system where A is never scheduled. As a result task A blocks on the lock and task B busy loops, trying to make progress but live locks the system instead. Tragic. This can be fixed by removing the top most waiter from the list in this case. This allows task B to grab the next top waiter (if any) in the next iteration and make progress. Remove the top most waiter if futex_requeue_pi_prepare() fails. Let the waiter conditionally remove itself from the list in handle_early_requeue_pi_wakeup(). Fixes: 07d91ef510fb1 ("futex: Prevent requeue_pi() lock nesting issue on RT") Reported-by: Moritz Klammler Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260428103425.dywXyPd3@linutronix.de Closes: https://lore.kernel.org/all/VE1PR06MB6894BE61C173D802365BE19DFF4CA@VE1PR06MB6894.eurprd06.prod.outlook.com --- kernel/futex/requeue.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index d818b4d47f1b..b597cb3d17fc 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -319,8 +319,11 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, return -EINVAL; /* Ensure that this does not race against an early wakeup */ - if (!futex_requeue_pi_prepare(top_waiter, NULL)) + if (!futex_requeue_pi_prepare(top_waiter, NULL)) { + plist_del(&top_waiter->list, &hb1->chain); + futex_hb_waiters_dec(hb1); return -EAGAIN; + } /* * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit @@ -722,10 +725,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, /* * We were woken prior to requeue by a timeout or a signal. - * Unqueue the futex_q and determine which it was. + * Conditionally unqueue the futex_q and determine which it was. */ - plist_del(&q->list, &hb->chain); - futex_hb_waiters_dec(hb); + if (!plist_node_empty(&q->list)) { + plist_del(&q->list, &hb->chain); + futex_hb_waiters_dec(hb); + } /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; -- cgit v1.2.3 From b2aa3b4d64e460ac606f386c24e7d8a873ce6f1a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Apr 2026 12:23:02 -0400 Subject: tracing/probes: Limit size of event probe to 3K There currently isn't a max limit an event probe can be. One could make an event greater than PAGE_SIZE, which makes the event useless because if it's bigger than the max event that can be recorded into the ring buffer, then it will never be recorded. A event probe should never need to be greater than 3K, so make that the max size. As long as the max is less than the max that can be recorded onto the ring buffer, it should be fine. Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Acked-by: Masami Hiramatsu (Google) Fixes: 93ccae7a22274 ("tracing/kprobes: Support basic types on dynamic events") Link: https://patch.msgid.link/20260428122302.706610ba@gandalf.local.home Signed-off-by: Steven Rostedt --- kernel/trace/trace_probe.c | 6 ++++++ kernel/trace/trace_probe.h | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e1c73065dae5..e0d3a0da26af 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1523,6 +1523,12 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, parg->offset = *size; *size += parg->type->size * (parg->count ?: 1); + if (*size > MAX_PROBE_EVENT_SIZE) { + ret = -E2BIG; + trace_probe_log_err(ctx->offset, EVENT_TOO_BIG); + goto fail; + } + if (parg->count) { len = strlen(parg->type->fmttype) + 6; parg->fmt = kmalloc(len, GFP_KERNEL); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 9fc56c937130..262d8707a3df 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -38,6 +38,7 @@ #define MAX_BTF_ARGS_LEN 128 #define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX +#define MAX_PROBE_EVENT_SIZE 3072 /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -561,7 +562,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_TYPE4STR, "This type does not fit for string."),\ C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ C(TOO_MANY_ARGS, "Too many arguments are specified"), \ - C(TOO_MANY_EARGS, "Too many entry arguments specified"), + C(TOO_MANY_EARGS, "Too many entry arguments specified"), \ + C(EVENT_TOO_BIG, "Event too big (too many fields?)"), #undef C #define C(a, b) TP_ERR_##a -- cgit v1.2.3 From ee9dce44362b2d8132c32964656ab6dff7dfbc6a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 1 May 2026 12:41:23 -0700 Subject: futex: Drop CLONE_THREAD requirement for private default hash alloc Currently need_futex_hash_allocate_default() depends on strict pthread semantics, abusing CLONE_THREAD. This breaks the non-concurrency assumptions when doing the mm->futex_ref pcpu allocations, leading to bugs[0] when sharing the mm in other ways; ie: BUG: KASAN: slab-use-after-free in futex_hash_put ... where the +1 bias can end up on a percpu counter that mm->futex_ref no longer points at. Loosen the check to cover any CLONE_VM clone, except vfork(). Excluding vfork keeps the existing paths untouched (no overhead), and we can't race in the first place: either the parent is suspended and the child runs alone, or mm->futex_ref is already allocated from an earlier CLONE_VM. Link: https://lore.kernel.org/all/CAL_bE8LsmCQ-FAtYDuwbJhOkt9p2wwYQwAbMh=PifC=VsiBM6A@mail.gmail.com/ [0] Fixes: d9b05321e21e ("futex: Move futex_hash_free() back to __mmput()") Reported-by: Yiming Qian Signed-off-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- kernel/fork.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f1ad69c6dc2d..5f3fdfdb14c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1951,9 +1951,11 @@ static void rv_task_fork(struct task_struct *p) static bool need_futex_hash_allocate_default(u64 clone_flags) { - if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM)) - return false; - return true; + /* + * Allocate a default futex hash for any sibling that will + * share the parent's mm, except vfork. + */ + return (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM; } /* @@ -2380,10 +2382,6 @@ __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cancel_cgroup; - /* - * Allocate a default futex hash for the user process once the first - * thread spawns. - */ if (need_futex_hash_allocate_default(clone_flags)) { retval = futex_hash_allocate_default(); if (retval) -- cgit v1.2.3