summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-05-05 15:22:04 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-05-05 15:22:04 -0700
commitde95ad90fb19e4b7778a0c27115a4639c7c8b186 (patch)
tree1d74306e2964932ccd0b787ea743dc74587683f5 /kernel
parent50fb0bcc9d7da23e0f0fd5359b4f9ceb0aa337d2 (diff)
parentb34c82777a2c0648ee053595f4b290fd5249b093 (diff)
Merge tag 'sched_ext-for-7.1-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext fixes from Tejun Heo: - Fix idle CPU selection returning prev_cpu outside the task's cpus_ptr when the BPF caller's allowed mask was wider. Stable backport. - Two opposite-direction gaps in scx_task_iter's cgroup-scoped mode versus the global mode: - Tasks past exit_signals() are filtered by the cgroup walk but kept by global. Sub-scheduler enable abort leaked __scx_init_task() state. Add a CSS_TASK_ITER_WITH_DEAD flag to cgroup's task iterator (scx_task_iter is its only user) and use it. - Tasks past sched_ext_dead() are still returned, tripping WARN_ON_ONCE() in callers or making them touch torn-down state. Mark and skip under the per-task rq lock. * tag 'sched_ext-for-7.1-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: sched_ext: idle: Recheck prev_cpu after narrowing allowed mask sched_ext: Skip past-sched_ext_dead() tasks in scx_task_iter_next_locked() cgroup, sched_ext: Include exiting tasks in cgroup iter
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cgroup.c8
-rw-r--r--kernel/sched/ext.c39
-rw-r--r--kernel/sched/ext_idle.c12
3 files changed, 40 insertions, 19 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 45c0b1ed687a..383af2b233b3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5067,10 +5067,12 @@ repeat:
task = list_entry(it->task_pos, struct task_struct, cg_list);
/*
- * Hide tasks that are exiting but not yet removed. Keep zombie
- * leaders with live threads visible.
+ * Hide tasks that are exiting but not yet removed by default. Keep
+ * zombie leaders with live threads visible. Usages that need to walk
+ * every existing task can opt out via CSS_TASK_ITER_WITH_DEAD.
*/
- if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
+ if (!(it->flags & CSS_TASK_ITER_WITH_DEAD) &&
+ (task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
goto repeat;
if (it->flags & CSS_TASK_ITER_PROCS) {
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 345aa11b84b2..38d90baf78cf 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -766,7 +766,8 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
iter->cgrp = cgrp;
iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self);
- css_task_iter_start(iter->css_pos, 0, &iter->css_iter);
+ css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD,
+ &iter->css_iter);
return;
}
#endif
@@ -866,7 +867,8 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
iter->css_pos = css_next_descendant_pre(iter->css_pos,
&iter->cgrp->self);
if (iter->css_pos)
- css_task_iter_start(iter->css_pos, 0, &iter->css_iter);
+ css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD,
+ &iter->css_iter);
}
return NULL;
}
@@ -926,16 +928,27 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
*
* Test for idle_sched_class as only init_tasks are on it.
*/
- if (p->sched_class != &idle_sched_class)
- break;
- }
- if (!p)
- return NULL;
+ if (p->sched_class == &idle_sched_class)
+ continue;
- iter->rq = task_rq_lock(p, &iter->rf);
- iter->locked_task = p;
+ iter->rq = task_rq_lock(p, &iter->rf);
+ iter->locked_task = p;
- return p;
+ /*
+ * cgroup_task_dead() removes the dead tasks from cset->tasks
+ * after sched_ext_dead() and cgroup iteration may see tasks
+ * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS
+ * is set by sched_ext_dead() under @p's rq lock. Test it to
+ * avoid visiting tasks which are already dead from SCX POV.
+ */
+ if (p->scx.flags & SCX_TASK_OFF_TASKS) {
+ __scx_task_iter_rq_unlock(iter);
+ continue;
+ }
+
+ return p;
+ }
+ return NULL;
}
/**
@@ -3848,6 +3861,11 @@ void sched_ext_dead(struct task_struct *p)
/*
* @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
* ENABLED transitions can't race us. Disable ops for @p.
+ *
+ * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see
+ * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup
+ * iteration is only used from sub-sched paths, which require root
+ * enabled. Root enable transitions every live task to at least READY.
*/
if (scx_get_task_state(p) != SCX_TASK_NONE) {
struct rq_flags rf;
@@ -3855,6 +3873,7 @@ void sched_ext_dead(struct task_struct *p)
rq = task_rq_lock(p, &rf);
scx_disable_and_exit_task(scx_task_sched(p), p);
+ p->scx.flags |= SCX_TASK_OFF_TASKS;
task_rq_unlock(rq, p, &rf);
}
}
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 7468560a6d80..6e1980763270 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -466,12 +466,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
preempt_disable();
/*
- * Check whether @prev_cpu is still within the allowed set. If not,
- * we can still try selecting a nearby CPU.
- */
- is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
-
- /*
* Determine the subset of CPUs usable by @p within @cpus_allowed.
*/
if (allowed != p->cpus_ptr) {
@@ -488,6 +482,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
}
/*
+ * Check whether @prev_cpu is still within the allowed set. If not,
+ * we can still try selecting a nearby CPU.
+ */
+ is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
+
+ /*
* This is necessary to protect llc_cpus.
*/
rcu_read_lock();