From 517a44d18537ef8ab888f71197c80116c14cee0a Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang@linux.dev>
Date: Mon, 8 Dec 2025 19:23:19 +0800
Subject: sched_ext: Fix the memleak for sch->helper objects

This commit use kthread_destroy_worker() to release sch->helper
objects to fix the following kmemleak:

unreferenced object 0xffff888121ec7b00 (size 128):
  comm "scx_simple", pid 1197, jiffies 4295884415
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 ad 4e ad de  .............N..
    ff ff ff ff 00 00 00 00 ff ff ff ff ff ff ff ff  ................
  backtrace (crc 587b3352):
    kmemleak_alloc+0x62/0xa0
    __kmalloc_cache_noprof+0x28d/0x3e0
    kthread_create_worker_on_node+0xd5/0x1f0
    scx_enable.isra.210+0x6c2/0x25b0
    bpf_scx_reg+0x12/0x20
    bpf_struct_ops_link_create+0x2c3/0x3b0
    __sys_bpf+0x3102/0x4b00
    __x64_sys_bpf+0x79/0xc0
    x64_sys_call+0x15d9/0x1dd0
    do_syscall_64+0xf0/0x470
    entry_SYSCALL_64_after_hwframe+0x77/0x7f

Fixes: bff3b5aec1b7 ("sched_ext: Move disable machinery into scx_sched")
Cc: stable@vger.kernel.org # v6.16+
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 05f5a49e9649..073b669869cb 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3575,7 +3575,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	int node;
 
 	irq_work_sync(&sch->error_irq_work);
-	kthread_stop(sch->helper->task);
+	kthread_destroy_worker(sch->helper);
 
 	free_percpu(sch->pcpu);
 
@@ -4786,7 +4786,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 	return sch;
 
 err_stop_helper:
-	kthread_stop(sch->helper->task);
+	kthread_destroy_worker(sch->helper);
 err_free_pcpu:
 	free_percpu(sch->pcpu);
 err_free_gdsqs:
-- 
cgit v1.2.3


From 12b5cd99a05f7cbc2ceb88b3b9601d404ef2236a Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Sat, 6 Dec 2025 02:22:03 +0000
Subject: sched/ext: Avoid null ptr traversal when ->put_prev_task() is called
 with NULL next

Early when trying to get sched_ext and proxy-exe working together,
I kept tripping over NULL ptr in put_prev_task_scx() on the line:
  if (sched_class_above(&ext_sched_class, next->sched_class)) {

Which was due to put_prev_task() passes a NULL next, calling:
  prev->sched_class->put_prev_task(rq, prev, NULL);

put_prev_task_scx() already guards for a NULL next in the
switch_class case, but doesn't seem to have a guard for
sched_class_above() check.

I can't say I understand why this doesn't trip usually without
proxy-exec. And in newer kernels there are way fewer
put_prev_task(), and I can't easily reproduce the issue now
even with proxy-exec.

But we still have one put_prev_task() call left in core.c that
seems like it could trip this, so I wanted to send this out for
consideration.

tj: put_prev_task() can be called with NULL @next; however, when @p is
queued, that doesn't happen, so this condition shouldn't currently be
triggerable. The connection isn't straightforward or necessarily reliable,
so add the NULL check even if it can't currently be triggered.

Link: http://lkml.kernel.org/r/20251206022218.1541878-1-jstultz@google.com
Signed-off-by: John Stultz <jstultz@google.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 073b669869cb..bd74b371f52d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2402,7 +2402,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 		 * ops.enqueue() that @p is the only one available for this cpu,
 		 * which should trigger an explicit follow-up scheduling event.
 		 */
-		if (sched_class_above(&ext_sched_class, next->sched_class)) {
+		if (next && sched_class_above(&ext_sched_class, next->sched_class)) {
 			WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST));
 			do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
 		} else {
-- 
cgit v1.2.3


From 9f769637a93fac81689b80df6855f545839cf999 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 9 Dec 2025 11:04:33 -1000
Subject: sched_ext: Fix bypass depth leak on scx_enable() failure

scx_enable() calls scx_bypass(true) to initialize in bypass mode and then
scx_bypass(false) on success to exit. If scx_enable() fails during task
initialization - e.g. scx_cgroup_init() or scx_init_task() returns an error -
it jumps to err_disable while bypass is still active. scx_disable_workfn()
then calls scx_bypass(true/false) for its own bypass, leaving the bypass depth
at 1 instead of 0. This causes the system to remain permanently in bypass mode
after a failed scx_enable().

Failures after task initialization is complete - e.g. scx_tryset_enable_state()
at the end - already call scx_bypass(false) before reaching the error path and
are not affected. This only affects a subset of failure modes.

Fix it by tracking whether scx_enable() called scx_bypass(true) in a bool and
having scx_disable_workfn() call an extra scx_bypass(false) to clear it. This
is a temporary measure as the bypass depth will be moved into the sched
instance, which will make this tracking unnecessary.

Fixes: 8c2090c504e9 ("sched_ext: Initialize in bypass mode")
Cc: stable@vger.kernel.org # v6.12+
Reported-by: Chris Mason <clm@meta.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Link: https://lore.kernel.org/stable/286e6f7787a81239e1ce2989b52391ce%40kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index bd74b371f52d..c4465ccefea4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -41,6 +41,13 @@ static bool scx_init_task_enabled;
 static bool scx_switching_all;
 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
 
+/*
+ * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass
+ * depth on enable failure. Will be removed when bypass depth is moved into the
+ * sched instance.
+ */
+static bool scx_bypassed_for_enable;
+
 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
 
@@ -4318,6 +4325,11 @@ static void scx_disable_workfn(struct kthread_work *work)
 	scx_dsp_max_batch = 0;
 	free_kick_syncs();
 
+	if (scx_bypassed_for_enable) {
+		scx_bypassed_for_enable = false;
+		scx_bypass(false);
+	}
+
 	mutex_unlock(&scx_enable_mutex);
 
 	WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
@@ -4970,6 +4982,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	 * Init in bypass mode to guarantee forward progress.
 	 */
 	scx_bypass(true);
+	scx_bypassed_for_enable = true;
 
 	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
 		if (((void (**)(void))ops)[i])
@@ -5067,6 +5080,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	scx_task_iter_stop(&sti);
 	percpu_up_write(&scx_fork_rwsem);
 
+	scx_bypassed_for_enable = false;
 	scx_bypass(false);
 
 	if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
-- 
cgit v1.2.3


From 530b6637c79e728d58f1d9b66bd4acf4b735b86d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 11 Dec 2025 15:45:03 -1000
Subject: sched_ext: Factor out local_dsq_post_enq() from dispatch_enqueue()

Factor out local_dsq_post_enq() which performs post-enqueue handling for
local DSQs - triggering resched_curr() if SCX_ENQ_PREEMPT is specified or if
the current CPU is idle. No functional change.

This will be used by the next patch to fix move_local_task_to_local_dsq().

Cc: stable@vger.kernel.org # v6.12+
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index c4465ccefea4..c78efa99406f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -982,6 +982,22 @@ static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
 	__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
 }
 
+static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p,
+			       u64 enq_flags)
+{
+	struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+	bool preempt = false;
+
+	if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
+	    rq->curr->sched_class == &ext_sched_class) {
+		rq->curr->scx.slice = 0;
+		preempt = true;
+	}
+
+	if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class))
+		resched_curr(rq);
+}
+
 static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
 			     struct task_struct *p, u64 enq_flags)
 {
@@ -1093,22 +1109,10 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
 	if (enq_flags & SCX_ENQ_CLEAR_OPSS)
 		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
 
-	if (is_local) {
-		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
-		bool preempt = false;
-
-		if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
-		    rq->curr->sched_class == &ext_sched_class) {
-			rq->curr->scx.slice = 0;
-			preempt = true;
-		}
-
-		if (preempt || sched_class_above(&ext_sched_class,
-						 rq->curr->sched_class))
-			resched_curr(rq);
-	} else {
+	if (is_local)
+		local_dsq_post_enq(dsq, p, enq_flags);
+	else
 		raw_spin_unlock(&dsq->lock);
-	}
 }
 
 static void task_unlink_from_dsq(struct task_struct *p,
-- 
cgit v1.2.3


From f5e1e5ec204da11fa87fdf006d451d80ce06e118 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 11 Dec 2025 15:45:04 -1000
Subject: sched_ext: Fix missing post-enqueue handling in
 move_local_task_to_local_dsq()

move_local_task_to_local_dsq() is used when moving a task from a non-local
DSQ to a local DSQ on the same CPU. It directly manipulates the local DSQ
without going through dispatch_enqueue() and was missing the post-enqueue
handling that triggers preemption when SCX_ENQ_PREEMPT is set or the idle
task is running.

The function is used by move_task_between_dsqs() which backs
scx_bpf_dsq_move() and may be called while the CPU is busy.

Add local_dsq_post_enq() call to move_local_task_to_local_dsq(). As the
dispatch path doesn't need post-enqueue handling, add SCX_RQ_IN_BALANCE
early exit to keep consume_dispatch_q() behavior unchanged and avoid
triggering unnecessary resched when scx_bpf_dsq_move() is used from the
dispatch path.

Fixes: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()")
Cc: stable@vger.kernel.org # v6.12+
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index c78efa99406f..695503a2f7d1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -988,6 +988,14 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p
 	struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
 	bool preempt = false;
 
+	/*
+	 * If @rq is in balance, the CPU is already vacant and looking for the
+	 * next task to run. No need to preempt or trigger resched after moving
+	 * @p into its local DSQ.
+	 */
+	if (rq->scx.flags & SCX_RQ_IN_BALANCE)
+		return;
+
 	if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
 	    rq->curr->sched_class == &ext_sched_class) {
 		rq->curr->scx.slice = 0;
@@ -1636,6 +1644,8 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
 
 	dsq_mod_nr(dst_dsq, 1);
 	p->scx.dsq = dst_dsq;
+
+	local_dsq_post_enq(dst_dsq, p, enq_flags);
 }
 
 /**
-- 
cgit v1.2.3


From bb27226f0d00588ac53be8825e021ae80aa43371 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang@linux.dev>
Date: Mon, 15 Dec 2025 19:29:40 +0800
Subject: sched_ext: Remove unused code in the do_pick_task_scx()

The kick_idle variable is no longer used, this commit therefore remove
it and also remove associated code in the do_pick_task_scx().

Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 695503a2f7d1..94164f2dec6d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2446,7 +2446,7 @@ static struct task_struct *
 do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
 {
 	struct task_struct *prev = rq->curr;
-	bool keep_prev, kick_idle = false;
+	bool keep_prev;
 	struct task_struct *p;
 
 	/* see kick_cpus_irq_workfn() */
@@ -2488,12 +2488,8 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
 			refill_task_slice_dfl(rcu_dereference_sched(scx_root), p);
 	} else {
 		p = first_local_task(rq);
-		if (!p) {
-			if (kick_idle)
-				scx_kick_cpu(rcu_dereference_sched(scx_root),
-					     cpu_of(rq), SCX_KICK_IDLE);
+		if (!p)
 			return NULL;
-		}
 
 		if (unlikely(!p->scx.slice)) {
 			struct scx_sched *sch = rcu_dereference_sched(scx_root);
-- 
cgit v1.2.3