From 54e96258a6930909b690fd7e8889749231ba8085 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 6 Oct 2025 15:35:36 -1000
Subject: sched_ext: Mark scx_bpf_dsq_move_set_[slice|vtime]() with KF_RCU

scx_bpf_dsq_move_set_slice() and scx_bpf_dsq_move_set_vtime() take a DSQ
iterator argument which has to be valid. Mark them with KF_RCU.

Fixes: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()")
Cc: stable@vger.kernel.org # v6.12+
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2b0e88206d07..fc353b8d69f7 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5688,8 +5688,8 @@ BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
@@ -5820,8 +5820,8 @@ __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
-- 
cgit v1.2.3


From efeeaac9ae9763f9c953e69633c86bc3031e39b5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Oct 2025 13:56:23 -1000
Subject: sched_ext: Sync error_irq_work before freeing scx_sched

By the time scx_sched_free_rcu_work() runs, the scx_sched is no longer
reachable. However, a previously queued error_irq_work may still be pending or
running. Ensure it completes before proceeding with teardown.

Fixes: bff3b5aec1b7 ("sched_ext: Move disable machinery into scx_sched")
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fc353b8d69f7..a79dfd0f743a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3471,7 +3471,9 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	struct scx_dispatch_q *dsq;
 	int node;
 
+	irq_work_sync(&sch->error_irq_work);
 	kthread_stop(sch->helper->task);
+
 	free_percpu(sch->pcpu);
 
 	for_each_node_state(node, N_POSSIBLE)
-- 
cgit v1.2.3


From a8ad873113d3fe01f9b5d737d4b0570fa36826b0 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <etsal@meta.com>
Date: Fri, 10 Oct 2025 12:12:50 -0700
Subject: sched_ext: defer queue_balance_callback() until after ops.dispatch

The sched_ext code calls queue_balance_callback() during enqueue_task()
to defer operations that drop multiple locks until we can unpin them.
The call assumes that the rq lock is held until the callbacks are
invoked, and the pending callbacks will not be visible to any other
threads. This is enforced by a WARN_ON_ONCE() in rq_pin_lock().

However, balance_one() may actually drop the lock during a BPF dispatch
call. Another thread may win the race to get the rq lock and see the
pending callback. To avoid this, sched_ext must only queue the callback
after the dispatch calls have completed.

CPU 0                   CPU 1           CPU 2

scx_balance()
  rq_unpin_lock()
  scx_balance_one()
    |= IN_BALANCE	scx_enqueue()
    ops.dispatch()
      rq_unlock()
                        rq_lock()
                        queue_balance_callback()
                        rq_unlock()
                                        [WARN] rq_pin_lock()
      rq_lock()
    &= ~IN_BALANCE
rq_repin_lock()

Changelog

v2-> v1 (https://lore.kernel.org/sched-ext/aOgOxtHCeyRT_7jn@gpd4)

- Fixed explanation in patch description (Andrea)
- Fixed scx_rq mask state updates (Andrea)
- Added Reviewed-by tag from Andrea

Reported-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Emil Tsalapatis (Meta) <emil@etsalapatis.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c   | 29 +++++++++++++++++++++++++++--
 kernel/sched/sched.h |  1 +
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a79dfd0f743a..1352e6a5b089 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -780,13 +780,23 @@ static void schedule_deferred(struct rq *rq)
 	if (rq->scx.flags & SCX_RQ_IN_WAKEUP)
 		return;
 
+	/* Don't do anything if there already is a deferred operation. */
+	if (rq->scx.flags & SCX_RQ_BAL_PENDING)
+		return;
+
 	/*
 	 * If in balance, the balance callbacks will be called before rq lock is
 	 * released. Schedule one.
+	 *
+	 *
+	 * We can't directly insert the callback into the
+	 * rq's list: The call can drop its lock and make the pending balance
+	 * callback visible to unrelated code paths that call rq_pin_lock().
+	 *
+	 * Just let balance_one() know that it must do it itself.
 	 */
 	if (rq->scx.flags & SCX_RQ_IN_BALANCE) {
-		queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
-				       deferred_bal_cb_workfn);
+		rq->scx.flags |= SCX_RQ_BAL_CB_PENDING;
 		return;
 	}
 
@@ -2003,6 +2013,19 @@ static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
 	dspc->cursor = 0;
 }
 
+static inline void maybe_queue_balance_callback(struct rq *rq)
+{
+	lockdep_assert_rq_held(rq);
+
+	if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING))
+		return;
+
+	queue_balance_callback(rq, &rq->scx.deferred_bal_cb,
+				deferred_bal_cb_workfn);
+
+	rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
+}
+
 static int balance_one(struct rq *rq, struct task_struct *prev)
 {
 	struct scx_sched *sch = scx_root;
@@ -2150,6 +2173,8 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
 #endif
 	rq_repin_lock(rq, rf);
 
+	maybe_queue_balance_callback(rq);
+
 	return ret;
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1f5d07067f60..3f7fab3d7960 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -784,6 +784,7 @@ enum scx_rq_flags {
 	SCX_RQ_BAL_KEEP		= 1 << 3, /* balance decided to keep current */
 	SCX_RQ_BYPASSING	= 1 << 4,
 	SCX_RQ_CLK_VALID	= 1 << 5, /* RQ clock is fresh and valid */
+	SCX_RQ_BAL_CB_PENDING	= 1 << 6, /* must queue a cb after dispatching */
 
 	SCX_RQ_IN_WAKEUP	= 1 << 16,
 	SCX_RQ_IN_BALANCE	= 1 << 17,
-- 
cgit v1.2.3


From 14c1da3895a116f4e32c20487046655f26d3999b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 8 Oct 2025 13:43:26 -1000
Subject: sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()

On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
boot because it exceeds the 32,768 byte percpu allocator limit.

Restructure to use DEFINE_PER_CPU() for the per-CPU pointers, with each CPU
pointing to its own kvzalloc'd array. Move allocation from boot time to
scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
consumed when sched_ext is active.

Use RCU to guard against racing with free. Arrays are freed via call_rcu()
and kick_cpus_irq_workfn() uses rcu_dereference_bh() with a NULL check.

While at it, rename to scx_kick_pseqs for brevity and update comments to
clarify these are pick_task sequence numbers.

v2: RCU protect scx_kick_seqs to manage kick_cpus_irq_workfn() racing
    against disable as per Andrea.

v3: Fix bugs notcied by Andrea.

Reported-by: Phil Auld <pauld@redhat.com>
Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
Cc: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 79 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1352e6a5b089..c645d47124e7 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -67,8 +67,19 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
 
 static struct delayed_work scx_watchdog_work;
 
-/* for %SCX_KICK_WAIT */
-static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
+/*
+ * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
+ * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
+ * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
+ * lazily when enabling and freed when disabling to avoid waste when sched_ext
+ * isn't active.
+ */
+struct scx_kick_pseqs {
+	struct rcu_head		rcu;
+	unsigned long		seqs[];
+};
+
+static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
 
 /*
  * Direct dispatch marker.
@@ -3877,6 +3888,27 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
 	}
 }
 
+static void free_kick_pseqs_rcu(struct rcu_head *rcu)
+{
+	struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu);
+
+	kvfree(pseqs);
+}
+
+static void free_kick_pseqs(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
+		struct scx_kick_pseqs *to_free;
+
+		to_free = rcu_replace_pointer(*pseqs, NULL, true);
+		if (to_free)
+			call_rcu(&to_free->rcu, free_kick_pseqs_rcu);
+	}
+}
+
 static void scx_disable_workfn(struct kthread_work *work)
 {
 	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
@@ -4013,6 +4045,7 @@ static void scx_disable_workfn(struct kthread_work *work)
 	free_percpu(scx_dsp_ctx);
 	scx_dsp_ctx = NULL;
 	scx_dsp_max_batch = 0;
+	free_kick_pseqs();
 
 	mutex_unlock(&scx_enable_mutex);
 
@@ -4375,6 +4408,33 @@ static void scx_vexit(struct scx_sched *sch,
 	irq_work_queue(&sch->error_irq_work);
 }
 
+static int alloc_kick_pseqs(void)
+{
+	int cpu;
+
+	/*
+	 * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size
+	 * can exceed percpu allocator limits on large machines.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu);
+		struct scx_kick_pseqs *new_pseqs;
+
+		WARN_ON_ONCE(rcu_access_pointer(*pseqs));
+
+		new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids),
+					  GFP_KERNEL, cpu_to_node(cpu));
+		if (!new_pseqs) {
+			free_kick_pseqs();
+			return -ENOMEM;
+		}
+
+		rcu_assign_pointer(*pseqs, new_pseqs);
+	}
+
+	return 0;
+}
+
 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 {
 	struct scx_sched *sch;
@@ -4517,15 +4577,19 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	mutex_lock(&scx_enable_mutex);
 
+	ret = alloc_kick_pseqs();
+	if (ret)
+		goto err_unlock;
+
 	if (scx_enable_state() != SCX_DISABLED) {
 		ret = -EBUSY;
-		goto err_unlock;
+		goto err_free_pseqs;
 	}
 
 	sch = scx_alloc_and_add_sched(ops);
 	if (IS_ERR(sch)) {
 		ret = PTR_ERR(sch);
-		goto err_unlock;
+		goto err_free_pseqs;
 	}
 
 	/*
@@ -4728,6 +4792,8 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	return 0;
 
+err_free_pseqs:
+	free_kick_pseqs();
 err_unlock:
 	mutex_unlock(&scx_enable_mutex);
 	return ret;
@@ -5109,10 +5175,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 {
 	struct rq *this_rq = this_rq();
 	struct scx_rq *this_scx = &this_rq->scx;
-	unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs);
+	struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs);
 	bool should_wait = false;
+	unsigned long *pseqs;
 	s32 cpu;
 
+	if (unlikely(!pseqs_pcpu)) {
+		pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs");
+		return;
+	}
+
+	pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs;
+
 	for_each_cpu(cpu, this_scx->cpus_to_kick) {
 		should_wait |= kick_one_cpu(cpu, this_rq, pseqs);
 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
@@ -5235,11 +5309,6 @@ void __init init_sched_ext_class(void)
 
 	scx_idle_init_masks();
 
-	scx_kick_cpus_pnt_seqs =
-		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
-			       __alignof__(scx_kick_cpus_pnt_seqs[0]));
-	BUG_ON(!scx_kick_cpus_pnt_seqs);
-
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
 		int  n = cpu_to_node(cpu);
-- 
cgit v1.2.3


From 05e63305c85c88141500f0a2fb02afcfba9396e1 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Mon, 13 Oct 2025 22:36:34 +0200
Subject: sched_ext: Fix scx_kick_pseqs corruption on concurrent scheduler
 loads

If we load a BPF scheduler while another scheduler is already running,
alloc_kick_pseqs() would be called again, overwriting the previously
allocated arrays.

Fix by moving the alloc_kick_pseqs() call after the scx_enable_state()
check, ensuring that the arrays are only allocated when a scheduler can
actually be loaded.

Fixes: 14c1da3895a11 ("sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()")
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index c645d47124e7..12c9c3595692 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4577,15 +4577,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	mutex_lock(&scx_enable_mutex);
 
-	ret = alloc_kick_pseqs();
-	if (ret)
-		goto err_unlock;
-
 	if (scx_enable_state() != SCX_DISABLED) {
 		ret = -EBUSY;
-		goto err_free_pseqs;
+		goto err_unlock;
 	}
 
+	ret = alloc_kick_pseqs();
+	if (ret)
+		goto err_unlock;
+
 	sch = scx_alloc_and_add_sched(ops);
 	if (IS_ERR(sch)) {
 		ret = PTR_ERR(sch);
-- 
cgit v1.2.3


From a3c4a0a42e61aad1056a3d33fd603c1ae66d4288 Mon Sep 17 00:00:00 2001
From: Emil Tsalapatis <etsal@meta.com>
Date: Thu, 16 Oct 2025 11:11:26 -0700
Subject: sched_ext: fix flag check for deferred callbacks

When scheduling the deferred balance callbacks, check SCX_RQ_BAL_CB_PENDING
instead of SCX_RQ_BAL_PENDING. This way schedule_deferred() properly tests
whether there is already a pending request for queue_balance_callback() to
be invoked at the end of .balance().

Fixes: a8ad873113d3 ("sched_ext: defer queue_balance_callback() until after ops.dispatch")
Signed-off-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 12c9c3595692..ecb251e883ea 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -792,7 +792,7 @@ static void schedule_deferred(struct rq *rq)
 		return;
 
 	/* Don't do anything if there already is a deferred operation. */
-	if (rq->scx.flags & SCX_RQ_BAL_PENDING)
+	if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING)
 		return;
 
 	/*
-- 
cgit v1.2.3


From 103541e6a5854b08a25e4caa61e990af1009a52e Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 2 Oct 2025 08:22:35 +0000
Subject: rv: Fully convert enabled_monitors to use list_head as iterator

The callbacks in enabled_monitors_seq_ops are inconsistent. Some treat the
iterator as struct rv_monitor *, while others treat the iterator as struct
list_head *.

This causes a wrong type cast and crashes the system as reported by Nathan.

Convert everything to use struct list_head * as iterator. This also makes
enabled_monitors consistent with available_monitors.

Fixes: de090d1ccae1 ("rv: Fix wrong type cast in enabled_monitors_next()")
Reported-by: Nathan Chancellor <nathan@kernel.org>
Closes: https://lore.kernel.org/linux-trace-kernel/20250923002004.GA2836051@ax162/
Signed-off-by: Nam Cao <namcao@linutronix.de>
Cc: stable@vger.kernel.org
Reviewed-by: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/r/20251002082235.973099-1-namcao@linutronix.de
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 kernel/trace/rv/rv.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 48338520376f..43e9ea473cda 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -501,7 +501,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
 
 	list_for_each_entry_continue(mon, &rv_monitors_list, list) {
 		if (mon->enabled)
-			return mon;
+			return &mon->list;
 	}
 
 	return NULL;
@@ -509,7 +509,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
 
 static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
 {
-	struct rv_monitor *mon;
+	struct list_head *head;
 	loff_t l;
 
 	mutex_lock(&rv_interface_lock);
@@ -517,15 +517,15 @@ static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
 	if (list_empty(&rv_monitors_list))
 		return NULL;
 
-	mon = list_entry(&rv_monitors_list, struct rv_monitor, list);
+	head = &rv_monitors_list;
 
 	for (l = 0; l <= *pos; ) {
-		mon = enabled_monitors_next(m, mon, &l);
-		if (!mon)
+		head = enabled_monitors_next(m, head, &l);
+		if (!head)
 			break;
 	}
 
-	return mon;
+	return head;
 }
 
 /*
-- 
cgit v1.2.3


From 3d62f95bd8450cebb4a4741bf83949cd54edd4a3 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 2 Oct 2025 08:23:17 +0000
Subject: rv: Make rtapp/pagefault monitor depends on CONFIG_MMU

There is no page fault without MMU. Compiling the rtapp/pagefault monitor
without CONFIG_MMU fails as page fault tracepoints' definitions are not
available.

Make rtapp/pagefault monitor depends on CONFIG_MMU.

Fixes: 9162620eb604 ("rv: Add rtapp_pagefault monitor")
Signed-off-by: Nam Cao <namcao@linutronix.de>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202509260455.6Z9Vkty4-lkp@intel.com/
Cc: stable@vger.kernel.org
Reviewed-by: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/r/20251002082317.973839-1-namcao@linutronix.de
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 kernel/trace/rv/monitors/pagefault/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/rv/monitors/pagefault/Kconfig b/kernel/trace/rv/monitors/pagefault/Kconfig
index 5e16625f1653..0e013f00c33b 100644
--- a/kernel/trace/rv/monitors/pagefault/Kconfig
+++ b/kernel/trace/rv/monitors/pagefault/Kconfig
@@ -5,6 +5,7 @@ config RV_MON_PAGEFAULT
 	select RV_LTL_MONITOR
 	depends on RV_MON_RTAPP
 	depends on X86 || RISCV
+	depends on MMU
 	default y
 	select LTL_MON_EVENTS_ID
 	bool "pagefault monitor"
-- 
cgit v1.2.3


From 39a9ed0fb6dac58547afdf9b6cb032d326a3698f Mon Sep 17 00:00:00 2001
From: Haofeng Li <lihaofeng@kylinos.cn>
Date: Wed, 15 Oct 2025 14:17:53 +0800
Subject: timekeeping: Fix aux clocks sysfs initialization loop bound

The loop in tk_aux_sysfs_init() uses `i <= MAX_AUX_CLOCKS` as the
termination condition, which results in 9 iterations (i=0 to 8) when
MAX_AUX_CLOCKS is defined as 8. However, the kernel is designed to support
only up to 8 auxiliary clocks.

This off-by-one error causes the creation of a 9th sysfs entry that exceeds
the intended auxiliary clock range.

Fix the loop bound to use `i < MAX_AUX_CLOCKS` to ensure exactly 8
auxiliary clock entries are created, matching the design specification.

Fixes: 7b95663a3d96 ("timekeeping: Provide interface to control auxiliary clocks")
Signed-off-by: Haofeng Li <lihaofeng@kylinos.cn>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/tencent_2376993D9FC06A3616A4F981B3DE1C599607@qq.com
---
 kernel/time/timekeeping.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b6974fce800c..3a4d3b2e3f74 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -3070,7 +3070,7 @@ static int __init tk_aux_sysfs_init(void)
 		return -ENOMEM;
 	}
 
-	for (int i = 0; i <= MAX_AUX_CLOCKS; i++) {
+	for (int i = 0; i < MAX_AUX_CLOCKS; i++) {
 		char id[2] = { [0] = '0' + i, };
 		struct kobject *clk = kobject_create_and_add(id, auxo);
 
-- 
cgit v1.2.3


From 0e4a169d1a2b630c607416d9e3739d80e176ed67 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Tue, 21 Oct 2025 05:35:22 +0000
Subject: sched/fair: Start a cfs_rq on throttled hierarchy with PELT clock
 throttled

Matteo reported hitting the assert_list_leaf_cfs_rq() warning from
enqueue_task_fair() post commit fe8d238e646e ("sched/fair: Propagate
load for throttled cfs_rq") which transitioned to using
cfs_rq_pelt_clock_throttled() check for leaf cfs_rq insertions in
propagate_entity_cfs_rq().

The "cfs_rq->pelt_clock_throttled" flag is used to indicate if the
hierarchy has its PELT frozen. If a cfs_rq's PELT is marked frozen, all
its descendants should have their PELT frozen too or weird things can
happen as a result of children accumulating PELT signals when the
parents have their PELT clock stopped.

Another side effect of this is the loss of integrity of the leaf cfs_rq
list. As debugged by Aaron, consider the following hierarchy:

    root(#)
   /    \
  A(#)   B(*)
         |
         C <--- new cgroup
         |
         D <--- new cgroup

  # - Already on leaf cfs_rq list
  * - Throttled with PELT frozen

The newly created cgroups don't have their "pelt_clock_throttled" signal
synced with cgroup B. Next, the following series of events occur:

1. online_fair_sched_group() for cgroup D will call
   propagate_entity_cfs_rq(). (Same can happen if a throttled task is
   moved to cgroup C and enqueue_task_fair() returns early.)

   propagate_entity_cfs_rq() adds the cfs_rq of cgroup C to
   "rq->tmp_alone_branch" since its PELT clock is not marked throttled
   and cfs_rq of cgroup B is not on the list.

   cfs_rq of cgroup B is skipped since its PELT is throttled.

   root cfs_rq already exists on cfs_rq leading to
   list_add_leaf_cfs_rq() returning early.

   The cfs_rq of cgroup C is left dangling on the
   "rq->tmp_alone_branch".

2. A new task wakes up on cgroup A. Since the whole hierarchy is already
   on the leaf cfs_rq list, list_add_leaf_cfs_rq() keeps returning early
   without any modifications to "rq->tmp_alone_branch".

   The final assert_list_leaf_cfs_rq() in enqueue_task_fair() sees the
   dangling reference to cgroup C's cfs_rq in "rq->tmp_alone_branch".

   !!! Splat !!!

Syncing the "pelt_clock_throttled" indicator with parent cfs_rq is not
enough since the new cfs_rq is not yet enqueued on the hierarchy. A
dequeue on other subtree on the throttled hierarchy can freeze the PELT
clock for the parent hierarchy without setting the indicators for this
newly added cfs_rq which was never enqueued.

Since there are no tasks on the new hierarchy, start a cfs_rq on a
throttled hierarchy with its PELT clock throttled. The first enqueue, or
the distribution (whichever happens first) will unfreeze the PELT clock
and queue the cfs_rq on the leaf cfs_rq list.

While at it, add an assert_list_leaf_cfs_rq() in
propagate_entity_cfs_rq() to catch such cases in the future.

Closes: https://lore.kernel.org/lkml/58a587d694f33c2ea487c700b0d046fa@codethink.co.uk/
Fixes: e1fad12dcb66 ("sched/fair: Switch to task based throttle model")
Reported-by: Matteo Martelli <matteo.martelli@codethink.co.uk>
Suggested-by: Aaron Lu <ziqianlu@bytedance.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Aaron Lu <ziqianlu@bytedance.com>
Tested-by: Aaron Lu <ziqianlu@bytedance.com>
Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk>
Link: https://patch.msgid.link/20251021053522.37583-1-kprateek.nayak@amd.com
---
 kernel/sched/fair.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cee1793e8277..25970dbbb279 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6437,6 +6437,16 @@ static void sync_throttle(struct task_group *tg, int cpu)
 
 	cfs_rq->throttle_count = pcfs_rq->throttle_count;
 	cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
+
+	/*
+	 * It is not enough to sync the "pelt_clock_throttled" indicator
+	 * with the parent cfs_rq when the hierarchy is not queued.
+	 * Always join a throttled hierarchy with PELT clock throttled
+	 * and leaf it to the first enqueue, or distribution to
+	 * unthrottle the PELT clock.
+	 */
+	if (cfs_rq->throttle_count)
+		cfs_rq->pelt_clock_throttled = 1;
 }
 
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
@@ -13187,6 +13197,8 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 		if (!cfs_rq_pelt_clock_throttled(cfs_rq))
 			list_add_leaf_cfs_rq(cfs_rq);
 	}
+
+	assert_list_leaf_cfs_rq(rq_of(cfs_rq));
 }
 #else /* !CONFIG_FAIR_GROUP_SCHED: */
 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
-- 
cgit v1.2.3


From 5d7e45dd670e42df4836afeaa9baf9d41ca4b434 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 23 Oct 2025 16:48:59 +0100
Subject: genirq/chip: Add buslock back in to irq_set_handler()

The locking was changed from a buslock to a plain lock, but the patch
description states there was no functional change. Assuming this was
accidental so reverting to using the buslock.

Fixes: 5cd05f3e2315 ("genirq/chip: Rework irq_set_handler() variants")
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251023154901.1333755-2-ckeepax@opensource.cirrus.com
---
 kernel/irq/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3ffa0d80ddd1..d1917b28761a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1030,7 +1030,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
 void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		       const char *name)
 {
-	scoped_irqdesc_get_and_lock(irq, 0)
+	scoped_irqdesc_get_and_buslock(irq, 0)
 		__irq_do_set_handler(scoped_irqdesc, handle, is_chained, name);
 }
 EXPORT_SYMBOL_GPL(__irq_set_handler);
-- 
cgit v1.2.3


From 56363e25f79fe83e63039c5595b8cd9814173d37 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 23 Oct 2025 16:49:00 +0100
Subject: genirq/manage: Add buslock back in to __disable_irq_nosync()

The locking was changed from a buslock to a plain lock, but the patch
description states there was no functional change. Assuming this was
accidental so reverting to using the buslock.

Fixes: 1b7444446724 ("genirq/manage: Rework __disable_irq_nosync()")
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251023154901.1333755-3-ckeepax@opensource.cirrus.com
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c94837382037..7d68fb5dc242 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -659,7 +659,7 @@ void __disable_irq(struct irq_desc *desc)
 
 static int __disable_irq_nosync(unsigned int irq)
 {
-	scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+	scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
 		__disable_irq(scoped_irqdesc);
 		return 0;
 	}
-- 
cgit v1.2.3


From ef3330b99c01bda53f2a189b58bed8f6b7397f28 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Thu, 23 Oct 2025 16:49:01 +0100
Subject: genirq/manage: Add buslock back in to enable_irq()

The locking was changed from a buslock to a plain lock, but the patch
description states there was no functional change. Assuming this was
accidental so reverting to using the buslock.

Fixes: bddd10c55407 ("genirq/manage: Rework enable_irq()")
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251023154901.1333755-4-ckeepax@opensource.cirrus.com
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7d68fb5dc242..400856abf672 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -789,7 +789,7 @@ void __enable_irq(struct irq_desc *desc)
  */
 void enable_irq(unsigned int irq)
 {
-	scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+	scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
 		struct irq_desc *desc = scoped_irqdesc;
 
 		if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq))
-- 
cgit v1.2.3


From 53abe3e1c154628cc74e33a1bfcd865656e433a5 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 15 Oct 2025 11:19:34 +0200
Subject: sched: Remove never used code in mm_cid_get()

Clang is not happy with set but unused variable (this is visible
with `make W=1` build:

  kernel/sched/sched.h:3744:18: error: variable 'cpumask' set but not used [-Werror,-Wunused-but-set-variable]

It seems like the variable was never used along with the assignment
that does not have side effects as far as I can see.  Remove those
altogether.

Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Tested-by: Eric Biggers <ebiggers@kernel.org>
Reviewed-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/sched.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1f5d07067f60..361f9101cef9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3740,11 +3740,9 @@ static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
 			     struct mm_struct *mm)
 {
 	struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
-	struct cpumask *cpumask;
 	int cid;
 
 	lockdep_assert_rq_held(rq);
-	cpumask = mm_cidmask(mm);
 	cid = __this_cpu_read(pcpu_cid->cid);
 	if (mm_cid_is_valid(cid)) {
 		mm_cid_snapshot_time(rq, mm);
-- 
cgit v1.2.3