From e3dfd64c1f344ebec9397719244c27b360255855 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 13 Sep 2024 09:23:40 -0700
Subject: perf: Fix missing RCU reader protection in perf_event_clear_cpumask()

Running rcutorture scenario TREE05, the below warning is triggered.

[   32.604594] WARNING: suspicious RCU usage
[   32.605928] 6.11.0-rc5-00040-g4ba4f1afb6a9 #55238 Not tainted
[   32.607812] -----------------------------
[   32.609140] kernel/events/core.c:13946 RCU-list traversed in non-reader section!!
[   32.611595] other info that might help us debug this:
[   32.614247] rcu_scheduler_active = 2, debug_locks = 1
[   32.616392] 3 locks held by cpuhp/4/35:
[   32.617687]  #0: ffffffffb666a650 (cpu_hotplug_lock){++++}-{0:0}, at: cpuhp_thread_fun+0x4e/0x200
[   32.620563]  #1: ffffffffb666cd20 (cpuhp_state-down){+.+.}-{0:0}, at: cpuhp_thread_fun+0x4e/0x200
[   32.623412]  #2: ffffffffb677c288 (pmus_lock){+.+.}-{3:3}, at: perf_event_exit_cpu_context+0x32/0x2f0

In perf_event_clear_cpumask(), uses list_for_each_entry_rcu() without an
obvious RCU read-side critical section.

Either pmus_srcu or pmus_lock is good enough to protect the pmus list.
In the current context, pmus_lock is already held. The
list_for_each_entry_rcu() is not required.

Fixes: 4ba4f1afb6a9 ("perf: Generic hotplug support for a PMU with a scope")
Closes: https://lore.kernel.org/lkml/2b66dff8-b827-494b-b151-1ad8d56f13e6@paulmck-laptop/
Closes: https://lore.kernel.org/oe-lkp/202409131559.545634cc-oliver.sang@intel.com
Reported-by: "Paul E. McKenney" <paulmck@kernel.org>
Reported-by: kernel test robot <oliver.sang@intel.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: "Paul E. McKenney" <paulmck@kernel.org>
Link: https://lore.kernel.org/r/20240913162340.2142976-1-kan.liang@linux.intel.com
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index cdd09769e6c5..df27d08a7232 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -13959,7 +13959,7 @@ static void perf_event_clear_cpumask(unsigned int cpu)
 	}
 
 	/* migrate */
-	list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
+	list_for_each_entry(pmu, &pmus, entry) {
 		if (pmu->scope == PERF_PMU_SCOPE_NONE ||
 		    WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
 			continue;
-- 
cgit v1.2.3


From b55945c500c5723992504aa03b362fab416863a6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 23 Oct 2024 11:36:41 +0200
Subject: sched: Fix pick_next_task_fair() vs try_to_wake_up() race

Syzkaller robot reported KCSAN tripping over the
ASSERT_EXCLUSIVE_WRITER(p->on_rq) in __block_task().

The report noted that both pick_next_task_fair() and try_to_wake_up()
were concurrently trying to write to the same p->on_rq, violating the
assertion -- even though both paths hold rq->__lock.

The logical consequence is that both code paths end up holding a
different rq->__lock. And looking through ttwu(), this is possible
when the __block_task() 'p->on_rq = 0' store is visible to the ttwu()
'p->on_rq' load, which then assumes the task is not queued and
continues to migrate it.

Rearrange things such that __block_task() releases @p with the store
and no code thereafter will use @p again.

Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue")
Reported-by: syzbot+0ec1e96c2cdf5c0e512a@syzkaller.appspotmail.com
Reported-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Marco Elver <elver@google.com>
Link: https://lkml.kernel.org/r/20241023093641.GE16066@noisy.programming.kicks-ass.net
---
 kernel/sched/fair.c  | 21 ++++++++++++++-------
 kernel/sched/sched.h | 34 ++++++++++++++++++++++++++++++++--
 2 files changed, 46 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c157d4860a3b..879614686188 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5625,8 +5625,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
 	struct sched_entity *se = pick_eevdf(cfs_rq);
 	if (se->sched_delayed) {
 		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
-		SCHED_WARN_ON(se->sched_delayed);
-		SCHED_WARN_ON(se->on_rq);
+		/*
+		 * Must not reference @se again, see __block_task().
+		 */
 		return NULL;
 	}
 	return se;
@@ -7176,7 +7177,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		/* Fix-up what dequeue_task_fair() skipped */
 		hrtick_update(rq);
 
-		/* Fix-up what block_task() skipped. */
+		/*
+		 * Fix-up what block_task() skipped.
+		 *
+		 * Must be last, @p might not be valid after this.
+		 */
 		__block_task(rq, p);
 	}
 
@@ -7193,12 +7198,14 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
 		util_est_dequeue(&rq->cfs, p);
 
-	if (dequeue_entities(rq, &p->se, flags) < 0) {
-		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
+	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+	if (dequeue_entities(rq, &p->se, flags) < 0)
 		return false;
-	}
 
-	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
+	/*
+	 * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
+	 */
+
 	hrtick_update(rq);
 	return true;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 081519ffab46..9f9d1cc390b1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2769,8 +2769,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
 
 static inline void __block_task(struct rq *rq, struct task_struct *p)
 {
-	WRITE_ONCE(p->on_rq, 0);
-	ASSERT_EXCLUSIVE_WRITER(p->on_rq);
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible++;
 
@@ -2778,6 +2776,38 @@ static inline void __block_task(struct rq *rq, struct task_struct *p)
 		atomic_inc(&rq->nr_iowait);
 		delayacct_blkio_start();
 	}
+
+	ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+
+	/*
+	 * The moment this write goes through, ttwu() can swoop in and migrate
+	 * this task, rendering our rq->__lock ineffective.
+	 *
+	 * __schedule()				try_to_wake_up()
+	 *   LOCK rq->__lock			  LOCK p->pi_lock
+	 *   pick_next_task()
+	 *     pick_next_task_fair()
+	 *       pick_next_entity()
+	 *         dequeue_entities()
+	 *           __block_task()
+	 *             RELEASE p->on_rq = 0	  if (p->on_rq && ...)
+	 *					    break;
+	 *
+	 *					  ACQUIRE (after ctrl-dep)
+	 *
+	 *					  cpu = select_task_rq();
+	 *					  set_task_cpu(p, cpu);
+	 *					  ttwu_queue()
+	 *					    ttwu_do_activate()
+	 *					      LOCK rq->__lock
+	 *					      activate_task()
+	 *					        STORE p->on_rq = 1
+	 *   UNLOCK rq->__lock
+	 *
+	 * Callers must ensure to not reference @p after this -- we no longer
+	 * own it.
+	 */
+	smp_store_release(&p->on_rq, 0);
 }
 
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
-- 
cgit v1.2.3


From 9c70b2a33cd2aa6a5a59c5523ef053bd42265209 Mon Sep 17 00:00:00 2001
From: Shawn Wang <shawnwang@linux.alibaba.com>
Date: Fri, 25 Oct 2024 10:22:08 +0800
Subject: sched/numa: Fix the potential null pointer dereference in
 task_numa_work()

When running stress-ng-vm-segv test, we found a null pointer dereference
error in task_numa_work(). Here is the backtrace:

  [323676.066985] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020
  ......
  [323676.067108] CPU: 35 PID: 2694524 Comm: stress-ng-vm-se
  ......
  [323676.067113] pstate: 23401009 (nzCv daif +PAN -UAO +TCO +DIT +SSBS BTYPE=--)
  [323676.067115] pc : vma_migratable+0x1c/0xd0
  [323676.067122] lr : task_numa_work+0x1ec/0x4e0
  [323676.067127] sp : ffff8000ada73d20
  [323676.067128] x29: ffff8000ada73d20 x28: 0000000000000000 x27: 000000003e89f010
  [323676.067130] x26: 0000000000080000 x25: ffff800081b5c0d8 x24: ffff800081b27000
  [323676.067133] x23: 0000000000010000 x22: 0000000104d18cc0 x21: ffff0009f7158000
  [323676.067135] x20: 0000000000000000 x19: 0000000000000000 x18: ffff8000ada73db8
  [323676.067138] x17: 0001400000000000 x16: ffff800080df40b0 x15: 0000000000000035
  [323676.067140] x14: ffff8000ada73cc8 x13: 1fffe0017cc72001 x12: ffff8000ada73cc8
  [323676.067142] x11: ffff80008001160c x10: ffff000be639000c x9 : ffff8000800f4ba4
  [323676.067145] x8 : ffff000810375000 x7 : ffff8000ada73974 x6 : 0000000000000001
  [323676.067147] x5 : 0068000b33e26707 x4 : 0000000000000001 x3 : ffff0009f7158000
  [323676.067149] x2 : 0000000000000041 x1 : 0000000000004400 x0 : 0000000000000000
  [323676.067152] Call trace:
  [323676.067153]  vma_migratable+0x1c/0xd0
  [323676.067155]  task_numa_work+0x1ec/0x4e0
  [323676.067157]  task_work_run+0x78/0xd8
  [323676.067161]  do_notify_resume+0x1ec/0x290
  [323676.067163]  el0_svc+0x150/0x160
  [323676.067167]  el0t_64_sync_handler+0xf8/0x128
  [323676.067170]  el0t_64_sync+0x17c/0x180
  [323676.067173] Code: d2888001 910003fd f9000bf3 aa0003f3 (f9401000)
  [323676.067177] SMP: stopping secondary CPUs
  [323676.070184] Starting crashdump kernel...

stress-ng-vm-segv in stress-ng is used to stress test the SIGSEGV error
handling function of the system, which tries to cause a SIGSEGV error on
return from unmapping the whole address space of the child process.

Normally this program will not cause kernel crashes. But before the
munmap system call returns to user mode, a potential task_numa_work()
for numa balancing could be added and executed. In this scenario, since the
child process has no vma after munmap, the vma_next() in task_numa_work()
will return a null pointer even if the vma iterator restarts from 0.

Recheck the vma pointer before dereferencing it in task_numa_work().

Fixes: 214dbc428137 ("sched: convert to vma iterator")
Signed-off-by: Shawn Wang <shawnwang@linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org # v6.2+
Link: https://lkml.kernel.org/r/20241025022208.125527-1-shawnwang@linux.alibaba.com
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 879614686188..2d16c8545c71 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3369,7 +3369,7 @@ retry_pids:
 		vma = vma_next(&vmi);
 	}
 
-	do {
+	for (; vma; vma = vma_next(&vmi)) {
 		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
 			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
 			trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
@@ -3491,7 +3491,7 @@ retry_pids:
 		 */
 		if (vma_pids_forced)
 			break;
-	} for_each_vma(vmi, vma);
+	}
 
 	/*
 	 * If no VMAs are remaining and VMAs were skipped due to the PID
-- 
cgit v1.2.3


From b5413156bad91dc2995a5c4eab1b05e56914638a Mon Sep 17 00:00:00 2001
From: Benjamin Segall <bsegall@google.com>
Date: Fri, 25 Oct 2024 18:35:35 -0700
Subject: posix-cpu-timers: Clear TICK_DEP_BIT_POSIX_TIMER on clone

When cloning a new thread, its posix_cputimers are not inherited, and
are cleared by posix_cputimers_init(). However, this does not clear the
tick dependency it creates in tsk->tick_dep_mask, and the handler does
not reach the code to clear the dependency if there were no timers to
begin with.

Thus if a thread has a cputimer running before clone/fork, all
descendants will prevent nohz_full unless they create a cputimer of
their own.

Fix this by entirely clearing the tick_dep_mask in copy_process().
(There is currently no inherited state that needs a tick dependency)

Process-wide timers do not have this problem because fork does not copy
signal_struct as a baseline, it creates one from scratch.

Fixes: b78783000d5c ("posix-cpu-timers: Migrate to use new tick dependency mask model")
Signed-off-by: Ben Segall <bsegall@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/all/xm26o737bq8o.fsf@google.com
---
 kernel/fork.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 89ceb4a68af2..6fa9fe62e01e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,6 +105,7 @@
 #include <linux/rseq.h>
 #include <uapi/linux/pidfd.h>
 #include <linux/pidfs.h>
+#include <linux/tick.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -2292,6 +2293,7 @@ __latent_entropy struct task_struct *copy_process(
 	acct_clear_integrals(p);
 
 	posix_cputimers_init(&p->posix_cputimers);
+	tick_dep_init_task(p);
 
 	p->io_context = NULL;
 	audit_set_context(p, NULL);
-- 
cgit v1.2.3


From 5f994f534120f47432092fb36f5cb0c7a80ed2bf Mon Sep 17 00:00:00 2001
From: Jinjie Ruan <ruanjinjie@huawei.com>
Date: Sat, 26 Oct 2024 14:36:39 +0800
Subject: genirq/msi: Fix off-by-one error in msi_domain_alloc()

The error path in msi_domain_alloc(), frees the already allocated MSI
interrupts in a loop, but the loop condition terminates when the index
reaches zero, which fails to free the first allocated MSI interrupt at
index zero.

Check for >= 0 so that msi[0] is freed as well.

Fixes: f3cf8bb0d6c3 ("genirq: Add generic msi irq domain support")
Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20241026063639.10711-1-ruanjinjie@huawei.com
---
 kernel/irq/msi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3a24d6b5f559..396a067a8a56 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -718,7 +718,7 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
 		ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
 		if (ret < 0) {
 			if (ops->msi_free) {
-				for (i--; i > 0; i--)
+				for (i--; i >= 0; i--)
 					ops->msi_free(domain, info, virq + i);
 			}
 			irq_domain_free_irqs_top(domain, virq, nr_irqs);
-- 
cgit v1.2.3


From 5db91545ef8150c45a526675ef99e8998b648a41 Mon Sep 17 00:00:00 2001
From: Aboorva Devarajan <aboorvad@linux.ibm.com>
Date: Sat, 26 Oct 2024 00:20:20 +0530
Subject: sched: Pass correct scheduling policy to __setscheduler_class

Commit 98442f0ccd82 ("sched: Fix delayed_dequeue vs
switched_from_fair()") overlooked that __setscheduler_prio(), now
__setscheduler_class() relies on p->policy for task_should_scx(), and
moved the call before __setscheduler_params() updates it, causing it
to be using the old p->policy value.

Resolve this by changing task_should_scx() to take the policy itself
instead of a task pointer, such that __sched_setscheduler() can pass
in the updated policy.

Fixes: 98442f0ccd82 ("sched: Fix delayed_dequeue vs switched_from_fair()")
Signed-off-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/core.c     | 8 ++++----
 kernel/sched/ext.c      | 8 ++++----
 kernel/sched/ext.h      | 2 +-
 kernel/sched/sched.h    | 2 +-
 kernel/sched/syscalls.c | 2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dbfb5717d6af..719e0ed1e976 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4711,7 +4711,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	if (rt_prio(p->prio)) {
 		p->sched_class = &rt_sched_class;
 #ifdef CONFIG_SCHED_CLASS_EXT
-	} else if (task_should_scx(p)) {
+	} else if (task_should_scx(p->policy)) {
 		p->sched_class = &ext_sched_class;
 #endif
 	} else {
@@ -7025,7 +7025,7 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
 }
 EXPORT_SYMBOL(default_wake_function);
 
-const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
+const struct sched_class *__setscheduler_class(int policy, int prio)
 {
 	if (dl_prio(prio))
 		return &dl_sched_class;
@@ -7034,7 +7034,7 @@ const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
 		return &rt_sched_class;
 
 #ifdef CONFIG_SCHED_CLASS_EXT
-	if (task_should_scx(p))
+	if (task_should_scx(policy))
 		return &ext_sched_class;
 #endif
 
@@ -7142,7 +7142,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 		queue_flag &= ~DEQUEUE_MOVE;
 
 	prev_class = p->sched_class;
-	next_class = __setscheduler_class(p, prio);
+	next_class = __setscheduler_class(p->policy, prio);
 
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5900b06fd036..40bdfe84e4f0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4256,14 +4256,14 @@ static const struct kset_uevent_ops scx_uevent_ops = {
  * Used by sched_fork() and __setscheduler_prio() to pick the matching
  * sched_class. dl/rt are already handled.
  */
-bool task_should_scx(struct task_struct *p)
+bool task_should_scx(int policy)
 {
 	if (!scx_enabled() ||
 	    unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
 		return false;
 	if (READ_ONCE(scx_switching_all))
 		return true;
-	return p->policy == SCHED_EXT;
+	return policy == SCHED_EXT;
 }
 
 /**
@@ -4493,7 +4493,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 
 		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 
-		p->sched_class = __setscheduler_class(p, p->prio);
+		p->sched_class = __setscheduler_class(p->policy, p->prio);
 		check_class_changing(task_rq(p), p, old_class);
 
 		sched_enq_and_set_task(&ctx);
@@ -5204,7 +5204,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 
 		p->scx.slice = SCX_SLICE_DFL;
-		p->sched_class = __setscheduler_class(p, p->prio);
+		p->sched_class = __setscheduler_class(p->policy, p->prio);
 		check_class_changing(task_rq(p), p, old_class);
 
 		sched_enq_and_set_task(&ctx);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 246019519231..b1675bb59fc4 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -18,7 +18,7 @@ bool scx_can_stop_tick(struct rq *rq);
 void scx_rq_activate(struct rq *rq);
 void scx_rq_deactivate(struct rq *rq);
 int scx_check_setscheduler(struct task_struct *p, int policy);
-bool task_should_scx(struct task_struct *p);
+bool task_should_scx(int policy);
 void init_sched_ext_class(void);
 
 static inline u32 scx_cpuperf_target(s32 cpu)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9f9d1cc390b1..6c54a57275cc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3830,7 +3830,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
 
 extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
 extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
-extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio);
+extern const struct sched_class *__setscheduler_class(int policy, int prio);
 extern void set_load_weight(struct task_struct *p, bool update_load);
 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 0470bcc3d204..24f9f90b6574 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -707,7 +707,7 @@ change:
 	}
 
 	prev_class = p->sched_class;
-	next_class = __setscheduler_class(p, newprio);
+	next_class = __setscheduler_class(policy, newprio);
 
 	if (prev_class != next_class && p->se.sched_delayed)
 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-- 
cgit v1.2.3


From 69d5e722be949a1e2409c3f2865ba6020c279db6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 30 Oct 2024 11:49:34 +0100
Subject: sched/ext: Fix scx vs sched_delayed

Commit 98442f0ccd82 ("sched: Fix delayed_dequeue vs
switched_from_fair()") forgot about scx :/

Fixes: 98442f0ccd82 ("sched: Fix delayed_dequeue vs switched_from_fair()")
Reported-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lkml.kernel.org/r/20241030104934.GK14555@noisy.programming.kicks-ass.net
---
 kernel/sched/ext.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 40bdfe84e4f0..721a75489ff5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4489,11 +4489,16 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	scx_task_iter_start(&sti);
 	while ((p = scx_task_iter_next_locked(&sti))) {
 		const struct sched_class *old_class = p->sched_class;
+		const struct sched_class *new_class =
+			__setscheduler_class(p->policy, p->prio);
 		struct sched_enq_and_set_ctx ctx;
 
+		if (old_class != new_class && p->se.sched_delayed)
+			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+
 		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 
-		p->sched_class = __setscheduler_class(p->policy, p->prio);
+		p->sched_class = new_class;
 		check_class_changing(task_rq(p), p, old_class);
 
 		sched_enq_and_set_task(&ctx);
@@ -5199,12 +5204,17 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	scx_task_iter_start(&sti);
 	while ((p = scx_task_iter_next_locked(&sti))) {
 		const struct sched_class *old_class = p->sched_class;
+		const struct sched_class *new_class =
+			__setscheduler_class(p->policy, p->prio);
 		struct sched_enq_and_set_ctx ctx;
 
+		if (old_class != new_class && p->se.sched_delayed)
+			dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+
 		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 
 		p->scx.slice = SCX_SLICE_DFL;
-		p->sched_class = __setscheduler_class(p->policy, p->prio);
+		p->sched_class = new_class;
 		check_class_changing(task_rq(p), p, old_class);
 
 		sched_enq_and_set_task(&ctx);
-- 
cgit v1.2.3


From fa17cb4b3b42618aeed1e0bce80cc55106561718 Mon Sep 17 00:00:00 2001
From: Kalesh Singh <kaleshsingh@google.com>
Date: Wed, 30 Oct 2024 10:17:49 -0700
Subject: tracing: Document tracefs gid mount option

Commit ee7f3666995d ("tracefs: Have new files inherit the ownership of
their parent") and commit 48b27b6b5191 ("tracefs: Set all files to the
same group ownership as the mount option") introduced a new gid mount
option that allows specifying a group to apply to all entries in tracefs.

Document this in the tracing readme.

Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Ali Zahraee <ahzahraee@gmail.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Link: https://lore.kernel.org/20241030171928.4168869-3-kaleshsingh@google.com
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a8f52b6527ca..2b64b3ec67d9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5501,6 +5501,10 @@ static const struct file_operations tracing_iter_fops = {
 
 static const char readme_msg[] =
 	"tracing mini-HOWTO:\n\n"
+	"By default tracefs removes all OTH file permission bits.\n"
+	"When mounting tracefs an optional group id can be specified\n"
+	"which adds the group to every directory and file in tracefs:\n\n"
+	"\t e.g. mount -t tracefs [-o [gid=<gid>]] nodev /sys/kernel/tracing\n\n"
 	"# echo 0 > tracing_on : quick way to disable tracing\n"
 	"# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
 	" Important files:\n"
-- 
cgit v1.2.3