From 7dadeaa6e851e7d67733f3e24fc53ee107781d0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Dec 2025 15:25:10 +0100 Subject: sched: Further restrict the preemption modes The introduction of PREEMPT_LAZY was for multiple reasons: - PREEMPT_RT suffered from over-scheduling, hurting performance compared to !PREEMPT_RT. - the introduction of (more) features that rely on preemption; like folio_zero_user() which can do large memset() without preemption checks. (Xen already had a horrible hack to deal with long running hypercalls) - the endless and uncontrolled sprinkling of cond_resched() -- mostly cargo cult or in response to poor to replicate workloads. By moving to a model that is fundamentally preemptable these things become managable and avoid needing to introduce more horrible hacks. Since this is a requirement; limit PREEMPT_NONE to architectures that do not support preemption at all. Further limit PREEMPT_VOLUNTARY to those architectures that do not yet have PREEMPT_LAZY support (with the eventual goal to make this the empty set and completely remove voluntary preemption and cond_resched() -- notably VOLUNTARY is already limited to !ARCH_NO_PREEMPT.) This leaves up-to-date architectures (arm64, loongarch, powerpc, riscv, s390, x86) with only two preemption models: full and lazy. While Lazy has been the recommended setting for a while, not all distributions have managed to make the switch yet. Force things along. Keep the patch minimal in case of hard to address regressions that might pop up. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://patch.msgid.link/20251219101502.GB1132199@noisy.programming.kicks-ass.net --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 41caa22e0680..5f9b77195159 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -243,7 +243,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { - int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; + int i = (IS_ENABLED(CONFIG_PREEMPT_RT) || IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY)) * 2; int j; /* Count entries in NULL terminated preempt_modes */ -- cgit v1.2.3 From 4fe82cf3024a4bdd2571d584efc25598533d5c96 Mon Sep 17 00:00:00 2001 From: Fushuai Wang Date: Sat, 17 Jan 2026 22:56:14 +0800 Subject: sched/debug: Convert copy_from_user() + kstrtouint() to kstrtouint_from_user() Using kstrtouint_from_user() instead of copy_from_user() + kstrtouint() makes the code simpler and less error-prone. Suggested-by: Yury Norov Signed-off-by: Fushuai Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yury Norov Link: https://patch.msgid.link/20260117145615.53455-2-fushuai.wang@linux.dev --- kernel/sched/debug.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5f9b77195159..929fdf09e8e9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -172,18 +172,12 @@ static const struct file_operations sched_feat_fops = { static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[16]; unsigned int scaling; + int ret; - if (cnt > 15) - cnt = 15; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - if (kstrtouint(buf, 10, &scaling)) - return -EINVAL; + ret = kstrtouint_from_user(ubuf, cnt, 10, &scaling); + if (ret) + return ret; if (scaling >= SCHED_TUNABLESCALING_END) return -EINVAL; -- cgit v1.2.3 From 6080fb211672aec6ce8f2f5a2e0b4eae736f2027 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 26 Jan 2026 10:59:00 +0100 Subject: sched/debug: Fix updating of ppos on server write ops Updating "ppos" on error conditions does not make much sense. The pattern is to return the error code directly without modifying the position, or modify the position on success and return the number of bytes written. Since on success, the return value of apply is 0, there is no point in modifying ppos either. Fix it by removing all this and just returning error code or number of bytes written on success. Signed-off-by: Joel Fernandes Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Reviewed-by: Andrea Righi Acked-by: Tejun Heo Tested-by: Christian Loehle Link: https://patch.msgid.link/20260126100050.3854740-3-arighi@nvidia.com --- kernel/sched/debug.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 929fdf09e8e9..ed9254da5116 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -339,8 +339,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu long cpu = (long) ((struct seq_file *) filp->private_data)->private; struct rq *rq = cpu_rq(cpu); u64 runtime, period; + int retval = 0; size_t err; - int retval; u64 value; err = kstrtoull_from_user(ubuf, cnt, 10, &value); @@ -374,8 +374,6 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu dl_server_stop(&rq->fair_server); retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); - if (retval) - cnt = retval; if (!runtime) printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", @@ -383,6 +381,9 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu if (rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); + + if (retval < 0) + return retval; } *ppos += cnt; -- cgit v1.2.3 From 68ec89d0e99156803bdea3c986c0198624e40ea2 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 26 Jan 2026 10:59:01 +0100 Subject: sched/debug: Stop and start server based on if it was active Currently the DL server interface for applying parameters checks CFS-internals to identify if the server is active. This is error-prone and makes it difficult when adding new servers in the future. Fix it, by using dl_server_active() which is also used by the DL server code to determine if the DL server was started. Signed-off-by: Joel Fernandes Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Reviewed-by: Andrea Righi Acked-by: Tejun Heo Tested-by: Christian Loehle Link: https://patch.msgid.link/20260126100050.3854740-4-arighi@nvidia.com --- kernel/sched/debug.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ed9254da5116..41e389569d06 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -348,6 +348,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return err; scoped_guard (rq_lock_irqsave, rq) { + bool is_active; + runtime = rq->fair_server.dl_runtime; period = rq->fair_server.dl_period; @@ -370,8 +372,11 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return -EINVAL; } - update_rq_clock(rq); - dl_server_stop(&rq->fair_server); + is_active = dl_server_active(&rq->fair_server); + if (is_active) { + update_rq_clock(rq); + dl_server_stop(&rq->fair_server); + } retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); @@ -379,7 +384,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", cpu_of(rq)); - if (rq->cfs.h_nr_queued) + if (is_active && runtime) dl_server_start(&rq->fair_server); if (retval < 0) -- cgit v1.2.3 From 76d12132ba459ab929cb66eb2030c666aacdb69a Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 26 Jan 2026 10:59:03 +0100 Subject: sched/debug: Add support to change sched_ext server params When a sched_ext server is loaded, tasks in the fair class are automatically moved to the sched_ext class. Add support to modify the ext server parameters similar to how the fair server parameters are modified. Re-use common code between ext and fair servers as needed. Co-developed-by: Andrea Righi Signed-off-by: Andrea Righi Signed-off-by: Joel Fernandes Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Tested-by: Christian Loehle Link: https://patch.msgid.link/20260126100050.3854740-6-arighi@nvidia.com --- kernel/sched/debug.c | 157 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 133 insertions(+), 24 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 41e389569d06..59e650f9d436 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -330,14 +330,16 @@ enum dl_param { DL_PERIOD, }; -static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ -static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ +static unsigned long dl_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ +static unsigned long dl_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ -static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos, enum dl_param param) +static ssize_t sched_server_write_common(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, enum dl_param param, + void *server) { long cpu = (long) ((struct seq_file *) filp->private_data)->private; struct rq *rq = cpu_rq(cpu); + struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server; u64 runtime, period; int retval = 0; size_t err; @@ -350,8 +352,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu scoped_guard (rq_lock_irqsave, rq) { bool is_active; - runtime = rq->fair_server.dl_runtime; - period = rq->fair_server.dl_period; + runtime = dl_se->dl_runtime; + period = dl_se->dl_period; switch (param) { case DL_RUNTIME: @@ -367,25 +369,25 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu } if (runtime > period || - period > fair_server_period_max || - period < fair_server_period_min) { + period > dl_server_period_max || + period < dl_server_period_min) { return -EINVAL; } - is_active = dl_server_active(&rq->fair_server); + is_active = dl_server_active(dl_se); if (is_active) { update_rq_clock(rq); - dl_server_stop(&rq->fair_server); + dl_server_stop(dl_se); } - retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); + retval = dl_server_apply_params(dl_se, runtime, period, 0); if (!runtime) - printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", - cpu_of(rq)); + printk_deferred("%s server disabled in CPU %d, system may crash due to starvation.\n", + server == &rq->fair_server ? "Fair" : "Ext", cpu_of(rq)); if (is_active && runtime) - dl_server_start(&rq->fair_server); + dl_server_start(dl_se); if (retval < 0) return retval; @@ -395,36 +397,42 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return cnt; } -static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param) +static size_t sched_server_show_common(struct seq_file *m, void *v, enum dl_param param, + void *server) { - unsigned long cpu = (unsigned long) m->private; - struct rq *rq = cpu_rq(cpu); + struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server; u64 value; switch (param) { case DL_RUNTIME: - value = rq->fair_server.dl_runtime; + value = dl_se->dl_runtime; break; case DL_PERIOD: - value = rq->fair_server.dl_period; + value = dl_se->dl_period; break; } seq_printf(m, "%llu\n", value); return 0; - } static ssize_t sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME); + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME, + &rq->fair_server); } static int sched_fair_server_runtime_show(struct seq_file *m, void *v) { - return sched_fair_server_show(m, v, DL_RUNTIME); + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_RUNTIME, &rq->fair_server); } static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp) @@ -440,16 +448,57 @@ static const struct file_operations fair_server_runtime_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_CLASS_EXT +static ssize_t +sched_ext_server_runtime_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_RUNTIME, + &rq->ext_server); +} + +static int sched_ext_server_runtime_show(struct seq_file *m, void *v) +{ + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_RUNTIME, &rq->ext_server); +} + +static int sched_ext_server_runtime_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_ext_server_runtime_show, inode->i_private); +} + +static const struct file_operations ext_server_runtime_fops = { + .open = sched_ext_server_runtime_open, + .write = sched_ext_server_runtime_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static ssize_t sched_fair_server_period_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD); + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, + &rq->fair_server); } static int sched_fair_server_period_show(struct seq_file *m, void *v) { - return sched_fair_server_show(m, v, DL_PERIOD); + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server); } static int sched_fair_server_period_open(struct inode *inode, struct file *filp) @@ -465,6 +514,40 @@ static const struct file_operations fair_server_period_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_CLASS_EXT +static ssize_t +sched_ext_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, + &rq->ext_server); +} + +static int sched_ext_server_period_show(struct seq_file *m, void *v) +{ + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + + return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server); +} + +static int sched_ext_server_period_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_ext_server_period_show, inode->i_private); +} + +static const struct file_operations ext_server_period_fops = { + .open = sched_ext_server_period_open, + .write = sched_ext_server_period_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_SCHED_CLASS_EXT */ + static struct dentry *debugfs_sched; static void debugfs_fair_server_init(void) @@ -488,6 +571,29 @@ static void debugfs_fair_server_init(void) } } +#ifdef CONFIG_SCHED_CLASS_EXT +static void debugfs_ext_server_init(void) +{ + struct dentry *d_ext; + unsigned long cpu; + + d_ext = debugfs_create_dir("ext_server", debugfs_sched); + if (!d_ext) + return; + + for_each_possible_cpu(cpu) { + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%lu", cpu); + d_cpu = debugfs_create_dir(buf, d_ext); + + debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_runtime_fops); + debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_period_fops); + } +} +#endif /* CONFIG_SCHED_CLASS_EXT */ + static __init int sched_init_debug(void) { struct dentry __maybe_unused *numa; @@ -526,6 +632,9 @@ static __init int sched_init_debug(void) debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); debugfs_fair_server_init(); +#ifdef CONFIG_SCHED_CLASS_EXT + debugfs_ext_server_init(); +#endif return 0; } -- cgit v1.2.3 From 5a40a9bb56d455e7548ba4b6d7787918323cbaf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Feb 2026 11:05:12 +0100 Subject: sched/debug: Fix dl_server (re)start conditions There are two problems with sched_server_write_common() that can cause the dl_server to malfunction upon attempting to change the parameters: 1) when, after having disabled the dl_server by setting runtime=0, it is enabled again while tasks are already enqueued. In this case is_active would still be 0 and dl_server_start() would not be called. 2) when dl_server_apply_params() would fail, runtime is not applied and does not reflect the new state. Instead have dl_server_start() check its actual dl_runtime, and have sched_server_write_common() unconditionally (re)start the dl_server. It will automatically stop if there isn't anything to do, so spurious activation is harmless -- while failing to start it is a problem. While there, move the printk out of the locked region and make it symmetric, also printing on enable. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260203103407.GK1282955@noisy.programming.kicks-ass.net --- kernel/sched/debug.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'kernel/sched/debug.c') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 59e650f9d436..b24f40f05019 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -338,9 +338,9 @@ static ssize_t sched_server_write_common(struct file *filp, const char __user *u void *server) { long cpu = (long) ((struct seq_file *) filp->private_data)->private; - struct rq *rq = cpu_rq(cpu); struct sched_dl_entity *dl_se = (struct sched_dl_entity *)server; - u64 runtime, period; + u64 old_runtime, runtime, period; + struct rq *rq = cpu_rq(cpu); int retval = 0; size_t err; u64 value; @@ -350,9 +350,7 @@ static ssize_t sched_server_write_common(struct file *filp, const char __user *u return err; scoped_guard (rq_lock_irqsave, rq) { - bool is_active; - - runtime = dl_se->dl_runtime; + old_runtime = runtime = dl_se->dl_runtime; period = dl_se->dl_period; switch (param) { @@ -374,25 +372,23 @@ static ssize_t sched_server_write_common(struct file *filp, const char __user *u return -EINVAL; } - is_active = dl_server_active(dl_se); - if (is_active) { - update_rq_clock(rq); - dl_server_stop(dl_se); - } - + update_rq_clock(rq); + dl_server_stop(dl_se); retval = dl_server_apply_params(dl_se, runtime, period, 0); - - if (!runtime) - printk_deferred("%s server disabled in CPU %d, system may crash due to starvation.\n", - server == &rq->fair_server ? "Fair" : "Ext", cpu_of(rq)); - - if (is_active && runtime) - dl_server_start(dl_se); + dl_server_start(dl_se); if (retval < 0) return retval; } + if (!!old_runtime ^ !!runtime) { + pr_info("%s server %sabled on CPU %d%s.\n", + server == &rq->fair_server ? "Fair" : "Ext", + runtime ? "en" : "dis", + cpu_of(rq), + runtime ? "" : ", system may malfunction due to starvation"); + } + *ppos += cnt; return cnt; } -- cgit v1.2.3