From 5202c25dd17c54cd4c21f266d9a51b644d7cd682 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:00 +0200 Subject: sched/smp: Always define sched_domains_mutex_lock()/unlock(), def_root_domain and sched_domains_mutex Simplify the scheduler by making CONFIG_SMP=y primitives and data structures unconditional. Unconditionally build kernel/sched/topology.c and the main sched-domains locking primitives. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-20-mingo@kernel.org --- include/linux/sched.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4f78a64beb52..aa54d75034ea 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -395,15 +395,10 @@ enum uclamp_id { UCLAMP_CNT }; -#ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; extern void sched_domains_mutex_lock(void); extern void sched_domains_mutex_unlock(void); -#else -static inline void sched_domains_mutex_lock(void) { } -static inline void sched_domains_mutex_unlock(void) { } -#endif struct sched_param { int sched_priority; -- cgit v1.2.3 From cac5cefbade90ff0bb0b393d301fa3b5234cf056 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:01 +0200 Subject: sched/smp: Make SMP unconditional Simplify the scheduler by making CONFIG_SMP=y primitives and data structures unconditional. Introduce transitory wrappers for functionality not yet converted to SMP. Note that this patch is pretty large, because there's no clear separation between various aspects of the SMP scheduler, it's basically a huge block of #ifdef CONFIG_SMP. A fair amount of it has to be switched on for it to boot and work on UP systems. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-21-mingo@kernel.org --- include/linux/preempt.h | 9 --------- include/linux/sched.h | 42 ------------------------------------------ include/linux/sched/deadline.h | 4 ---- include/linux/sched/idle.h | 4 ---- include/linux/sched/nohz.h | 4 ++-- include/linux/sched/topology.h | 32 -------------------------------- 6 files changed, 2 insertions(+), 93 deletions(-) (limited to 'include') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index b0af8d4ef6e6..1fad1c8a4c76 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -369,8 +369,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, #endif -#ifdef CONFIG_SMP - /* * Migrate-Disable and why it is undesired. * @@ -429,13 +427,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, extern void migrate_disable(void); extern void migrate_enable(void); -#else - -static inline void migrate_disable(void) { } -static inline void migrate_enable(void) { } - -#endif /* CONFIG_SMP */ - /** * preempt_disable_nested - Disable preemption inside a normally preempt disabled section * diff --git a/include/linux/sched.h b/include/linux/sched.h index aa54d75034ea..376befdec4b0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -599,7 +599,6 @@ struct sched_entity { unsigned long runnable_weight; #endif -#ifdef CONFIG_SMP /* * Per entity load average tracking. * @@ -607,7 +606,6 @@ struct sched_entity { * collide with read-mostly values above. */ struct sched_avg avg; -#endif }; struct sched_rt_entity { @@ -837,7 +835,6 @@ struct task_struct { struct alloc_tag *alloc_tag; #endif -#ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; @@ -853,7 +850,6 @@ struct task_struct { */ int recent_used_cpu; int wake_cpu; -#endif int on_rq; int prio; @@ -912,9 +908,7 @@ struct task_struct { cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; -#ifdef CONFIG_SMP unsigned short migration_disabled; -#endif unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU @@ -946,10 +940,8 @@ struct task_struct { struct sched_info sched_info; struct list_head tasks; -#ifdef CONFIG_SMP struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; -#endif struct mm_struct *mm; struct mm_struct *active_mm; @@ -1843,7 +1835,6 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpu extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); -#ifdef CONFIG_SMP /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); @@ -1861,33 +1852,6 @@ extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); -#else -static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ -} -static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - /* Opencoded cpumask_test_cpu(0, new_mask) to avoid dependency on cpumask.h */ - if ((*cpumask_bits(new_mask) & 1) == 0) - return -EINVAL; - return 0; -} -static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) -{ - if (src->user_cpus_ptr) - return -EINVAL; - return 0; -} -static inline void release_user_cpus_ptr(struct task_struct *p) -{ - WARN_ON(p->user_cpus_ptr); -} - -static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) -{ - return 0; -} -#endif extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); @@ -1976,11 +1940,7 @@ extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); -#ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); -#else -static inline void kick_process(struct task_struct *tsk) { } -#endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); #define set_task_comm(tsk, from) ({ \ @@ -2225,7 +2185,6 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #define TASK_SIZE_OF(tsk) TASK_SIZE #endif -#ifdef CONFIG_SMP static inline bool owner_on_cpu(struct task_struct *owner) { /* @@ -2237,7 +2196,6 @@ static inline bool owner_on_cpu(struct task_struct *owner) /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); -#endif /* CONFIG_SMP */ #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index f9aabbc9d22e..c40115d4e34d 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -29,15 +29,11 @@ static inline bool dl_time_before(u64 a, u64 b) return (s64)(a - b) < 0; } -#ifdef CONFIG_SMP - struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); extern void dl_clear_root_domain_cpu(int cpu); -#endif /* CONFIG_SMP */ - extern u64 dl_cookie; extern bool dl_bw_visited(int cpu, u64 cookie); diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index 439f6029d3b9..8465ff1f20d1 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -11,11 +11,7 @@ enum cpu_idle_type { CPU_MAX_IDLE_TYPES }; -#ifdef CONFIG_SMP extern void wake_up_if_idle(int cpu); -#else -static inline void wake_up_if_idle(int cpu) { } -#endif /* * Idle thread specific functions to determine the need_resched diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 6d67e9a5af6b..0db7f67935fe 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -6,7 +6,7 @@ * This is the interface between the scheduler and nohz/dynticks: */ -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON extern void nohz_balance_enter_idle(int cpu); extern int get_nohz_timer_target(void); #else @@ -23,7 +23,7 @@ static inline void calc_load_nohz_remote(struct rq *rq) { } static inline void calc_load_nohz_stop(void) { } #endif /* CONFIG_NO_HZ_COMMON */ -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) +#ifdef CONFIG_NO_HZ_COMMON extern void wake_up_nohz_cpu(int cpu); #else static inline void wake_up_nohz_cpu(int cpu) { } diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 198bb5cc1774..e54e7fa76ba6 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -9,7 +9,6 @@ /* * sched-domains (multiprocessor balancing) declarations: */ -#ifdef CONFIG_SMP /* Generate SD flag indexes */ #define SD_FLAG(name, mflags) __##name, @@ -200,37 +199,6 @@ extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); # define SD_INIT_NAME(type) .name = #type -#else /* CONFIG_SMP */ - -struct sched_domain_attr; - -static inline void -partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ -} - -static inline bool cpus_equal_capacity(int this_cpu, int that_cpu) -{ - return true; -} - -static inline bool cpus_share_cache(int this_cpu, int that_cpu) -{ - return true; -} - -static inline bool cpus_share_resources(int this_cpu, int that_cpu) -{ - return true; -} - -static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) -{ -} - -#endif /* !CONFIG_SMP */ - #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern void rebuild_sched_domains_energy(void); #else -- cgit v1.2.3 From 06ddd17521bf11a3e7f59dafdf5c148f29467d2c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:02 +0200 Subject: sched/smp: Always define is_percpu_thread() and scheduler_ipi() Simplify the scheduler by making the CONFIG_SMP=y primitives of is_percpu_thread() and scheduler_ipi() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-22-mingo@kernel.org --- include/linux/sched.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 376befdec4b0..eec6b225e9d1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1765,12 +1765,8 @@ extern struct pid *cad_pid; static __always_inline bool is_percpu_thread(void) { -#ifdef CONFIG_SMP return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); -#else - return true; -#endif } /* Per-process atomic flags. */ @@ -1967,7 +1963,6 @@ extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec buf; \ }) -#ifdef CONFIG_SMP static __always_inline void scheduler_ipi(void) { /* @@ -1977,9 +1972,6 @@ static __always_inline void scheduler_ipi(void) */ preempt_fold_need_resched(); } -#else -static inline void scheduler_ipi(void) { } -#endif extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); -- cgit v1.2.3 From 1f25730e5a780b33f78e3ea23e64d3f75e0b2042 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:07 +0200 Subject: sched/smp: Use the SMP version of sched_exec() Simplify the scheduler making CONFIG_SMP=y sched_exec() code unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-27-mingo@kernel.org --- include/linux/sched/task.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ca1db4b92c32..c517dbc242f7 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -109,11 +109,7 @@ int kernel_wait(pid_t pid, int *stat); extern void free_task(struct task_struct *tsk); /* sched_exec is called by processes performing an exec */ -#ifdef CONFIG_SMP extern void sched_exec(void); -#else -#define sched_exec() {} -#endif static inline struct task_struct *get_task_struct(struct task_struct *t) { -- cgit v1.2.3 From 570c8efd5eb79c3725ba439ce105ed1bedc5acd9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 May 2025 17:28:00 +0200 Subject: sched/psi: Optimize psi_group_change() cpu_clock() usage Dietmar reported that commit 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race") caused a regression for him on a high context switch rate benchmark (schbench) due to the now repeating cpu_clock() calls. In particular the problem is that get_recent_times() will extrapolate the current state to 'now'. But if an update uses a timestamp from before the start of the update, it is possible to get two reads with inconsistent results. It is effectively back-dating an update. (note that this all hard-relies on the clock being synchronized across CPUs -- if this is not the case, all bets are off). Combine this problem with the fact that there are per-group-per-cpu seqcounts, the commit in question pushed the clock read into the group iteration, causing tree-depth cpu_clock() calls. On architectures where cpu_clock() has appreciable overhead, this hurts. Instead move to a per-cpu seqcount, which allows us to have a single clock read for all group updates, increasing internal consistency and lowering update overhead. This comes at the cost of a longer update side (proportional to the tree depth) which can cause the read side to retry more often. Fixes: 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race") Reported-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Tested-by: Dietmar Eggemann , Link: https://lkml.kernel.org/20250522084844.GC31726@noisy.programming.kicks-ass.net --- include/linux/psi_types.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index f1fd3a8044e0..dd10c22299ab 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -84,11 +84,9 @@ enum psi_aggregators { struct psi_group_cpu { /* 1st cacheline updated by the scheduler */ - /* Aggregator needs to know of concurrent changes */ - seqcount_t seq ____cacheline_aligned_in_smp; - /* States of the tasks belonging to this group */ - unsigned int tasks[NR_PSI_TASK_COUNTS]; + unsigned int tasks[NR_PSI_TASK_COUNTS] + ____cacheline_aligned_in_smp; /* Aggregate pressure state derived from the tasks */ u32 state_mask; -- cgit v1.2.3 From cccb45d7c4295bbfeba616582d0249f2d21e6df5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 May 2025 11:19:30 +0200 Subject: sched/deadline: Less agressive dl_server handling Chris reported that commit 5f6bd380c7bd ("sched/rt: Remove default bandwidth control") caused a significant dip in his favourite benchmark of the day. Simply disabling dl_server cured things. His workload hammers the 0->1, 1->0 transitions, and the dl_server_{start,stop}() overhead kills it -- fairly obviously a bad idea in hind sight and all that. Change things around to only disable the dl_server when there has not been a fair task around for a whole period. Since the default period is 1 second, this ensures the benchmark never trips this, overhead gone. Fixes: 557a6bfc662c ("sched/fair: Add trivial fair server") Reported-by: Chris Mason Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20250702121158.465086194@infradead.org --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index eec6b225e9d1..4802fcf738cd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -698,6 +698,7 @@ struct sched_dl_entity { unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; + unsigned int dl_server_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its -- cgit v1.2.3 From 74eec63661d46a7153d04c2e0249eeb76cc76d44 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Jul 2025 18:56:26 +0200 Subject: sched/fair: Fix NO_RUN_TO_PARITY case EEVDF expects the scheduler to allocate a time quantum to the selected entity and then pick a new entity for next quantum. Although this notion of time quantum is not strictly doable in our case, we can ensure a minimum runtime for each task most of the time and pick a new entity after a minimum time has elapsed. Reuse the slice protection of run to parity to ensure such runtime quantum. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250708165630.1948751-3-vincent.guittot@linaro.org --- include/linux/sched.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4802fcf738cd..55921385927d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -583,7 +583,15 @@ struct sched_entity { u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; - s64 vlag; + union { + /* + * When !@on_rq this field is vlag. + * When cfs_rq->curr == se (which implies @on_rq) + * this field is vprot. See protect_slice(). + */ + s64 vlag; + u64 vprot; + }; u64 slice; u64 nr_migrations; -- cgit v1.2.3 From e075f4360931263f5ec006ea5dadc065e5e98eb8 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 10 Jul 2025 18:57:07 +0800 Subject: smpboot: introduce SDTL_INIT() helper to tidy sched topology setup Define a small SDTL_INIT(maskfn, flagsfn, name) macro and use it to build the sched_domain_topology_level array. Purely a cleanup; behaviour is unchanged. Suggested-by: Thomas Gleixner Signed-off-by: Li Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250710105715.66594-2-me@linux.beauty --- include/linux/sched/topology.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index e54e7fa76ba6..0d5daaa277b7 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -196,8 +196,8 @@ struct sched_domain_topology_level { extern void __init set_sched_topology(struct sched_domain_topology_level *tl); extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); - -# define SD_INIT_NAME(type) .name = #type +#define SDTL_INIT(maskfn, flagsfn, dname) ((struct sched_domain_topology_level) \ + { .mask = maskfn, .sd_flags = flagsfn, .name = #dname }) #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern void rebuild_sched_domains_energy(void); -- cgit v1.2.3 From 1eec89a671413ce38df9fe9e70f5130a9eb79a59 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Fri, 11 Jul 2025 11:20:30 +0530 Subject: sched/topology: Remove sched_domain_topology_level::flags Support for overlapping domains added in commit e3589f6c81e4 ("sched: Allow for overlapping sched_domain spans") also allowed forcefully setting SD_OVERLAP for !NUMA domains via FORCE_SD_OVERLAP sched_feat(). Since NUMA domains had to be presumed overlapping to ensure correct behavior, "sched_domain_topology_level::flags" was introduced. NUMA domains added the SDTL_OVERLAP flag would ensure SD_OVERLAP was always added during build_sched_domains() for these domains, even when FORCE_SD_OVERLAP was off. Condition for adding the SD_OVERLAP flag at the aforementioned commit was as follows: if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; The FORCE_SD_OVERLAP debug feature was removed in commit af85596c74de ("sched/topology: Remove FORCE_SD_OVERLAP") which left the NUMA domains as the exclusive users of SDTL_OVERLAP, SD_OVERLAP, and SD_NUMA flags. Get rid of SDTL_OVERLAP and SD_OVERLAP as they have become redundant and instead rely on SD_NUMA to detect the only overlapping domain currently supported. Since SDTL_OVERLAP was the only user of "tl->flags", get rid of "sched_domain_topology_level::flags" too. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/ba4dbdf8-bc37-493d-b2e0-2efb00ea3e19@amd.com --- include/linux/sched/sd_flags.h | 8 -------- include/linux/sched/topology.h | 3 --- 2 files changed, 11 deletions(-) (limited to 'include') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index b04a5d04dee9..42839cfa2778 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -153,14 +153,6 @@ SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS) */ SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS) -/* - * sched_groups of this level overlap - * - * SHARED_PARENT: Set for all NUMA levels above NODE. - * NEEDS_GROUPS: Overlaps can only exist with more than one group. - */ -SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) - /* * Cross-node balancing * diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 0d5daaa277b7..5263746b63e8 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -175,8 +175,6 @@ bool cpus_share_resources(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); -#define SDTL_OVERLAP 0x01 - struct sd_data { struct sched_domain *__percpu *sd; struct sched_domain_shared *__percpu *sds; @@ -187,7 +185,6 @@ struct sd_data { struct sched_domain_topology_level { sched_domain_mask_f mask; sched_domain_flags_f sd_flags; - int flags; int numa_level; struct sd_data data; char *name; -- cgit v1.2.3 From 25c411fce735dda29de26f58d3fce52d4824380c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Sat, 12 Jul 2025 03:33:42 +0000 Subject: sched: Add CONFIG_SCHED_PROXY_EXEC & boot argument to enable/disable Add a CONFIG_SCHED_PROXY_EXEC option, along with a boot argument sched_proxy_exec= that can be used to disable the feature at boot time if CONFIG_SCHED_PROXY_EXEC was enabled. Also uses this option to allow the rq->donor to be different from rq->curr. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-2-jstultz@google.com --- include/linux/sched.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 54a91261e99b..f225b6b1baa3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1656,6 +1656,19 @@ struct task_struct { randomized_struct_fields_end } __attribute__ ((aligned (64))); +#ifdef CONFIG_SCHED_PROXY_EXEC +DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); +static inline bool sched_proxy_exec(void) +{ + return static_branch_likely(&__sched_proxy_exec); +} +#else +static inline bool sched_proxy_exec(void) +{ + return false; +} +#endif + #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) -- cgit v1.2.3 From 44e4e0297c3c01987399bb9973f4d22a096a62c2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 12 Jul 2025 03:33:43 +0000 Subject: locking/mutex: Rework task_struct::blocked_on Track the blocked-on relation for mutexes, to allow following this relation at schedule time. task | blocked-on v mutex | owner v task This all will be used for tracking blocked-task/mutex chains with the prox-execution patch in a similar fashion to how priority inheritance is done with rt_mutexes. For serialization, blocked-on is only set by the task itself (current). And both when setting or clearing (potentially by others), is done while holding the mutex::wait_lock. [minor changes while rebasing] [jstultz: Fix blocked_on tracking in __mutex_lock_common in error paths] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Juri Lelli Signed-off-by: Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-3-jstultz@google.com --- include/linux/sched.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f225b6b1baa3..33ad240ec900 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1230,10 +1230,7 @@ struct task_struct { struct rt_mutex_waiter *pi_blocked_on; #endif -#ifdef CONFIG_DEBUG_MUTEXES - /* Mutex deadlock detection: */ - struct mutex_waiter *blocked_on; -#endif + struct mutex *blocked_on; /* lock we're blocked on */ #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* -- cgit v1.2.3 From a4f0b6fef4b08e9928449206390133e48ac185a7 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Sat, 12 Jul 2025 03:33:44 +0000 Subject: locking/mutex: Add p->blocked_on wrappers for correctness checks This lets us assert mutex::wait_lock is held whenever we access p->blocked_on, as well as warn us for unexpected state changes. [fix conflicts, call in more places] [jstultz: tweaked commit subject, reworked a good bit] Signed-off-by: Valentin Schneider Signed-off-by: Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-4-jstultz@google.com --- include/linux/sched.h | 64 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 33ad240ec900..5b4e1cd52e27 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -2129,6 +2130,67 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock); __cond_resched_rwlock_write(lock); \ }) +#ifndef CONFIG_PREEMPT_RT +static inline struct mutex *__get_task_blocked_on(struct task_struct *p) +{ + struct mutex *m = p->blocked_on; + + if (m) + lockdep_assert_held_once(&m->wait_lock); + return m; +} + +static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + WARN_ON_ONCE(!m); + /* The task should only be setting itself as blocked */ + WARN_ON_ONCE(p != current); + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * Check ensure we don't overwrite existing mutex value + * with a different mutex. Note, setting it to the same + * lock repeatedly is ok. + */ + WARN_ON_ONCE(p->blocked_on && p->blocked_on != m); + p->blocked_on = m; +} + +static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + guard(raw_spinlock_irqsave)(&m->wait_lock); + __set_task_blocked_on(p, m); +} + +static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + WARN_ON_ONCE(!m); + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * There may be cases where we re-clear already cleared + * blocked_on relationships, but make sure we are not + * clearing the relationship with a different lock. + */ + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); + p->blocked_on = NULL; +} + +static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + guard(raw_spinlock_irqsave)(&m->wait_lock); + __clear_task_blocked_on(p, m); +} +#else +static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) +{ +} + +static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) +{ +} +#endif /* !CONFIG_PREEMPT_RT */ + static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); @@ -2168,8 +2230,6 @@ extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); -#include - /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. -- cgit v1.2.3 From 8671bad873ebeb082afcf7b4501395c374da6023 Mon Sep 17 00:00:00 2001 From: "Luis Claudio R. Goncalves" Date: Mon, 7 Jul 2025 11:03:59 -0300 Subject: sched: Do not call __put_task_struct() on rt if pi_blocked_on is set With PREEMPT_RT enabled, some of the calls to put_task_struct() coming from rt_mutex_adjust_prio_chain() could happen in preemptible context and with a mutex enqueued. That could lead to this sequence: rt_mutex_adjust_prio_chain() put_task_struct() __put_task_struct() sched_ext_free() spin_lock_irqsave() rtlock_lock() ---> TRIGGERS lockdep_assert(!current->pi_blocked_on); This is not a SCHED_EXT bug. The first cleanup function called by __put_task_struct() is sched_ext_free() and it happens to take a (RT) spin_lock, which in the scenario described above, would trigger the lockdep assertion of "!current->pi_blocked_on". Crystal Wood was able to identify the problem as __put_task_struct() being called during rt_mutex_adjust_prio_chain(), in the context of a process with a mutex enqueued. Instead of adding more complex conditions to decide when to directly call __put_task_struct() and when to defer the call, unconditionally resort to the deferred call on PREEMPT_RT to simplify the code. Fixes: 893cdaaa3977 ("sched: avoid false lockdep splat in put_task_struct()") Suggested-by: Crystal Wood Signed-off-by: Luis Claudio R. Goncalves Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Wander Lairson Costa Reviewed-by: Valentin Schneider Reviewed-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/aGvTz5VaPFyj0pBV@uudg.org --- include/linux/sched/task.h | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index c517dbc242f7..ea41795a352b 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -131,24 +131,17 @@ static inline void put_task_struct(struct task_struct *t) return; /* - * In !RT, it is always safe to call __put_task_struct(). - * Under RT, we can only call it in preemptible context. - */ - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { - static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP); - - lock_map_acquire_try(&put_task_map); - __put_task_struct(t); - lock_map_release(&put_task_map); - return; - } - - /* - * under PREEMPT_RT, we can't call put_task_struct + * Under PREEMPT_RT, we can't call __put_task_struct * in atomic context because it will indirectly - * acquire sleeping locks. + * acquire sleeping locks. The same is true if the + * current process has a mutex enqueued (blocked on + * a PI chain). + * + * In !RT, it is always safe to call __put_task_struct(). + * Though, in order to simplify the code, resort to the + * deferred call too. * - * call_rcu() will schedule delayed_put_task_struct_rcu() + * call_rcu() will schedule __put_task_struct_rcu_cb() * to be called in process context. * * __put_task_struct() is called when @@ -161,7 +154,7 @@ static inline void put_task_struct(struct task_struct *t) * * delayed_free_task() also uses ->rcu, but it is only called * when it fails to fork a process. Therefore, there is no - * way it can conflict with put_task_struct(). + * way it can conflict with __put_task_struct(). */ call_rcu(&t->rcu, __put_task_struct_rcu_cb); } -- cgit v1.2.3 From 1b5f1454091e9e9fb5c944b3161acf4ec0894d0d Mon Sep 17 00:00:00 2001 From: Feng Lee <379943137@qq.com> Date: Mon, 21 Jul 2025 16:04:35 +0800 Subject: sched/idle: Remove play_idle() play_idle() is no longer in use, so delete it. Signed-off-by: Feng Lee <379943137@qq.com> Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/tencent_C3E0BD9B812C27A30FC49F1EA6A4B1352707@qq.com --- include/linux/cpu.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 6378370a952f..8b1abbf5b6d2 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -187,11 +187,6 @@ static inline void arch_cpu_finalize_init(void) { } void play_idle_precise(u64 duration_ns, u64 latency_ns); -static inline void play_idle(unsigned long duration_us) -{ - play_idle_precise(duration_us * NSEC_PER_USEC, U64_MAX); -} - #ifdef CONFIG_HOTPLUG_CPU void cpuhp_report_idle_dead(void); #else -- cgit v1.2.3