From 7ab02bd36eb444654183ad6c5b15211ddfa32a8f Mon Sep 17 00:00:00 2001 From: "Nysal Jan K.A." Date: Mon, 3 Mar 2025 11:34:50 +0530 Subject: sched/membarrier: Fix redundant load of membarrier_state On architectures where ARCH_HAS_SYNC_CORE_BEFORE_USERMODE is not selected, sync_core_before_usermode() is a no-op. In membarrier_mm_sync_core_before_usermode() the compiler does not eliminate redundant branches and load of mm->membarrier_state for this case as the atomic_read() cannot be optimized away. Here's a snippet of the code generated for finish_task_switch() on powerpc prior to this change: 1b786c: ld r26,2624(r30) # mm = rq->prev_mm; ....... 1b78c8: cmpdi cr7,r26,0 1b78cc: beq cr7,1b78e4 1b78d0: ld r9,2312(r13) # current 1b78d4: ld r9,1888(r9) # current->mm 1b78d8: cmpd cr7,r26,r9 1b78dc: beq cr7,1b7a70 1b78e0: hwsync 1b78e4: cmplwi cr7,r27,128 ....... 1b7a70: lwz r9,176(r26) # atomic_read(&mm->membarrier_state) 1b7a74: b 1b78e0 This was found while analyzing "perf c2c" reports on kernels prior to commit c1753fd02a00 ("mm: move mm_count into its own cache line") where mm_count was false sharing with membarrier_state. There is a minor improvement in the size of finish_task_switch(). The following are results from bloat-o-meter for ppc64le: GCC 7.5.0 --------- add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-32 (-32) Function old new delta finish_task_switch 884 852 -32 GCC 12.2.1 ---------- add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-32 (-32) Function old new delta finish_task_switch.isra 852 820 -32 LLVM 17.0.6 ----------- add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-36 (-36) Function old new delta rt_mutex_schedule 120 104 -16 finish_task_switch 792 772 -20 Results on aarch64: GCC 14.1.1 ---------- add/remove: 0/2 grow/shrink: 1/1 up/down: 4/-60 (-56) Function old new delta get_nohz_timer_target 352 356 +4 e843419@0b02_0000d7e7_408 8 - -8 e843419@01bb_000021d2_868 8 - -8 finish_task_switch.isra 592 548 -44 Signed-off-by: Nysal Jan K.A. Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Reviewed-by: Michael Ellerman Reviewed-by: Segher Boessenkool Link: https://lore.kernel.org/r/20250303060457.531293-1-nysal@linux.ibm.com --- include/linux/sched/mm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 928a626725e6..b13474825130 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -531,6 +531,13 @@ enum { static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { + /* + * The atomic_read() below prevents CSE. The following should + * help the compiler generate more efficient code on architectures + * where sync_core_before_usermode() is a no-op. + */ + if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE)) + return; if (current->mm != mm) return; if (likely(!(atomic_read(&mm->membarrier_state) & -- cgit v1.2.3 From 8bdc5daaa01e3054647d394d354762210ad88f17 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 14 Mar 2025 17:08:02 +0100 Subject: sched: Add a generic function to return the preemption string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The individual architectures often add the preemption model to the begin of the backtrace. This is the case on X86 or ARM64 for the "die" case but not for regular warning. With the addition of DYNAMIC_PREEMPT for PREEMPT_RT we end up with CONFIG_PREEMPT and CONFIG_PREEMPT_RT set simultaneously. That means that everyone who tried to add that piece of information gets it wrong for PREEMPT_RT because PREEMPT is checked first. Provide a generic function which returns the current scheduling model considering LAZY preempt and the current state of PREEMPT_DYNAMIC. The resulting strings are: ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓ ┃ Model ┃ -RT -DYN ┃ +RT -DYN ┃ -RT +DYN ┃ +RT +DYN ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩ │NONE │ NONE │ n/a │ PREEMPT(none) │ n/a │ ├───────────┼──────────────┼───────────────────┼────────────────────┼───────────────────┤ │VOLUNTARY │ VOLUNTARY │ n/a │ PREEMPT(voluntary) │ n/a │ ├───────────┼──────────────┼───────────────────┼────────────────────┼───────────────────┤ │FULL │ PREEMPT │ PREEMPT_RT │ PREEMPT(full) │ PREEMPT_{RT,full} │ ├───────────┼──────────────┼───────────────────┼────────────────────┼───────────────────┤ │LAZY │ PREEMPT_LAZY │ PREEMPT_{RT,LAZY} │ PREEMPT(lazy) │ PREEMPT_{RT,lazy} │ └───────────┴──────────────┴───────────────────┴────────────────────┴───────────────────┘ [ The dynamic building of the string can lead to an empty string if the function is invoked simultaneously on two CPUs. ] Co-developed-by: "Peter Zijlstra (Intel)" Signed-off-by: "Peter Zijlstra (Intel)" Co-developed-by: "Steven Rostedt (Google)" Signed-off-by: "Steven Rostedt (Google)" Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20250314160810.2373416-2-bigeasy@linutronix.de --- include/linux/preempt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index ca86235ac15c..3e9808f2b549 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -515,6 +515,8 @@ static inline bool preempt_model_rt(void) return IS_ENABLED(CONFIG_PREEMPT_RT); } +extern const char *preempt_model_str(void); + /* * Does the preemption model allow non-cooperative preemption? * -- cgit v1.2.3 From 56209334dda1832c0a919e1d74768c6d0f3b2ca9 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 13 Mar 2025 18:03:32 +0100 Subject: sched/topology: Wrappers for sched_domains_mutex Create wrappers for sched_domains_mutex so that it can transparently be used on both CONFIG_SMP and !CONFIG_SMP, as some function will need to do. Fixes: 53916d5fd3c0 ("sched/deadline: Check bandwidth overflow earlier for hotplug") Reported-by: Jon Hunter Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Waiman Long Tested-by: Jon Hunter Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/Z9MP5Oq9RB8jBs3y@jlelli-thinkpadt14gen4.remote.csb --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9632e3318e0d..0785268c76f8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -382,6 +382,11 @@ enum uclamp_id { #ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; +extern void sched_domains_mutex_lock(void); +extern void sched_domains_mutex_unlock(void); +#else +static inline void sched_domains_mutex_lock(void) { } +static inline void sched_domains_mutex_unlock(void) { } #endif struct sched_param { -- cgit v1.2.3 From 45007c6fb5860cf63556a9cadc87c8984927e23d Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 13 Mar 2025 18:05:46 +0100 Subject: sched/deadline: Generalize unique visiting of root domains Bandwidth checks and updates that work on root domains currently employ a cookie mechanism for efficiency. This mechanism is very much tied to when root domains are first created and initialized. Generalize the cookie mechanism so that it can be used also later at runtime while updating root domains. Also, additionally guard it with sched_domains_mutex, since domains need to be stable while updating them (and it will be required for further dynamic changes). Fixes: 53916d5fd3c0 ("sched/deadline: Check bandwidth overflow earlier for hotplug") Reported-by: Jon Hunter Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Waiman Long Tested-by: Jon Hunter Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/Z9MQaiXPvEeW_v7x@jlelli-thinkpadt14gen4.remote.csb --- include/linux/sched/deadline.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 3a912ab42bb5..6ec578600b24 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -37,4 +37,7 @@ extern void dl_clear_root_domain(struct root_domain *rd); #endif /* CONFIG_SMP */ +extern u64 dl_cookie; +extern bool dl_bw_visited(int cpu, u64 cookie); + #endif /* _LINUX_SCHED_DEADLINE_H */ -- cgit v1.2.3 From 2ff899e3516437354204423ef0a94994717b8e6a Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 13 Mar 2025 18:10:21 +0100 Subject: sched/deadline: Rebuild root domain accounting after every update Rebuilding of root domains accounting information (total_bw) is currently broken on some cases, e.g. suspend/resume on aarch64. Problem is that the way we keep track of domain changes and try to add bandwidth back is convoluted and fragile. Fix it by simplify things by making sure bandwidth accounting is cleared and completely restored after root domains changes (after root domains are again stable). To be sure we always call dl_rebuild_rd_accounting while holding cpuset_mutex we also add cpuset_reset_sched_domains() wrapper. Fixes: 53916d5fd3c0 ("sched/deadline: Check bandwidth overflow earlier for hotplug") Reported-by: Jon Hunter Co-developed-by: Waiman Long Signed-off-by: Waiman Long Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/Z9MRfeJKJUOyUSto@jlelli-thinkpadt14gen4.remote.csb --- include/linux/cpuset.h | 6 ++++++ include/linux/sched/deadline.h | 1 + include/linux/sched/topology.h | 2 ++ 3 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 835e7b793f6a..17cc90d900f9 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -128,6 +128,7 @@ extern bool current_cpuset_is_being_rebound(void); extern void rebuild_sched_domains(void); extern void cpuset_print_current_mems_allowed(void); +extern void cpuset_reset_sched_domains(void); /* * read_mems_allowed_begin is required when making decisions involving @@ -264,6 +265,11 @@ static inline void rebuild_sched_domains(void) partition_sched_domains(1, NULL, NULL); } +static inline void cpuset_reset_sched_domains(void) +{ + partition_sched_domains(1, NULL, NULL); +} + static inline void cpuset_print_current_mems_allowed(void) { } diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 6ec578600b24..f9aabbc9d22e 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -34,6 +34,7 @@ static inline bool dl_time_before(u64 a, u64 b) struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); +extern void dl_clear_root_domain_cpu(int cpu); #endif /* CONFIG_SMP */ diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7f3dbafe1817..1622232bd08b 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -166,6 +166,8 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) return to_cpumask(sd->span); } +extern void dl_rebuild_rd_accounting(void); + extern void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); -- cgit v1.2.3 From d128130f486b4aa86086655af0fbb943b26b0003 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 13 Mar 2025 18:12:43 +0100 Subject: sched/topology: Stop exposing partition_sched_domains_locked The are no callers of partition_sched_domains_locked() outside topology.c. Stop exposing such function. Suggested-by: Waiman Long Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Waiman Long Tested-by: Jon Hunter Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/Z9MSC96a8FcqWV3G@jlelli-thinkpadt14gen4.remote.csb --- include/linux/sched/topology.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 1622232bd08b..96e69bfc3c8a 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -168,10 +168,6 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) extern void dl_rebuild_rd_accounting(void); -extern void partition_sched_domains_locked(int ndoms_new, - cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new); - extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); @@ -212,12 +208,6 @@ extern void __init set_sched_topology(struct sched_domain_topology_level *tl); struct sched_domain_attr; -static inline void -partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ -} - static inline void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) -- cgit v1.2.3 From 34929a070b7fd06c386080c926b61ee844e6ad34 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 13 Mar 2025 18:13:29 +0100 Subject: include/{topology,cpuset}: Move dl_rebuild_rd_accounting to cpuset.h dl_rebuild_rd_accounting() is defined in cpuset.c, so it makes more sense to move related declarations to cpuset.h. Implement the move. Suggested-by: Waiman Long Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Waiman Long Tested-by: Jon Hunter Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/Z9MSOVMpU7jpVrMU@jlelli-thinkpadt14gen4.remote.csb --- include/linux/cpuset.h | 5 +++++ include/linux/sched/topology.h | 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 17cc90d900f9..5466c96a33db 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -125,6 +125,7 @@ static inline int cpuset_do_page_mem_spread(void) extern bool current_cpuset_is_being_rebound(void); +extern void dl_rebuild_rd_accounting(void); extern void rebuild_sched_domains(void); extern void cpuset_print_current_mems_allowed(void); @@ -260,6 +261,10 @@ static inline bool current_cpuset_is_being_rebound(void) return false; } +static inline void dl_rebuild_rd_accounting(void) +{ +} + static inline void rebuild_sched_domains(void) { partition_sched_domains(1, NULL, NULL); diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 96e69bfc3c8a..51f7b8169515 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -166,8 +166,6 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) return to_cpumask(sd->span); } -extern void dl_rebuild_rd_accounting(void); - extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); -- cgit v1.2.3 From dd5bdaf2b72da81d57f4f99e518af80002b6562e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 17 Mar 2025 11:42:54 +0100 Subject: sched/debug: Make CONFIG_SCHED_DEBUG functionality unconditional All the big Linux distros enable CONFIG_SCHED_DEBUG, because the various features it provides help not just with kernel development, but with system administration and user-space software development as well. Reflect this reality and enable this functionality unconditionally. Signed-off-by: Ingo Molnar Tested-by: Shrikanth Hegde Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Valentin Schneider Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250317104257.3496611-4-mingo@kernel.org --- include/linux/energy_model.h | 2 -- include/linux/sched/debug.h | 2 -- include/linux/sched/topology.h | 4 ---- 3 files changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 78318d49276d..65efc0f5ea2e 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -240,9 +240,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, struct em_perf_state *ps; int i; -#ifdef CONFIG_SCHED_DEBUG WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); -#endif if (!sum_util) return 0; diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index b5035afa2396..35ed4577a6cc 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -35,12 +35,10 @@ extern void show_stack(struct task_struct *task, unsigned long *sp, extern void sched_show_task(struct task_struct *p); -#ifdef CONFIG_SCHED_DEBUG struct seq_file; extern void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); -#endif /* Attach to any functions which should be ignored in wchan output. */ #define __sched __section(".sched.text") diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 51f7b8169515..7b4301b7235f 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -25,16 +25,12 @@ enum { }; #undef SD_FLAG -#ifdef CONFIG_SCHED_DEBUG - struct sd_flag_debug { unsigned int meta_flags; char *name; }; extern const struct sd_flag_debug sd_flag_debug[]; -#endif - #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) { -- cgit v1.2.3 From 3785c7dbae0f733f13f8857beaaada5d7dc63e02 Mon Sep 17 00:00:00 2001 From: Yujun Dong Date: Mon, 30 Dec 2024 22:16:24 +0800 Subject: cpuidle, sched: Use smp_mb__after_atomic() in current_clr_polling() In architectures that use the polling bit, current_clr_polling() employs smp_mb() to ensure that the clearing of the polling bit is visible to other cores before checking TIF_NEED_RESCHED. However, smp_mb() can be costly. Given that clear_bit() is an atomic operation, replacing smp_mb() with smp_mb__after_atomic() is appropriate. Many architectures implement smp_mb__after_atomic() as a lighter-weight barrier compared to smp_mb(), leading to performance improvements. For instance, on x86, smp_mb__after_atomic() is a no-op. This change eliminates a smp_mb() instruction in the cpuidle wake-up path, saving several CPU cycles and thereby reducing wake-up latency. Architectures that do not use the polling bit will retain the original smp_mb() behavior to ensure that existing dependencies remain unaffected. Signed-off-by: Yujun Dong Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241230141624.155356-1-yujundong@pascal-lab.net --- include/linux/sched/idle.h | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index e670ac282333..439f6029d3b9 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -79,6 +79,21 @@ static __always_inline bool __must_check current_clr_polling_and_test(void) return unlikely(tif_need_resched()); } +static __always_inline void current_clr_polling(void) +{ + __current_clr_polling(); + + /* + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. + */ + smp_mb__after_atomic(); /* paired with resched_curr() */ + + preempt_fold_need_resched(); +} + #else static inline void __current_set_polling(void) { } static inline void __current_clr_polling(void) { } @@ -91,21 +106,15 @@ static inline bool __must_check current_clr_polling_and_test(void) { return unlikely(tif_need_resched()); } -#endif static __always_inline void current_clr_polling(void) { __current_clr_polling(); - /* - * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. - * Once the bit is cleared, we'll get IPIs with every new - * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also - * fold. - */ smp_mb(); /* paired with resched_curr() */ preempt_fold_need_resched(); } +#endif #endif /* _LINUX_SCHED_IDLE_H */ -- cgit v1.2.3