From 7ab02bd36eb444654183ad6c5b15211ddfa32a8f Mon Sep 17 00:00:00 2001
From: "Nysal Jan K.A." <nysal@linux.ibm.com>
Date: Mon, 3 Mar 2025 11:34:50 +0530
Subject: sched/membarrier: Fix redundant load of membarrier_state

On architectures where ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
is not selected, sync_core_before_usermode() is a no-op.
In membarrier_mm_sync_core_before_usermode() the compiler does not
eliminate redundant branches and load of mm->membarrier_state
for this case as the atomic_read() cannot be optimized away.

Here's a snippet of the code generated for finish_task_switch() on powerpc
prior to this change:

  1b786c:   ld      r26,2624(r30)   # mm = rq->prev_mm;
  .......
  1b78c8:   cmpdi   cr7,r26,0
  1b78cc:   beq     cr7,1b78e4 <finish_task_switch+0xd0>
  1b78d0:   ld      r9,2312(r13)    # current
  1b78d4:   ld      r9,1888(r9)     # current->mm
  1b78d8:   cmpd    cr7,r26,r9
  1b78dc:   beq     cr7,1b7a70 <finish_task_switch+0x25c>
  1b78e0:   hwsync
  1b78e4:   cmplwi  cr7,r27,128
  .......
  1b7a70:   lwz     r9,176(r26)     # atomic_read(&mm->membarrier_state)
  1b7a74:   b       1b78e0 <finish_task_switch+0xcc>

This was found while analyzing "perf c2c" reports on kernels prior
to commit c1753fd02a00 ("mm: move mm_count into its own cache line")
where mm_count was false sharing with membarrier_state.

There is a minor improvement in the size of finish_task_switch().
The following are results from bloat-o-meter for ppc64le:

  GCC 7.5.0
  ---------
  add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-32 (-32)
  Function                                     old     new   delta
  finish_task_switch                           884     852     -32

  GCC 12.2.1
  ----------
  add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-32 (-32)
  Function                                     old     new   delta
  finish_task_switch.isra                      852     820     -32

  LLVM 17.0.6
  -----------
  add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-36 (-36)
  Function                                     old     new   delta
  rt_mutex_schedule                            120     104     -16
  finish_task_switch                           792     772     -20

Results on aarch64:

  GCC 14.1.1
  ----------
  add/remove: 0/2 grow/shrink: 1/1 up/down: 4/-60 (-56)
  Function                                     old     new   delta
  get_nohz_timer_target                        352     356      +4
  e843419@0b02_0000d7e7_408                      8       -      -8
  e843419@01bb_000021d2_868                      8       -      -8
  finish_task_switch.isra                      592     548     -44

Signed-off-by: Nysal Jan K.A. <nysal@linux.ibm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Segher Boessenkool <segher@kernel.crashing.org>
Link: https://lore.kernel.org/r/20250303060457.531293-1-nysal@linux.ibm.com
---
 include/linux/sched/mm.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 928a626725e6..b13474825130 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -531,6 +531,13 @@ enum {
 
 static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
 {
+	/*
+	 * The atomic_read() below prevents CSE. The following should
+	 * help the compiler generate more efficient code on architectures
+	 * where sync_core_before_usermode() is a no-op.
+	 */
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE))
+		return;
 	if (current->mm != mm)
 		return;
 	if (likely(!(atomic_read(&mm->membarrier_state) &
-- 
cgit v1.2.3


From 8bdc5daaa01e3054647d394d354762210ad88f17 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 14 Mar 2025 17:08:02 +0100
Subject: sched: Add a generic function to return the preemption string
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The individual architectures often add the preemption model to the begin
of the backtrace. This is the case on X86 or ARM64 for the "die" case
but not for regular warning. With the addition of DYNAMIC_PREEMPT for
PREEMPT_RT we end up with CONFIG_PREEMPT and CONFIG_PREEMPT_RT set
simultaneously. That means that everyone who tried to add that piece of
information gets it wrong for PREEMPT_RT because PREEMPT is checked
first.

Provide a generic function which returns the current scheduling model
considering LAZY preempt and the current state of PREEMPT_DYNAMIC.

The resulting strings are:
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
┃   Model   ┃  -RT -DYN    ┃     +RT -DYN      ┃     -RT +DYN       ┃     +RT +DYN      ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
│NONE       │ NONE         │ n/a               │ PREEMPT(none)      │ n/a               │
├───────────┼──────────────┼───────────────────┼────────────────────┼───────────────────┤
│VOLUNTARY  │ VOLUNTARY    │ n/a               │ PREEMPT(voluntary) │ n/a               │
├───────────┼──────────────┼───────────────────┼────────────────────┼───────────────────┤
│FULL       │ PREEMPT      │ PREEMPT_RT        │ PREEMPT(full)      │ PREEMPT_{RT,full} │
├───────────┼──────────────┼───────────────────┼────────────────────┼───────────────────┤
│LAZY       │ PREEMPT_LAZY │ PREEMPT_{RT,LAZY} │ PREEMPT(lazy)      │ PREEMPT_{RT,lazy} │
└───────────┴──────────────┴───────────────────┴────────────────────┴───────────────────┘

[ The dynamic building of the string can lead to an empty string if the
  function is invoked simultaneously on two CPUs. ]

Co-developed-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Co-developed-by: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Signed-off-by: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Link: https://lore.kernel.org/r/20250314160810.2373416-2-bigeasy@linutronix.de
---
 include/linux/preempt.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index ca86235ac15c..3e9808f2b549 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -515,6 +515,8 @@ static inline bool preempt_model_rt(void)
 	return IS_ENABLED(CONFIG_PREEMPT_RT);
 }
 
+extern const char *preempt_model_str(void);
+
 /*
  * Does the preemption model allow non-cooperative preemption?
  *
-- 
cgit v1.2.3


From 56209334dda1832c0a919e1d74768c6d0f3b2ca9 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@redhat.com>
Date: Thu, 13 Mar 2025 18:03:32 +0100
Subject: sched/topology: Wrappers for sched_domains_mutex

Create wrappers for sched_domains_mutex so that it can transparently be
used on both CONFIG_SMP and !CONFIG_SMP, as some function will need to
do.

Fixes: 53916d5fd3c0 ("sched/deadline: Check bandwidth overflow earlier for hotplug")
Reported-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Waiman Long <longman@redhat.com>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lore.kernel.org/r/Z9MP5Oq9RB8jBs3y@jlelli-thinkpadt14gen4.remote.csb
---
 include/linux/sched.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9632e3318e0d..0785268c76f8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -382,6 +382,11 @@ enum uclamp_id {
 #ifdef CONFIG_SMP
 extern struct root_domain def_root_domain;
 extern struct mutex sched_domains_mutex;
+extern void sched_domains_mutex_lock(void);
+extern void sched_domains_mutex_unlock(void);
+#else
+static inline void sched_domains_mutex_lock(void) { }
+static inline void sched_domains_mutex_unlock(void) { }
 #endif
 
 struct sched_param {
-- 
cgit v1.2.3


From 45007c6fb5860cf63556a9cadc87c8984927e23d Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@redhat.com>
Date: Thu, 13 Mar 2025 18:05:46 +0100
Subject: sched/deadline: Generalize unique visiting of root domains

Bandwidth checks and updates that work on root domains currently employ
a cookie mechanism for efficiency. This mechanism is very much tied to
when root domains are first created and initialized.

Generalize the cookie mechanism so that it can be used also later at
runtime while updating root domains. Also, additionally guard it with
sched_domains_mutex, since domains need to be stable while updating them
(and it will be required for further dynamic changes).

Fixes: 53916d5fd3c0 ("sched/deadline: Check bandwidth overflow earlier for hotplug")
Reported-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Waiman Long <longman@redhat.com>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lore.kernel.org/r/Z9MQaiXPvEeW_v7x@jlelli-thinkpadt14gen4.remote.csb
---
 include/linux/sched/deadline.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 3a912ab42bb5..6ec578600b24 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -37,4 +37,7 @@ extern void dl_clear_root_domain(struct root_domain *rd);
 
 #endif /* CONFIG_SMP */
 
+extern u64 dl_cookie;
+extern bool dl_bw_visited(int cpu, u64 cookie);
+
 #endif /* _LINUX_SCHED_DEADLINE_H */
-- 
cgit v1.2.3


From 2ff899e3516437354204423ef0a94994717b8e6a Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@redhat.com>
Date: Thu, 13 Mar 2025 18:10:21 +0100
Subject: sched/deadline: Rebuild root domain accounting after every update

Rebuilding of root domains accounting information (total_bw) is
currently broken on some cases, e.g. suspend/resume on aarch64. Problem
is that the way we keep track of domain changes and try to add bandwidth
back is convoluted and fragile.

Fix it by simplify things by making sure bandwidth accounting is cleared
and completely restored after root domains changes (after root domains
are again stable).

To be sure we always call dl_rebuild_rd_accounting while holding
cpuset_mutex we also add cpuset_reset_sched_domains() wrapper.

Fixes: 53916d5fd3c0 ("sched/deadline: Check bandwidth overflow earlier for hotplug")
Reported-by: Jon Hunter <jonathanh@nvidia.com>
Co-developed-by: Waiman Long <llong@redhat.com>
Signed-off-by: Waiman Long <llong@redhat.com>
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lore.kernel.org/r/Z9MRfeJKJUOyUSto@jlelli-thinkpadt14gen4.remote.csb
---
 include/linux/cpuset.h         | 6 ++++++
 include/linux/sched/deadline.h | 1 +
 include/linux/sched/topology.h | 2 ++
 3 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 835e7b793f6a..17cc90d900f9 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -128,6 +128,7 @@ extern bool current_cpuset_is_being_rebound(void);
 extern void rebuild_sched_domains(void);
 
 extern void cpuset_print_current_mems_allowed(void);
+extern void cpuset_reset_sched_domains(void);
 
 /*
  * read_mems_allowed_begin is required when making decisions involving
@@ -264,6 +265,11 @@ static inline void rebuild_sched_domains(void)
 	partition_sched_domains(1, NULL, NULL);
 }
 
+static inline void cpuset_reset_sched_domains(void)
+{
+	partition_sched_domains(1, NULL, NULL);
+}
+
 static inline void cpuset_print_current_mems_allowed(void)
 {
 }
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 6ec578600b24..f9aabbc9d22e 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -34,6 +34,7 @@ static inline bool dl_time_before(u64 a, u64 b)
 struct root_domain;
 extern void dl_add_task_root_domain(struct task_struct *p);
 extern void dl_clear_root_domain(struct root_domain *rd);
+extern void dl_clear_root_domain_cpu(int cpu);
 
 #endif /* CONFIG_SMP */
 
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7f3dbafe1817..1622232bd08b 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -166,6 +166,8 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 	return to_cpumask(sd->span);
 }
 
+extern void dl_rebuild_rd_accounting(void);
+
 extern void partition_sched_domains_locked(int ndoms_new,
 					   cpumask_var_t doms_new[],
 					   struct sched_domain_attr *dattr_new);
-- 
cgit v1.2.3


From d128130f486b4aa86086655af0fbb943b26b0003 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@redhat.com>
Date: Thu, 13 Mar 2025 18:12:43 +0100
Subject: sched/topology: Stop exposing partition_sched_domains_locked

The are no callers of partition_sched_domains_locked() outside
topology.c.

Stop exposing such function.

Suggested-by: Waiman Long <llong@redhat.com>
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Waiman Long <longman@redhat.com>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lore.kernel.org/r/Z9MSC96a8FcqWV3G@jlelli-thinkpadt14gen4.remote.csb
---
 include/linux/sched/topology.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 1622232bd08b..96e69bfc3c8a 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -168,10 +168,6 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 
 extern void dl_rebuild_rd_accounting(void);
 
-extern void partition_sched_domains_locked(int ndoms_new,
-					   cpumask_var_t doms_new[],
-					   struct sched_domain_attr *dattr_new);
-
 extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 				    struct sched_domain_attr *dattr_new);
 
@@ -212,12 +208,6 @@ extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
 
 struct sched_domain_attr;
 
-static inline void
-partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
-			       struct sched_domain_attr *dattr_new)
-{
-}
-
 static inline void
 partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			struct sched_domain_attr *dattr_new)
-- 
cgit v1.2.3


From 34929a070b7fd06c386080c926b61ee844e6ad34 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@redhat.com>
Date: Thu, 13 Mar 2025 18:13:29 +0100
Subject: include/{topology,cpuset}: Move dl_rebuild_rd_accounting to cpuset.h

dl_rebuild_rd_accounting() is defined in cpuset.c, so it makes more
sense to move related declarations to cpuset.h.

Implement the move.

Suggested-by: Waiman Long <llong@redhat.com>
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Waiman Long <llong@redhat.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Waiman Long <longman@redhat.com>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lore.kernel.org/r/Z9MSOVMpU7jpVrMU@jlelli-thinkpadt14gen4.remote.csb
---
 include/linux/cpuset.h         | 5 +++++
 include/linux/sched/topology.h | 2 --
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 17cc90d900f9..5466c96a33db 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -125,6 +125,7 @@ static inline int cpuset_do_page_mem_spread(void)
 
 extern bool current_cpuset_is_being_rebound(void);
 
+extern void dl_rebuild_rd_accounting(void);
 extern void rebuild_sched_domains(void);
 
 extern void cpuset_print_current_mems_allowed(void);
@@ -260,6 +261,10 @@ static inline bool current_cpuset_is_being_rebound(void)
 	return false;
 }
 
+static inline void dl_rebuild_rd_accounting(void)
+{
+}
+
 static inline void rebuild_sched_domains(void)
 {
 	partition_sched_domains(1, NULL, NULL);
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 96e69bfc3c8a..51f7b8169515 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -166,8 +166,6 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 	return to_cpumask(sd->span);
 }
 
-extern void dl_rebuild_rd_accounting(void);
-
 extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 				    struct sched_domain_attr *dattr_new);
 
-- 
cgit v1.2.3


From dd5bdaf2b72da81d57f4f99e518af80002b6562e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 17 Mar 2025 11:42:54 +0100
Subject: sched/debug: Make CONFIG_SCHED_DEBUG functionality unconditional

All the big Linux distros enable CONFIG_SCHED_DEBUG, because
the various features it provides help not just with kernel
development, but with system administration and user-space
software development as well.

Reflect this reality and enable this functionality
unconditionally.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20250317104257.3496611-4-mingo@kernel.org
---
 include/linux/energy_model.h   | 2 --
 include/linux/sched/debug.h    | 2 --
 include/linux/sched/topology.h | 4 ----
 3 files changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 78318d49276d..65efc0f5ea2e 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -240,9 +240,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	struct em_perf_state *ps;
 	int i;
 
-#ifdef CONFIG_SCHED_DEBUG
 	WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n");
-#endif
 
 	if (!sum_util)
 		return 0;
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
index b5035afa2396..35ed4577a6cc 100644
--- a/include/linux/sched/debug.h
+++ b/include/linux/sched/debug.h
@@ -35,12 +35,10 @@ extern void show_stack(struct task_struct *task, unsigned long *sp,
 
 extern void sched_show_task(struct task_struct *p);
 
-#ifdef CONFIG_SCHED_DEBUG
 struct seq_file;
 extern void proc_sched_show_task(struct task_struct *p,
 				 struct pid_namespace *ns, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
-#endif
 
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__section(".sched.text")
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 51f7b8169515..7b4301b7235f 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -25,16 +25,12 @@ enum {
 };
 #undef SD_FLAG
 
-#ifdef CONFIG_SCHED_DEBUG
-
 struct sd_flag_debug {
 	unsigned int meta_flags;
 	char *name;
 };
 extern const struct sd_flag_debug sd_flag_debug[];
 
-#endif
-
 #ifdef CONFIG_SCHED_SMT
 static inline int cpu_smt_flags(void)
 {
-- 
cgit v1.2.3


From 3785c7dbae0f733f13f8857beaaada5d7dc63e02 Mon Sep 17 00:00:00 2001
From: Yujun Dong <yujundong@pascal-lab.net>
Date: Mon, 30 Dec 2024 22:16:24 +0800
Subject: cpuidle, sched: Use smp_mb__after_atomic() in current_clr_polling()

In architectures that use the polling bit, current_clr_polling() employs
smp_mb() to ensure that the clearing of the polling bit is visible to
other cores before checking TIF_NEED_RESCHED.

However, smp_mb() can be costly. Given that clear_bit() is an atomic
operation, replacing smp_mb() with smp_mb__after_atomic() is appropriate.

Many architectures implement smp_mb__after_atomic() as a lighter-weight
barrier compared to smp_mb(), leading to performance improvements.
For instance, on x86, smp_mb__after_atomic() is a no-op. This change
eliminates a smp_mb() instruction in the cpuidle wake-up path, saving
several CPU cycles and thereby reducing wake-up latency.

Architectures that do not use the polling bit will retain the original
smp_mb() behavior to ensure that existing dependencies remain unaffected.

Signed-off-by: Yujun Dong <yujundong@pascal-lab.net>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20241230141624.155356-1-yujundong@pascal-lab.net
---
 include/linux/sched/idle.h | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index e670ac282333..439f6029d3b9 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -79,6 +79,21 @@ static __always_inline bool __must_check current_clr_polling_and_test(void)
 	return unlikely(tif_need_resched());
 }
 
+static __always_inline void current_clr_polling(void)
+{
+	__current_clr_polling();
+
+	/*
+	 * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
+	 * Once the bit is cleared, we'll get IPIs with every new
+	 * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
+	 * fold.
+	 */
+	smp_mb__after_atomic(); /* paired with resched_curr() */
+
+	preempt_fold_need_resched();
+}
+
 #else
 static inline void __current_set_polling(void) { }
 static inline void __current_clr_polling(void) { }
@@ -91,21 +106,15 @@ static inline bool __must_check current_clr_polling_and_test(void)
 {
 	return unlikely(tif_need_resched());
 }
-#endif
 
 static __always_inline void current_clr_polling(void)
 {
 	__current_clr_polling();
 
-	/*
-	 * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
-	 * Once the bit is cleared, we'll get IPIs with every new
-	 * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
-	 * fold.
-	 */
 	smp_mb(); /* paired with resched_curr() */
 
 	preempt_fold_need_resched();
 }
+#endif
 
 #endif /* _LINUX_SCHED_IDLE_H */
-- 
cgit v1.2.3