sched/fair: Get rid of sched_domains_curr_level hack for tl->cpumask()

Leon [1] and Vinicius [2] noted a topology_span_sane() warning during their testing starting from v6.16-rc1. Debug that followed pointed to the tl->mask() for the NODE domain being incorrectly resolved to that of the highest NUMA domain. tl->mask() for NODE is set to the sd_numa_mask() which depends on the global "sched_domains_curr_level" hack. "sched_domains_curr_level" is set to the "tl->numa_level" during tl traversal in build_sched_domains() calling sd_init() but was not reset before topology_span_sane(). Since "tl->numa_level" still reflected the old value from build_sched_domains(), topology_span_sane() for the NODE domain trips when the span of the last NUMA domain overlaps. Instead of replicating the "sched_domains_curr_level" hack, get rid of it entirely and instead, pass the entire "sched_domain_topology_level" object to tl->cpumask() function to prevent such mishap in the future. sd_numa_mask() now directly references "tl->numa_level" instead of relying on the global "sched_domains_curr_level" hack to index into sched_domains_numa_masks[]. The original warning was reproducible on the following NUMA topology reported by Leon: $ sudo numactl -H available: 5 nodes (0-4) node 0 cpus: 0 1 node 0 size: 2927 MB node 0 free: 1603 MB node 1 cpus: 2 3 node 1 size: 3023 MB node 1 free: 3008 MB node 2 cpus: 4 5 node 2 size: 3023 MB node 2 free: 3007 MB node 3 cpus: 6 7 node 3 size: 3023 MB node 3 free: 3002 MB node 4 cpus: 8 9 node 4 size: 3022 MB node 4 free: 2718 MB node distances: node 0 1 2 3 4 0: 10 39 38 37 36 1: 39 10 38 37 36 2: 38 38 10 37 36 3: 37 37 37 10 36 4: 36 36 36 36 10 The above topology can be mimicked using the following QEMU cmd that was used to reproduce the warning and test the fix: sudo qemu-system-x86_64 -enable-kvm -cpu host \ -m 20G -smp cpus=10,sockets=10 -machine q35 \ -object memory-backend-ram,size=4G,id=m0 \ -object memory-backend-ram,size=4G,id=m1 \ -object memory-backend-ram,size=4G,id=m2 \ -object memory-backend-ram,size=4G,id=m3 \ -object memory-backend-ram,size=4G,id=m4 \ -numa node,cpus=0-1,memdev=m0,nodeid=0 \ -numa node,cpus=2-3,memdev=m1,nodeid=1 \ -numa node,cpus=4-5,memdev=m2,nodeid=2 \ -numa node,cpus=6-7,memdev=m3,nodeid=3 \ -numa node,cpus=8-9,memdev=m4,nodeid=4 \ -numa dist,src=0,dst=1,val=39 \ -numa dist,src=0,dst=2,val=38 \ -numa dist,src=0,dst=3,val=37 \ -numa dist,src=0,dst=4,val=36 \ -numa dist,src=1,dst=0,val=39 \ -numa dist,src=1,dst=2,val=38 \ -numa dist,src=1,dst=3,val=37 \ -numa dist,src=1,dst=4,val=36 \ -numa dist,src=2,dst=0,val=38 \ -numa dist,src=2,dst=1,val=38 \ -numa dist,src=2,dst=3,val=37 \ -numa dist,src=2,dst=4,val=36 \ -numa dist,src=3,dst=0,val=37 \ -numa dist,src=3,dst=1,val=37 \ -numa dist,src=3,dst=2,val=37 \ -numa dist,src=3,dst=4,val=36 \ -numa dist,src=4,dst=0,val=36 \ -numa dist,src=4,dst=1,val=36 \ -numa dist,src=4,dst=2,val=36 \ -numa dist,src=4,dst=3,val=36 \ ... [ prateek: Moved common functions to include/linux/sched/topology.h, reuse the common bits for s390 and ppc, commit message ] Closes: https://lore.kernel.org/lkml/20250610110701.GA256154@unreal/ [1] Fixes: ccf74128d66c ("sched/topology: Assert non-NUMA topology masks don't (partially) overlap") # ce29a7da84cd, f55dac1dafb3 Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reported-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Valentin Schneider <vschneid@redhat.com> Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com> Tested-by: Valentin Schneider <vschneid@redhat.com> # x86 Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com> # powerpc Link: https://lore.kernel.org/lkml/a3de98387abad28592e6ab591f3ff6107fe01dc1.1755893468.git.tim.c.chen@linux.intel.com/ [2]
author: Peter Zijlstra <peterz@infradead.org> 2025-08-25 12:02:44 +0000
committer: Peter Zijlstra <peterz@infradead.org> 2025-09-03 10:03:12 +0200
commit: 661f951e371cc134ea31c84238dbdc9a898b8403 (patch)
tree: 0fcdfb9c78d79229269eeda5dde97e8c28c51512 /include/linux/sched
parent: 8fd5485fb4f3d9da3977fd783fcb8e5452463420 (diff)
1 files changed, 27 insertions, 1 deletions
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 5263746b63e8..a3a24e115d44 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -30,11 +30,19 @@ struct sd_flag_debug {
 };
 extern const struct sd_flag_debug sd_flag_debug[];
 
+struct sched_domain_topology_level;
+
 #ifdef CONFIG_SCHED_SMT
 static inline int cpu_smt_flags(void)
 {
 	return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
 }
+
+static inline const
+struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+	return cpu_smt_mask(cpu);
+}
 #endif
 
 #ifdef CONFIG_SCHED_CLUSTER
@@ -42,6 +50,12 @@ static inline int cpu_cluster_flags(void)
 {
 	return SD_CLUSTER | SD_SHARE_LLC;
 }
+
+static inline const
+struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+	return cpu_clustergroup_mask(cpu);
+}
 #endif
 
 #ifdef CONFIG_SCHED_MC
@@ -49,8 +63,20 @@ static inline int cpu_core_flags(void)
 {
 	return SD_SHARE_LLC;
 }
+
+static inline const
+struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+	return cpu_coregroup_mask(cpu);
+}
 #endif
 
+static inline const
+struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu)
+{
+	return cpu_node_mask(cpu);
+}
+
 #ifdef CONFIG_NUMA
 static inline int cpu_numa_flags(void)
 {
@@ -172,7 +198,7 @@ bool cpus_equal_capacity(int this_cpu, int that_cpu);
 bool cpus_share_cache(int this_cpu, int that_cpu);
 bool cpus_share_resources(int this_cpu, int that_cpu);
 
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+typedef const struct cpumask *(*sched_domain_mask_f)(struct sched_domain_topology_level *tl, int cpu);
 typedef int (*sched_domain_flags_f)(void);
 
 struct sd_data {
author	Peter Zijlstra <peterz@infradead.org>	2025-08-25 12:02:44 +0000
committer	Peter Zijlstra <peterz@infradead.org>	2025-09-03 10:03:12 +0200
commit	661f951e371cc134ea31c84238dbdc9a898b8403 (patch)
tree	0fcdfb9c78d79229269eeda5dde97e8c28c51512 /include/linux/sched
parent	8fd5485fb4f3d9da3977fd783fcb8e5452463420 (diff)