From df0d98475954d655571979aa061ecb07d7e00392 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Wed, 1 Apr 2026 14:52:13 -0700 Subject: sched/cache: Introduce infrastructure for cache-aware load balancing Adds infrastructure to enable cache-aware load balancing, which improves cache locality by grouping tasks that share resources within the same cache domain. This reduces cache misses and improves overall data access efficiency. In this initial implementation, threads belonging to the same process are treated as entities that likely share working sets. The mechanism tracks per-process CPU occupancy across cache domains and attempts to migrate threads toward cache-hot domains where their process already has active threads, thereby enhancing locality. This provides a basic model for cache affinity. While the current code targets the last-level cache (LLC), the approach could be extended to other domain types such as clusters (L2) or node-internal groupings. At present, the mechanism selects the CPU within an LLC that has the highest recent runtime. Subsequent patches in this series will use this information in the load-balancing path to guide task placement toward preferred LLCs. In the future, more advanced policies could be integrated through NUMA balancing-for example, migrating a task to its preferred LLC when spare capacity exists, or swapping tasks across LLCs to improve cache affinity. Grouping of tasks could also be generalized from that of a process to be that of a NUMA group, or be user configurable. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/6269a53221b9439b9ca00d18a9d1946fb64d8cff.1775065312.git.tim.c.chen@linux.intel.com --- kernel/fork.c | 6 ++ kernel/sched/core.c | 6 ++ kernel/sched/fair.c | 266 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 14 +++ 4 files changed, 292 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 079802cb6100..61042bc3482d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -724,6 +724,7 @@ void __mmdrop(struct mm_struct *mm) cleanup_lazy_tlbs(mm); WARN_ON_ONCE(mm == current->active_mm); + mm_destroy_sched(mm); mm_free_pgd(mm); mm_free_id(mm); destroy_context(mm); @@ -1125,6 +1126,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_cid(mm, p)) goto fail_cid; + if (mm_alloc_sched(mm)) + goto fail_sched; + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, NR_MM_COUNTERS)) goto fail_pcpu; @@ -1134,6 +1138,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, return mm; fail_pcpu: + mm_destroy_sched(mm); +fail_sched: mm_destroy_cid(mm); fail_cid: destroy_context(mm); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 49cd5d217161..7e0b55e7ef5c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4434,6 +4434,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) init_numa_balancing(clone_flags, p); p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; + init_sched_mm(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -8962,6 +8963,11 @@ void __init sched_init(void) rq->core_cookie = 0UL; #endif +#ifdef CONFIG_SCHED_CACHE + raw_spin_lock_init(&rq->cpu_epoch_lock); + rq->cpu_epoch_next = jiffies; +#endif + zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 12890ef16603..c9cd064223e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1321,6 +1321,8 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } +static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec); + static s64 update_se(struct rq *rq, struct sched_entity *se) { u64 now = rq_clock_task(rq); @@ -1343,6 +1345,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) trace_sched_stat_runtime(running, delta_exec); account_group_exec_runtime(running, delta_exec); + account_mm_sched(rq, running, delta_exec); /* cgroup time is always accounted against the donor */ cgroup_account_cputime(donor, delta_exec); @@ -1364,6 +1367,267 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) static void set_next_buddy(struct sched_entity *se); +#ifdef CONFIG_SCHED_CACHE + +/* + * XXX numbers come from a place the sun don't shine -- probably wants to be SD + * tunable or so. + */ +#define EPOCH_PERIOD (HZ / 100) /* 10 ms */ +#define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */ + +static int llc_id(int cpu) +{ + if (cpu < 0) + return -1; + + return per_cpu(sd_llc_id, cpu); +} + +void mm_init_sched(struct mm_struct *mm, + struct sched_cache_time __percpu *_pcpu_sched) +{ + unsigned long epoch = 0; + int i; + + for_each_possible_cpu(i) { + struct sched_cache_time *pcpu_sched = per_cpu_ptr(_pcpu_sched, i); + struct rq *rq = cpu_rq(i); + + pcpu_sched->runtime = 0; + /* a slightly stale cpu epoch is acceptible */ + pcpu_sched->epoch = rq->cpu_epoch; + epoch = rq->cpu_epoch; + } + + raw_spin_lock_init(&mm->sc_stat.lock); + mm->sc_stat.epoch = epoch; + mm->sc_stat.cpu = -1; + + /* + * The update to mm->sc_stat should not be reordered + * before initialization to mm's other fields, in case + * the readers may get invalid mm_sched_epoch, etc. + */ + smp_store_release(&mm->sc_stat.pcpu_sched, _pcpu_sched); +} + +/* because why would C be fully specified */ +static __always_inline void __shr_u64(u64 *val, unsigned int n) +{ + if (n >= 64) { + *val = 0; + return; + } + *val >>= n; +} + +static inline void __update_mm_sched(struct rq *rq, + struct sched_cache_time *pcpu_sched) +{ + lockdep_assert_held(&rq->cpu_epoch_lock); + + unsigned long n, now = jiffies; + long delta = now - rq->cpu_epoch_next; + + if (delta > 0) { + n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD; + rq->cpu_epoch += n; + rq->cpu_epoch_next += n * EPOCH_PERIOD; + __shr_u64(&rq->cpu_runtime, n); + } + + n = rq->cpu_epoch - pcpu_sched->epoch; + if (n) { + pcpu_sched->epoch += n; + __shr_u64(&pcpu_sched->runtime, n); + } +} + +static unsigned long fraction_mm_sched(struct rq *rq, + struct sched_cache_time *pcpu_sched) +{ + guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock); + + __update_mm_sched(rq, pcpu_sched); + + /* + * Runtime is a geometric series (r=0.5) and as such will sum to twice + * the accumulation period, this means the multiplcation here should + * not overflow. + */ + return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); +} + +static inline +void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) +{ + struct sched_cache_time *pcpu_sched; + struct mm_struct *mm = p->mm; + unsigned long epoch; + + if (!sched_cache_enabled()) + return; + + if (p->sched_class != &fair_sched_class) + return; + /* + * init_task, kthreads and user thread created + * by user_mode_thread() don't have mm. + */ + if (!mm || !mm->sc_stat.pcpu_sched) + return; + + pcpu_sched = per_cpu_ptr(p->mm->sc_stat.pcpu_sched, cpu_of(rq)); + + scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) { + __update_mm_sched(rq, pcpu_sched); + pcpu_sched->runtime += delta_exec; + rq->cpu_runtime += delta_exec; + epoch = rq->cpu_epoch; + } + + /* + * If this process hasn't hit task_cache_work() for a while invalidate + * its preferred state. + */ + if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT) { + if (mm->sc_stat.cpu != -1) + mm->sc_stat.cpu = -1; + } +} + +static void task_tick_cache(struct rq *rq, struct task_struct *p) +{ + struct callback_head *work = &p->cache_work; + struct mm_struct *mm = p->mm; + unsigned long epoch; + + if (!sched_cache_enabled()) + return; + + if (!mm || !mm->sc_stat.pcpu_sched) + return; + + epoch = rq->cpu_epoch; + /* avoid moving backwards */ + if (time_after_eq(mm->sc_stat.epoch, epoch)) + return; + + guard(raw_spinlock)(&mm->sc_stat.lock); + + if (work->next == work) { + task_work_add(p, work, TWA_RESUME); + WRITE_ONCE(mm->sc_stat.epoch, epoch); + } +} + +static void task_cache_work(struct callback_head *work) +{ + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + unsigned long m_a_occ = 0; + unsigned long curr_m_a_occ = 0; + int cpu, m_a_cpu = -1; + cpumask_var_t cpus; + + WARN_ON_ONCE(work != &p->cache_work); + + work->next = work; + + if (p->flags & PF_EXITING) + return; + + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) + return; + + scoped_guard (cpus_read_lock) { + guard(rcu)(); + + cpumask_copy(cpus, cpu_online_mask); + + for_each_cpu(cpu, cpus) { + /* XXX sched_cluster_active */ + struct sched_domain *sd = per_cpu(sd_llc, cpu); + unsigned long occ, m_occ = 0, a_occ = 0; + int m_cpu = -1, i; + + if (!sd) + continue; + + for_each_cpu(i, sched_domain_span(sd)) { + occ = fraction_mm_sched(cpu_rq(i), + per_cpu_ptr(mm->sc_stat.pcpu_sched, i)); + a_occ += occ; + if (occ > m_occ) { + m_occ = occ; + m_cpu = i; + } + } + + /* + * Compare the accumulated occupancy of each LLC. The + * reason for using accumulated occupancy rather than average + * per CPU occupancy is that it works better in asymmetric LLC + * scenarios. + * For example, if there are 2 threads in a 4CPU LLC and 3 + * threads in an 8CPU LLC, it might be better to choose the one + * with 3 threads. However, this would not be the case if the + * occupancy is divided by the number of CPUs in an LLC (i.e., + * if average per CPU occupancy is used). + * Besides, NUMA balancing fault statistics behave similarly: + * the total number of faults per node is compared rather than + * the average number of faults per CPU. This strategy is also + * followed here. + */ + if (a_occ > m_a_occ) { + m_a_occ = a_occ; + m_a_cpu = m_cpu; + } + + if (llc_id(cpu) == llc_id(mm->sc_stat.cpu)) + curr_m_a_occ = a_occ; + + cpumask_andnot(cpus, cpus, sched_domain_span(sd)); + } + } + + if (m_a_occ > (2 * curr_m_a_occ)) { + /* + * Avoid switching sc_stat.cpu too fast. + * The reason to choose 2X is because: + * 1. It is better to keep the preferred LLC stable, + * rather than changing it frequently and cause migrations + * 2. 2X means the new preferred LLC has at least 1 more + * busy CPU than the old one(200% vs 100%, eg) + * 3. 2X is chosen based on test results, as it delivers + * the optimal performance gain so far. + */ + mm->sc_stat.cpu = m_a_cpu; + } + + free_cpumask_var(cpus); +} + +void init_sched_mm(struct task_struct *p) +{ + struct callback_head *work = &p->cache_work; + + init_task_work(work, task_cache_work); + work->next = work; +} + +#else /* CONFIG_SCHED_CACHE */ + +static inline void account_mm_sched(struct rq *rq, struct task_struct *p, + s64 delta_exec) { } + +void init_sched_mm(struct task_struct *p) { } + +static void task_tick_cache(struct rq *rq, struct task_struct *p) { } + +#endif /* CONFIG_SCHED_CACHE */ + /* * Used by other classes to account runtime. */ @@ -13653,6 +13917,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + task_tick_cache(rq, curr); + update_misfit_status(curr, rq); check_update_overutilized_status(task_rq(curr)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c95584191d58..f939d45fe043 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1178,6 +1178,12 @@ struct rq { struct scx_rq scx; struct sched_dl_entity ext_server; #endif +#ifdef CONFIG_SCHED_CACHE + raw_spinlock_t cpu_epoch_lock ____cacheline_aligned; + u64 cpu_runtime; + unsigned long cpu_epoch; + unsigned long cpu_epoch_next; +#endif struct sched_dl_entity fair_server; @@ -4041,6 +4047,14 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { } #endif /* !CONFIG_SCHED_MM_CID */ +#ifdef CONFIG_SCHED_CACHE +static inline bool sched_cache_enabled(void) +{ + return false; +} +#endif +extern void init_sched_mm(struct task_struct *p); + extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); static inline -- cgit v1.2.3 From b4606faab3188beeacc2287b8a369cca943cc8eb Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 1 Apr 2026 14:52:14 -0700 Subject: sched/cache: Limit the scan number of CPUs when calculating task occupancy When NUMA balancing is enabled, the kernel currently iterates over all online CPUs to aggregate process-wide occupancy data. On large systems, this global scan introduces significant overhead. To reduce scan latency, limit the search to a subset of relevant CPUs: 1. The task's preferred NUMA node. 2. The node where the task is currently running. 3. The node that contains the task's current preferred LLC.. While focusing solely on the preferred NUMA node is ideal, a process-wide scan must remain flexible because the "preferred node" is a per-task attribute. Different threads within the same process may have different preferred nodes, causing the process-wide preference to migrate. Maintaining a mask that covers both the preferred and active running nodes ensures accuracy while significantly reducing the number of CPUs inspected. Future work may integrate numa_group to further refine task aggregation. Suggested-by: Madadi Vineeth Reddy Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/57ed5fcec9b242803fe4ea2ce6e7f3de6a6efc6b.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c9cd064223e5..a55ada22e40c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1522,6 +1522,51 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p) } } +static void get_scan_cpumasks(cpumask_var_t cpus, struct task_struct *p) +{ +#ifdef CONFIG_NUMA_BALANCING + int cpu, curr_cpu, nid, pref_nid; + + if (!static_branch_likely(&sched_numa_balancing)) + goto out; + + cpu = p->mm->sc_stat.cpu; + if (cpu != -1) + nid = cpu_to_node(cpu); + curr_cpu = task_cpu(p); + + /* + * Scanning in the preferred NUMA node is ideal. However, the NUMA + * preferred node is per-task rather than per-process. It is possible + * for different threads of the process to have distinct preferred + * nodes; consequently, the process-wide preferred LLC may bounce + * between different nodes. As a workaround, maintain the scan + * CPU mask to also cover the process's current preferred LLC and the + * current running node to mitigate the bouncing risk. + * TBD: numa_group should be considered during task aggregation. + */ + pref_nid = p->numa_preferred_nid; + /* honor the task's preferred node */ + if (pref_nid == NUMA_NO_NODE) + goto out; + + cpumask_or(cpus, cpus, cpumask_of_node(pref_nid)); + + /* honor the task's preferred LLC CPU */ + if (cpu != -1 && !cpumask_test_cpu(cpu, cpus) && nid != NUMA_NO_NODE) + cpumask_or(cpus, cpus, cpumask_of_node(nid)); + + /* make sure the task's current running node is included */ + if (!cpumask_test_cpu(curr_cpu, cpus)) + cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu))); + + return; + +out: +#endif + cpumask_copy(cpus, cpu_online_mask); +} + static void task_cache_work(struct callback_head *work) { struct task_struct *p = current; @@ -1544,7 +1589,7 @@ static void task_cache_work(struct callback_head *work) scoped_guard (cpus_read_lock) { guard(rcu)(); - cpumask_copy(cpus, cpu_online_mask); + get_scan_cpumasks(cpus, p); for_each_cpu(cpu, cpus) { /* XXX sched_cluster_active */ -- cgit v1.2.3 From f025ef275388742643a2c33f00a0d9c0af3112ee Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 1 Apr 2026 14:52:15 -0700 Subject: sched/cache: Record per LLC utilization to guide cache aware scheduling decisions When a system becomes busy and a process's preferred LLC is saturated with too many threads, tasks within that LLC migrate frequently. These in LLC migrations introduce latency and degrade performance. To avoid this, task aggregation should be suppressed when the preferred LLC is overloaded, which requires a metric to indicate LLC utilization. Record per LLC utilization/cpu capacity during periodic load balancing. These statistics will be used in later patches to decide whether tasks should be aggregated into their preferred LLC. Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/a48151b3d57f2a42a5971aaead1b7f81e69229f4.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a55ada22e40c..6647d465b59e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9992,6 +9992,28 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ return 0; } +#ifdef CONFIG_SCHED_CACHE +static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +{ + struct sched_domain_shared *sd_share; + + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + if (!sd_share) + return false; + + *util = READ_ONCE(sd_share->util_avg); + *cap = READ_ONCE(sd_share->capacity); + + return true; +} +#else +static inline bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +{ + return false; +} +#endif /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -10948,6 +10970,53 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) return check_cpu_capacity(rq, sd); } +#ifdef CONFIG_SCHED_CACHE +/* + * Record the statistics for this scheduler group for later + * use. These values guide load balancing on aggregating tasks + * to a LLC. + */ +static void record_sg_llc_stats(struct lb_env *env, + struct sg_lb_stats *sgs, + struct sched_group *group) +{ + struct sched_domain_shared *sd_share; + int cpu; + + if (!sched_cache_enabled() || env->idle == CPU_NEWLY_IDLE) + return; + + /* Only care about sched domain spanning multiple LLCs */ + if (env->sd->child != rcu_dereference_all(per_cpu(sd_llc, env->dst_cpu))) + return; + + /* + * At this point we know this group spans a LLC domain. + * Record the statistic of this group in its corresponding + * shared LLC domain. + * Note: sd_share cannot be obtained via sd->child->shared, + * because the latter refers to the domain that covers the + * local group. Instead, sd_share should be located using + * the first CPU of the LLC group. + */ + cpu = cpumask_first(sched_group_span(group)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + if (!sd_share) + return; + + if (READ_ONCE(sd_share->util_avg) != sgs->group_util) + WRITE_ONCE(sd_share->util_avg, sgs->group_util); + + if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity)) + WRITE_ONCE(sd_share->capacity, sgs->group_capacity); +} +#else +static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ +} +#endif + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -11035,6 +11104,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + record_sg_llc_stats(env, sgs, group); /* Computing avg_load makes sense only when group is overloaded */ if (sgs->group_type == group_overloaded) sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / -- cgit v1.2.3 From 23b2b5ccc45ce2a38b9336a916088fffdc4cdfb1 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 1 Apr 2026 14:52:16 -0700 Subject: sched/cache: Introduce helper functions to enforce LLC migration policy Cache-aware scheduling aggregates threads onto their preferred LLC, mainly through load balancing. When the preferred LLC becomes saturated, more threads are still placed there, increasing latency. A mechanism is needed to limit aggregation so that the preferred LLC does not become overloaded. Introduce helper functions can_migrate_llc() and can_migrate_llc_task() to enforce the LLC migration policy: 1. Aggregate a task to its preferred LLC if both source and destination LLCs are not too busy, or if doing so will not leave the preferred LLC much more imbalanced than the non-preferred one (>20% utilization difference, a little higher than the default imbalance_pct(17%) of the LLC domain as hysteresis). Later this threshold will be turned into tunable debugfs. 2. Allow moving a task from overloaded preferred LLC to a non preferred LLC if this will not cause the non preferred LLC to become too imbalanced to cause a later migration back. 3. If both LLCs are too busy, let the generic load balance to spread the tasks. Further (hysteresis)action could be taken in the future to prevent tasks from being migrated into and out of the preferred LLC frequently (back and forth): the threshold for migrating a task out of its preferred LLC should be higher than that for migrating it into the LLC. Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/d19b52589cdceaee5e625980959f4d1982d6d7c9.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6647d465b59e..7860c5bc12d7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9993,6 +9993,38 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ } #ifdef CONFIG_SCHED_CACHE +/* + * The margin used when comparing LLC utilization with CPU capacity. + * It determines the LLC load level where active LLC aggregation is + * done. + * Derived from fits_capacity(). + * + * (default: ~50%, tunable via debugfs) + */ +static bool fits_llc_capacity(unsigned long util, unsigned long max) +{ + u32 aggr_pct = 50; + + /* + * For single core systems, raise the aggregation + * threshold to accommodate more tasks. + */ + if (cpu_smt_num_threads == 1) + aggr_pct = (aggr_pct * 3 / 2); + + return util * 100 < max * aggr_pct; +} + +/* + * The margin used when comparing utilization. + * is 'util1' noticeably greater than 'util2' + * Derived from capacity_greater(). + * Bias is in perentage. + */ +/* Allows dst util to be bigger than src util by up to bias percent */ +#define util_greater(util1, util2) \ + ((util1) * 100 > (util2) * 120) + static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, unsigned long *cap) { @@ -10007,6 +10039,141 @@ static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, return true; } + +/* + * Decision matrix according to the LLC utilization. To + * decide whether we can do task aggregation across LLC. + * + * By default, 50% is the threshold for treating the LLC + * as busy. The reason for choosing 50% is to avoid saturation + * of SMT-2, and it is also a safe cutoff for other SMT-n + * platforms. SMT-1 has higher threshold because it is + * supposed to accommodate more tasks, see fits_llc_capacity(). + * + * 20% is the utilization imbalance percentage to decide + * if the preferred LLC is busier than the non-preferred LLC. + * 20 is a little higher than the LLC domain's imbalance_pct + * 17. The hysteresis is used to avoid task bouncing between the + * preferred LLC and the non-preferred LLC, and it will + * be turned into tunable debugfs. + * + * 1. moving towards the preferred LLC, dst is the preferred + * LLC, src is not. + * + * src \ dst 30% 40% 50% 60% + * 30% Y Y Y N + * 40% Y Y Y Y + * 50% Y Y G G + * 60% Y Y G G + * + * 2. moving out of the preferred LLC, src is the preferred + * LLC, dst is not: + * + * src \ dst 30% 40% 50% 60% + * 30% N N N N + * 40% N N N N + * 50% N N G G + * 60% Y N G G + * + * src : src_util + * dst : dst_util + * Y : Yes, migrate + * N : No, do not migrate + * G : let the Generic load balance to even the load. + * + * The intention is that if both LLCs are quite busy, cache aware + * load balance should not be performed, and generic load balance + * should take effect. However, if one is busy and the other is not, + * the preferred LLC capacity(50%) and imbalance criteria(20%) should + * be considered to determine whether LLC aggregation should be + * performed to bias the load towards the preferred LLC. + */ + +/* migration decision, 3 states are orthogonal. */ +enum llc_mig { + mig_forbid = 0, /* N: Don't migrate task, respect LLC preference */ + mig_llc, /* Y: Do LLC preference based migration */ + mig_unrestricted /* G: Don't restrict generic load balance migration */ +}; + +/* + * Check if task can be moved from the source LLC to the + * destination LLC without breaking cache aware preferrence. + * src_cpu and dst_cpu are arbitrary CPUs within the source + * and destination LLCs, respectively. + */ +static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu, + unsigned long tsk_util, + bool to_pref) +{ + unsigned long src_util, dst_util, src_cap, dst_cap; + + if (!get_llc_stats(src_cpu, &src_util, &src_cap) || + !get_llc_stats(dst_cpu, &dst_util, &dst_cap)) + return mig_unrestricted; + + src_util = src_util < tsk_util ? 0 : src_util - tsk_util; + dst_util = dst_util + tsk_util; + + if (!fits_llc_capacity(dst_util, dst_cap) && + !fits_llc_capacity(src_util, src_cap)) + return mig_unrestricted; + + if (to_pref) { + /* + * Don't migrate if we will get preferred LLC too + * heavily loaded and if the dest is much busier + * than the src, in which case migration will + * increase the imbalance too much. + */ + if (!fits_llc_capacity(dst_util, dst_cap) && + util_greater(dst_util, src_util)) + return mig_forbid; + } else { + /* + * Don't migrate if we will leave preferred LLC + * too idle, or if this migration leads to the + * non-preferred LLC falls within sysctl_aggr_imb percent + * of preferred LLC, leading to migration again + * back to preferred LLC. + */ + if (fits_llc_capacity(src_util, src_cap) || + !util_greater(src_util, dst_util)) + return mig_forbid; + } + return mig_llc; +} + +/* + * Check if task p can migrate from source LLC to + * destination LLC in terms of cache aware load balance. + */ +static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, + struct task_struct *p) +{ + struct mm_struct *mm; + bool to_pref; + int cpu; + + mm = p->mm; + if (!mm) + return mig_unrestricted; + + cpu = mm->sc_stat.cpu; + if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) + return mig_unrestricted; + + if (cpus_share_cache(dst_cpu, cpu)) + to_pref = true; + else if (cpus_share_cache(src_cpu, cpu)) + to_pref = false; + else + return mig_unrestricted; + + return can_migrate_llc(src_cpu, dst_cpu, + task_util(p), to_pref); +} + #else static inline bool get_llc_stats(int cpu, unsigned long *util, unsigned long *cap) -- cgit v1.2.3 From b5ea300a17e37eada7a98561fbd34a3054578713 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:17 -0700 Subject: sched/cache: Make LLC id continuous Introduce an index mapping between CPUs and their LLCs. This provides a roughly continuous per LLC index needed for cache-aware load balancing in later patches. The existing per_cpu llc_id usually points to the first CPU of the LLC domain, which is sparse and unsuitable as an array index. Using llc_id directly would waste memory. With the new mapping, CPUs in the same LLC share an approximate continuous id: per_cpu(llc_id, CPU=0...15) = 0 per_cpu(llc_id, CPU=16...31) = 1 per_cpu(llc_id, CPU=32...47) = 2 ... Note that the LLC IDs are allocated via bitmask, so the IDs may be reused during CPU offline->online transitions. Suggested-by: Peter Zijlstra (Intel) Originally-by: K Prateek Nayak Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/047ef46339e4db497b54a89940a7ebedf27fcf28.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/core.c | 2 ++ kernel/sched/sched.h | 3 ++ kernel/sched/topology.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 93 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7e0b55e7ef5c..d11e27be7697 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8630,6 +8630,8 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu(); + sched_domains_free_llc_id(cpu); + sched_set_rq_offline(rq, cpu); scx_rq_deactivate(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f939d45fe043..3cb3ab02b1eb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -4053,6 +4053,9 @@ static inline bool sched_cache_enabled(void) return false; } #endif + +void sched_domains_free_llc_id(int cpu); + extern void init_sched_mm(struct task_struct *p); extern u64 avg_vruntime(struct cfs_rq *cfs_rq); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5847b83d9d55..1200670969bb 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -19,8 +19,10 @@ void sched_domains_mutex_unlock(void) } /* Protected by sched_domains_mutex: */ +static cpumask_var_t sched_domains_llc_id_allocmask; static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; +int max_lid; static int __init sched_debug_setup(char *str) { @@ -663,7 +665,7 @@ static void destroy_sched_domains(struct sched_domain *sd) */ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); -DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_llc_id) = -1; DEFINE_PER_CPU(int, sd_share_id); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); @@ -692,7 +694,6 @@ static void update_top_cache_domain(int cpu) rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; - per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); sd = lowest_flag_domain(cpu, SD_CLUSTER); @@ -1790,6 +1791,11 @@ const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu { return cpu_coregroup_mask(cpu); } + +#define llc_mask(cpu) cpu_coregroup_mask(cpu) + +#else +#define llc_mask(cpu) cpumask_of(cpu) #endif const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) @@ -2650,6 +2656,61 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc) } } +static int __sched_domains_alloc_llc_id(void) +{ + int lid, max; + + lockdep_assert_held(&sched_domains_mutex); + + lid = cpumask_first_zero(sched_domains_llc_id_allocmask); + /* + * llc_id space should never grow larger than the + * possible number of CPUs in the system. + */ + if (lid >= nr_cpu_ids) + return -1; + + __cpumask_set_cpu(lid, sched_domains_llc_id_allocmask); + max = cpumask_last(sched_domains_llc_id_allocmask); + if (max > max_lid) + max_lid = max; + + return lid; +} + +static void __sched_domains_free_llc_id(int cpu) +{ + int i, lid, max; + + lockdep_assert_held(&sched_domains_mutex); + + lid = per_cpu(sd_llc_id, cpu); + if (lid == -1 || lid >= nr_cpu_ids) + return; + + per_cpu(sd_llc_id, cpu) = -1; + + for_each_cpu(i, llc_mask(cpu)) { + /* An online CPU owns the llc_id. */ + if (per_cpu(sd_llc_id, i) == lid) + return; + } + + __cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask); + + max = cpumask_last(sched_domains_llc_id_allocmask); + /* shrink max lid to save memory */ + if (max < max_lid) + max_lid = max; +} + +void sched_domains_free_llc_id(int cpu) +{ + sched_domains_mutex_lock(); + __sched_domains_free_llc_id(cpu); + sched_domains_mutex_unlock(); +} + /* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs @@ -2675,6 +2736,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; + int lid; sd = NULL; for_each_sd_topology(tl) { @@ -2688,6 +2750,29 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } + + lid = per_cpu(sd_llc_id, i); + if (lid == -1) { + /* try to reuse the llc_id of its siblings */ + for (int j = cpumask_first(llc_mask(i)); + j < nr_cpu_ids; + j = cpumask_next(j, llc_mask(i))) { + if (i == j) + continue; + + lid = per_cpu(sd_llc_id, j); + + if (lid != -1) { + per_cpu(sd_llc_id, i) = lid; + + break; + } + } + + /* a new LLC is detected */ + if (lid == -1) + per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id(); + } } if (WARN_ON(!topology_span_sane(cpu_map))) @@ -2831,6 +2916,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map) { int err; + zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL); zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL); zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); -- cgit v1.2.3 From 47d8696b95f7397fe7cad2d194d550ffe82efc15 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:18 -0700 Subject: sched/cache: Assign preferred LLC ID to processes With cache-aware scheduling enabled, each task is assigned a preferred LLC ID. This allows quick identification of the LLC domain where the task prefers to run, similar to numa_preferred_nid in NUMA balancing. Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/f2ceecba5858680349ad4ce9303a2121f0bb7272.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7860c5bc12d7..6e78ecfb560e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1459,11 +1459,43 @@ static unsigned long fraction_mm_sched(struct rq *rq, return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); } +static int get_pref_llc(struct task_struct *p, struct mm_struct *mm) +{ + int mm_sched_llc = -1; + + if (!mm) + return -1; + + if (mm->sc_stat.cpu != -1) { + mm_sched_llc = llc_id(mm->sc_stat.cpu); + +#ifdef CONFIG_NUMA_BALANCING + /* + * Don't assign preferred LLC if it + * conflicts with NUMA balancing. + * This can happen when sched_setnuma() gets + * called, however it is not much of an issue + * because we expect account_mm_sched() to get + * called fairly regularly -- at a higher rate + * than sched_setnuma() at least -- and thus the + * conflict only exists for a short period of time. + */ + if (static_branch_likely(&sched_numa_balancing) && + p->numa_preferred_nid >= 0 && + cpu_to_node(mm->sc_stat.cpu) != p->numa_preferred_nid) + mm_sched_llc = -1; +#endif + } + + return mm_sched_llc; +} + static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) { struct sched_cache_time *pcpu_sched; struct mm_struct *mm = p->mm; + int mm_sched_llc = -1; unsigned long epoch; if (!sched_cache_enabled()) @@ -1495,6 +1527,11 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; } + + mm_sched_llc = get_pref_llc(p, mm); + + if (READ_ONCE(p->preferred_llc) != mm_sched_llc) + WRITE_ONCE(p->preferred_llc, mm_sched_llc); } static void task_tick_cache(struct rq *rq, struct task_struct *p) @@ -1671,6 +1708,12 @@ void init_sched_mm(struct task_struct *p) { } static void task_tick_cache(struct rq *rq, struct task_struct *p) { } +static inline int get_pref_llc(struct task_struct *p, + struct mm_struct *mm) +{ + return -1; +} + #endif /* CONFIG_SCHED_CACHE */ /* -- cgit v1.2.3 From 46afe3af7ead57190b6d362e214814ec804e3b7b Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:19 -0700 Subject: sched/cache: Track LLC-preferred tasks per runqueue For each runqueue, track the number of tasks with an LLC preference and how many of them are running on their preferred LLC. This mirrors nr_numa_running and nr_preferred_running for NUMA balancing, and will be used by cache-aware load balancing in later patches. Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/459a37102f3d74a4e09ea58401d2094ac731d044.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/core.c | 5 +++++ kernel/sched/fair.c | 47 ++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 8 ++++++++ 3 files changed, 57 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d11e27be7697..eb542c97266a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -539,6 +539,11 @@ void __trace_set_current_state(int state_value) } EXPORT_SYMBOL(__trace_set_current_state); +int task_llc(const struct task_struct *p) +{ + return per_cpu(sd_llc_id, task_cpu(p)); +} + /* * Serialization rules: * diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e78ecfb560e..e66da7a6be3e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1384,6 +1384,30 @@ static int llc_id(int cpu) return per_cpu(sd_llc_id, cpu); } +static void account_llc_enqueue(struct rq *rq, struct task_struct *p) +{ + int pref_llc; + + pref_llc = p->preferred_llc; + if (pref_llc < 0) + return; + + rq->nr_llc_running++; + rq->nr_pref_llc_running += (pref_llc == task_llc(p)); +} + +static void account_llc_dequeue(struct rq *rq, struct task_struct *p) +{ + int pref_llc; + + pref_llc = p->preferred_llc; + if (pref_llc < 0) + return; + + rq->nr_llc_running--; + rq->nr_pref_llc_running -= (pref_llc == task_llc(p)); +} + void mm_init_sched(struct mm_struct *mm, struct sched_cache_time __percpu *_pcpu_sched) { @@ -1490,6 +1514,8 @@ static int get_pref_llc(struct task_struct *p, struct mm_struct *mm) return mm_sched_llc; } +static unsigned int task_running_on_cpu(int cpu, struct task_struct *p); + static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) { @@ -1530,8 +1556,13 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) mm_sched_llc = get_pref_llc(p, mm); - if (READ_ONCE(p->preferred_llc) != mm_sched_llc) + /* task not on rq accounted later in account_entity_enqueue() */ + if (task_running_on_cpu(rq->cpu, p) && + READ_ONCE(p->preferred_llc) != mm_sched_llc) { + account_llc_dequeue(rq, p); WRITE_ONCE(p->preferred_llc, mm_sched_llc); + account_llc_enqueue(rq, p); + } } static void task_tick_cache(struct rq *rq, struct task_struct *p) @@ -1714,6 +1745,10 @@ static inline int get_pref_llc(struct task_struct *p, return -1; } +static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {} + +static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {} + #endif /* CONFIG_SCHED_CACHE */ /* @@ -4200,9 +4235,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); if (entity_is_task(se)) { + struct task_struct *p = task_of(se); struct rq *rq = rq_of(cfs_rq); - account_numa_enqueue(rq, task_of(se)); + account_numa_enqueue(rq, p); + account_llc_enqueue(rq, p); list_add(&se->group_node, &rq->cfs_tasks); } cfs_rq->nr_queued++; @@ -4213,7 +4250,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); if (entity_is_task(se)) { - account_numa_dequeue(rq_of(cfs_rq), task_of(se)); + struct task_struct *p = task_of(se); + struct rq *rq = rq_of(cfs_rq); + + account_numa_dequeue(rq, p); + account_llc_dequeue(rq, p); list_del_init(&se->group_node); } cfs_rq->nr_queued--; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3cb3ab02b1eb..3c9e92b79041 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1196,6 +1196,12 @@ struct rq { #ifdef CONFIG_NUMA_BALANCING unsigned int numa_migrate_on; #endif + +#ifdef CONFIG_SCHED_CACHE + unsigned int nr_pref_llc_running; + unsigned int nr_llc_running; +#endif + /* * This is part of a global counter where only the total sum * over all CPUs matters. A task can increase this counter on @@ -2077,6 +2083,8 @@ init_numa_balancing(u64 clone_flags, struct task_struct *p) #endif /* !CONFIG_NUMA_BALANCING */ +int task_llc(const struct task_struct *p); + static inline void queue_balance_callback(struct rq *rq, struct balance_callback *head, -- cgit v1.2.3 From a8d0ca0b7f2f7b53565d1e30e509d3d74d1f5460 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:20 -0700 Subject: sched/cache: Introduce per CPU's tasks LLC preference counter The lowest level of sched domain for each CPU is assigned an array where each element tracks the number of tasks preferring a given LLC, indexed from 0 to max_lid. Since each CPU has its dedicated sd, this implies that each CPU will have a dedicated task LLC preference counter. For example, sd->llc_counts[3] = 2 signifies that there are 2 tasks on this runqueue which prefer to run within LLC3. The load balancer can use this information to identify busy runqueues and migrate tasks to their preferred LLC domains. This array will be reallocated at runtime during sched domain rebuild. Introduce the buffer allocation mechanism, and the statistics will be calculated in the subsequent patch. Note: the LLC preference statistics of each CPU are reset on sched domain rebuild and may under count temporarily, until the CPU becomes idle and the count is cleared. This is a trade off to avoid complex data synchronization across sched domain builds. Suggested-by: Peter Zijlstra (Intel) Suggested-by: K Prateek Nayak Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/42e79eceb8cd6be8a032401d481d101913bc5703.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/topology.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1200670969bb..8954bf7900ff 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -634,6 +634,11 @@ static void destroy_sched_domain(struct sched_domain *sd) if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) kfree(sd->shared); + +#ifdef CONFIG_SCHED_CACHE + /* only the bottom sd has llc_counts array */ + kfree(sd->llc_counts); +#endif kfree(sd); } @@ -763,10 +768,18 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd && sd_degenerate(sd)) { tmp = sd; sd = sd->parent; - destroy_sched_domain(tmp); + if (sd) { struct sched_group *sg = sd->groups; +#ifdef CONFIG_SCHED_CACHE + /* move buffer to parent as child is being destroyed */ + sd->llc_counts = tmp->llc_counts; + sd->llc_max = tmp->llc_max; + /* make sure destroy_sched_domain() does not free it */ + tmp->llc_counts = NULL; + tmp->llc_max = 0; +#endif /* * sched groups hold the flags of the child sched * domain for convenience. Clear such flags since @@ -778,6 +791,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } + + destroy_sched_domain(tmp); } sched_domain_debug(sd, cpu); @@ -805,6 +820,49 @@ enum s_alloc { sa_none, }; +#ifdef CONFIG_SCHED_CACHE +static bool alloc_sd_llc(const struct cpumask *cpu_map, + struct s_data *d) +{ + struct sched_domain *sd; + unsigned int *p; + int i; + + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d->sd, i); + if (!sd) + goto err; + + p = kcalloc_node(max_lid + 1, sizeof(unsigned int), + GFP_KERNEL, cpu_to_node(i)); + if (!p) + goto err; + + sd->llc_max = max_lid + 1; + sd->llc_counts = p; + } + + return true; +err: + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d->sd, i); + if (sd) { + kfree(sd->llc_counts); + sd->llc_counts = NULL; + sd->llc_max = 0; + } + } + + return false; +} +#else +static bool alloc_sd_llc(const struct cpumask *cpu_map, + struct s_data *d) +{ + return false; +} +#endif + /* * Return the canonical balance CPU for this group, this is the first CPU * of this group that's also in the balance mask. @@ -2828,6 +2886,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att init_sched_groups_capacity(i, sd); } + alloc_sd_llc(cpu_map, &d); + /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { -- cgit v1.2.3 From 82c960aee304bf286552046b66d5b0b3933b2418 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:21 -0700 Subject: sched/cache: Calculate the percpu sd task LLC preference Calculate the number of tasks' LLC preferences for each runqueue. This statistic is computed during task enqueue and dequeue operations, and is used by the cache-aware load balancing. Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/d15a64436d3acd19c5c53344c5e9d3d0b79b3233.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e66da7a6be3e..7d52cf0b85bd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1386,6 +1386,7 @@ static int llc_id(int cpu) static void account_llc_enqueue(struct rq *rq, struct task_struct *p) { + struct sched_domain *sd; int pref_llc; pref_llc = p->preferred_llc; @@ -1394,10 +1395,15 @@ static void account_llc_enqueue(struct rq *rq, struct task_struct *p) rq->nr_llc_running++; rq->nr_pref_llc_running += (pref_llc == task_llc(p)); + + sd = rcu_dereference_all(rq->sd); + if (sd && (unsigned int)pref_llc < sd->llc_max) + sd->llc_counts[pref_llc]++; } static void account_llc_dequeue(struct rq *rq, struct task_struct *p) { + struct sched_domain *sd; int pref_llc; pref_llc = p->preferred_llc; @@ -1406,6 +1412,24 @@ static void account_llc_dequeue(struct rq *rq, struct task_struct *p) rq->nr_llc_running--; rq->nr_pref_llc_running -= (pref_llc == task_llc(p)); + + sd = rcu_dereference_all(rq->sd); + if (sd && (unsigned int)pref_llc < sd->llc_max) { + /* + * There is a race condition between dequeue + * and CPU hotplug. After a task has been enqueued + * on CPUx, a CPU hotplug event occurs, and all online + * CPUs (including CPUx) rebuild their sched_domains + * and reset statistics to zero(including sd->llc_counts). + * This can cause temporary undercount and we have to + * check for such underflow in sd->llc_counts. + * + * This undercount is temporary and accurate accounting + * will resume once the rq has a chance to be idle. + */ + if (sd->llc_counts[pref_llc]) + sd->llc_counts[pref_llc]--; + } } void mm_init_sched(struct mm_struct *mm, -- cgit v1.2.3 From 15ad45fb80ca7fe67faf6b51dffce125a801cc5a Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:22 -0700 Subject: sched/cache: Count tasks prefering destination LLC in a sched group During LLC load balancing, tabulate the number of tasks on each runqueue that prefer the LLC contains the env->dst_cpu in a sched group. For example, consider a system with 4 LLC sched groups (LLC0 to LLC3) balancing towards LLC3. LLC0 has 3 tasks preferring LLC3, LLC1 has 2, and LLC2 has 1. LLC0, having the most tasks preferring LLC3, is selected as the busiest source to pick tasks from. Within a source LLC, the total number of tasks preferring a destination LLC is computed by summing counts across all CPUs in that LLC. For instance, if LLC0 has CPU0 with 2 tasks and CPU1 with 1 task preferring LLC3, the total for LLC0 is 3. These statistics allow the load balancer to choose tasks from source sched groups that best match their preferred LLCs. Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/3d8502a33a753c4384b368f97f64ee70b1cea0db.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7d52cf0b85bd..cea625c79035 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10834,6 +10834,9 @@ struct sg_lb_stats { unsigned int nr_numa_running; unsigned int nr_preferred_running; #endif +#ifdef CONFIG_SCHED_CACHE + unsigned int nr_pref_dst_llc; +#endif }; /* @@ -11328,6 +11331,20 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (cpu_overutilized(i)) sgs->group_overutilized = 1; +#ifdef CONFIG_SCHED_CACHE + if (sched_cache_enabled()) { + struct sched_domain *sd_tmp; + int dst_llc; + + dst_llc = llc_id(env->dst_cpu); + if (llc_id(i) != dst_llc) { + sd_tmp = rcu_dereference_all(rq->sd); + if (sd_tmp && (unsigned int)dst_llc < sd_tmp->llc_max) + sgs->nr_pref_dst_llc += sd_tmp->llc_counts[dst_llc]; + } + } +#endif + /* * No need to call idle_cpu() if nr_running is not 0 */ -- cgit v1.2.3 From 9a5e22fbb0c88bff33458ede98b0fa922fab3831 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:23 -0700 Subject: sched/cache: Check local_group only once in update_sg_lb_stats() There is no need to check the local group twice for both group_asym_packing and group_smt_balance. Adjust the code to facilitate future checks for group types (cache-aware load balancing) as well. No functional changes are expected. Suggested-by: Peter Zijlstra (Intel) Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/99a57865c8ae1847087a5c00e92d24351cf3e5a8.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cea625c79035..d3812d18b6d6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11385,14 +11385,16 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; - /* Check if dst CPU is idle and preferred to this group */ - if (!local_group && env->idle && sgs->sum_h_nr_running && - sched_group_asym(env, sgs, group)) - sgs->group_asym_packing = 1; - - /* Check for loaded SMT group to be balanced to dst CPU */ - if (!local_group && smt_balance(env, sgs, group)) - sgs->group_smt_balance = 1; + if (!local_group) { + /* Check if dst CPU is idle and preferred to this group */ + if (env->idle && sgs->sum_h_nr_running && + sched_group_asym(env, sgs, group)) + sgs->group_asym_packing = 1; + + /* Check for loaded SMT group to be balanced to dst CPU */ + if (smt_balance(env, sgs, group)) + sgs->group_smt_balance = 1; + } sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); -- cgit v1.2.3 From f38cc2f0d8a354551d219e7fd95fce3e96868105 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:24 -0700 Subject: sched/cache: Prioritize tasks preferring destination LLC during balancing During LLC load balancing, first check for tasks that prefer the destination LLC and balance them to it before others. Mark source sched groups containing tasks preferring non local LLCs with the group_llc_balance flag. This ensures the load balancer later pulls or pushes these tasks toward their preferred LLCs. The priority of group_llc_balance is lower than that of group_overloaded and higher than that of all other group types. This is because group_llc_balance may exacerbate load imbalance, and if the LLC balancing attempt fails, the nr_balance_failed mechanism will trigger other group types to rebalance the load. The load balancer selects the busiest sched_group and migrates tasks to less busy groups to distribute load across CPUs. With cache-aware scheduling enabled, the busiest sched_group is the one with most tasks preferring the destination LLC. If the group has the llc_balance flag set, cache aware load balancing is triggered. Introduce the helper function update_llc_busiest() to identify the sched_group with the most tasks preferring the destination LLC. Suggested-by: K Prateek Nayak Suggested-by: Madadi Vineeth Reddy Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/baa458f45eab3f602af090c6d6af63dc864f5ec6.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d3812d18b6d6..ba4ee9aeea66 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9925,6 +9925,16 @@ enum group_type { * from balancing the load across the system. */ group_imbalanced, + /* + * There are tasks running on non-preferred LLC, possible to move + * them to their preferred LLC without creating too much imbalance. + * The priority of group_llc_balance is lower than that of + * group_overloaded and higher than that of all other group types. + * This is because group_llc_balance may exacerbate load imbalance. + * If the LLC balancing attempt fails, the nr_balance_failed + * mechanism will trigger other group types to rebalance the load. + */ + group_llc_balance, /* * The CPU is overloaded and can't provide expected CPU cycles to all * tasks. @@ -10828,6 +10838,7 @@ struct sg_lb_stats { enum group_type group_type; unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned int group_smt_balance; /* Task on busy SMT be moved */ + unsigned int group_llc_balance; /* Tasks should be moved to preferred LLC */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ unsigned int group_overutilized; /* At least one CPU is overutilized in the group */ #ifdef CONFIG_NUMA_BALANCING @@ -11094,6 +11105,9 @@ group_type group_classify(unsigned int imbalance_pct, if (group_is_overloaded(imbalance_pct, sgs)) return group_overloaded; + if (sgs->group_llc_balance) + return group_llc_balance; + if (sg_imbalanced(group)) return group_imbalanced; @@ -11288,11 +11302,63 @@ static void record_sg_llc_stats(struct lb_env *env, if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity)) WRITE_ONCE(sd_share->capacity, sgs->group_capacity); } + +/* + * Do LLC balance on sched group that contains LLC, and have tasks preferring + * to run on LLC in idle dst_cpu. + */ +static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + if (!sched_cache_enabled()) + return false; + + if (env->sd->flags & SD_SHARE_LLC) + return false; + + /* + * Skip cache aware tagging if nr_balanced_failed is sufficiently high. + * Threshold of cache_nice_tries is set to 1 higher than nr_balance_failed + * to avoid excessive task migration at the same time. + */ + if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1) + return false; + + if (sgs->nr_pref_dst_llc && + can_migrate_llc(cpumask_first(sched_group_span(group)), + env->dst_cpu, 0, true) == mig_llc) + return true; + + return false; +} + +static bool update_llc_busiest(struct lb_env *env, + struct sg_lb_stats *busiest, + struct sg_lb_stats *sgs) +{ + /* + * There are more tasks that want to run on dst_cpu's LLC. + */ + return sgs->nr_pref_dst_llc > busiest->nr_pref_dst_llc; +} #else static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) { } + +static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + return false; +} + +static bool update_llc_busiest(struct lb_env *env, + struct sg_lb_stats *busiest, + struct sg_lb_stats *sgs) +{ + return false; +} #endif /** @@ -11394,6 +11460,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Check for loaded SMT group to be balanced to dst CPU */ if (smt_balance(env, sgs, group)) sgs->group_smt_balance = 1; + + /* Check for tasks in this group can be moved to their preferred LLC */ + if (llc_balance(env, sgs, group)) + sgs->group_llc_balance = 1; } sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); @@ -11457,6 +11527,10 @@ static bool update_sd_pick_busiest(struct lb_env *env, /* Select the overloaded group with highest avg_load. */ return sgs->avg_load > busiest->avg_load; + case group_llc_balance: + /* Select the group with most tasks preferring dst LLC */ + return update_llc_busiest(env, busiest, sgs); + case group_imbalanced: /* * Select the 1st imbalanced group as we don't have any way to @@ -11719,6 +11793,7 @@ static bool update_pick_idlest(struct sched_group *idlest, return false; break; + case group_llc_balance: case group_imbalanced: case group_asym_packing: case group_smt_balance: @@ -11851,6 +11926,7 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int return NULL; break; + case group_llc_balance: case group_imbalanced: case group_asym_packing: case group_smt_balance: @@ -12349,7 +12425,8 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) * group's child domain. */ if (sds.prefer_sibling && local->group_type == group_has_spare && - sibling_imbalance(env, &sds, busiest, local) > 1) + (busiest->group_type == group_llc_balance || + sibling_imbalance(env, &sds, busiest, local) > 1)) goto force_balance; if (busiest->group_type != group_overloaded) { -- cgit v1.2.3 From e4c9a4cb244a273c58e8fd86d7c04e2502822e64 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:25 -0700 Subject: sched/cache: Add migrate_llc_task migration type for cache-aware balancing Introduce a new migration type, migrate_llc_task, to support cache-aware load balancing. After identifying the busiest sched_group (having the most tasks preferring the destination LLC), mark migrations with this type. During load balancing, each runqueue in the busiest sched_group is examined, and the runqueue with the highest number of tasks preferring the destination CPU is selected as the busiest runqueue. Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/b9df27c19cc5121ddb2a7d1be7f9d52fec1563dc.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ba4ee9aeea66..68032efd143b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9946,7 +9946,8 @@ enum migration_type { migrate_load = 0, migrate_util, migrate_task, - migrate_misfit + migrate_misfit, + migrate_llc_task }; #define LBF_ALL_PINNED 0x01 @@ -10560,6 +10561,10 @@ static int detach_tasks(struct lb_env *env) env->imbalance = 0; break; + + case migrate_llc_task: + env->imbalance--; + break; } detach_task(p, env); @@ -12179,6 +12184,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return; } +#ifdef CONFIG_SCHED_CACHE + if (busiest->group_type == group_llc_balance) { + /* Move a task that prefer local LLC */ + env->migration_type = migrate_llc_task; + env->imbalance = 1; + return; + } +#endif + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages @@ -12485,7 +12499,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, { struct rq *busiest = NULL, *rq; unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int __maybe_unused busiest_pref_llc = 0; + struct sched_domain __maybe_unused *sd_tmp; unsigned int busiest_nr = 0; + int __maybe_unused dst_llc; int i; for_each_cpu_and(i, sched_group_span(group), env->cpus) { @@ -12613,6 +12630,23 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, break; + case migrate_llc_task: +#ifdef CONFIG_SCHED_CACHE + sd_tmp = rcu_dereference_all(rq->sd); + dst_llc = llc_id(env->dst_cpu); + + if (sd_tmp && (unsigned)dst_llc < sd_tmp->llc_max) { + unsigned int this_pref_llc = + sd_tmp->llc_counts[dst_llc]; + + if (busiest_pref_llc < this_pref_llc) { + busiest_pref_llc = this_pref_llc; + busiest = rq; + } + } +#endif + break; + } } @@ -12776,6 +12810,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd case migrate_misfit: __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); break; + case migrate_llc_task: + break; } } -- cgit v1.2.3 From 714059f79ff0ba976cb75360064583c78bbc6f8e Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:26 -0700 Subject: sched/cache: Handle moving single tasks to/from their preferred LLC Cache aware scheduling mainly does two things: 1. Prevent task from migrating out of its preferred LLC if not nessasary. 2. Migrating task to their preferred LLC if nessasary. For 1: In the generic load balance, if the busiest runqueue has only one task, active balancing may be invoked to move it away. However, this migration might break LLC locality. Prevent regular load balance from migrating a task that prefers the current LLC. The load level and imbalance do not warrant breaking LLC preference per the can_migrate_llc() policy. Here, the benefit of LLC locality outweighs the power efficiency gained from migrating the only runnable task away. Before migration, check whether the task is running on its preferred LLC: Do not move a lone task to another LLC if it would move the task away from its preferred LLC or cause excessive imbalance between LLCs. For 2: On the other hand, if the migration type is migrate_llc_task, it means that there are tasks on the env->src_cpu that want to be migrated to their preferred LLC, launch the active load balance anyway. Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/9b816d8c27fabf2a9c0e1f61a6b90afe8ec4ad52.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 68032efd143b..bfb6c0c52221 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10293,12 +10293,60 @@ static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu task_util(p), to_pref); } +/* + * Check if active load balance breaks LLC locality in + * terms of cache aware load balance. The load level and + * imbalance do not warrant breaking LLC preference per + * the can_migrate_llc() policy. Here, the benefit of + * LLC locality outweighs the power efficiency gained from + * migrating the only runnable task away. + */ +static inline bool +alb_break_llc(struct lb_env *env) +{ + if (!sched_cache_enabled()) + return false; + + if (cpus_share_cache(env->src_cpu, env->dst_cpu)) + return false; + /* + * All tasks prefer to stay on their current CPU. + * Do not pull a task from its preferred CPU if: + * 1. It is the only task running there(not too imbalance); OR + * 2. Migrating it away from its preferred LLC would violate + * the cache-aware scheduling policy. + */ + if (env->src_rq->nr_pref_llc_running && + env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable) { + unsigned long util = 0; + struct task_struct *cur; + + if (env->src_rq->nr_running <= 1) + return true; + + cur = rcu_dereference_all(env->src_rq->curr); + if (cur) + util = task_util(cur); + + if (can_migrate_llc(env->src_cpu, env->dst_cpu, + util, false) == mig_forbid) + return true; + } + + return false; +} #else static inline bool get_llc_stats(int cpu, unsigned long *util, unsigned long *cap) { return false; } + +static inline bool +alb_break_llc(struct lb_env *env) +{ + return false; +} #endif /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? @@ -12698,6 +12746,9 @@ static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; + if (alb_break_llc(env)) + return 0; + if (asym_active_balance(env)) return 1; @@ -12717,7 +12768,8 @@ static int need_active_balance(struct lb_env *env) return 1; } - if (env->migration_type == migrate_misfit) + if (env->migration_type == migrate_misfit || + env->migration_type == migrate_llc_task) return 1; return 0; -- cgit v1.2.3 From 5b1d5e6db20a6c64ffb95d04578db8c4b0228eea Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 1 Apr 2026 14:52:27 -0700 Subject: sched/cache: Respect LLC preference in task migration and detach During load balancing, make can_migrate_task() consider a task's LLC preference. Prevent a task from being moved out of its preferred LLC. During the regular load balancing, if the task cannot be migrated due to LLC locality, the nr_balance_failed also should not be increased. Suggested-by: Peter Zijlstra (Intel) Suggested-by: K Prateek Nayak Co-developed-by: Chen Yu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/53da65f3d59de31e1a1dc59a4093d8dd9d4dc206.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 13 ++++++++ 2 files changed, 91 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bfb6c0c52221..5f22e5a097cf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9955,6 +9955,7 @@ enum migration_type { #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 #define LBF_ACTIVE_LB 0x10 +#define LBF_LLC_PINNED 0x20 struct lb_env { struct sched_domain *sd; @@ -10267,8 +10268,8 @@ static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu, * Check if task p can migrate from source LLC to * destination LLC in terms of cache aware load balance. */ -static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, - struct task_struct *p) +static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, + struct task_struct *p) { struct mm_struct *mm; bool to_pref; @@ -10335,6 +10336,46 @@ alb_break_llc(struct lb_env *env) return false; } + +/* + * Check if migrating task p from env->src_cpu to + * env->dst_cpu breaks LLC localiy. + */ +static bool migrate_degrades_llc(struct task_struct *p, struct lb_env *env) +{ + if (!sched_cache_enabled()) + return false; + + if (task_has_sched_core(p)) + return false; + /* + * Skip over tasks that would degrade LLC locality; + * only when nr_balanced_failed is sufficiently high do we + * ignore this constraint. + * + * Threshold of cache_nice_tries is set to 1 higher + * than nr_balance_failed to avoid excessive task + * migration at the same time. + */ + if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1) + return false; + + /* + * We know the env->src_cpu has some tasks prefer to + * run on env->dst_cpu, skip the tasks do not prefer + * env->dst_cpu, and find the one that prefers. + */ + if (env->migration_type == migrate_llc_task && + READ_ONCE(p->preferred_llc) != llc_id(env->dst_cpu)) + return true; + + if (can_migrate_llc_task(env->src_cpu, + env->dst_cpu, p) != mig_forbid) + return false; + + return true; +} + #else static inline bool get_llc_stats(int cpu, unsigned long *util, unsigned long *cap) @@ -10347,6 +10388,12 @@ alb_break_llc(struct lb_env *env) { return false; } + +static inline bool +migrate_degrades_llc(struct task_struct *p, struct lb_env *env) +{ + return false; +} #endif /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? @@ -10444,10 +10491,29 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 1; degrades = migrate_degrades_locality(p, env); - if (!degrades) + if (!degrades) { + /* + * If the NUMA locality is not broken, + * further check if migration would hurt + * LLC locality. + */ + if (migrate_degrades_llc(p, env)) { + /* + * If regular load balancing fails to pull a task + * due to LLC locality, this is expected behavior + * and we set LBF_LLC_PINNED so we don't increase + * nr_balance_failed unecessarily. + */ + if (env->migration_type != migrate_llc_task) + env->flags |= LBF_LLC_PINNED; + + return 0; + } + hot = task_hot(p, env); - else + } else { hot = degrades > 0; + } if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (hot) @@ -13067,9 +13133,16 @@ more_balance: * * Similarly for migration_misfit which is not related to * load/util migration, don't pollute nr_balance_failed. + * + * The same for cache aware scheduling's allowance for + * load imbalance. If regular load balance does not + * migrate task due to LLC locality, it is a expected + * behavior and don't pollute nr_balance_failed. + * See can_migrate_task(). */ if (idle != CPU_NEWLY_IDLE && - env.migration_type != migrate_misfit) + env.migration_type != migrate_misfit && + !(env.flags & LBF_LLC_PINNED)) sd->nr_balance_failed++; if (need_active_balance(&env)) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3c9e92b79041..a56619b3761f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1547,6 +1547,14 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); extern void sched_core_get(void); extern void sched_core_put(void); +static inline bool task_has_sched_core(struct task_struct *p) +{ + if (sched_core_disabled()) + return false; + + return !!p->core_cookie; +} + #else /* !CONFIG_SCHED_CORE: */ static inline bool sched_core_enabled(struct rq *rq) @@ -1587,6 +1595,11 @@ static inline bool sched_group_cookie_match(struct rq *rq, return true; } +static inline bool task_has_sched_core(struct task_struct *p) +{ + return false; +} + #endif /* !CONFIG_SCHED_CORE */ #ifdef CONFIG_RT_GROUP_SCHED -- cgit v1.2.3 From d59f4fd1d303987f434bcf0b8191e89ca1d6a67c Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 1 Apr 2026 14:52:30 -0700 Subject: sched/cache: Enable cache aware scheduling for multi LLCs NUMA node Introduce sched_cache_present to enable cache aware scheduling for multi LLCs NUMA node Cache-aware load balancing should only be enabled if there are more than 1 LLCs within 1 NUMA node. sched_cache_present is introduced to indicate whether this platform supports this topology. Test results: The first test platform is a 2 socket Intel Sapphire Rapids with 30 cores per socket. The DRAM interleaving is enabled in the BIOS so it essential has one NUMA node with two last level caches. There are 60 CPUs associated with each last level cache. The second test platform is a AMD Genoa. There are 4 Nodes and 32 CPUs per node. Each node has 2 CCXs and each CCX has 16 CPUs. hackbench/schbench/netperf/stream/stress-ng/chacha20 were launched on these two platforms. [TL;DR] Sappire Rapids: hackbench shows significant improvement when the number of different active threads is below the capacity of a LLC. schbench shows limitted wakeup latency improvement. ChaCha20-xiangshan(risc-v simulator) shows good throughput improvement. No obvious difference was observed in netperf/stream/stress-ng in Hmean. Genoa: Significant improvement is observed in hackbench when the active number of threads is lower than the number of CPUs within 1 LLC. On v2, Aaron reported improvement of hackbench/redis when system is underloaded. ChaCha20-xiangshan shows huge throughput improvement. Phoronix has tested v1 and shows good improvements in 30+ cases[3]. No obvious difference was observed in netperf/stream/stress-ng in Hmean. Detail: Due to length constraints, data without much difference with baseline is not presented. Sapphire Rapids: [hackbench pipe] ================ case load baseline(std%) compare%( std%) threads-pipe-10 1-groups 1.00 ( 1.22) +26.09 ( 1.10) threads-pipe-10 2-groups 1.00 ( 4.90) +22.88 ( 0.18) threads-pipe-10 4-groups 1.00 ( 2.07) +9.00 ( 3.49) threads-pipe-10 8-groups 1.00 ( 8.13) +3.45 ( 3.62) threads-pipe-16 1-groups 1.00 ( 2.11) +26.30 ( 0.08) threads-pipe-16 2-groups 1.00 ( 15.13) -1.77 ( 11.89) threads-pipe-16 4-groups 1.00 ( 4.37) +0.58 ( 7.99) threads-pipe-16 8-groups 1.00 ( 2.88) +2.71 ( 3.50) threads-pipe-2 1-groups 1.00 ( 9.40) +22.07 ( 0.71) threads-pipe-2 2-groups 1.00 ( 9.99) +18.01 ( 0.95) threads-pipe-2 4-groups 1.00 ( 3.98) +24.66 ( 0.96) threads-pipe-2 8-groups 1.00 ( 7.00) +21.83 ( 0.23) threads-pipe-20 1-groups 1.00 ( 1.03) +28.84 ( 0.21) threads-pipe-20 2-groups 1.00 ( 4.42) +31.90 ( 3.15) threads-pipe-20 4-groups 1.00 ( 9.97) +4.56 ( 1.69) threads-pipe-20 8-groups 1.00 ( 1.87) +1.25 ( 0.74) threads-pipe-4 1-groups 1.00 ( 4.48) +25.67 ( 0.78) threads-pipe-4 2-groups 1.00 ( 9.14) +4.91 ( 2.08) threads-pipe-4 4-groups 1.00 ( 7.68) +19.36 ( 1.53) threads-pipe-4 8-groups 1.00 ( 10.79) +7.20 ( 12.20) threads-pipe-8 1-groups 1.00 ( 4.69) +21.93 ( 0.03) threads-pipe-8 2-groups 1.00 ( 1.16) +25.29 ( 0.65) threads-pipe-8 4-groups 1.00 ( 2.23) -1.27 ( 3.62) threads-pipe-8 8-groups 1.00 ( 4.65) -3.08 ( 2.75) Note: The default number of fd in hackbench is changed from 20 to various values to ensure that threads fit within a single LLC, especially on AMD systems. Take "threads-pipe-8, 2-groups" for example, the number of fd is 8, and 2 groups are created. [schbench] The 99th percentile wakeup latency shows some improvements when the system is underload, while it does not bring much difference with the increasing of system utilization. 99th Wakeup Latencies Base (mean std) Compare (mean std) Change ========================================================================= thread=2 9.00(0.00) 9.00(1.73) 0.00% thread=4 7.33(0.58) 6.33(0.58) +13.64% thread=8 9.00(0.00) 7.67(1.15) +14.78% thread=16 8.67(0.58) 8.67(1.53) 0.00% thread=32 9.00(0.00) 7.00(0.00) +22.22% thread=64 9.33(0.58) 9.67(0.58) -3.64% thread=128 12.00(0.00) 12.00(0.00) 0.00% [chacha20 on simulated risc-v] baseline: Host time spent: 67861ms cache aware scheduling enabled: Host time spent: 54441ms Time reduced by 24% Genoa: [hackbench pipe] The default number of fd is 20, which exceed the number of CPUs in a LLC. So the fd is adjusted to 2, 4, 6, 8, 20 respectively. Exclude the result with large run-to-run variance, 10% ~ 50% improvement is observed when the system is underloaded: [hackbench pipe] ================ case load baseline(std%) compare%( std%) threads-pipe-2 1-groups 1.00 ( 2.89) +47.33 ( 1.20) threads-pipe-2 2-groups 1.00 ( 3.88) +39.82 ( 0.61) threads-pipe-2 4-groups 1.00 ( 8.76) +5.57 ( 13.10) threads-pipe-20 1-groups 1.00 ( 4.61) +11.72 ( 1.06) threads-pipe-20 2-groups 1.00 ( 6.18) +14.55 ( 1.47) threads-pipe-20 4-groups 1.00 ( 2.99) +10.16 ( 4.49) threads-pipe-4 1-groups 1.00 ( 4.23) +43.70 ( 2.14) threads-pipe-4 2-groups 1.00 ( 3.68) +8.45 ( 4.04) threads-pipe-4 4-groups 1.00 ( 17.72) +2.42 ( 1.14) threads-pipe-6 1-groups 1.00 ( 3.10) +7.74 ( 3.83) threads-pipe-6 2-groups 1.00 ( 3.42) +14.26 ( 4.53) threads-pipe-6 4-groups 1.00 ( 10.34) +10.94 ( 7.12) threads-pipe-8 1-groups 1.00 ( 4.21) +9.06 ( 4.43) threads-pipe-8 2-groups 1.00 ( 1.88) +3.74 ( 0.58) threads-pipe-8 4-groups 1.00 ( 2.78) +23.96 ( 1.18) [chacha20 on simulated risc-v] Host time spent: 54762ms Host time spent: 28295ms Time reduced by 48% Suggested-by: Libo Chen Suggested-by: Adam Li Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/71972e12ab4f08aff422b31e34df09bdbd94de84.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/sched.h | 4 +++- kernel/sched/topology.c | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a56619b3761f..71f6077da466 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -4069,9 +4069,11 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct #endif /* !CONFIG_SCHED_MM_CID */ #ifdef CONFIG_SCHED_CACHE +DECLARE_STATIC_KEY_FALSE(sched_cache_present); + static inline bool sched_cache_enabled(void) { - return false; + return static_branch_unlikely(&sched_cache_present); } #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8954bf7900ff..6a36f8f6b7b1 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -821,6 +821,7 @@ enum s_alloc { }; #ifdef CONFIG_SCHED_CACHE +DEFINE_STATIC_KEY_FALSE(sched_cache_present); static bool alloc_sd_llc(const struct cpumask *cpu_map, struct s_data *d) { @@ -2777,6 +2778,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { enum s_alloc alloc_state = sa_none; + bool has_multi_llcs = false; struct sched_domain *sd; struct s_data d; struct rq *rq = NULL; @@ -2870,8 +2872,11 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att * In presence of higher domains, adjust the * NUMA imbalance stats for the hierarchy. */ - if (IS_ENABLED(CONFIG_NUMA) && sd->parent) - adjust_numa_imbalance(sd); + if (sd->parent) { + if (IS_ENABLED(CONFIG_NUMA)) + adjust_numa_imbalance(sd); + has_multi_llcs = true; + } } } @@ -2912,6 +2917,16 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att ret = 0; error: +#ifdef CONFIG_SCHED_CACHE + /* + * TBD: check before writing to it. sched domain rebuild + * is not in the critical path, leave as-is for now. + */ + if (!ret && has_multi_llcs) + static_branch_enable_cpuslocked(&sched_cache_present); + else + static_branch_disable_cpuslocked(&sched_cache_present); +#endif __free_domain_allocs(&d, alloc_state, cpu_map); return ret; -- cgit v1.2.3 From 067a3135814334a8ea7241faef364cc48c6340bc Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 1 Apr 2026 14:52:31 -0700 Subject: sched/cache: Allow the user space to turn on and off cache aware scheduling Provide a debugfs directory llc_balancing, and a knob named "enabled" under it to allow the user to turn off and on the cache aware scheduling at runtime. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/0aa56f7fc48db2f8f700cd1aa34dedd0ec88351b.1775065312.git.tim.c.chen@linux.intel.com --- kernel/sched/debug.c | 48 +++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 7 +++++- kernel/sched/topology.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 74c1617cf652..2eae67cd2ba2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -210,6 +210,46 @@ static const struct file_operations sched_scaling_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_CACHE +static ssize_t +sched_cache_enable_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + bool val; + int ret; + + ret = kstrtobool_from_user(ubuf, cnt, &val); + if (ret) + return ret; + + sysctl_sched_cache_user = val; + + sched_cache_active_set_unlocked(); + + return cnt; +} + +static int sched_cache_enable_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", sysctl_sched_cache_user); + return 0; +} + +static int sched_cache_enable_open(struct inode *inode, + struct file *filp) +{ + return single_open(filp, sched_cache_enable_show, NULL); +} + +static const struct file_operations sched_cache_enable_fops = { + .open = sched_cache_enable_open, + .write = sched_cache_enable_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + #ifdef CONFIG_PREEMPT_DYNAMIC static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, @@ -593,7 +633,7 @@ static void debugfs_ext_server_init(void) static __init int sched_init_debug(void) { - struct dentry __maybe_unused *numa; + struct dentry __maybe_unused *numa, *llc; debugfs_sched = debugfs_create_dir("sched", NULL); @@ -626,6 +666,12 @@ static __init int sched_init_debug(void) debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_SCHED_CACHE + llc = debugfs_create_dir("llc_balancing", debugfs_sched); + debugfs_create_file("enabled", 0644, llc, NULL, + &sched_cache_enable_fops); +#endif + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); debugfs_fair_server_init(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 71f6077da466..f499d5dd1130 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -4070,11 +4070,16 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct #ifdef CONFIG_SCHED_CACHE DECLARE_STATIC_KEY_FALSE(sched_cache_present); +DECLARE_STATIC_KEY_FALSE(sched_cache_active); +extern int sysctl_sched_cache_user; static inline bool sched_cache_enabled(void) { - return static_branch_unlikely(&sched_cache_present); + return static_branch_unlikely(&sched_cache_active); } + +extern void sched_cache_active_set_unlocked(void); + #endif void sched_domains_free_llc_id(int cpu); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6a36f8f6b7b1..9fc99346ef4f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -821,7 +821,16 @@ enum s_alloc { }; #ifdef CONFIG_SCHED_CACHE +/* hardware support for cache aware scheduling */ DEFINE_STATIC_KEY_FALSE(sched_cache_present); +/* + * Indicator of whether cache aware scheduling + * is active, used by the scheduler. + */ +DEFINE_STATIC_KEY_FALSE(sched_cache_active); +/* user wants cache aware scheduling [0 or 1] */ +int sysctl_sched_cache_user = 1; + static bool alloc_sd_llc(const struct cpumask *cpu_map, struct s_data *d) { @@ -856,6 +865,60 @@ err: return false; } + +static void _sched_cache_active_set(bool enable, bool locked) +{ + if (enable) { + if (locked) + static_branch_enable_cpuslocked(&sched_cache_active); + else + static_branch_enable(&sched_cache_active); + } else { + if (locked) + static_branch_disable_cpuslocked(&sched_cache_active); + else + static_branch_disable(&sched_cache_active); + } +} + +/* + * Enable/disable cache aware scheduling according to + * user input and the presence of hardware support. + */ +static void sched_cache_active_set(bool locked) +{ + /* hardware does not support */ + if (!static_branch_likely(&sched_cache_present)) { + _sched_cache_active_set(false, locked); + return; + } + + /* + * user wants it or not ? + * TBD: read before writing the static key. + * It is not in the critical path, leave as-is + * for now. + */ + if (sysctl_sched_cache_user) { + _sched_cache_active_set(true, locked); + if (sched_debug()) + pr_info("%s: enabling cache aware scheduling\n", __func__); + } else { + _sched_cache_active_set(false, locked); + if (sched_debug()) + pr_info("%s: disabling cache aware scheduling\n", __func__); + } +} + +static void sched_cache_active_set_locked(void) +{ + return sched_cache_active_set(true); +} + +void sched_cache_active_set_unlocked(void) +{ + return sched_cache_active_set(false); +} #else static bool alloc_sd_llc(const struct cpumask *cpu_map, struct s_data *d) @@ -2926,6 +2989,8 @@ error: static_branch_enable_cpuslocked(&sched_cache_present); else static_branch_disable_cpuslocked(&sched_cache_present); + + sched_cache_active_set_locked(); #endif __free_domain_allocs(&d, alloc_state, cpu_map); -- cgit v1.2.3 From 4ac4d6549a6563878d7c19c154e017f6cb7114d3 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Wed, 29 Apr 2026 11:41:37 +0200 Subject: sched: Use trace_call__() to save a static branch The wrapper functions __trace_set_current_state() and __trace_set_need_resched() allow the tracepoints to be called from code outside sched/core.c, those calls are already guarded by a tracepoint_enabled() so there is no need to repeat this check once again inside the call using trace_(). Use the new trace_call__() API to directly call the tracepoint without check. Those helper functions must be called after the appropriate check. Signed-off-by: Gabriele Monaco Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260429094227.34087-1-gmonaco@redhat.com --- kernel/sched/core.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b8871449d3c6..b905805bbcbe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -537,10 +537,14 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } /* need a wrapper since we may need to trace from modules */ EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); -/* Call via the helper macro trace_set_current_state. */ +/* + * Call via the helper macro trace_set_current_state. + * Calls to this function MUST be guarded by a + * tracepoint_enabled(sched_set_state_tp) + */ void __trace_set_current_state(int state_value) { - trace_sched_set_state_tp(current, state_value); + trace_call__sched_set_state_tp(current, state_value); } EXPORT_SYMBOL(__trace_set_current_state); @@ -1203,9 +1207,13 @@ static void __resched_curr(struct rq *rq, int tif) } } +/* + * Calls to this function MUST be guarded by a + * tracepoint_enabled(sched_set_need_resched_tp) + */ void __trace_set_need_resched(struct task_struct *curr, int tif) { - trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif); + trace_call__sched_set_need_resched_tp(curr, smp_processor_id(), tif); } EXPORT_SYMBOL_GPL(__trace_set_need_resched); -- cgit v1.2.3 From a2b4cf39d9d333bfeb9262dbaafe3d24d405a5c0 Mon Sep 17 00:00:00 2001 From: Jianyong Wu Date: Wed, 13 May 2026 13:39:12 -0700 Subject: sched/cache: Allow only 1 thread of the process to calculate the LLC occupancy Scanning online CPUs to calculate the occupancy might be time-consuming. Only allow 1 thread of the process to scan the CPUs at the same time, which is similar to what NUMA balance does in task_numa_work(). Signed-off-by: Jianyong Wu Signed-off-by: Chen Yu Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/5672b52e588b855b01e5a1a17822f7c6c7237a3d.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5f22e5a097cf..a759ea669d74 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1451,6 +1451,7 @@ void mm_init_sched(struct mm_struct *mm, raw_spin_lock_init(&mm->sc_stat.lock); mm->sc_stat.epoch = epoch; mm->sc_stat.cpu = -1; + mm->sc_stat.next_scan = jiffies; /* * The update to mm->sc_stat should not be reordered @@ -1661,6 +1662,7 @@ out: static void task_cache_work(struct callback_head *work) { + unsigned long next_scan, now = jiffies; struct task_struct *p = current; struct mm_struct *mm = p->mm; unsigned long m_a_occ = 0; @@ -1675,6 +1677,15 @@ static void task_cache_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + next_scan = READ_ONCE(mm->sc_stat.next_scan); + if (time_before(now, next_scan)) + return; + + /* only 1 thread is allowed to scan */ + if (!try_cmpxchg(&mm->sc_stat.next_scan, &next_scan, + now + EPOCH_PERIOD)) + return; + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) return; -- cgit v1.2.3 From deee5e27d5b608323c04dc99979e55f944016a13 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:13 -0700 Subject: sched/cache: Disable cache aware scheduling for processes with high thread counts A performance regression was observed by Prateek when running hackbench with many threads per process (high fd count). To avoid this, processes with a large number of active threads are excluded from cache-aware scheduling. With sched_cache enabled, record the number of active threads in each process during the periodic task_cache_work(). While iterating over CPUs, if the currently running task belongs to the same process as the task that launched task_cache_work(), increment the active thread count. If the number of active threads within the process exceeds the number of Cores (divided by the SMT number) in the LLC, do not enable cache-aware scheduling. However, on systems with a smaller number of CPUs within 1 LLC, like Power10/Power11 with SMT4 and an LLC size of 4, this check effectively disables cache-aware scheduling for any process. One possible solution suggested by Peter is to use an LLC-mask instead of a single LLC value for preference. Once there are a 'few' LLCs as preference, this constraint becomes a little easier. It could be an enhancement in the future. For users who wish to perform task aggregation regardless, a debugfs knob is provided for tuning in a subsequent change. Suggested-by: K Prateek Nayak Suggested-by: Aaron Lu Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingyin Duan Link: https://patch.msgid.link/d076cd21a8e6c6341d1e2d927e118db770ebb650.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 48 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a759ea669d74..808f614fc2d2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1384,6 +1384,12 @@ static int llc_id(int cpu) return per_cpu(sd_llc_id, cpu); } +static bool invalid_llc_nr(struct mm_struct *mm, int cpu) +{ + return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads), + per_cpu(sd_llc_size, cpu)); +} + static void account_llc_enqueue(struct rq *rq, struct task_struct *p) { struct sched_domain *sd; @@ -1452,7 +1458,7 @@ void mm_init_sched(struct mm_struct *mm, mm->sc_stat.epoch = epoch; mm->sc_stat.cpu = -1; mm->sc_stat.next_scan = jiffies; - + mm->sc_stat.nr_running_avg = 0; /* * The update to mm->sc_stat should not be reordered * before initialization to mm's other fields, in case @@ -1574,7 +1580,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) * If this process hasn't hit task_cache_work() for a while invalidate * its preferred state. */ - if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT) { + if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || + invalid_llc_nr(mm, cpu_of(rq))) { if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; } @@ -1660,14 +1667,32 @@ out: cpumask_copy(cpus, cpu_online_mask); } +static inline void update_avg_scale(u64 *avg, u64 sample) +{ + int factor = per_cpu(sd_llc_size, raw_smp_processor_id()); + s64 diff = sample - *avg; + u32 divisor; + + /* + * Scale the divisor based on the number of CPUs contained + * in the LLC. This scaling ensures smaller LLC domains use + * a smaller divisor to achieve more precise sensitivity to + * changes in nr_running, while larger LLC domains are capped + * at a maximum divisor of 8 which is the default smoothing + * factor of EWMA in update_avg(). + */ + divisor = clamp_t(u32, (factor >> 2), 2, 8); + *avg += div64_s64(diff, divisor); +} + static void task_cache_work(struct callback_head *work) { unsigned long next_scan, now = jiffies; - struct task_struct *p = current; + struct task_struct *p = current, *cur; + int cpu, m_a_cpu = -1, nr_running = 0; + unsigned long curr_m_a_occ = 0; struct mm_struct *mm = p->mm; unsigned long m_a_occ = 0; - unsigned long curr_m_a_occ = 0; - int cpu, m_a_cpu = -1; cpumask_var_t cpus; WARN_ON_ONCE(work != &p->cache_work); @@ -1711,6 +1736,11 @@ static void task_cache_work(struct callback_head *work) m_occ = occ; m_cpu = i; } + + cur = rcu_dereference_all(cpu_rq(i)->curr); + if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) && + cur->mm == mm) + nr_running++; } /* @@ -1754,6 +1784,7 @@ static void task_cache_work(struct callback_head *work) mm->sc_stat.cpu = m_a_cpu; } + update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running); free_cpumask_var(cpus); } @@ -10294,6 +10325,13 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) return mig_unrestricted; + /* skip cache aware load balance for too many threads */ + if (invalid_llc_nr(mm, dst_cpu)) { + if (mm->sc_stat.cpu != -1) + mm->sc_stat.cpu = -1; + return mig_unrestricted; + } + if (cpus_share_cache(dst_cpu, cpu)) to_pref = true; else if (cpus_share_cache(src_cpu, cpu)) -- cgit v1.2.3 From 7b34bb1ca324451c84c0a69136ce92e7928cf72b Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:14 -0700 Subject: sched/cache: Skip cache-aware scheduling for single-threaded processes For a single thread, the current wakeup path tends to place it on the same LLC where it was previously running with cache-hot data. There is no need to enable cache-aware scheduling for single-threaded processes for the following reasons: 1. Cache-aware scheduling primarily benefits multi-threaded processes where threads share data. Single-threaded processes typically have no inter-thread data sharing and thus gain little. 2. Enabling it incurs the additional overhead of tracking the thread's residency in the LLCs. 3. Bypassing single-threaded processes avoids excessive concentration of such tasks on a single LLC. Nevertheless, this check can be omitted if users explicitly provide hints for such single-threaded workloads where different processes have shared memory, e.g., via prctl() or other interfaces to be added in the future. Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingyin Duan Link: https://patch.msgid.link/8a59a13aa58fdb48e410ecb2aabd97fe3ea5d256.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 808f614fc2d2..df21366ba1ca 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1384,8 +1384,12 @@ static int llc_id(int cpu) return per_cpu(sd_llc_id, cpu); } -static bool invalid_llc_nr(struct mm_struct *mm, int cpu) +static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p, + int cpu) { + if (get_nr_threads(p) <= 1) + return true; + return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads), per_cpu(sd_llc_size, cpu)); } @@ -1581,7 +1585,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) * its preferred state. */ if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || - invalid_llc_nr(mm, cpu_of(rq))) { + invalid_llc_nr(mm, p, cpu_of(rq))) { if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; } @@ -1687,9 +1691,9 @@ static inline void update_avg_scale(u64 *avg, u64 sample) static void task_cache_work(struct callback_head *work) { + int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu; unsigned long next_scan, now = jiffies; struct task_struct *p = current, *cur; - int cpu, m_a_cpu = -1, nr_running = 0; unsigned long curr_m_a_occ = 0; struct mm_struct *mm = p->mm; unsigned long m_a_occ = 0; @@ -1711,6 +1715,14 @@ static void task_cache_work(struct callback_head *work) now + EPOCH_PERIOD)) return; + curr_cpu = task_cpu(p); + if (invalid_llc_nr(mm, p, curr_cpu)) { + if (mm->sc_stat.cpu != -1) + mm->sc_stat.cpu = -1; + + return; + } + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) return; @@ -10326,7 +10338,7 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, return mig_unrestricted; /* skip cache aware load balance for too many threads */ - if (invalid_llc_nr(mm, dst_cpu)) { + if (invalid_llc_nr(mm, p, dst_cpu)) { if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; return mig_unrestricted; -- cgit v1.2.3 From 7030513a08776b2ca70fccd5dfddf7bb5c5c88ba Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:15 -0700 Subject: sched/cache: Calculate the LLC size and store it in sched_domain Cache aware scheduling needs to know the LLC size that a process can use, so as to avoid memory-intensive tasks from being over-aggregated on a single LLC. Introduce a preparation patch to add get_effective_llc_bytes() to get the LLC size that a CPU can use. The function can be further enhanced by subtracting the LLC cache ways reserved by resctrl (CAT in Intel RDT, etc). Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingyin Duan Link: https://patch.msgid.link/37afee09ff608034da0ce149e72d33b6f4698edf.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/topology.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9fc99346ef4f..7248a7279abe 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -776,9 +776,11 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* move buffer to parent as child is being destroyed */ sd->llc_counts = tmp->llc_counts; sd->llc_max = tmp->llc_max; + sd->llc_bytes = tmp->llc_bytes; /* make sure destroy_sched_domain() does not free it */ tmp->llc_counts = NULL; tmp->llc_max = 0; + tmp->llc_bytes = 0; #endif /* * sched groups hold the flags of the child sched @@ -831,10 +833,42 @@ DEFINE_STATIC_KEY_FALSE(sched_cache_active); /* user wants cache aware scheduling [0 or 1] */ int sysctl_sched_cache_user = 1; +/* + * Get the effective LLC size in bytes that @cpu's bottom sched_domain + * can use. A CPU within a cpuset partition can only use a proportion + * of the physical LLC, scaled by the ratio of the partition's span + * weight to the hardware LLC sharing weight. @sd should be the + * topmost domain with SD_SHARE_LLC. + * + * Returns 0 if cacheinfo is not yet populated. This happens during + * early boot when build_sched_domains() runs before the generic + * cacheinfo framework has been initialized (cacheinfo_cpu_online() + * is a device_initcall cpuhp callback). In that case, + * cacheinfo_cpu_online() will later call sched_update_llc_bytes() + * to fill in the bottom domain's llc_bytes once the cache attributes + * are available. + */ +static unsigned long get_effective_llc_bytes(int cpu, + struct sched_domain *sd) +{ + struct cacheinfo *ci; + unsigned int hw_weight; + + ci = get_cpu_cacheinfo_llc(cpu); + if (!ci) + return 0; + + hw_weight = cpumask_weight(&ci->shared_cpu_map); + if (!hw_weight) + return 0; + + return div_u64((u64)ci->size * sd->span_weight, hw_weight); +} + static bool alloc_sd_llc(const struct cpumask *cpu_map, struct s_data *d) { - struct sched_domain *sd; + struct sched_domain *sd, *top_llc, *parent; unsigned int *p; int i; @@ -848,8 +882,24 @@ static bool alloc_sd_llc(const struct cpumask *cpu_map, if (!p) goto err; - sd->llc_max = max_lid + 1; - sd->llc_counts = p; + top_llc = sd; + /* + * Find the topmost SD_SHARE_LLC domain. + * Not yet attached to the CPU, so per_cpu(sd_llc, i) + * can not be used. + */ + while ((parent = rcu_dereference_protected(top_llc->parent, true)) && + (parent->flags & SD_SHARE_LLC)) + top_llc = parent; + + if (top_llc->flags & SD_SHARE_LLC) { + sd->llc_max = max_lid + 1; + sd->llc_counts = p; + sd->llc_bytes = get_effective_llc_bytes(i, top_llc); + } else { + /* avoid memory leak */ + kfree(p); + } } return true; @@ -860,6 +910,7 @@ err: kfree(sd->llc_counts); sd->llc_counts = NULL; sd->llc_max = 0; + sd->llc_bytes = 0; } } @@ -919,6 +970,47 @@ void sched_cache_active_set_unlocked(void) { return sched_cache_active_set(false); } + +/* + * Update the bottom sched_domain's llc_bytes for @cpu and all its + * LLC siblings. Called from cacheinfo_cpu_online() or + * cacheinfo_cpu_pre_down() with cpu hotplug lock held. + * + * Note: get_effective_llc_bytes() returns 0 on PowerPC. + * thus cache aware scheduling is disabled on PowerPC for + * now. PowerPC does not use the generic cacheinfo framework -- + * it has its own cacheinfo with a separate struct cache hierarchy + * and does not populates the per-CPU struct cpu_cacheinfo array + * that get_cpu_cacheinfo_llc() reads. + */ +void sched_update_llc_bytes(unsigned int cpu) +{ + struct sched_domain *sd, *sdp; + unsigned int i; + + sched_domains_mutex_lock(); + + sdp = rcu_dereference_sched_domain(per_cpu(sd_llc, cpu)); + if (!sdp) + goto unlock; + + /* + * ci->shared_cpu_map is built incrementally as CPUs come + * online, so the first CPU in an LLC initially sees + * hw_weight == 1 and computes an inflated llc_bytes in + * get_effective_llc_bytes(). Re-evaluating every LLC + * sibling on each online event corrects this once the full + * shared_cpu_map is known. + */ + for_each_cpu(i, sched_domain_span(sdp)) { + sd = rcu_dereference_sched_domain(cpu_rq(i)->sd); + if (sd) + sd->llc_bytes = get_effective_llc_bytes(i, sdp); + } + +unlock: + sched_domains_mutex_unlock(); +} #else static bool alloc_sd_llc(const struct cpumask *cpu_map, struct s_data *d) -- cgit v1.2.3 From 808915f982c2a52f5d148510ecfab52284de67cf Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:16 -0700 Subject: sched/cache: Avoid cache-aware scheduling for memory-heavy processes Prateek and Tingyin reported that memory-intensive workloads (such as stream) can saturate memory bandwidth and caches on the preferred LLC when sched_cache aggregates too many threads. To mitigate this, estimate a process's memory footprint by comparing its NUMA balancing fault statistics to the size of the LLC. If the footprint exceeds the LLC size, skip cache-aware scheduling. Note that footprint is only an approximation of the memory footprint, since the kernel lacks suitable metrics to estimate the real working set. If a user-provided hint is available in the future, it would be more accurate. A later patch will allow users to provide a hint to adjust this threshold. Suggested-by: K Prateek Nayak Suggested-by: Vern Hao Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingyin Duan Link: https://patch.msgid.link/95cf64a385bcc12f18dcebe9d59e8d3ba8bb318f.1778703694.git.tim.c.chen@linux.intel.com --- kernel/exit.c | 29 +++++++++++++++++++++++++ kernel/sched/fair.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 88 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ede3117fa7d4..77275c26a2a1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -543,6 +543,32 @@ void mm_update_next_owner(struct mm_struct *mm) } #endif /* CONFIG_MEMCG */ +#if defined(CONFIG_SCHED_CACHE) && defined(CONFIG_NUMA_BALANCING) +/* + * Subtract the memory footprint of the current task from + * mm. + */ +static void exit_mm_sched_cache(struct mm_struct *mm) +{ + unsigned long fp, sub; + + if (!current->total_numa_faults) + return; + /* + * No lock protection due to performance considerations. + * Make sure mm->sc_stat.footprint does not become + * negative. + */ + fp = READ_ONCE(mm->sc_stat.footprint); + sub = min(fp, current->total_numa_faults); + WRITE_ONCE(mm->sc_stat.footprint, fp - sub); +} +#else +static inline void exit_mm_sched_cache(struct mm_struct *mm) +{ +} +#endif /* CONFIG_SCHED_CACHE CONFIG_NUMA_BALANCING */ + /* * Turn us into a lazy TLB process if we * aren't already.. @@ -554,6 +580,9 @@ static void exit_mm(void) exit_mm_release(current, mm); if (!mm) return; + + exit_mm_sched_cache(mm); + mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df21366ba1ca..a10116ffe0d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1384,6 +1384,32 @@ static int llc_id(int cpu) return per_cpu(sd_llc_id, cpu); } +static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) +{ +#ifdef CONFIG_NUMA_BALANCING + unsigned long llc, footprint; + struct sched_domain *sd; + + guard(rcu)(); + + sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); + if (!sd) + return true; + + if (static_branch_likely(&sched_numa_balancing)) { + /* + * TBD: RDT exclusive LLC ways reserved should be + * excluded. + */ + llc = sd->llc_bytes; + footprint = READ_ONCE(mm->sc_stat.footprint); + + return (llc < (footprint * PAGE_SIZE)); + } +#endif + return false; +} + static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p, int cpu) { @@ -1463,6 +1489,7 @@ void mm_init_sched(struct mm_struct *mm, mm->sc_stat.cpu = -1; mm->sc_stat.next_scan = jiffies; mm->sc_stat.nr_running_avg = 0; + mm->sc_stat.footprint = 0; /* * The update to mm->sc_stat should not be reordered * before initialization to mm's other fields, in case @@ -1585,7 +1612,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) * its preferred state. */ if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || - invalid_llc_nr(mm, p, cpu_of(rq))) { + invalid_llc_nr(mm, p, cpu_of(rq)) || + exceed_llc_capacity(mm, cpu_of(rq))) { if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; } @@ -1716,7 +1744,8 @@ static void task_cache_work(struct callback_head *work) return; curr_cpu = task_cpu(p); - if (invalid_llc_nr(mm, p, curr_cpu)) { + if (invalid_llc_nr(mm, p, curr_cpu) || + exceed_llc_capacity(mm, curr_cpu)) { if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; @@ -3515,6 +3544,7 @@ static void task_numa_placement(struct task_struct *p) unsigned long total_faults; u64 runtime, period; spinlock_t *group_lock = NULL; + long __maybe_unused new_fp; struct numa_group *ng; /* @@ -3589,6 +3619,31 @@ static void task_numa_placement(struct task_struct *p) ng->total_faults += diff; group_faults += ng->faults[mem_idx]; } +#ifdef CONFIG_SCHED_CACHE + /* + * Per task p->numa_faults[mem_idx] converges, + * so the accumulation of each task's faults + * converges too - Given the number of threads, + * it cannot overflow an unsigned long. + * Racy with concurrent updates from other threads + * sharing this mm. Acceptable since footprint is a + * heuristic and occasional lost updates are tolerable. + * + * If a task exits, its corresponding footprint must + * be subtracted from the mm->sc_stat.footprint, otherwise + * the mm->sc_stat.footprint will not converge: + * the exiting thread's footprint remains unchanged/undecayed + * in mm->sc_stat.footprint. See exit_mm(). + * + * Lost updates and unsynchronized subtraction + * in exit_mm() can cause footprint + diff to + * go negative. Clamp to zero to prevent the + * unsigned footprint from wrapping. + */ + new_fp = (long)READ_ONCE(p->mm->sc_stat.footprint) + diff; + WRITE_ONCE(p->mm->sc_stat.footprint, + max(new_fp, 0L)); +#endif } if (!ng) { @@ -10338,7 +10393,8 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, return mig_unrestricted; /* skip cache aware load balance for too many threads */ - if (invalid_llc_nr(mm, p, dst_cpu)) { + if (invalid_llc_nr(mm, p, dst_cpu) || + exceed_llc_capacity(mm, dst_cpu)) { if (mm->sc_stat.cpu != -1) mm->sc_stat.cpu = -1; return mig_unrestricted; -- cgit v1.2.3 From c1e7fe5e75ed11fa85368e5a186472afd3858f3a Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:17 -0700 Subject: sched/cache: Add user control to adjust the aggressiveness of cache-aware scheduling Introduce a set of debugfs knobs to control how aggressively the cache aware scheduling does the task aggregation. (1) aggr_tolerance With sched_cache enabled, the scheduler uses a process's footprint as a proxy for its LLC footprint to determine if aggregating tasks on the preferred LLC could cause cache contention. If the footprint exceeds the LLC size, aggregation is skipped. Since the kernel cannot efficiently track per-task cache usage (resctrl is user-space only), userspace can provide a more accurate hint. Introduce /sys/kernel/debug/sched/llc_balancing/aggr_tolerance to let users control how strictly footprint limits aggregation. Values range from 0 to 100: - 0: Cache-aware scheduling is disabled. - 1: Strict; tasks with footprint larger than LLC size are skipped. - >=100: Aggressive; tasks are aggregated regardless of footprint. For example, with a 32MB L3 cache: - aggr_tolerance=1 -> tasks with footprint > 32MB are skipped. - aggr_tolerance=99 -> tasks with footprint > 784GB are skipped (784GB = (1 + (99 - 1) * 256) * 32MB). Similarly, /sys/kernel/debug/sched/llc_balancing/aggr_tolerance also controls how strictly the number of active threads is considered when doing cache aware load balance. The number of SMTs is also considered. High SMT counts reduce the aggregation capacity, preventing excessive task aggregation on SMT-heavy systems like Power10/Power11. Yangyu suggested introducing separate aggregation controls for the number of active threads and memory footprint checks. Since there are plans to add per-process/task group controls, fine-grained tunables are deferred to that implementation. (2) epoch_period, epoch_affinity_timeout, imb_pct, overaggr_pct are also turned into tunables. Suggested-by: K Prateek Nayak Suggested-by: Madadi Vineeth Reddy Suggested-by: Shrikanth Hegde Suggested-by: Tingyin Duan Suggested-by: Jianyong Wu Suggested-by: Yangyu Chen Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tingyin Duan Link: https://patch.msgid.link/1c62cc060ba2b33d7b1f0ed98b3390128edbae93.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/debug.c | 10 ++++++++ kernel/sched/fair.c | 68 +++++++++++++++++++++++++++++++++++++++++++++------- kernel/sched/sched.h | 5 ++++ 3 files changed, 75 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2eae67cd2ba2..fe569539e888 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -670,6 +670,16 @@ static __init int sched_init_debug(void) llc = debugfs_create_dir("llc_balancing", debugfs_sched); debugfs_create_file("enabled", 0644, llc, NULL, &sched_cache_enable_fops); + debugfs_create_u32("aggr_tolerance", 0644, llc, + &llc_aggr_tolerance); + debugfs_create_u32("epoch_period", 0644, llc, + &llc_epoch_period); + debugfs_create_u32("epoch_affinity_timeout", 0644, llc, + &llc_epoch_affinity_timeout); + debugfs_create_u32("overaggr_pct", 0644, llc, + &llc_overaggr_pct); + debugfs_create_u32("imb_pct", 0644, llc, + &llc_imb_pct); #endif debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a10116ffe0d1..76ac6a8100fc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1375,6 +1375,11 @@ static void set_next_buddy(struct sched_entity *se); */ #define EPOCH_PERIOD (HZ / 100) /* 10 ms */ #define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */ +__read_mostly unsigned int llc_aggr_tolerance = 1; +__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD; +__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT; +__read_mostly unsigned int llc_imb_pct = 20; +__read_mostly unsigned int llc_overaggr_pct = 50; static int llc_id(int cpu) { @@ -1384,11 +1389,25 @@ static int llc_id(int cpu) return per_cpu(sd_llc_id, cpu); } +static inline int get_sched_cache_scale(int mul) +{ + unsigned int tol = READ_ONCE(llc_aggr_tolerance); + + if (!tol) + return 0; + + if (tol >= 100) + return INT_MAX; + + return (1 + (tol - 1) * mul); +} + static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) { #ifdef CONFIG_NUMA_BALANCING unsigned long llc, footprint; struct sched_domain *sd; + int scale; guard(rcu)(); @@ -1404,7 +1423,28 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) llc = sd->llc_bytes; footprint = READ_ONCE(mm->sc_stat.footprint); - return (llc < (footprint * PAGE_SIZE)); + /* + * Scale the LLC size by 256*llc_aggr_tolerance + * and compare it to the task's footprint. + * + * Suppose the L3 size is 32MB. If the + * llc_aggr_tolerance is 1: + * When the footprint is larger than 32MB, the + * process is regarded as exceeding the LLC + * capacity. If the llc_aggr_tolerance is 99: + * When the footprint is larger than 784GB, the + * process is regarded as exceeding the LLC + * capacity: + * 784GB = (1 + (99 - 1) * 256) * 32MB + * If the llc_aggr_tolerance is 100: + * ignore the footprint and do the aggregation + * anyway. + */ + scale = get_sched_cache_scale(256); + if (scale == INT_MAX) + return false; + + return ((llc * (u64)scale) < (footprint * PAGE_SIZE)); } #endif return false; @@ -1413,11 +1453,21 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p, int cpu) { + int scale; + if (get_nr_threads(p) <= 1) return true; + /* + * Scale the number of 'cores' in a LLC by llc_aggr_tolerance + * and compare it to the task's active threads. + */ + scale = get_sched_cache_scale(1); + if (scale == INT_MAX) + return false; + return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads), - per_cpu(sd_llc_size, cpu)); + (scale * per_cpu(sd_llc_size, cpu))); } static void account_llc_enqueue(struct rq *rq, struct task_struct *p) @@ -1513,13 +1563,14 @@ static inline void __update_mm_sched(struct rq *rq, { lockdep_assert_held(&rq->cpu_epoch_lock); + unsigned int period = max(READ_ONCE(llc_epoch_period), 1U); unsigned long n, now = jiffies; long delta = now - rq->cpu_epoch_next; if (delta > 0) { - n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD; + n = (delta + period - 1) / period; rq->cpu_epoch += n; - rq->cpu_epoch_next += n * EPOCH_PERIOD; + rq->cpu_epoch_next += n * period; __shr_u64(&rq->cpu_runtime, n); } @@ -1611,7 +1662,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) * If this process hasn't hit task_cache_work() for a while invalidate * its preferred state. */ - if (epoch - READ_ONCE(mm->sc_stat.epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || + if ((long)(epoch - READ_ONCE(mm->sc_stat.epoch)) > llc_epoch_affinity_timeout || invalid_llc_nr(mm, p, cpu_of(rq)) || exceed_llc_capacity(mm, cpu_of(rq))) { if (mm->sc_stat.cpu != -1) @@ -1740,7 +1791,8 @@ static void task_cache_work(struct callback_head *work) /* only 1 thread is allowed to scan */ if (!try_cmpxchg(&mm->sc_stat.next_scan, &next_scan, - now + EPOCH_PERIOD)) + now + max_t(unsigned long, + READ_ONCE(llc_epoch_period), 1))) return; curr_cpu = task_cpu(p); @@ -10232,7 +10284,7 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ */ static bool fits_llc_capacity(unsigned long util, unsigned long max) { - u32 aggr_pct = 50; + u32 aggr_pct = llc_overaggr_pct; /* * For single core systems, raise the aggregation @@ -10252,7 +10304,7 @@ static bool fits_llc_capacity(unsigned long util, unsigned long max) */ /* Allows dst util to be bigger than src util by up to bias percent */ #define util_greater(util1, util2) \ - ((util1) * 100 > (util2) * 120) + ((util1) * 100 > (util2) * (100 + llc_imb_pct)) static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, unsigned long *cap) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f499d5dd1130..27409399137c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -4072,6 +4072,11 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct DECLARE_STATIC_KEY_FALSE(sched_cache_present); DECLARE_STATIC_KEY_FALSE(sched_cache_active); extern int sysctl_sched_cache_user; +extern unsigned int llc_aggr_tolerance; +extern unsigned int llc_epoch_period; +extern unsigned int llc_epoch_affinity_timeout; +extern unsigned int llc_imb_pct; +extern unsigned int llc_overaggr_pct; static inline bool sched_cache_enabled(void) { -- cgit v1.2.3 From d943b86dfbf4e9b76be30cf90b1b3f82ff9abbac Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:18 -0700 Subject: sched/cache: Fix rcu warning when accessing sd_llc domain rcu_dereference_all() should be used to access the sd_llc domain under RCU protection. This bug was reported by sashiko. Fixes: df0d98475954 ("sched/cache: Introduce infrastructure for cache-aware load balancing") Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/2dc49455e861215d8059a1c877953f0b95990038.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 76ac6a8100fc..c549ad489c6d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1814,7 +1814,7 @@ static void task_cache_work(struct callback_head *work) for_each_cpu(cpu, cpus) { /* XXX sched_cluster_active */ - struct sched_domain *sd = per_cpu(sd_llc, cpu); + struct sched_domain *sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); unsigned long occ, m_occ = 0, a_occ = 0; int m_cpu = -1, i; -- cgit v1.2.3 From 9f23469401b04cfd9a5d0a8b61760a48cce35dc1 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:19 -0700 Subject: sched/cache: Fix potential NULL mm pointer access A concurrent task exit might cause a NULL pointer dereference in account_mm_sched(). Use the locally cached mm pointer instead, since the active_mm reference guarantees the structure remains allocated. Meanwhile, skip the kernel thread because it has nothing to do with cache aware scheduling. This bug was reported by sashiko and Vern. Fixes: df0d98475954 ("sched/cache: Introduce infrastructure for cache-aware load balancing") Reported-by: Vern Hao Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/09cf7ee3-6e27-4505-9692-4b4a4707c8b2@gmail.com/ Link: https://patch.msgid.link/066d8cfa45d4822bf4367e788c50377c66bbcc82.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c549ad489c6d..663968b46e13 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1649,7 +1649,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) if (!mm || !mm->sc_stat.pcpu_sched) return; - pcpu_sched = per_cpu_ptr(p->mm->sc_stat.pcpu_sched, cpu_of(rq)); + pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, cpu_of(rq)); scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) { __update_mm_sched(rq, pcpu_sched); @@ -1689,7 +1689,8 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p) if (!sched_cache_enabled()) return; - if (!mm || !mm->sc_stat.pcpu_sched) + if (!mm || p->flags & PF_KTHREAD || + !mm->sc_stat.pcpu_sched) return; epoch = rq->cpu_epoch; -- cgit v1.2.3 From 91d07324c9305c0e4afff0cc859cac96594daa88 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:20 -0700 Subject: sched/cache: Annotate lockless accesses to mm->sc_stat.cpu mm->sc_stat.cpu is written by task_cache_work() and could be read locklessly by several functions on other CPUs. Use READ_ONCE and WRITE_ONCE on mm->sc_stat.cpu access and write to prevent inconsistent values from compiler optimizations when there are multiple accesses. For example in get_pref_llc(), if the writer updated the field between two compiler-generated loads, the validation (e.g., cpu != -1) and subsequent use (e.g., llc_id(cpu)) could operate on different values, allowing a negative CPU ID to be used as an index. Leave plain write in mm_init_sched(), where the mm is not yet visible to other CPUs. This bug was reported by sashiko. Fixes: 47d8696b95f7 ("sched/cache: Assign preferred LLC ID to processes") Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/63ea494f12efcf265d7134400a06cd75d7f2c310.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 663968b46e13..087445ea6bc9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1598,13 +1598,14 @@ static unsigned long fraction_mm_sched(struct rq *rq, static int get_pref_llc(struct task_struct *p, struct mm_struct *mm) { - int mm_sched_llc = -1; + int mm_sched_llc = -1, mm_sched_cpu; if (!mm) return -1; - if (mm->sc_stat.cpu != -1) { - mm_sched_llc = llc_id(mm->sc_stat.cpu); + mm_sched_cpu = READ_ONCE(mm->sc_stat.cpu); + if (mm_sched_cpu != -1) { + mm_sched_llc = llc_id(mm_sched_cpu); #ifdef CONFIG_NUMA_BALANCING /* @@ -1619,7 +1620,7 @@ static int get_pref_llc(struct task_struct *p, struct mm_struct *mm) */ if (static_branch_likely(&sched_numa_balancing) && p->numa_preferred_nid >= 0 && - cpu_to_node(mm->sc_stat.cpu) != p->numa_preferred_nid) + cpu_to_node(mm_sched_cpu) != p->numa_preferred_nid) mm_sched_llc = -1; #endif } @@ -1665,8 +1666,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) if ((long)(epoch - READ_ONCE(mm->sc_stat.epoch)) > llc_epoch_affinity_timeout || invalid_llc_nr(mm, p, cpu_of(rq)) || exceed_llc_capacity(mm, cpu_of(rq))) { - if (mm->sc_stat.cpu != -1) - mm->sc_stat.cpu = -1; + if (READ_ONCE(mm->sc_stat.cpu) != -1) + WRITE_ONCE(mm->sc_stat.cpu, -1); } mm_sched_llc = get_pref_llc(p, mm); @@ -1714,7 +1715,7 @@ static void get_scan_cpumasks(cpumask_var_t cpus, struct task_struct *p) if (!static_branch_likely(&sched_numa_balancing)) goto out; - cpu = p->mm->sc_stat.cpu; + cpu = READ_ONCE(p->mm->sc_stat.cpu); if (cpu != -1) nid = cpu_to_node(cpu); curr_cpu = task_cpu(p); @@ -1799,8 +1800,8 @@ static void task_cache_work(struct callback_head *work) curr_cpu = task_cpu(p); if (invalid_llc_nr(mm, p, curr_cpu) || exceed_llc_capacity(mm, curr_cpu)) { - if (mm->sc_stat.cpu != -1) - mm->sc_stat.cpu = -1; + if (READ_ONCE(mm->sc_stat.cpu) != -1) + WRITE_ONCE(mm->sc_stat.cpu, -1); return; } @@ -1857,7 +1858,7 @@ static void task_cache_work(struct callback_head *work) m_a_cpu = m_cpu; } - if (llc_id(cpu) == llc_id(mm->sc_stat.cpu)) + if (llc_id(cpu) == llc_id(READ_ONCE(mm->sc_stat.cpu))) curr_m_a_occ = a_occ; cpumask_andnot(cpus, cpus, sched_domain_span(sd)); @@ -1875,7 +1876,7 @@ static void task_cache_work(struct callback_head *work) * 3. 2X is chosen based on test results, as it delivers * the optimal performance gain so far. */ - mm->sc_stat.cpu = m_a_cpu; + WRITE_ONCE(mm->sc_stat.cpu, m_a_cpu); } update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running); @@ -10441,15 +10442,15 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, if (!mm) return mig_unrestricted; - cpu = mm->sc_stat.cpu; + cpu = READ_ONCE(mm->sc_stat.cpu); if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) return mig_unrestricted; /* skip cache aware load balance for too many threads */ if (invalid_llc_nr(mm, p, dst_cpu) || exceed_llc_capacity(mm, dst_cpu)) { - if (mm->sc_stat.cpu != -1) - mm->sc_stat.cpu = -1; + if (READ_ONCE(mm->sc_stat.cpu) != -1) + WRITE_ONCE(mm->sc_stat.cpu, -1); return mig_unrestricted; } -- cgit v1.2.3 From 03755348b8e74421f92ffed9da159175a698290b Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:21 -0700 Subject: sched/cache: Fix unpaired account_llc_enqueue/dequeue There is a race condition that, after a task is enqueued on a runqueue, task_llc(p) may change due to CPU hotplug, because the llc_id is dynamically allocated and adjusted at runtime. Therefore, checking task_llc(p) to determine whether the task is being dequeued from its preferred LLC is unreliable and can cause inconsistent values. To fix this problem, record whether p is enqueued on its preferred LLC, in order to pair with account_llc_dequeue() to maintain a consistent nr_pref_llc_running per runqueue. This bug was reported by sashiko, and the solution was once suggested by Prateek. Fixes: 46afe3af7ead ("sched/cache: Track LLC-preferred tasks per runqueue") Suggested-by: K Prateek Nayak Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/0c8c6a1571d66792a4d2ff0103ba3cc13e059046.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 087445ea6bc9..96c61ce366c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1472,15 +1472,32 @@ static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p, static void account_llc_enqueue(struct rq *rq, struct task_struct *p) { + int pref_llc, pref_llc_queued; struct sched_domain *sd; - int pref_llc; pref_llc = p->preferred_llc; if (pref_llc < 0) return; + pref_llc_queued = (pref_llc == task_llc(p)); rq->nr_llc_running++; - rq->nr_pref_llc_running += (pref_llc == task_llc(p)); + rq->nr_pref_llc_running += pref_llc_queued; + + /* + * Record whether p is enqueued on its preferred + * LLC, in order to pair with account_llc_dequeue() + * to maintain a consistent nr_pref_llc_running per + * runqueue. + * This is necessary because a race condition exists: + * after a task is enqueued on a runqueue, task_llc(p) + * may change due to CPU hotplug. Therefore, checking + * task_llc(p) to determine whether the task is being + * dequeued from its preferred LLC is unreliable and + * can cause inconsistent values - checking the + * p->pref_llc_queued in account_llc_dequeue() would + * be reliable. + */ + p->pref_llc_queued = pref_llc_queued; sd = rcu_dereference_all(rq->sd); if (sd && (unsigned int)pref_llc < sd->llc_max) @@ -1497,7 +1514,15 @@ static void account_llc_dequeue(struct rq *rq, struct task_struct *p) return; rq->nr_llc_running--; - rq->nr_pref_llc_running -= (pref_llc == task_llc(p)); + if (p->pref_llc_queued) { + rq->nr_pref_llc_running--; + /* + * Update the status in case + * other logic might query + * this. + */ + p->pref_llc_queued = 0; + } sd = rcu_dereference_all(rq->sd); if (sd && (unsigned int)pref_llc < sd->llc_max) { -- cgit v1.2.3 From d6b9afab44e23d537fb85ecf50330baaf9ec82e9 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:22 -0700 Subject: sched/cache: Fix checking active load balance by only considering the CFS task The currently running task cur may not be a CFS task, such as an RT or Deadline task. For non-CFS tasks, the task_util(cur) utilization average is not maintained, so this might pass a stale or meaningless value to can_migrate_llc(). Check if the task is CFS before getting its task_util(). This bug was reported by sashiko. Fixes: 714059f79ff0 ("sched/cache: Handle moving single tasks to/from their preferred LLC") Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/f9161133cf040d286dca11344a112c5ef2a5253d.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96c61ce366c2..c249caea3862 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10509,7 +10509,8 @@ alb_break_llc(struct lb_env *env) /* * All tasks prefer to stay on their current CPU. * Do not pull a task from its preferred CPU if: - * 1. It is the only task running there(not too imbalance); OR + * 1. It is the only task running and does not exceed + * imbalance allowance; OR * 2. Migrating it away from its preferred LLC would violate * the cache-aware scheduling policy. */ @@ -10522,7 +10523,7 @@ alb_break_llc(struct lb_env *env) return true; cur = rcu_dereference_all(env->src_rq->curr); - if (cur) + if (cur && cur->sched_class == &fair_sched_class) util = task_util(cur); if (can_migrate_llc(env->src_cpu, env->dst_cpu, -- cgit v1.2.3 From 9f7c745850b4b1b7e4706ae81f04c43f204a6a8d Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:23 -0700 Subject: sched/cache: Fix race condition during sched domain rebuild sched_cache_active_set_unlocked() checks hardware support without locks: static void sched_cache_active_set(bool locked) { /* hardware does not support */ if (!static_branch_likely(&sched_cache_present)) { _sched_cache_active_set(false, locked); return; } ... If build_sched_domains() runs concurrently during CPU hotplug, it can disable sched_cache_present under sched_domains_mutex and the CPU hotplug lock. If a debugfs write thread evaluates sched_cache_present as true right before that, and then blocks or gets preempted, it might proceed to enable sched_cache_active after the hardware support has been marked as absent. Make it safer by acquiring cpus_read_lock() and sched_domains_mutex_lock() when the user changes sched_cache_active via debugfs. This bug was reported by sashiko. Fixes: 067a31358143 ("sched/cache: Allow the user space to turn on and off cache aware scheduling") Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/9afddf439687f04bb56b46625bd9f153eb8abad5.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/debug.c | 4 +++- kernel/sched/sched.h | 2 +- kernel/sched/topology.c | 43 ++++++++++++++++--------------------------- 3 files changed, 20 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index fe569539e888..ed3a0d65da0c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -224,7 +224,9 @@ sched_cache_enable_write(struct file *filp, const char __user *ubuf, sysctl_sched_cache_user = val; - sched_cache_active_set_unlocked(); + sched_cache_active_set(); + + *ppos += cnt; return cnt; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 27409399137c..45a3b77f46aa 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -4083,7 +4083,7 @@ static inline bool sched_cache_enabled(void) return static_branch_unlikely(&sched_cache_active); } -extern void sched_cache_active_set_unlocked(void); +extern void sched_cache_active_set(void); #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 7248a7279abe..c257134f613d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -917,30 +917,20 @@ err: return false; } -static void _sched_cache_active_set(bool enable, bool locked) -{ - if (enable) { - if (locked) - static_branch_enable_cpuslocked(&sched_cache_active); - else - static_branch_enable(&sched_cache_active); - } else { - if (locked) - static_branch_disable_cpuslocked(&sched_cache_active); - else - static_branch_disable(&sched_cache_active); - } -} - /* * Enable/disable cache aware scheduling according to * user input and the presence of hardware support. */ -static void sched_cache_active_set(bool locked) +static void _sched_cache_active_set(void) { + lockdep_assert_cpus_held(); + lockdep_assert_held(&sched_domains_mutex); + /* hardware does not support */ if (!static_branch_likely(&sched_cache_present)) { - _sched_cache_active_set(false, locked); + static_branch_disable_cpuslocked(&sched_cache_active); + if (sched_debug()) + pr_info("%s: cache aware scheduling not supported on this platform\n", __func__); return; } @@ -951,24 +941,23 @@ static void sched_cache_active_set(bool locked) * for now. */ if (sysctl_sched_cache_user) { - _sched_cache_active_set(true, locked); + static_branch_enable_cpuslocked(&sched_cache_active); if (sched_debug()) pr_info("%s: enabling cache aware scheduling\n", __func__); } else { - _sched_cache_active_set(false, locked); + static_branch_disable_cpuslocked(&sched_cache_active); if (sched_debug()) pr_info("%s: disabling cache aware scheduling\n", __func__); } } -static void sched_cache_active_set_locked(void) +void sched_cache_active_set(void) { - return sched_cache_active_set(true); -} - -void sched_cache_active_set_unlocked(void) -{ - return sched_cache_active_set(false); + cpus_read_lock(); + sched_domains_mutex_lock(); + _sched_cache_active_set(); + sched_domains_mutex_unlock(); + cpus_read_unlock(); } /* @@ -3082,7 +3071,7 @@ error: else static_branch_disable_cpuslocked(&sched_cache_present); - sched_cache_active_set_locked(); + _sched_cache_active_set(); #endif __free_domain_allocs(&d, alloc_state, cpu_map); -- cgit v1.2.3 From 5beff4f087277901444787d4b7af6b3a93c34c54 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:24 -0700 Subject: sched/cache: Fix cache aware scheduling enabling for multi LLCs system If there are multiple LLCs in the system, cache aware scheduling should be enabled. However, there is a corner case where, if there is a single NUMA node and a single LLC per node, cache aware scheduling will be turned on in the current implementation - because at this moment, the parent domain has not yet been degenerated, and it is possible that the current domain has the same cpu span as its parent. There is no need to turn cache aware scheduling on in this scenario. Fix it by iterating the parent domains to find a domain that is a superset of the current sd_llc, so that later, after the duplicated parent domains have been degenerated, cache aware scheduling will take effect. For example, the expected behavior would be: 2 sockets, 1 LLC per socket: MC span=0-3, PKG span=0-7, has_multi_llcs=true 1 socket, 2 LLCs per socket: MC span=0-3, PKG span=0-7, has_multi_llcs=true 2 sockets, 2 LLCs per socket: MC span=0-3, PKG span=0-7, has_multi_llcs=true 1 socket, 1 LLC per socket: MC span=0-3, PKG span=0-3, has_multi_llcs=false This bug was reported by sashiko. Fixes: d59f4fd1d303 ("sched/cache: Enable cache aware scheduling for multi LLCs NUMA node") Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/6328a8a7f40925cec2a712d81ee58128a4c4444a.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/topology.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c257134f613d..4b7c64cbe854 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1008,6 +1008,37 @@ static bool alloc_sd_llc(const struct cpumask *cpu_map, } #endif +/* + * Return true if @sd belongs to an LLC group whose enclosing + * partition spans more than one LLC. @sd must be the topmost + * SD_SHARE_LLC domain. + * + * Any duplicated parent domains with the same span as @sd are + * skipped: before cpu_attach_domain() degeneration these still + * exist, after degeneration the loop is a no-op. This makes the + * helper usable both during sched domain build and against an + * already-attached domain tree. + * + * Note: For systems with a single LLC per node, cache-aware + * scheduling is still enabled when multiple nodes exist. + * However, NUMA balancing decisions take precedence over + * cache-aware scheduling. Conversely, if there is only one + * LLC per partition, cache-aware scheduling should be disabled. + */ +static bool sd_in_multi_llcs(struct sched_domain *sd) +{ + struct sched_domain *sdp = sd->parent; + + /* it does not make sense to aggregate to 1 CPU */ + if (sd->span_weight == 1) + return false; + + while (sdp && sdp->span_weight == sd->span_weight) + sdp = sdp->parent; + + return !!sdp; +} + /* * Return the canonical balance CPU for this group, this is the first CPU * of this group that's also in the balance mask. @@ -3017,9 +3048,11 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att * NUMA imbalance stats for the hierarchy. */ if (sd->parent) { - if (IS_ENABLED(CONFIG_NUMA)) - adjust_numa_imbalance(sd); - has_multi_llcs = true; + if (IS_ENABLED(CONFIG_NUMA)) + adjust_numa_imbalance(sd); + + if (sd_in_multi_llcs(sd)) + has_multi_llcs = true; } } } -- cgit v1.2.3 From a7660ce1590fc1316a44cc2af53a07a21dfc25da Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:25 -0700 Subject: sched/cache: Fix has_multi_llcs iff at least one partition has multiple LLCs sched_cache_present is a global static key, but build_sched_domains() is called per partition from the "Build new domains" loop in partition_sched_domains_locked(). Each call unconditionally sets the key based solely on the has_multi_llcs local variable for that partition. The call to the last partition set the value even when there are previous partitions with multiple LLCs. If partition A (multi-LLC) is built first, the key is enabled. Then when partition B (single-LLC) is built, the key is disabled. The multi-LLC partition A is still active but the key is now off. Fix it by doing a similar thing as sched_energy_present: check the multi-LLCs during the iteration over all the partitions rather than checking it on a single partition. This bug was reported by sashiko. Fixes: d59f4fd1d303 ("sched/cache: Enable cache aware scheduling for multi LLCs NUMA node") Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/c541af2547d54509fbfd3b3a1e8072e2e5c7ff68.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/topology.c | 69 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4b7c64cbe854..e47a3f72eb72 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -951,6 +951,7 @@ static void _sched_cache_active_set(void) } } +/* used by debugfs */ void sched_cache_active_set(void) { cpus_read_lock(); @@ -1000,12 +1001,27 @@ void sched_update_llc_bytes(unsigned int cpu) unlock: sched_domains_mutex_unlock(); } + +static void sched_cache_set(bool has_multi_llcs) +{ + /* + * TBD: check before writing to it. sched domain rebuild + * is not in the critical path, leave as-is for now. + */ + if (has_multi_llcs) + static_branch_enable_cpuslocked(&sched_cache_present); + else + static_branch_disable_cpuslocked(&sched_cache_present); + + _sched_cache_active_set(); +} #else static bool alloc_sd_llc(const struct cpumask *cpu_map, struct s_data *d) { return false; } +static inline void sched_cache_set(bool has_multi_llcs) { } #endif /* @@ -2950,7 +2966,8 @@ void sched_domains_free_llc_id(int cpu) * to the individual CPUs */ static int -build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) +build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr, + bool *multi_llcs) { enum s_alloc alloc_state = sa_none; bool has_multi_llcs = false; @@ -3094,18 +3111,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att ret = 0; error: -#ifdef CONFIG_SCHED_CACHE - /* - * TBD: check before writing to it. sched domain rebuild - * is not in the critical path, leave as-is for now. - */ - if (!ret && has_multi_llcs) - static_branch_enable_cpuslocked(&sched_cache_present); - else - static_branch_disable_cpuslocked(&sched_cache_present); - - _sched_cache_active_set(); -#endif + *multi_llcs = has_multi_llcs; __free_domain_allocs(&d, alloc_state, cpu_map); return ret; @@ -3168,6 +3174,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) */ int __init sched_init_domains(const struct cpumask *cpu_map) { + bool multi_llcs; int err; zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL); @@ -3182,7 +3189,9 @@ int __init sched_init_domains(const struct cpumask *cpu_map) if (!doms_cur) doms_cur = &fallback_doms; cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN)); - err = build_sched_domains(doms_cur[0], NULL); + err = build_sched_domains(doms_cur[0], NULL, &multi_llcs); + if (!err) + sched_cache_set(multi_llcs); return err; } @@ -3255,6 +3264,7 @@ static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new struct sched_domain_attr *dattr_new) { bool __maybe_unused has_eas = false; + bool has_multi_llcs = false, multi_llcs; int i, j, n; int new_topology; @@ -3304,14 +3314,41 @@ match1: for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && - dattrs_equal(dattr_new, i, dattr_cur, j)) + dattrs_equal(dattr_new, i, dattr_cur, j)) { + /* + * Reused partition has to be taken care + * of here, because there could be a corner + * case that if the reused partition is skipped + * and only new partition is considered, an + * incorrect has_multi_llcs would be set. For + * example: + * If the only multi-LLC partition is reused + * and a new single-LLC partition is built, + * sched_cache_set(false) disables cache-aware + * scheduling globally despite the reused + * multi-LLC partition still being active. + */ + struct sched_domain *sd; + int cpu = cpumask_first(doms_cur[j]); + + guard(rcu)(); + sd = rcu_dereference(cpu_rq(cpu)->sd); + while (sd && sd->parent && (sd->parent->flags & SD_SHARE_LLC)) + sd = sd->parent; + if (sd && (sd->flags & SD_SHARE_LLC) && sd->parent && + sd_in_multi_llcs(sd)) + has_multi_llcs = true; goto match2; + } } /* No match - add a new doms_new */ - build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); + build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL, + &multi_llcs); + has_multi_llcs |= multi_llcs; match2: ; } + sched_cache_set(has_multi_llcs); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) /* Build perf domains: */ -- cgit v1.2.3 From c99b8593b060931c5a0a4b701689f8d6a2c00dbf Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 13 May 2026 13:39:27 -0700 Subject: sched/cache: Fix stale preferred_llc for a new task On fork without CLONE_VM, the child gets a new mm, the parent's preferred_llc value is stale for the child. Fix this by resetting the task's preferred_llc to -1. This bug was reported by sashiko. Fixes: 47d8696b95f7 ("sched/cache: Assign preferred LLC ID to processes") Signed-off-by: Chen Yu Co-developed-by: Tim Chen Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/0ec7309d0e24ede97656754d1505b7490403d966.1778703694.git.tim.c.chen@linux.intel.com --- kernel/sched/fair.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c249caea3862..2614315a25e0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1914,6 +1914,11 @@ void init_sched_mm(struct task_struct *p) init_task_work(work, task_cache_work); work->next = work; + /* + * Reset new task's preference to avoid + * polluting account_llc_enqueue(). + */ + p->preferred_llc = -1; } #else /* CONFIG_SCHED_CACHE */ -- cgit v1.2.3 From c2e390197ad1360db6686a8c89abaafaf83adf72 Mon Sep 17 00:00:00 2001 From: Yuri Andriaccio Date: Thu, 30 Apr 2026 23:38:25 +0200 Subject: sched/rt: Update default bandwidth for real-time tasks to ONE Set the default total bandwidth for SCHED_DEADLINE tasks and servers to ONE. FIFO/RR tasks are already throttled by fair-servers and ext-servers, and the sysctl_sched_rt_runtime parameter now only defines the total bw that is allowed to deadline entities. Signed-off-by: Yuri Andriaccio Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260430213835.62217-22-yurand2000@gmail.com --- kernel/sched/rt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4ee8faf01441..e6ea728f519e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -19,9 +19,9 @@ int sysctl_sched_rt_period = 1000000; /* * part of the period that we allow rt tasks to run in us. - * default: 0.95s + * default: 1s */ -int sysctl_sched_rt_runtime = 950000; +int sysctl_sched_rt_runtime = 1000000; #ifdef CONFIG_SYSCTL static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ; -- cgit v1.2.3 From eecd5e117cfa63a353f4c69fdcea5d9b14af698e Mon Sep 17 00:00:00 2001 From: Yuri Andriaccio Date: Thu, 30 Apr 2026 23:38:05 +0200 Subject: sched/deadline: Fix replenishment logic for non-deferred servers Enqueue and replenish non-deferred deadline servers when their runtime is exhausted and the replenishment timer could not be started because it is too close to the wake-up instant. Signed-off-by: Yuri Andriaccio Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260430213835.62217-2-yurand2000@gmail.com --- kernel/sched/deadline.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index edca7849b165..b60e2df8ff9d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1515,8 +1515,12 @@ throttle: if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) { if (dl_server(dl_se)) { - replenish_dl_new_period(dl_se, rq); - start_dl_timer(dl_se); + if (dl_se->dl_defer) { + replenish_dl_new_period(dl_se, rq); + start_dl_timer(dl_se); + } else { + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + } } else { enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH); } -- cgit v1.2.3 From 95f44886afec7cbce0ff2a5ed8158fbe8aa6f2ec Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 14 May 2026 16:26:29 -0400 Subject: sched/cputime: Drop now-stale mul_u64_u64_div_u64() over-approximation guard Commit 77baa5bafcbe ("sched/cputime: Fix mul_u64_u64_div_u64() precision for cputime") added a clamp in cputime_adjust(): if (unlikely(stime > rtime)) stime = rtime; The justification was that mul_u64_u64_div_u64() could over-approximate on some architectures (notably arm64 and the old 32-bit fallback), so the mathematically impossible stime > rtime was nevertheless reachable and would underflow utime = rtime - stime. That premise no longer holds. Commit b29a62d87cc0 ("mul_u64_u64_div_u64: make it precise always") replaced the fallback implementation with an exact 128-bit long division, and the x86_64 inline asm already produced exact results. The helper now returns the mathematically correct floor(a*b/d) on every architecture, so stime <= rtime is guaranteed by stime <= stime + utime and the clamp is dead code. Remove it along with its stale comment. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260514202629.673539-1-nico@fluxnic.net --- kernel/sched/cputime.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index fbf31db0d2f3..6e85023a81ff 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -587,12 +587,6 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, } stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); - /* - * Because mul_u64_u64_div_u64() can approximate on some - * achitectures; enforce the constraint that: a*b/(b+c) <= a. - */ - if (unlikely(stime > rtime)) - stime = rtime; update: /* -- cgit v1.2.3 From ea19506013ad13685573e4674fbeddb790e27906 Mon Sep 17 00:00:00 2001 From: Yiyang Chen Date: Fri, 15 May 2026 00:05:05 +0800 Subject: sched/clock: Provide !HAVE_UNSTABLE_SCHED_CLOCK stub for sched_clock_stable() When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is disabled, sched_clock() is already assumed to provide stable semantics, but the public header doesn't provide a sched_clock_stable() stub for that case. Add a header stub that always returns true and clean up the duplicate local stub in ring_buffer.c, so callers can use sched_clock_stable() unconditionally. Signed-off-by: Yiyang Chen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt Link: https://patch.msgid.link/56e45338858946cd9581b75c8bd45dd37dba52c5.1778773587.git.cyyzero16@gmail.com --- kernel/trace/ring_buffer.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5326924615a4..02691c3c6dd6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3769,13 +3769,6 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return skip_time_extend(event); } -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline bool sched_clock_stable(void) -{ - return true; -} -#endif - static void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) -- cgit v1.2.3 From 6d2051403d6c93832d3058a1b275c6aef2c97f44 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 18 May 2026 12:23:45 +0200 Subject: sched/fair: Update util_est after updating util_avg during dequeue util_est_update() must be called after updating util_avg during the dequeue of a task and only when the task is not delayed dequeue. Move util_est_update() in update_load_avg(). Fixes: b55945c500c5 ("sched: Fix pick_next_task_fair() vs try_to_wake_up() race") Closes: https://lore.kernel.org/all/20260512124653.305275-1-qyousef@layalina.io/ Reported-by: Qais Yousef Reviewed-and-tested-by: Qais Yousef Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260518102345.268452-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 188 +++++++++++++++++++++++++--------------------------- 1 file changed, 92 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 728965851842..09d3acd2d2bc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4930,13 +4930,86 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s trace_pelt_cfs_tp(cfs_rq); } +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) + +static inline void util_est_update(struct sched_entity *se) +{ + unsigned int ewma, dequeued, last_ewma_diff; + + if (!sched_feat(UTIL_EST)) + return; + + /* Get current estimate of utilization */ + ewma = READ_ONCE(se->avg.util_est); + + /* + * If the PELT values haven't changed since enqueue time, + * skip the util_est update. + */ + if (ewma & UTIL_AVG_UNCHANGED) + return; + + /* Get utilization at dequeue */ + dequeued = READ_ONCE(se->avg.util_avg); + + /* + * Reset EWMA on utilization increases, the moving average is used only + * to smooth utilization decreases. + */ + if (ewma <= dequeued) { + ewma = dequeued; + goto done; + } + + /* + * Skip update of task's estimated utilization when its members are + * already ~1% close to its last activation value. + */ + last_ewma_diff = ewma - dequeued; + if (last_ewma_diff < UTIL_EST_MARGIN) + goto done; + + /* + * To avoid underestimate of task utilization, skip updates of EWMA if + * we cannot grant that thread got all CPU time it wanted. + */ + if ((dequeued + UTIL_EST_MARGIN) < READ_ONCE(se->avg.runnable_avg)) + goto done; + + /* + * Update Task's estimated utilization + * + * When *p completes an activation we can consolidate another sample + * of the task size. This is done by using this value to update the + * Exponential Weighted Moving Average (EWMA): + * + * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) + * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) + * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) + * = w * ( -last_ewma_diff ) + ewma(t-1) + * = w * (-last_ewma_diff + ewma(t-1) / w) + * + * Where 'w' is the weight of new samples, which is configured to be + * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) + */ + ewma <<= UTIL_EST_WEIGHT_SHIFT; + ewma -= last_ewma_diff; + ewma >>= UTIL_EST_WEIGHT_SHIFT; +done: + ewma |= UTIL_AVG_UNCHANGED; + WRITE_ONCE(se->avg.util_est, ewma); + + trace_sched_util_est_se_tp(se); +} + /* * Optional action to be done while updating the load average */ -#define UPDATE_TG 0x1 -#define SKIP_AGE_LOAD 0x2 -#define DO_ATTACH 0x4 -#define DO_DETACH 0x8 +#define UPDATE_TG 0x01 +#define SKIP_AGE_LOAD 0x02 +#define DO_ATTACH 0x04 +#define DO_DETACH 0x08 +#define UPDATE_UTIL_EST 0x10 /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -4979,6 +5052,9 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s if (flags & UPDATE_TG) update_tg_load_avg(cfs_rq); } + + if (flags & UPDATE_UTIL_EST) + util_est_update(se); } /* @@ -5037,11 +5113,6 @@ static inline unsigned long task_util(struct task_struct *p) return READ_ONCE(p->se.avg.util_avg); } -static inline unsigned long task_runnable(struct task_struct *p) -{ - return READ_ONCE(p->se.avg.runnable_avg); -} - static inline unsigned long _task_util_est(struct task_struct *p) { return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED; @@ -5084,88 +5155,6 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq, trace_sched_util_est_cfs_tp(cfs_rq); } -#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) - -static inline void util_est_update(struct cfs_rq *cfs_rq, - struct task_struct *p, - bool task_sleep) -{ - unsigned int ewma, dequeued, last_ewma_diff; - - if (!sched_feat(UTIL_EST)) - return; - - /* - * Skip update of task's estimated utilization when the task has not - * yet completed an activation, e.g. being migrated. - */ - if (!task_sleep) - return; - - /* Get current estimate of utilization */ - ewma = READ_ONCE(p->se.avg.util_est); - - /* - * If the PELT values haven't changed since enqueue time, - * skip the util_est update. - */ - if (ewma & UTIL_AVG_UNCHANGED) - return; - - /* Get utilization at dequeue */ - dequeued = task_util(p); - - /* - * Reset EWMA on utilization increases, the moving average is used only - * to smooth utilization decreases. - */ - if (ewma <= dequeued) { - ewma = dequeued; - goto done; - } - - /* - * Skip update of task's estimated utilization when its members are - * already ~1% close to its last activation value. - */ - last_ewma_diff = ewma - dequeued; - if (last_ewma_diff < UTIL_EST_MARGIN) - goto done; - - /* - * To avoid underestimate of task utilization, skip updates of EWMA if - * we cannot grant that thread got all CPU time it wanted. - */ - if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p)) - goto done; - - - /* - * Update Task's estimated utilization - * - * When *p completes an activation we can consolidate another sample - * of the task size. This is done by using this value to update the - * Exponential Weighted Moving Average (EWMA): - * - * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) - * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) - * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) - * = w * ( -last_ewma_diff ) + ewma(t-1) - * = w * (-last_ewma_diff + ewma(t-1) / w) - * - * Where 'w' is the weight of new samples, which is configured to be - * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) - */ - ewma <<= UTIL_EST_WEIGHT_SHIFT; - ewma -= last_ewma_diff; - ewma >>= UTIL_EST_WEIGHT_SHIFT; -done: - ewma |= UTIL_AVG_UNCHANGED; - WRITE_ONCE(p->se.avg.util_est, ewma); - - trace_sched_util_est_se_tp(&p->se); -} - static inline unsigned long get_actual_cpu_capacity(int cpu) { unsigned long capacity = arch_scale_cpu_capacity(cpu); @@ -5618,7 +5607,7 @@ static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { bool sleep = flags & DEQUEUE_SLEEP; - int action = UPDATE_TG; + int action = 0; update_curr(cfs_rq); clear_buddies(cfs_rq, se); @@ -5638,15 +5627,23 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { - update_load_avg(cfs_rq, se, 0); + if (entity_is_task(se)) + action |= UPDATE_UTIL_EST; + update_load_avg(cfs_rq, se, action); update_entity_lag(cfs_rq, se); set_delayed(se); return false; } } - if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) - action |= DO_DETACH; + action = UPDATE_TG; + if (entity_is_task(se)) { + if (task_on_rq_migrating(task_of(se))) + action |= DO_DETACH; + + if (sleep && !(flags & DEQUEUE_DELAYED)) + action |= UPDATE_UTIL_EST; + } /* * When dequeuing a sched_entity, we must: @@ -7409,7 +7406,6 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!p->se.sched_delayed) util_est_dequeue(&rq->cfs, p); - util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); if (dequeue_entities(rq, &p->se, flags) < 0) return false; -- cgit v1.2.3 From 5bc6ab2d42e545f816def21cfcdb4ba35cc74bf6 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 15 May 2026 22:54:54 +0530 Subject: sched: Simplify ifdeffery around cpu_smt_mask Now, that cpu_smt_mask is defined as cpumask_of(cpu) for CONFIG_SCHED_SMT=n, it is possible to get rid of the ifdeffery. Effectively, - This makes sched_smt_present is defined always - cpumask_weight(cpumask_of(cpu)) == 1. So sched_smt_present_inc/dec will never enable the sched_smt_present. Which is expected. - Paths that were compile-time eliminated become runtime guarded using static keys. - Defines set_idle_cores, test_idle_cores, etc which could likely benefit the CONFIG_SCHED_SMT=n systems to use the same optimizations within the LLC at wakeups. - This will expose sched_smt_present symbol for CONFIG_SCHED_SMT=n. Likely not a concern. - There is a bloat of code CONFIG_SCHED_SMT=n. (NR_CPUS=2048) add/remove: 24/18 grow/shrink: 26/28 up/down: 6396/-3188 (3208) Total: Before=30629880, After=30633088, chg +0.01% - No code bloat for CONFIG_SCHED_SMT=y, which is expected. - Add comments around stop_core_cpuslocked on why ifdefs are not removed. - This leaves the remaining uses of CONFIG_SCHED_SMT mainly for topology building bits which has a policy based decision. Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Acked-by: Tejun Heo Tested-by: K Prateek Nayak Link: https://patch.msgid.link/20260515172456.542799-3-sshegde@linux.ibm.com --- kernel/sched/core.c | 6 ------ kernel/sched/ext_idle.c | 6 ------ kernel/sched/fair.c | 35 ----------------------------------- kernel/sched/sched.h | 6 ------ kernel/sched/topology.c | 2 -- kernel/stop_machine.c | 5 +++++ kernel/workqueue.c | 4 ---- 7 files changed, 5 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b905805bbcbe..3ae5f19c1b7e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8612,18 +8612,14 @@ static void cpuset_cpu_inactive(unsigned int cpu) static inline void sched_smt_present_inc(int cpu) { -#ifdef CONFIG_SCHED_SMT if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_inc_cpuslocked(&sched_smt_present); -#endif } static inline void sched_smt_present_dec(int cpu) { -#ifdef CONFIG_SCHED_SMT if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_dec_cpuslocked(&sched_smt_present); -#endif } int sched_cpu_activate(unsigned int cpu) @@ -8711,9 +8707,7 @@ int sched_cpu_deactivate(unsigned int cpu) */ sched_smt_present_dec(cpu); -#ifdef CONFIG_SCHED_SMT sched_core_cpu_deactivate(cpu); -#endif if (!sched_smp_initialized) return 0; diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 7468560a6d80..2bcf58e99c9b 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -79,7 +79,6 @@ static bool scx_idle_test_and_clear_cpu(int cpu) int node = scx_cpu_node_if_enabled(cpu); struct cpumask *idle_cpus = idle_cpumask(node)->cpu; -#ifdef CONFIG_SCHED_SMT /* * SMT mask should be cleared whether we can claim @cpu or not. The SMT * cluster is not wholly idle either way. This also prevents @@ -104,7 +103,6 @@ static bool scx_idle_test_and_clear_cpu(int cpu) else if (cpumask_test_cpu(cpu, idle_smts)) __cpumask_clear_cpu(cpu, idle_smts); } -#endif return cpumask_test_and_clear_cpu(cpu, idle_cpus); } @@ -622,7 +620,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, goto out_unlock; } -#ifdef CONFIG_SCHED_SMT /* * Use @prev_cpu's sibling if it's idle. */ @@ -634,7 +631,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, goto out_unlock; } } -#endif /* * Search for any idle CPU in the same LLC domain. @@ -714,7 +710,6 @@ static void update_builtin_idle(int cpu, bool idle) assign_cpu(cpu, idle_cpus, idle); -#ifdef CONFIG_SCHED_SMT if (sched_smt_active()) { const struct cpumask *smt = cpu_smt_mask(cpu); struct cpumask *idle_smts = idle_cpumask(node)->smt; @@ -731,7 +726,6 @@ static void update_builtin_idle(int cpu, bool idle) cpumask_andnot(idle_smts, idle_smts, smt); } } -#endif } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 09d3acd2d2bc..233bd2ebbb73 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1555,7 +1555,6 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline bool is_core_idle(int cpu) { -#ifdef CONFIG_SCHED_SMT int sibling; for_each_cpu(sibling, cpu_smt_mask(cpu)) { @@ -1565,7 +1564,6 @@ static inline bool is_core_idle(int cpu) if (!idle_cpu(sibling)) return false; } -#endif return true; } @@ -2248,7 +2246,6 @@ numa_type numa_classify(unsigned int imbalance_pct, return node_fully_busy; } -#ifdef CONFIG_SCHED_SMT /* Forward declarations of select_idle_sibling helpers */ static inline bool test_idle_cores(int cpu); static inline int numa_idle_core(int idle_core, int cpu) @@ -2266,12 +2263,6 @@ static inline int numa_idle_core(int idle_core, int cpu) return idle_core; } -#else /* !CONFIG_SCHED_SMT: */ -static inline int numa_idle_core(int idle_core, int cpu) -{ - return idle_core; -} -#endif /* !CONFIG_SCHED_SMT */ /* * Gather all necessary information to make NUMA balancing placement @@ -7778,7 +7769,6 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p) return -1; } -#ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -7888,29 +7878,6 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; } -#else /* !CONFIG_SCHED_SMT: */ - -static inline void set_idle_cores(int cpu, int val) -{ -} - -static inline bool test_idle_cores(int cpu) -{ - return false; -} - -static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) -{ - return __select_idle_cpu(core, p); -} - -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - return -1; -} - -#endif /* !CONFIG_SCHED_SMT */ - /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by * comparing the average scan cost (tracked in sd->avg_scan_cost) against the @@ -12002,9 +11969,7 @@ static int should_we_balance(struct lb_env *env) * idle has been found, then its not needed to check other * SMT siblings for idleness: */ -#ifdef CONFIG_SCHED_SMT cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu)); -#endif continue; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9f63b15d309d..e476623a0c2a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1667,7 +1667,6 @@ do { \ flags = _raw_spin_rq_lock_irqsave(rq); \ } while (0) -#ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq); static inline void update_idle_core(struct rq *rq) @@ -1676,12 +1675,7 @@ static inline void update_idle_core(struct rq *rq) __update_idle_core(rq); } -#else /* !CONFIG_SCHED_SMT: */ -static inline void update_idle_core(struct rq *rq) { } -#endif /* !CONFIG_SCHED_SMT */ - #ifdef CONFIG_FAIR_GROUP_SCHED - static inline struct task_struct *task_of(struct sched_entity *se) { WARN_ON_ONCE(!entity_is_task(se)); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5847b83d9d55..a1f46e3f4ede 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1310,9 +1310,7 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) cpumask_copy(mask, sched_group_span(sg)); for_each_cpu(cpu, mask) { cores++; -#ifdef CONFIG_SCHED_SMT cpumask_andnot(mask, mask, cpu_smt_mask(cpu)); -#endif } sg->cores = cores; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 3fe6b0c99f3d..773d8e9ae30c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -633,6 +633,11 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) EXPORT_SYMBOL_GPL(stop_machine); #ifdef CONFIG_SCHED_SMT +/* + * INTEL_IFS is the only user of this API. That selftest can + * only be compiled if SMP=y. On x86 it selects SCHED_SMT. + * Keep the ifdefs for now. + */ int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f747f241a5f..99ef412f02a6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -8187,11 +8187,7 @@ static bool __init cpus_dont_share(int cpu0, int cpu1) static bool __init cpus_share_smt(int cpu0, int cpu1) { -#ifdef CONFIG_SCHED_SMT return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); -#else - return false; -#endif } static bool __init cpus_share_numa(int cpu0, int cpu1) -- cgit v1.2.3 From 3dbb362f90f3a8300ed9209d3278e30f8dbfb780 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 15 May 2026 22:54:55 +0530 Subject: sched/fair: Add sched_smt_active check for fastpaths For fastpaths such as wakeup and load balance even minimal code additions can add up. is_core_idle is accessed during load balance. Other callsites of is_core_idle make sched_smt_active() check first. Make the same check in should_we_balance. Rest of access to cpu_smt_mask isn't in fastpath. Note: Remove the stale comment above is_core_idle. Enqueue methods of fair aren't close to it anymore. Suggested-by: K Prateek Nayak Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://patch.msgid.link/20260515172456.542799-4-sshegde@linux.ibm.com --- kernel/sched/fair.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 233bd2ebbb73..14bd31b17c71 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1549,10 +1549,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) se->exec_start = rq_clock_task(rq_of(cfs_rq)); } -/************************************************** - * Scheduling class queueing methods: - */ - +/* Check sched_smt_active before calling this to avoid overheads in fastpaths */ static inline bool is_core_idle(int cpu) { int sibling; @@ -11961,7 +11958,9 @@ static int should_we_balance(struct lb_env *env) * balancing cores, but remember the first idle SMT CPU for * later consideration. Find CPU on an idle core first. */ - if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) { + if (sched_smt_active() && + !(env->sd->flags & SD_SHARE_CPUCAPACITY) && + !is_core_idle(cpu)) { if (idle_smt == -1) idle_smt = cpu; /* -- cgit v1.2.3 From acbdbab75ff4b1b87ab3c3d2b6ca86948f472189 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 15 May 2026 22:54:56 +0530 Subject: sched: Unify SMT active check via sched_smt_active() There is a use of sched_smt_active() and explicit use of sched_smt_present. Remove the explicit usage for better code maintenance and readability. Note that this differs slightly for update_idle_core. It used to call static_branch_unlikely earlier and now it will call static_branch_likely. Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://patch.msgid.link/20260515172456.542799-5-sshegde@linux.ibm.com --- kernel/sched/core_sched.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 73b6b2426911..43e0bde3038e 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -136,7 +136,7 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, struct pid *grp; int err = 0; - if (!static_branch_likely(&sched_smt_present)) + if (!sched_smt_active()) return -ENODEV; BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD != PIDTYPE_PID); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 14bd31b17c71..bcaadddf8624 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2247,7 +2247,7 @@ numa_type numa_classify(unsigned int imbalance_pct, static inline bool test_idle_cores(int cpu); static inline int numa_idle_core(int idle_core, int cpu) { - if (!static_branch_likely(&sched_smt_present) || + if (!sched_smt_active() || idle_core >= 0 || !test_idle_cores(cpu)) return idle_core; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e476623a0c2a..ffe77b2b6296 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1671,7 +1671,7 @@ extern void __update_idle_core(struct rq *rq); static inline void update_idle_core(struct rq *rq) { - if (static_branch_unlikely(&sched_smt_present)) + if (sched_smt_active()) __update_idle_core(rq); } -- cgit v1.2.3 From c9d93a73ce871ca32caf9308562501290b64b955 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 9 May 2026 20:07:25 +0200 Subject: sched/fair: Drop redundant RCU read lock in NOHZ kick path nohz_balancer_kick() is reached from sched_balance_trigger(), which is called from sched_tick(). sched_tick() runs with IRQs disabled, so the additional rcu_read_lock/unlock() used around sched_domain accesses in this path is redundant. Rely on the existing IRQ-disabled context (and the rcu_dereference_all() checking) instead. The same applies to set_cpu_sd_state_idle(), called from the idle entry path with IRQs disabled, and to set_cpu_sd_state_busy(), reachable via nohz_balance_exit_idle() from two contexts: nohz_balancer_kick() (IRQs disabled, as above) and sched_cpu_deactivate() (the CPUHP_AP_ACTIVE teardown, which runs under cpus_write_lock(), so it cannot race with sched-domain rebuilds). In both cases the rcu_dereference_all() validation is sufficient. No functional change intended. Suggested-by: K Prateek Nayak Signed-off-by: Andrea Righi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Reviewed-by: Vincent Guittot Link: https://patch.msgid.link/20260509180955.1840064-2-arighi@nvidia.com --- kernel/sched/fair.c | 38 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bcaadddf8624..03f63b094ff9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12715,8 +12715,6 @@ static void nohz_balancer_kick(struct rq *rq) goto out; } - rcu_read_lock(); - sd = rcu_dereference_all(rq->sd); if (sd) { /* @@ -12724,8 +12722,8 @@ static void nohz_balancer_kick(struct rq *rq) * capacity, kick the ILB to see if there's a better CPU to run on: */ if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) { - flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; - goto unlock; + flags |= NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; + goto out; } } @@ -12741,8 +12739,8 @@ static void nohz_balancer_kick(struct rq *rq) */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { if (sched_asym(sd, i, cpu)) { - flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; - goto unlock; + flags |= NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; + goto out; } } } @@ -12753,10 +12751,8 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU * to run the misfit task on. */ - if (check_misfit_status(rq)) { - flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; - goto unlock; - } + if (check_misfit_status(rq)) + flags |= NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; /* * For asymmetric systems, we do not want to nicely balance @@ -12765,7 +12761,7 @@ static void nohz_balancer_kick(struct rq *rq) * * Skip the LLC logic because it's not relevant in that case. */ - goto unlock; + goto out; } sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); @@ -12780,13 +12776,9 @@ static void nohz_balancer_kick(struct rq *rq) * like this LLC domain has tasks we could move. */ nr_busy = atomic_read(&sds->nr_busy_cpus); - if (nr_busy > 1) { - flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; - goto unlock; - } + if (nr_busy > 1) + flags |= NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; } -unlock: - rcu_read_unlock(); out: if (READ_ONCE(nohz.needs_update)) flags |= NOHZ_NEXT_KICK; @@ -12798,17 +12790,13 @@ out: static void set_cpu_sd_state_busy(int cpu) { struct sched_domain *sd; - - rcu_read_lock(); sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); if (!sd || !sd->nohz_idle) - goto unlock; + return; sd->nohz_idle = 0; atomic_inc(&sd->shared->nr_busy_cpus); -unlock: - rcu_read_unlock(); } void nohz_balance_exit_idle(struct rq *rq) @@ -12827,17 +12815,13 @@ void nohz_balance_exit_idle(struct rq *rq) static void set_cpu_sd_state_idle(int cpu) { struct sched_domain *sd; - - rcu_read_lock(); sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); if (!sd || sd->nohz_idle) - goto unlock; + return; sd->nohz_idle = 1; atomic_dec(&sd->shared->nr_busy_cpus); -unlock: - rcu_read_unlock(); } /* -- cgit v1.2.3 From fdfe5a8cd8731dd81840f26abfb6527edd27b0cb Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Sat, 16 May 2026 07:58:50 +0200 Subject: sched/fair: Attach sched_domain_shared to sd_asym_cpucapacity On asymmetric CPU capacity systems, the wakeup path uses select_idle_capacity(), which scans the span of sd_asym_cpucapacity rather than sd_llc. The has_idle_cores hint however lives on sd_llc->shared, so the wakeup-time read of has_idle_cores operates on an LLC-scoped blob while the actual scan/decision spans the asym domain; nr_busy_cpus also lives in the same shared sched_domain data, but it's never used in the asym CPU capacity scenario. Therefore, move the sched_domain_shared object to sd_asym_cpucapacity whenever the CPU has a SD_ASYM_CPUCAPACITY_FULL ancestor and that ancestor is non-overlapping (i.e., not built from SD_NUMA). In that case the scope of has_idle_cores matches the scope of the wakeup scan. Fall back to attaching the shared object to sd_llc in three cases: 1) plain symmetric systems (no SD_ASYM_CPUCAPACITY_FULL anywhere); 2) CPUs in an exclusive cpuset that carves out a symmetric capacity island: has_asym is system-wide but those CPUs have no SD_ASYM_CPUCAPACITY_FULL ancestor in their hierarchy and follow the symmetric LLC path in select_idle_sibling(); 3) exotic topologies where SD_ASYM_CPUCAPACITY_FULL lands on an SD_NUMA-built domain. init_sched_domain_shared() keys the shared blob off cpumask_first(span), which on overlapping NUMA domains would alias unrelated spans onto the same blob. Keep the shared object on the LLC there; select_idle_capacity() gracefully skips the has_idle_cores preference when sd->shared is NULL. While at it, also rename the per-CPU sd_llc_shared to sd_balance_shared, as it is no longer strictly tied to the LLC. Co-developed-by: Andrea Righi Signed-off-by: Andrea Righi Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Shrikanth Hegde Acked-by: Vincent Guittot Link: https://patch.msgid.link/20260516055850.1345932-1-arighi@nvidia.com --- kernel/sched/fair.c | 22 +++++++----- kernel/sched/sched.h | 2 +- kernel/sched/topology.c | 95 ++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 97 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 03f63b094ff9..2637a6fe9a87 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7773,7 +7773,7 @@ static inline void set_idle_cores(int cpu, int val) { struct sched_domain_shared *sds; - sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu)); if (sds) WRITE_ONCE(sds->has_idle_cores, val); } @@ -7782,7 +7782,7 @@ static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; - sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu)); if (sds) return READ_ONCE(sds->has_idle_cores); @@ -7791,7 +7791,7 @@ static inline bool test_idle_cores(int cpu) /* * Scans the local SMT mask to see if the entire core is idle, and records this - * information in sd_llc_shared->has_idle_cores. + * information in sd_balance_shared->has_idle_cores. * * Since SMT siblings share all cache levels, inspecting this limited remote * state should be fairly cheap. @@ -7821,7 +7821,8 @@ unlock: /* * Scan the entire LLC domain for idle cores; this dynamically switches off if * there are no idle cores left in the system; tracked through - * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. + * sd_balance_shared->has_idle_cores and enabled through update_idle_core() + * above. */ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { @@ -7885,7 +7886,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; - if (sched_feat(SIS_UTIL)) { + if (sched_feat(SIS_UTIL) && sd->shared) { /* * Increment because !--nr is the condition to stop scan. * @@ -12764,7 +12765,7 @@ static void nohz_balancer_kick(struct rq *rq) goto out; } - sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu)); if (sds) { /* * If there is an imbalance between LLC domains (IOW we could @@ -12792,7 +12793,11 @@ static void set_cpu_sd_state_busy(int cpu) struct sched_domain *sd; sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); - if (!sd || !sd->nohz_idle) + /* + * sd->nohz_idle only pairs with nr_busy_cpus on sd->shared; if this + * domain has no shared object there is nothing to clear or account. + */ + if (!sd || !sd->shared || !sd->nohz_idle) return; sd->nohz_idle = 0; @@ -12817,7 +12822,8 @@ static void set_cpu_sd_state_idle(int cpu) struct sched_domain *sd; sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); - if (!sd || sd->nohz_idle) + /* See set_cpu_sd_state_busy(): nohz_idle is only used with sd->shared. */ + if (!sd || !sd->shared || sd->nohz_idle) return; sd->nohz_idle = 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ffe77b2b6296..bfb4b47c021b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2164,7 +2164,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(int, sd_share_id); -DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a1f46e3f4ede..f96d50131495 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -665,7 +665,7 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(int, sd_share_id); -DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); @@ -680,20 +680,38 @@ static void update_top_cache_domain(int cpu) int id = cpu; int size = 1; + sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); + /* + * The shared object is attached to sd_asym_cpucapacity only when the + * asym domain is non-overlapping (i.e., not built from SD_NUMA). + * On overlapping (NUMA) asym domains we fall back to letting the + * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL + * here. + */ + if (sd && sd->shared) + sds = sd->shared; + + rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); + sd = highest_flag_domain(cpu, SD_SHARE_LLC); if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - /* If sd_llc exists, sd_llc_shared should exist too. */ - WARN_ON_ONCE(!sd->shared); - sds = sd->shared; + /* + * If sd_asym_cpucapacity didn't claim the shared object, + * sd_llc must have one linked. + */ + if (!sds) { + WARN_ON_ONCE(!sd->shared); + sds = sd->shared; + } } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; - rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds); sd = lowest_flag_domain(cpu, SD_CLUSTER); if (sd) @@ -711,9 +729,6 @@ static void update_top_cache_domain(int cpu) sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); - - sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); - rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); } /* @@ -2648,6 +2663,54 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc) } } +static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd) +{ + int sd_id = cpumask_first(sched_domain_span(sd)); + + sd->shared = *per_cpu_ptr(d->sds, sd_id); + /* + * nr_busy_cpus is consumed only by the NOHZ kick path via + * sd_balance_shared; on the asym-capacity path it is initialized but + * never read. + */ + atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight); + atomic_inc(&sd->shared->ref); +} + +/* + * For asymmetric CPU capacity, attach sched_domain_shared on the innermost + * SD_ASYM_CPUCAPACITY_FULL ancestor of @cpu's base domain when that ancestor is + * not an overlapping NUMA-built domain (then LLC should claim shared). + * + * A CPU may lack any FULL ancestor (e.g., exclusive cpuset symmetric island), + * then LLC must claim shared instead. + * + * Note: SD_ASYM_CPUCAPACITY_FULL is only set when all CPU capacity values + * are present in the domain span, so the asym domain we attach to cannot + * degenerate into a single-capacity group. The relevant edge cases are instead + * covered by the caveats above. + * + * Return true if this CPU's asym path claimed sd->shared, false otherwise. + */ +static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu) +{ + struct sched_domain *sd = *per_cpu_ptr(d->sd, cpu); + struct sched_domain *sd_asym; + + if (!sd) + return false; + + sd_asym = sd; + while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL)) + sd_asym = sd_asym->parent; + + if (!sd_asym || (sd_asym->flags & SD_NUMA)) + return false; + + init_sched_domain_shared(d, sd_asym); + return true; +} + /* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs @@ -2706,20 +2769,26 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } for_each_cpu(i, cpu_map) { + bool asym_claimed = false; + sd = *per_cpu_ptr(d.sd, i); if (!sd) continue; + if (has_asym) + asym_claimed = claim_asym_sched_domain_shared(&d, i); + /* First, find the topmost SD_SHARE_LLC domain */ while (sd->parent && (sd->parent->flags & SD_SHARE_LLC)) sd = sd->parent; if (sd->flags & SD_SHARE_LLC) { - int sd_id = cpumask_first(sched_domain_span(sd)); - - sd->shared = *per_cpu_ptr(d.sds, sd_id); - atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight); - atomic_inc(&sd->shared->ref); + /* + * Initialize the sd->shared for SD_SHARE_LLC unless + * the asym path above already claimed it. + */ + if (!asym_claimed) + init_sched_domain_shared(&d, sd); /* * In presence of higher domains, adjust the -- cgit v1.2.3 From 25a32e400a14009601c0a727643057f5515152df Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 11 May 2026 16:25:02 +0200 Subject: sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection On systems with asymmetric CPU capacity (e.g., ACPI/CPPC reporting different per-core frequencies), the wakeup path uses select_idle_capacity() and prioritizes idle CPUs with higher capacity for better task placement. However, when those CPUs belong to SMT cores, their effective capacity can be much lower than the nominal capacity when the sibling thread is busy: SMT siblings compete for shared resources, so a "high capacity" CPU that is idle but whose sibling is busy does not deliver its full capacity. This effective capacity reduction cannot be modeled by the static capacity value alone. Introduce SMT awareness in the asym-capacity idle selection policy: when SMT is active, always prefer fully-idle SMT cores over partially-idle ones. Prioritizing fully-idle SMT cores yields better task placement because the effective capacity of partially-idle SMT cores is reduced; always preferring them when available leads to more accurate capacity usage on task wakeup. On an SMT system with asymmetric CPU capacities (NVIDIA Vera Rubin), SMT-aware idle selection has been shown to improve throughput by around 15-18% over NO_ASYM mainline and by around 60% over ASYM mainline, for CPU-bound workloads (NVBLAS) running an amount of tasks equal to the amount of SMT cores. Reported-by: Felix Abecassis Signed-off-by: Andrea Righi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260511142502.3873984-1-arighi@nvidia.com --- kernel/sched/fair.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 114 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2637a6fe9a87..8854d4d980b0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7950,6 +7950,54 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return idle_cpu; } +/* + * Idle-capacity scan converts util_fits_cpu() outcomes into preference ranks, + * where lower values indicate a better fit - see select_idle_capacity(). + * + * A CPU that both fits the task and sits on a fully-idle SMT core is returned + * immediately and is never assigned one of these ranks. On !SMT every CPU is + * its own "core", so the early return covers all fits-and-idle cases and the + * core-tier ranks below become unreachable. + * + * Rank Val Tier Meaning + * ------------------------------ --- ------ --------------------------- + * ASYM_IDLE_UCLAMP_MISFIT -4 core Idle core; capacity fits + * util but uclamp_min misses. + * ASYM_IDLE_COMPLETE_MISFIT -3 core Idle core; capacity does + * not fit. Still beats every + * thread-tier rank: a busy + * sibling cuts effective + * capacity more than a + * misfit hurts a quiet core. + * ASYM_IDLE_THREAD_FITS -2 thread Busy SMT sibling; capacity + * fits util + uclamp. + * ASYM_IDLE_THREAD_UCLAMP_MISFIT -1 thread Busy SMT sibling; capacity + * fits but uclamp_min misses + * (native util_fits_cpu() + * return value). + * ASYM_IDLE_THREAD_MISFIT 0 thread Busy SMT sibling; capacity + * does not fit. + * + * ASYM_IDLE_CORE_BIAS (-3) is an offset, not a state. On an idle core, + * fits += ASYM_IDLE_CORE_BIAS rebases thread-tier ranks into the core tier: + * + * ASYM_IDLE_THREAD_UCLAMP_MISFIT (-1) + BIAS -> ASYM_IDLE_UCLAMP_MISFIT (-4) + * ASYM_IDLE_THREAD_MISFIT (0) + BIAS -> ASYM_IDLE_COMPLETE_MISFIT (-3) + * + * ASYM_IDLE_THREAD_FITS (-2) is never rebased because a fully-fitting idle-core + * candidate early-returns from select_idle_capacity(). + */ +enum asym_fits_state { + ASYM_IDLE_UCLAMP_MISFIT = -4, + ASYM_IDLE_COMPLETE_MISFIT, + ASYM_IDLE_THREAD_FITS, + ASYM_IDLE_THREAD_UCLAMP_MISFIT, + ASYM_IDLE_THREAD_MISFIT, + + /* util_fits_cpu() bias for idle core */ + ASYM_IDLE_CORE_BIAS = -3, +}; + /* * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which * the task fits. If no CPU is big enough, but there are idle ones, try to @@ -7958,8 +8006,14 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { + /* + * On !SMT systems, has_idle_core is always false and preferred_core + * is always true (CPU == core), so the SMT preference logic below + * collapses to the plain capacity scan. + */ + bool has_idle_core = sched_smt_active() && test_idle_cores(target); unsigned long task_util, util_min, util_max, best_cap = 0; - int fits, best_fits = 0; + int fits, best_fits = ASYM_IDLE_THREAD_MISFIT; int cpu, best_cpu = -1; struct cpumask *cpus; @@ -7971,6 +8025,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) util_max = uclamp_eff_value(p, UCLAMP_MAX); for_each_cpu_wrap(cpu, cpus, target) { + bool preferred_core = !has_idle_core || is_core_idle(cpu); unsigned long cpu_cap = capacity_of(cpu); if (!choose_idle_cpu(cpu, p)) @@ -7978,8 +8033,14 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) fits = util_fits_cpu(task_util, util_min, util_max, cpu); - /* This CPU fits with all requirements */ - if (fits > 0) + /* + * Perfect fit: capacity satisfies util + uclamp and the CPU + * sits on a fully-idle SMT core, this is a !SMT system, or + * there is no idle core to find. + * Short-circuit the rank-based selection and return + * immediately. + */ + if (fits > 0 && preferred_core) return cpu; /* * Only the min performance hint (i.e. uclamp_min) doesn't fit. @@ -7987,9 +8048,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) */ else if (fits < 0) cpu_cap = get_actual_cpu_capacity(cpu); + /* + * fits > 0 implies we are not on a preferred core, but the util + * fits CPU capacity. Set fits to ASYM_IDLE_THREAD_FITS + * so the effective range becomes + * [ASYM_IDLE_THREAD_FITS, ASYM_IDLE_THREAD_MISFIT], where: + * ASYM_IDLE_THREAD_MISFIT - does not fit + * ASYM_IDLE_THREAD_UCLAMP_MISFIT - fits with the exception of UCLAMP_MIN + * ASYM_IDLE_THREAD_FITS - fits with the exception of preferred_core + */ + else if (fits > 0) + fits = ASYM_IDLE_THREAD_FITS; /* - * First, select CPU which fits better (-1 being better than 0). + * If we are on a preferred core, translate the range of fits + * of [ASYM_IDLE_THREAD_UCLAMP_MISFIT, ASYM_IDLE_THREAD_MISFIT] to + * [ASYM_IDLE_UCLAMP_MISFIT, ASYM_IDLE_COMPLETE_MISFIT]. + * This ensures that an idle core is always given priority over + * (partially) busy core. + * + * A fully fitting idle core would have returned early and hence + * fits > 0 for preferred_core need not be dealt with. + */ + if (preferred_core) + fits += ASYM_IDLE_CORE_BIAS; + + /* + * First, select CPU which fits better (lower is more preferred). * Then, select the one with best capacity at same level. */ if ((fits < best_fits) || @@ -8000,6 +8085,19 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) } } + /* + * A value in the [ASYM_IDLE_UCLAMP_MISFIT, ASYM_IDLE_COMPLETE_MISFIT] + * range means the chosen CPU is in a fully idle SMT core. Values above + * ASYM_IDLE_COMPLETE_MISFIT mean we never ranked such a CPU best. + * + * The asym-capacity wakeup path returns from select_idle_sibling() + * after this function and never runs select_idle_cpu(), so the usual + * select_idle_cpu() tail that clears idle cores must live here when the + * idle-core preference did not win. + */ + if (has_idle_core && best_fits > ASYM_IDLE_COMPLETE_MISFIT) + set_idle_cores(target, false); + return best_cpu; } @@ -8008,12 +8106,22 @@ static inline bool asym_fits_cpu(unsigned long util, unsigned long util_max, int cpu) { - if (sched_asym_cpucap_active()) + if (sched_asym_cpucap_active()) { /* * Return true only if the cpu fully fits the task requirements * which include the utilization and the performance hints. + * + * When SMT is active, also require that the core has no busy + * siblings. + * + * Note: gating on is_core_idle() also makes the early-bailout + * candidates in select_idle_sibling() (target, prev, + * recent_used_cpu) idle-core-aware on ASYM+SMT, which the + * NO_ASYM path does not do. */ - return (util_fits_cpu(util, util_min, util_max, cpu) > 0); + return (!sched_smt_active() || is_core_idle(cpu)) && + (util_fits_cpu(util, util_min, util_max, cpu) > 0); + } return true; } -- cgit v1.2.3 From bf6aa722198d3c06e4236e8c5a480f30a64e1513 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 9 May 2026 20:07:28 +0200 Subject: sched/fair: Reject misfit pulls onto busy SMT siblings on asym-capacity When SD_ASYM_CPUCAPACITY load balancing considers pulling a misfit task, capacity_of(dst_cpu) can overstate available compute if the SMT sibling is busy: the core does not deliver its full nominal capacity. If SMT is active and dst_cpu is not on a fully idle core, skip this destination so we do not migrate a misfit expecting a capacity upgrade we cannot actually provide. Reported-by: Felix Abecassis Signed-off-by: Andrea Righi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://patch.msgid.link/20260509180955.1840064-5-arighi@nvidia.com --- kernel/sched/fair.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8854d4d980b0..f69ee5ae2b8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9625,6 +9625,7 @@ struct lb_env { int dst_cpu; struct rq *dst_rq; + bool dst_core_idle; struct cpumask *dst_grpmask; int new_dst_cpu; @@ -10850,10 +10851,16 @@ static bool update_sd_pick_busiest(struct lb_env *env, * We can use max_capacity here as reduction in capacity on some * CPUs in the group should either be possible to resolve * internally or be covered by avg_load imbalance (eventually). + * + * When SMT is active, only pull a misfit to dst_cpu if it is on a + * fully idle core; otherwise the effective capacity of the core is + * reduced and we may not actually provide more capacity than the + * source. */ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && (sgs->group_type == group_misfit_task) && - (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || + (!env->dst_core_idle || + !capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || sds->local_stat.group_type != group_has_spare)) return false; @@ -11417,6 +11424,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd unsigned long sum_util = 0; bool sg_overloaded = 0, sg_overutilized = 0; + env->dst_core_idle = !sched_smt_active() || is_core_idle(env->dst_cpu); + do { struct sg_lb_stats *sgs = &tmp_sgs; int local_group; -- cgit v1.2.3 From 61ea17a63719bac51e1bc50eb39fc637f0fdc06e Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Sat, 9 May 2026 20:07:29 +0200 Subject: sched/fair: Add SIS_UTIL support to select_idle_capacity() Add to select_idle_capacity() the same SIS_UTIL-controlled idle-scan mechanism, already used by select_idle_cpu(): when sched_feat(SIS_UTIL) is enabled and the LLC domain has sched_domain_shared data, derive the per-attempt scan limit from sd->shared->nr_idle_scan. That bounds the walk on large LLCs: once nr_idle_scan is exhausted, return the best CPU seen so far. The early exit is gated on !has_idle_core so an active idle-core search (SMT with idle cores reported by test_idle_cores()) isn't cut short before it gets a chance to find one. Co-developed-by: Andrea Righi Signed-off-by: Andrea Righi Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://patch.msgid.link/20260509180955.1840064-6-arighi@nvidia.com --- kernel/sched/fair.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f69ee5ae2b8c..69ba882681c5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8016,6 +8016,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) int fits, best_fits = ASYM_IDLE_THREAD_MISFIT; int cpu, best_cpu = -1; struct cpumask *cpus; + int nr = INT_MAX; cpus = this_cpu_cpumask_var_ptr(select_rq_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); @@ -8024,10 +8025,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) util_min = uclamp_eff_value(p, UCLAMP_MIN); util_max = uclamp_eff_value(p, UCLAMP_MAX); + if (sched_feat(SIS_UTIL) && sd->shared) { + /* + * Same nr_idle_scan hint as select_idle_cpu(), nr only limits + * the scan when not preferring an idle core. + */ + nr = READ_ONCE(sd->shared->nr_idle_scan) + 1; + /* overloaded domain is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; + } + for_each_cpu_wrap(cpu, cpus, target) { bool preferred_core = !has_idle_core || is_core_idle(cpu); unsigned long cpu_cap = capacity_of(cpu); + /* + * Stop when the nr_idle_scan is exhausted (mirrors + * select_idle_cpu() logic). + */ + if (!has_idle_core && --nr <= 0) + return best_cpu; + if (!choose_idle_cpu(cpu, p)) continue; -- cgit v1.2.3 From 04f80f8b12a02fa2e0827c8f37eb357adca8ce44 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 14 May 2026 23:47:17 +0000 Subject: sched: Switch rq->next_class on proxy_resched_idle() K Prateek noticed we weren't setting the rq->next_class in proxy_resched_idle(), when I was debugging an issue seen with CONFIG_SCHED_PROXY_EXEC and some of Peter's new patches, and suggested this fix. So set rq->next_class when we temporarily switch the donor to idle, so we don't accidentally call wakeup_preempt_fair() with idle as the donor. Suggested-by: K Prateek Nayak Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260514234732.3170197-1-jstultz@google.com --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3ae5f19c1b7e..77f4ebe8f5c7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6653,6 +6653,7 @@ static inline void proxy_set_task_cpu(struct task_struct *p, int cpu) static inline struct task_struct *proxy_resched_idle(struct rq *rq) { put_prev_set_next_task(rq, rq->donor, rq->idle); + rq->next_class = &idle_sched_class; rq_set_donor(rq, rq->idle); set_tsk_need_resched(rq->idle); return rq->idle; -- cgit v1.2.3 From dd29c017aed628076e915fe4cdfb5392fd4c5cab Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 15 May 2026 10:37:40 -0400 Subject: sched/rt: Have RT_PUSH_IPI be default off for non PREEMPT_RT RT migration is done aggressively. When a CPU schedules out a high priority RT task for a lower priority task, it will look to see if there's any RT tasks that are waiting to run on another CPU that is of higher priority than the task this CPU is about to run. If it finds one, it will pull that task over to the CPU and allow it to run there instead. Normally, this pulling is done by looking at the RT overloaded mask (rto) which contains all the CPUs in the scheduler domain with RT tasks that are waiting to run due to a higher priority RT task currently running on their CPU. The CPU that is about to schedule a lower priority task will grab the rq lock of the overloaded CPU and move the RT task from that CPU's runqueue to the local one and schedule the higher priority RT task. This caused issues when a lot of CPUs would schedule a lower priority task at the same time. They would all try to grab the same runqueue lock of the CPU with the overloaded RT tasks. Only the first CPU that got in will get that task. All the others would wait until they got the runqueue lock and see there's nothing to pull and do nothing. On systems with lots of CPUs, this caused a large latency (up to 500us) which is beyond what PREEMPT_RT is to allow. The solution to that was to create an RT_PUSH_IPI logic. When any CPU wanted to pull a task, instead of grabbing the runqueue lock of the overloaded CPU, it would start by sending an IPI to the overloaded CPU, and that IPI handler would have the CPU with the waiting RT task do a push instead. Then that handler would send an IPI to the next CPU with overloaded RT tasks, and so on. Note, after the first CPU starts this process, if another CPU wanted to do a pull, it would see that the process has already begun and would only increment a counter to have the IPIs continue again. The RT_PUSH_IPI solved the latency problem with PREEMPT_RT but could cause a new issue with non PREEMPT_RT. Namely, softirqs run in a threaded context on PREEMPT_RT but they can run in an interrupt context in non-RT. If an IPI lands on a CPU that has just woken up multiple RT tasks and the current CPU is running a non RT or a low priority RT task, instead of doing a push, it would simply do a schedule on that CPU. But if a softirq was also executing on this CPU, the schedule would need to wait until the softirq finished. Until then, the CPU would still be considered overloaded as there are RT tasks still waiting to run on it. A live lock occurred on a workload that was doing heavy networking traffic on a large machine where the softirqs would run 500us out of 750us. And it would also be waking up RT tasks, causing the RT pull logic to be constantly executed. When a softirq triggered on a CPU with RT tasks queued but not running yet, and the other CPUs would see this CPU as being overloaded, they would send an IPI over to it. The CPU would notice that the waiting RT tasks are of higher priority than the currently running task and simply schedule that CPU instead. But because the softirq was executing, before it could schedule, it would receive another IPI to do the same. The amount of IPIs would slow down the currently running softirq so much that before it could return back to task context, it would execute another softirq never allowing the CPU to schedule. This live locked that CPU. As RT_PUSH_IPI was created to help PREEMPT_RT, make it default off if PREEMPT_RT is not enabled. Fixes: b6366f048e0c ("sched/rt: Use IPI to trigger RT task push migration instead of pulling") Closes: https://lore.kernel.org/all/20260506235716.2530720-1-tj@kernel.org/ Reported-by: Tejun Heo Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260515103740.25ccbed8@gandalf.local.home --- kernel/sched/features.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 84c4fe3abd74..8f0dee8fc475 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -110,8 +110,16 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) * rq lock and possibly create a large contention, sending an * IPI to that CPU and let that CPU push the RT task to where * it should go may be a better scenario. + * + * This is best for PREEMPT_RT, but for non-RT it can cause issues + * when preemption is disabled for long periods of time. Have + * it only default enabled for PREEMPT_RT. */ +# ifdef CONFIG_PREEMPT_RT SCHED_FEAT(RT_PUSH_IPI, true) +# else +SCHED_FEAT(RT_PUSH_IPI, false) +# endif #endif SCHED_FEAT(RT_RUNTIME_SHARE, false) -- cgit v1.2.3 From 9e005ed21152d4a4bb0ceea71045ff8a642a6feb Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 19 May 2026 05:14:23 +0000 Subject: sched/topology: Allow multiple domains to claim sched_domain_shared Recent optimizations of sd->shared assignment moved to allocating a single instance of per-CPU sched_domain_shared objects per s_data. Recent optimizations to select_idle_capacity() moved the sd->shared assignments to "sd_asym" domain when ASYM_CPUCAPACITY is detected but cache-aware scheduling mandates the presence of "sd_llc_shared" to compute and cache per-LLC statistics. Use an "alloc_flags" union in sched_domain_shared to claim a sched_domain_shared object per sched_domain. Allocation starts searching for an available / matching sched_domain_shared instance from the first CPU of sched_domain_span(sd) (sd can be sd_llc, or sd_asym). If the shared object is claimed by another domain, the instance corresponding to next CPU in the domain span is explored until a matching / available instance is found. In case of a single CPU in sched_domain_span(), the domain will be degenerated and a temporary overlap of ->shared objects across different domains is acceptable. "alloc_flags" forms a union with "nr_idle_scan" and the stale flags are left as is when the sd->shared is published. The expectation is for the first load balancing instance to correct the value just like the current behavior, except the initial value is no longer 0. Originally-by: Peter Zijlstra Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Tested-by: Andrea Righi --- kernel/sched/topology.c | 63 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dbfd9657f897..df2ceb54c970 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -623,6 +623,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) } while (sg != first); } +static void free_sched_domain_shared(struct sched_domain_shared *sds) +{ + if (sds && atomic_dec_and_test(&sds->ref)) + kfree(sds); +} + static void destroy_sched_domain(struct sched_domain *sd) { /* @@ -631,9 +637,7 @@ static void destroy_sched_domain(struct sched_domain *sd) * dropping group/capacity references, freeing where none remain. */ free_sched_groups(sd->groups, 1); - - if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) - kfree(sd->shared); + free_sched_domain_shared(sd->shared); #ifdef CONFIG_SCHED_CACHE /* only the bottom sd has llc_counts array */ @@ -755,7 +759,14 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* Pick reference to parent->shared. */ if (parent->shared) { - WARN_ON_ONCE(tmp->shared); + /* + * It is safe to free a sd->shared that + * has not been published yet. If a + * sd->shared was published, the refcount + * will end up being non-zero and it will + * not be freed here. + */ + free_sched_domain_shared(tmp->shared); tmp->shared = parent->shared; parent->shared = NULL; } @@ -2916,11 +2927,45 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc) } } -static void init_sched_domain_shared(struct s_data *d, struct sched_domain *sd) +static void +init_sched_domain_shared(struct s_data *d, struct sched_domain *sd, int flags) { - int sd_id = cpumask_first(sched_domain_span(sd)); + struct sched_domain_shared *sds = NULL; + int cpu; + + /* + * Multiple domains can try to claim a shared object like + * SD_ASYM_CPUCAPACITY and SD_SHARE_LLC which can alias to + * same cpumask_first(sched_domain_span(sd)) CPU and can + * cause "nr_idle_scan" to be populated incorrectly during + * load balancing. + * + * Find the first CPU in sched_domain_span(sd) with an + * unclaimed domain (!alloc_flags) or where the alloc_flag + * matches the requested flag (SD_* flag) + * + * If the domain only has single CPU, allow temporary overlap + * in allocation since the domains will be degenerated later. + */ + for_each_cpu(cpu, sched_domain_span(sd)) { + sds = *per_cpu_ptr(d->sds, cpu); + + if (!sds->alloc_flags || + sd->span_weight == 1 || + sds->alloc_flags == flags) { + sds->alloc_flags = flags; + sd->shared = sds; + break; + } + } + + /* + * Use the sd_shared corresponding to the last + * CPU in the span if none are avaialable. + */ + if (WARN_ON_ONCE(!sd->shared)) + sd->shared = sds; - sd->shared = *per_cpu_ptr(d->sds, sd_id); /* * nr_busy_cpus is consumed only by the NOHZ kick path via * sd_balance_shared; on the asym-capacity path it is initialized but @@ -2960,7 +3005,7 @@ static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu) if (!sd_asym || (sd_asym->flags & SD_NUMA)) return false; - init_sched_domain_shared(d, sd_asym); + init_sched_domain_shared(d, sd_asym, SD_ASYM_CPUCAPACITY); return true; } @@ -3115,7 +3160,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = sd->parent; if (sd->flags & SD_SHARE_LLC) { - init_sched_domain_shared(&d, sd); + init_sched_domain_shared(&d, sd, SD_SHARE_LLC); /* * In presence of higher domains, adjust the -- cgit v1.2.3 From 25139c11693afed894db46d1a44e2b6e015b804d Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 22 May 2026 11:25:23 +0200 Subject: sched/fair: Fix RCU usage in NOHZ exit path on CPU offline Commit c9d93a73ce87 ("sched/fair: Drop redundant RCU read lock in NOHZ kick path") removed the rcu_read_lock()/unlock() pair from set_cpu_sd_state_busy() and set_cpu_sd_state_idle() on the assumption that all callers run in a safe context for rcu_dereference_all(): IRQs disabled or cpus_write_lock() held. That assumption is wrong for the CPU hotplug teardown path. When CPUs are taken offline, set_cpu_sd_state_busy() is invoked via: cpuhp/N kthread cpuhp_thread_fun() cpuhp_invoke_callback() sched_cpu_deactivate() nohz_balance_exit_idle() set_cpu_sd_state_busy() rcu_dereference_all(per_cpu(sd_llc, cpu)) The cpuhp kthread holds cpu_hotplug_lock (percpu-rwsem) but runs with preemption and IRQs enabled. As a result, lockdep correctly reports a suspicious RCU usage on CPU offline, e.g.: # echo 0 > /sys/devices/system/cpu/cpu1/online ============================= WARNING: suspicious RCU usage ----------------------------- kernel/sched/fair.c:12793 suspicious rcu_dereference_check() usage! ... 2 locks held by cpuhp/1/20: #0: (cpu_hotplug_lock){++++}-{0:0}, at: cpuhp_thread_fun+0x42/0x1ae #1: (cpuhp_state-down){+.+.}-{0:0}, at: cpuhp_thread_fun+0x72/0x1ae Call Trace: lockdep_rcu_suspicious nohz_balance_exit_idle sched_cpu_deactivate cpuhp_invoke_callback cpuhp_thread_fun smpboot_thread_fn Fix this by adding RCU read lock coverage to the one caller that lacks it: nohz_balance_exit_idle() in the CPU hotplug teardown. The other callers (nohz_balancer_kick() and nohz_balance_enter_idle()) genuinely run with IRQs disabled, so they remain unchanged. Fixes: c9d93a73ce87 ("sched/fair: Drop redundant RCU read lock in NOHZ kick path") Closes: https://lore.kernel.org/all/38fe0a1d-1a48-435a-910a-c278024d9ac9@samsung.com/ Reported-by: Marek Szyprowski Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Andrea Righi Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260522092523.2046095-1-arighi@nvidia.com --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7fb3f5f2d48c..b3a416b1c251 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8681,7 +8681,8 @@ int sched_cpu_deactivate(unsigned int cpu) * Remove CPU from nohz.idle_cpus_mask to prevent participating in * load balancing when not active */ - nohz_balance_exit_idle(rq); + scoped_guard (rcu) + nohz_balance_exit_idle(rq); set_cpu_active(cpu, false); -- cgit v1.2.3 From 333f6f0e11acc20d036f94f94709874f76d0b430 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 May 2026 13:31:05 +0200 Subject: sched/debug: Use char * instead of char (*)[] Some of the fancy AI robots are getting 'upset'. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260511120627.065013766@infradead.org --- kernel/sched/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ed3a0d65da0c..af13a896858c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -136,7 +136,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, if (cnt > 63) cnt = 63; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -263,7 +263,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, if (cnt > 15) cnt = 15; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; -- cgit v1.2.3 From 77557002234546a4fb46ebf517d6cb1d515535a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 May 2026 13:31:06 +0200 Subject: sched: Use {READ,WRITE}_ONCE() for preempt_dynamic_mode Robots figured out you can read and write this concurrently and got 'upset'. Gemini even noted sched_dynamic_show() can generate 'confusing' output if it observed different values during the printing. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260511120627.176946327@infradead.org --- kernel/sched/core.c | 15 ++++++++------- kernel/sched/debug.c | 5 +++-- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b3a416b1c251..83202f090f0c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7979,7 +7979,7 @@ static void __sched_dynamic_update(int mode) break; } - preempt_dynamic_mode = mode; + WRITE_ONCE(preempt_dynamic_mode, mode); } void sched_dynamic_update(int mode) @@ -8020,12 +8020,13 @@ static void __init preempt_dynamic_init(void) } } -# define PREEMPT_MODEL_ACCESSOR(mode) \ - bool preempt_model_##mode(void) \ - { \ - WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ - return preempt_dynamic_mode == preempt_dynamic_##mode; \ - } \ +# define PREEMPT_MODEL_ACCESSOR(mode) \ + bool preempt_model_##mode(void) \ + { \ + int mode = READ_ONCE(preempt_dynamic_mode); \ + WARN_ON_ONCE(mode == preempt_dynamic_undefined); \ + return mode == preempt_dynamic_##mode; \ + } \ EXPORT_SYMBOL_GPL(preempt_model_##mode) PREEMPT_MODEL_ACCESSOR(none); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index af13a896858c..4cdf2f9c5d9e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -281,6 +281,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { int i = (IS_ENABLED(CONFIG_PREEMPT_RT) || IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY)) * 2; + int mode = READ_ONCE(preempt_dynamic_mode); int j; /* Count entries in NULL terminated preempt_modes */ @@ -289,10 +290,10 @@ static int sched_dynamic_show(struct seq_file *m, void *v) j -= !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); for (; i < j; i++) { - if (preempt_dynamic_mode == i) + if (mode == i) seq_puts(m, "("); seq_puts(m, preempt_modes[i]); - if (preempt_dynamic_mode == i) + if (mode == i) seq_puts(m, ")"); seq_puts(m, " "); -- cgit v1.2.3 From e05777c44e53df8ab41d930510a384d65a34aafa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 May 2026 13:31:07 +0200 Subject: sched/debug: Collapse subsequent CONFIG_SCHED_CLASS_EXT sections Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260511120627.281160085@infradead.org --- kernel/sched/debug.c | 92 +++++++++++++++++++++++++--------------------------- 1 file changed, 44 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4cdf2f9c5d9e..4809e1d23081 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -488,6 +488,8 @@ static const struct file_operations fair_server_runtime_fops = { .release = single_release, }; +static struct dentry *debugfs_sched; + #ifdef CONFIG_SCHED_CLASS_EXT static ssize_t sched_ext_server_runtime_write(struct file *filp, const char __user *ubuf, @@ -520,75 +522,92 @@ static const struct file_operations ext_server_runtime_fops = { .llseek = seq_lseek, .release = single_release, }; -#endif /* CONFIG_SCHED_CLASS_EXT */ static ssize_t -sched_fair_server_period_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +sched_ext_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { long cpu = (long) ((struct seq_file *) filp->private_data)->private; struct rq *rq = cpu_rq(cpu); return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, - &rq->fair_server); + &rq->ext_server); } -static int sched_fair_server_period_show(struct seq_file *m, void *v) +static int sched_ext_server_period_show(struct seq_file *m, void *v) { unsigned long cpu = (unsigned long) m->private; struct rq *rq = cpu_rq(cpu); - return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server); + return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server); } -static int sched_fair_server_period_open(struct inode *inode, struct file *filp) +static int sched_ext_server_period_open(struct inode *inode, struct file *filp) { - return single_open(filp, sched_fair_server_period_show, inode->i_private); + return single_open(filp, sched_ext_server_period_show, inode->i_private); } -static const struct file_operations fair_server_period_fops = { - .open = sched_fair_server_period_open, - .write = sched_fair_server_period_write, +static const struct file_operations ext_server_period_fops = { + .open = sched_ext_server_period_open, + .write = sched_ext_server_period_write, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; -#ifdef CONFIG_SCHED_CLASS_EXT +static void debugfs_ext_server_init(void) +{ + struct dentry *d_ext; + unsigned long cpu; + + d_ext = debugfs_create_dir("ext_server", debugfs_sched); + if (!d_ext) + return; + + for_each_possible_cpu(cpu) { + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%lu", cpu); + d_cpu = debugfs_create_dir(buf, d_ext); + + debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_runtime_fops); + debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_period_fops); + } +} +#endif /* CONFIG_SCHED_CLASS_EXT */ + static ssize_t -sched_ext_server_period_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +sched_fair_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { long cpu = (long) ((struct seq_file *) filp->private_data)->private; struct rq *rq = cpu_rq(cpu); return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, - &rq->ext_server); + &rq->fair_server); } -static int sched_ext_server_period_show(struct seq_file *m, void *v) +static int sched_fair_server_period_show(struct seq_file *m, void *v) { unsigned long cpu = (unsigned long) m->private; struct rq *rq = cpu_rq(cpu); - return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server); + return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server); } -static int sched_ext_server_period_open(struct inode *inode, struct file *filp) +static int sched_fair_server_period_open(struct inode *inode, struct file *filp) { - return single_open(filp, sched_ext_server_period_show, inode->i_private); + return single_open(filp, sched_fair_server_period_show, inode->i_private); } -static const struct file_operations ext_server_period_fops = { - .open = sched_ext_server_period_open, - .write = sched_ext_server_period_write, +static const struct file_operations fair_server_period_fops = { + .open = sched_fair_server_period_open, + .write = sched_fair_server_period_write, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; -#endif /* CONFIG_SCHED_CLASS_EXT */ - -static struct dentry *debugfs_sched; static void debugfs_fair_server_init(void) { @@ -611,29 +630,6 @@ static void debugfs_fair_server_init(void) } } -#ifdef CONFIG_SCHED_CLASS_EXT -static void debugfs_ext_server_init(void) -{ - struct dentry *d_ext; - unsigned long cpu; - - d_ext = debugfs_create_dir("ext_server", debugfs_sched); - if (!d_ext) - return; - - for_each_possible_cpu(cpu) { - struct dentry *d_cpu; - char buf[32]; - - snprintf(buf, sizeof(buf), "cpu%lu", cpu); - d_cpu = debugfs_create_dir(buf, d_ext); - - debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_runtime_fops); - debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_period_fops); - } -} -#endif /* CONFIG_SCHED_CLASS_EXT */ - static __init int sched_init_debug(void) { struct dentry __maybe_unused *numa, *llc; -- cgit v1.2.3 From b3a2dfa8b42e5b97dd144aa59374f4e045725cac Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 May 2026 13:31:12 +0200 Subject: sched/fair: Add newidle balance to pick_task_fair() With commit 50653216e4ff ("sched: Add support to pick functions to take rf") removing the balance callback, the pick_task() callback is in charge of newidle balancing. This means pick_task_fair() should do so too. This hasn't been a problem in practise because pick_next_task_fair() is used. However, since we'll be removing that one shortly, make sure pick_next_task() is up to scratch. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://patch.msgid.link/20260511120627.944705718@infradead.org --- kernel/sched/fair.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8e858ca6bcd0..5f48af700fd4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9863,16 +9863,18 @@ preempt: } static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) + __must_hold(__rq_lockp(rq)) { struct sched_entity *se; struct cfs_rq *cfs_rq; struct task_struct *p; bool throttled; + int new_tasks; again: cfs_rq = &rq->cfs; if (!cfs_rq->nr_queued) - return NULL; + goto idle; throttled = false; @@ -9893,6 +9895,14 @@ again: if (unlikely(throttled)) task_throttle_setup_work(p); return p; + +idle: + new_tasks = sched_balance_newidle(rq, rf); + if (new_tasks < 0) + return RETRY_TASK; + if (new_tasks > 0) + goto again; + return NULL; } static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); @@ -9904,12 +9914,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf { struct sched_entity *se; struct task_struct *p; - int new_tasks; -again: p = pick_task_fair(rq, rf); + if (unlikely(p == RETRY_TASK)) + return p; if (!p) - goto idle; + return p; se = &p->se; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -9959,29 +9969,11 @@ simple: #endif /* CONFIG_FAIR_GROUP_SCHED */ put_prev_set_next_task(rq, prev, p); return p; - -idle: - if (rf) { - new_tasks = sched_balance_newidle(rq, rf); - - /* - * Because sched_balance_newidle() releases (and re-acquires) - * rq->lock, it is possible for any higher priority task to - * appear. In that case we must re-start the pick_next_entity() - * loop. - */ - if (new_tasks < 0) - return RETRY_TASK; - - if (new_tasks > 0) - goto again; - } - - return NULL; } static struct task_struct * fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) + __must_hold(__rq_lockp(dl_se->rq)) { return pick_task_fair(dl_se->rq, rf); } -- cgit v1.2.3 From 5ad278dd20bdf59714443894d7b3044471af97d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 May 2026 13:31:13 +0200 Subject: sched: Remove sched_class::pick_next_task() The reason for pick_next_task_fair() is the put/set optimization that avoids touching the common ancestors. However, it is possible to implement this in the put_prev_task() and set_next_task() calls as used in put_prev_set_next_task(). Notably, put_prev_set_next_task() is the only site that: - calls put_prev_task() with a .next argument; - calls set_next_task() with .first = true. This means that put_prev_task() can determine the common hierarchy and stop there, and then set_next_task() can terminate where put_prev_task stopped. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://patch.msgid.link/20260511120628.057634261@infradead.org --- kernel/sched/core.c | 27 ++++------ kernel/sched/fair.c | 139 +++++++++++++++++---------------------------------- kernel/sched/sched.h | 14 +----- 3 files changed, 57 insertions(+), 123 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83202f090f0c..3c8bfd697e2c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6030,16 +6030,15 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_queued)) { - p = pick_next_task_fair(rq, prev, rf); + p = pick_task_fair(rq, rf); if (unlikely(p == RETRY_TASK)) goto restart; /* Assume the next prioritized class is idle_sched_class */ - if (!p) { + if (!p) p = pick_task_idle(rq, rf); - put_prev_set_next_task(rq, prev, p); - } + put_prev_set_next_task(rq, prev, p); return p; } @@ -6047,20 +6046,12 @@ restart: prev_balance(rq, prev, rf); for_each_active_class(class) { - if (class->pick_next_task) { - p = class->pick_next_task(rq, prev, rf); - if (unlikely(p == RETRY_TASK)) - goto restart; - if (p) - return p; - } else { - p = class->pick_task(rq, rf); - if (unlikely(p == RETRY_TASK)) - goto restart; - if (p) { - put_prev_set_next_task(rq, prev, p); - return p; - } + p = class->pick_task(rq, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; + if (p) { + put_prev_set_next_task(rq, prev, p); + return p; } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5f48af700fd4..62a2dcb0d03e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9862,7 +9862,7 @@ preempt: resched_curr_lazy(rq); } -static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) +struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) __must_hold(__rq_lockp(rq)) { struct sched_entity *se; @@ -9905,72 +9905,6 @@ idle: return NULL; } -static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); - -struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - __must_hold(__rq_lockp(rq)) -{ - struct sched_entity *se; - struct task_struct *p; - - p = pick_task_fair(rq, rf); - if (unlikely(p == RETRY_TASK)) - return p; - if (!p) - return p; - se = &p->se; - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (prev->sched_class != &fair_sched_class) - goto simple; - - __put_prev_set_next_dl_server(rq, prev, p); - - /* - * Because of the set_next_buddy() in dequeue_task_fair() it is rather - * likely that a next task is from the same cgroup as the current. - * - * Therefore attempt to avoid putting and setting the entire cgroup - * hierarchy, only change the part that actually changes. - * - * Since we haven't yet done put_prev_entity and if the selected task - * is a different task than we started out with, try and touch the - * least amount of cfs_rqs. - */ - if (prev != p) { - struct sched_entity *pse = &prev->se; - struct cfs_rq *cfs_rq; - - while (!(cfs_rq = is_same_group(se, pse))) { - int se_depth = se->depth; - int pse_depth = pse->depth; - - if (se_depth <= pse_depth) { - put_prev_entity(cfs_rq_of(pse), pse); - pse = parent_entity(pse); - } - if (se_depth >= pse_depth) { - set_next_entity(cfs_rq_of(se), se, true); - se = parent_entity(se); - } - } - - put_prev_entity(cfs_rq, pse); - set_next_entity(cfs_rq, se, true); - - __set_next_task_fair(rq, p, true); - } - - return p; - -simple: -#endif /* CONFIG_FAIR_GROUP_SCHED */ - put_prev_set_next_task(rq, prev, p); - return p; -} - static struct task_struct * fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) __must_hold(__rq_lockp(dl_se->rq)) @@ -9994,10 +9928,33 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; + struct sched_entity *nse = NULL; - for_each_sched_entity(se) { +#ifdef CONFIG_FAIR_GROUP_SCHED + if (next && next->sched_class == &fair_sched_class) + nse = &next->se; +#endif + + while (se) { cfs_rq = cfs_rq_of(se); - put_prev_entity(cfs_rq, se); + if (!nse || cfs_rq->curr) + put_prev_entity(cfs_rq, se); +#ifdef CONFIG_FAIR_GROUP_SCHED + if (nse) { + if (is_same_group(se, nse)) + break; + + int d = nse->depth - se->depth; + if (d >= 0) { + /* nse has equal or greater depth, ascend */ + nse = parent_entity(nse); + /* if nse is the deeper, do not ascend se */ + if (d > 0) + continue; + } + } +#endif + se = parent_entity(se); } } @@ -15021,10 +14978,30 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) } } -static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) +/* + * Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { struct sched_entity *se = &p->se; + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) && + first && cfs_rq->curr) + break; + + set_next_entity(cfs_rq, se, first); + /* ensure bandwidth has been allocated on our new cfs_rq */ + account_cfs_rq_runtime(cfs_rq, 0); + } + + se = &p->se; + if (task_on_rq_queued(p)) { /* * Move the next running task to the front of the list, so our @@ -15044,27 +15021,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs sched_fair_update_stop_tick(rq, p); } -/* - * Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) -{ - struct sched_entity *se = &p->se; - - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - set_next_entity(cfs_rq, se, first); - /* ensure bandwidth has been allocated on our new cfs_rq */ - account_cfs_rq_runtime(cfs_rq, 0); - } - - __set_next_task_fair(rq, p, first); -} - void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; @@ -15376,7 +15332,6 @@ DEFINE_SCHED_CLASS(fair) = { .wakeup_preempt = wakeup_preempt_fair, .pick_task = pick_task_fair, - .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8eb8f83db6b0..6b48bb3074fe 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2589,17 +2589,6 @@ struct sched_class { * schedule/pick_next_task: rq->lock */ struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); - /* - * Optional! When implemented pick_next_task() should be equivalent to: - * - * next = pick_task(); - * if (next) { - * put_prev_task(prev); - * set_next_task_first(next); - * } - */ - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf); /* * sched_change: @@ -2823,8 +2812,7 @@ static inline bool sched_fair_runnable(struct rq *rq) return rq->cfs.nr_queued > 0; } -extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf); +extern struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf); extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf); #define SCA_CHECK 0x01 -- cgit v1.2.3 From 1eae219ea0e80eed83c129e8cae0f007843f1893 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 29 May 2026 13:27:12 +0530 Subject: sched/topology: Provide arch_llc_mask for cache aware scheduling Venkat Reported a boot kernel panic next-20260522. Git bisect pointed to b5ea300a17e3 ("sched/cache: Make LLC id continuous") Stacktrace points to llc_mask being null. NIP [c000000000e58504] _find_first_bit+0x44/0x130 LR [c000000000e58500] _find_first_bit+0x40/0x130 Call Trace: build_sched_domains+0xad8/0xe50 sched_init_smp+0xa8/0x164 kernel_init_freeable+0x250/0x370 ret_from_kernel_user_thread+0x14/0x1c On powerpc, cpu_coregroup_mask is available only when the underlying hardware support coregroup. In shared LPAR, QEMU guest or power9 etc coregroup isn't supported. In such cases llc_mask was being referenced when it was null leading to panic. On powerpc, LLC is at SMT core level. So assumption that coregroup(MC) domain point to LLC is wrong. Provide a way for archs to say where its LLC is if it not at MC domain. Fixes: b5ea300a17e3 ("sched/cache: Make LLC id continuous") Closes: https://lore.kernel.org/all/51154de7-3700-4cb4-82f2-1b3a8fa427f7@linux.ibm.com/ Reported-by: Venkat Rao Bagalkote Co-developed-by: Chen, Yu C Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chen Yu Tested-by: Venkat Rao Bagalkote Tested-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/20260529075712.1181039-1-sshegde@linux.ibm.com --- kernel/sched/topology.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index df2ceb54c970..622e2e01974c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2063,12 +2063,21 @@ const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu return cpu_coregroup_mask(cpu); } -#define llc_mask(cpu) cpu_coregroup_mask(cpu) +/* + * Majority of architectures have LLC at MC domain level with exception + * such as powerpc. Provide a way for arch to specify where its LLC is + * if it falls in exception category + */ +# ifndef arch_llc_mask +#define arch_llc_mask(cpu) cpu_coregroup_mask(cpu) +# endif #else -#define llc_mask(cpu) cpumask_of(cpu) +#define arch_llc_mask(cpu) cpumask_of(cpu) #endif +#define llc_mask(cpu) arch_llc_mask(cpu) + const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) { return cpu_node_mask(cpu); -- cgit v1.2.3 From 4043f549841619a01999bf5d4e0b7931ef87f6cc Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 26 May 2026 12:05:02 +0200 Subject: sched/deadline: Reject debugfs dl_server writes for offline CPUs Writing runtime or period via the per-CPU dl_server debugfs files (/sys/kernel/debug/sched/{fair,ext}_server/cpu*/{runtime,period}) on an offline CPU can trigger two distinct kernel issues: 1) Divide-by-zero in dl_server_apply_params(): Oops: divide error: 0000 [#1] SMP NOPTI RIP: 0010:dl_server_apply_params+0x239/0x3a0 Call Trace: sched_server_write_common.isra.0+0x21a/0x3c0 full_proxy_write+0x78/0xd0 vfs_write+0xe7/0x6e0 Both __dl_sub() and __dl_add() divide by cpus internally, which can be 0 once the CPU has been removed from any active root-domain span (this has been latent since the debugfs interface was introduced). 2) WARN_ON_ONCE in dl_server_start(): WARNING: kernel/sched/deadline.c:1805 at dl_server_start+0x232/0x270 Commit ee6e44dfe6e5 ("sched/deadline: Stop dl_server before CPU goes offline") added this check to catch enqueueing the server on an offline rq. There's no meaningful semantics for re-configuring the per-CPU dl_server bandwidth while the CPU is offline, so simply reject the write with -EBUSY so userspace gets a clear error. Closes: https://lore.kernel.org/all/20260526092228.3B6891F00A3A@smtp.kernel.org/ Fixes: d741f297bcea ("sched/fair: Fair server interface") Reported-by: Sashiko Signed-off-by: Andrea Righi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Tested-by: abaci-kreproducer Link: https://patch.msgid.link/20260526100502.575774-1-arighi@nvidia.com --- kernel/sched/debug.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4809e1d23081..5e09cf9fae3e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -416,6 +416,9 @@ static ssize_t sched_server_write_common(struct file *filp, const char __user *u return -EINVAL; } + if (!cpu_online(cpu_of(rq))) + return -EBUSY; + update_rq_clock(rq); dl_server_stop(dl_se); retval = dl_server_apply_params(dl_se, runtime, period, 0); -- cgit v1.2.3 From e7b63427fdb4977621d69085a97272c8856644fe Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 26 May 2026 18:42:48 +0200 Subject: sched_ext: Auto-register/unregister dl_server reservations Commit cd959a3562050d ("sched_ext: Add a DL server for sched_ext tasks") introduced an ext_server deadline server to protect sched_ext tasks from fair/RT starvation, mirroring the existing fair_server. Currently, both servers reserve their 50ms/1000ms bandwidth at boot, regardless of whether a BPF scheduler is loaded. Unused bandwidth is still reclaimed at runtime by other classes, but the static reservation prevents the RT class from implicitly using that headroom when one of the two classes is guaranteed to be empty. A sysadmin can work around this by writing /sys/kernel/debug/sched/{fair,ext}_server/cpu*/runtime, but that requires manual action and not all systems expose debugfs. A better approach is to make server bandwidth reservations dynamic: only the scheduling policy that is currently active should register its reservation, while the inactive one should not artificially hold capacity (keeping both reservations only when the BPF scheduler is running in partial mode): +---------------------------------------------+-------------+------------+ | BPF scheduler state | fair server | ext server | +---------------------------------------------+-------------+------------+ | not loaded (default boot) | reserved | none | | loaded full mode (!SCX_OPS_SWITCH_PARTIAL) | none | reserved | | loaded partial mode (SCX_OPS_SWITCH_PARTIAL)| reserved | reserved | +---------------------------------------------+-------------+------------+ To achieve this, introduce an "attached/detached" state for each deadline server, so the kernel can decide whether a server's bandwidth should be accounted in global bandwidth tracking. At boot, the system starts with only the fair server contributing to bandwidth accounting. When a BPF scheduler is enabled, the ext server is attached and may replace or complement the fair server depending on whether full or partial mode is used. When sched_ext is disabled, the system restores the previous deadline bandwidth values and behavior. The transition logic ensures that switching between scheduling modes is consistent and reversible, without losing runtime configuration or requiring manual intervention. Signed-off-by: Andrea Righi Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://patch.msgid.link/20260526164420.638711-2-arighi@nvidia.com --- kernel/sched/deadline.c | 204 ++++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/ext.c | 71 +++++++++++++++++ kernel/sched/sched.h | 4 + 3 files changed, 272 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b60e2df8ff9d..f9e62ed08d77 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1797,7 +1797,8 @@ void dl_server_start(struct sched_dl_entity *dl_se) struct rq *rq = dl_se->rq; dl_se->dl_defer_idle = 0; - if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime) + if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime || + !dl_se->dl_bw_attached) return; /* @@ -1872,6 +1873,13 @@ void sched_init_dl_servers(void) dl_se->dl_server = 1; dl_se->dl_defer = 1; setup_new_dl_entity(dl_se); + + /* + * No BPF scheduler is loaded at boot, so the ext_server has no + * tasks to protect. Detach its bandwidth reservation, it will + * be attached when a BPF scheduler is loaded. + */ + dl_server_detach_bw(dl_se); #endif } } @@ -1882,6 +1890,9 @@ void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) int cpu = cpu_of(rq); struct dl_bw *dl_b; + if (!dl_se->dl_bw_attached) + return; + dl_b = dl_bw_of(cpu_of(rq)); guard(raw_spinlock)(&dl_b->lock); @@ -1893,7 +1904,8 @@ void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init) { - u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime); + u64 old_bw = (init || !dl_se->dl_bw_attached) ? 0 : + to_ratio(dl_se->dl_period, dl_se->dl_runtime); u64 new_bw = to_ratio(period, runtime); struct rq *rq = dl_se->rq; int cpu = cpu_of(rq); @@ -1913,7 +1925,8 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio if (init) { __add_rq_bw(new_bw, &rq->dl); __dl_add(dl_b, new_bw, cpus); - } else { + dl_se->dl_bw_attached = 1; + } else if (dl_se->dl_bw_attached) { __dl_sub(dl_b, dl_se->dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); @@ -1933,6 +1946,181 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio return 0; } +/* + * Add @dl_se's bw to the root-domain accounting. + * + * Return -EBUSY if attaching would overflow root domain capacity. + */ +static int __dl_server_attach_bw_locked(struct sched_dl_entity *dl_se, + struct dl_bw *dl_b, int cpus) +{ + struct rq *rq = dl_se->rq; + unsigned long cap; + + /* + * Always update @rq->dl.this_bw, but only update @dl_b->total_bw + * (and run the overflow check it gates) while this CPU is active. + * + * This mirrors dl_server_add_bw() during root-domain rebuilds, which + * only publishes bandwidth from active CPUs into @dl_b. + */ + if (cpu_active(cpu_of(rq))) { + cap = dl_bw_capacity(cpu_of(rq)); + if (__dl_overflow(dl_b, cap, 0, dl_se->dl_bw)) + return -EBUSY; + __dl_add(dl_b, dl_se->dl_bw, cpus); + } + __add_rq_bw(dl_se->dl_bw, &rq->dl); + dl_se->dl_bw_attached = 1; + + return 0; +} + +/* + * Drain @dl_se and remove its bw from the root-domain accounting. + */ +static void __dl_server_detach_bw_locked(struct sched_dl_entity *dl_se, + struct dl_bw *dl_b, int cpus) +{ + struct rq *rq = dl_se->rq; + + /* + * If the server is still active (on_rq), dequeue it via + * dl_server_stop(); task_non_contending() will either subtract + * @dl_bw from running_bw immediately (0-lag passed) or set + * dl_non_contending and arm the inactive_timer. + */ + if (dl_se->dl_server_active) + dl_server_stop(dl_se); + + /* + * Drop @dl_se's contribution from this rq's bandwidth accounting, + * mirroring the __add_rq_bw() done at attach time. + */ + dl_rq_change_utilization(rq, dl_se, 0); + + /* + * Update @dl_b only while this CPU is active, matching + * dl_server_add_bw() during root-domain rebuilds. + * + * If this CPU is inactive, its bandwidth is not currently accounted in + * @dl_b->total_bw: either attach skipped adding it, or a rebuild + * already dropped it while re-publishing active CPUs only. + * + * In that case there is nothing to subtract from @dl_b. Just clear + * @dl_se->dl_bw_attached; if the CPU becomes active again, the next + * rebuild will re-publish its bandwidth. + */ + if (cpu_active(cpu_of(rq))) + __dl_sub(dl_b, dl_se->dl_bw, cpus); + dl_se->dl_bw_attached = 0; +} + +/* + * Attach @dl_se's bandwidth to the root domain's total_bw accounting. + * + * Use to dynamically register a dl_server's bandwidth reservation while + * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is + * already attached. + * + * Returns -EBUSY if attaching would overflow the root domain capacity. + */ +int dl_server_attach_bw(struct sched_dl_entity *dl_se) +{ + struct rq *rq = dl_se->rq; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + int cpus, ret; + + if (dl_se->dl_bw_attached) + return 0; + + scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) { + dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); + ret = __dl_server_attach_bw_locked(dl_se, dl_b, cpus); + } + if (ret) + return ret; + + /* + * The natural 0->nr_running transition that triggers dl_server_start() + * may have happened while @dl_se was still detached (e.g., between + * scx_bypass(false) and the scx_enable() re-balance loop), so kick a + * start here. + * + * dl_server_start() bails out cleanly if there's nothing to schedule or + * it's already active. Skip if @cpu is offline; the server will be + * started naturally on the first enqueue once @cpu comes back. + */ + if (cpu_online(cpu)) + dl_server_start(dl_se); + + return 0; +} + +/* + * Detach @dl_se's bandwidth from the root domain's total_bw accounting. + * + * Use to dynamically unregister a dl_server's bandwidth reservation while + * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is + * not currently attached. + */ +void dl_server_detach_bw(struct sched_dl_entity *dl_se) +{ + int cpu = cpu_of(dl_se->rq); + struct dl_bw *dl_b; + int cpus; + + if (!dl_se->dl_bw_attached) + return; + + dl_b = dl_bw_of(cpu); + guard(raw_spinlock)(&dl_b->lock); + cpus = dl_bw_cpus(cpu); + __dl_server_detach_bw_locked(dl_se, dl_b, cpus); +} + +/* + * Atomically detach @detach_se and attach @attach_se on the same rq, holding + * @dl_b->lock across both operations so a concurrent sched_setattr() cannot + * steal the bandwidth freed by the detach before the attach can claim it. + * + * Both entities must live on the same rq (same root domain). Returns the + * result of the attach: -EBUSY if attaching @attach_se would overflow root + * domain capacity (in which case both servers end up detached). + */ +int dl_server_swap_bw(struct sched_dl_entity *detach_se, + struct sched_dl_entity *attach_se) +{ + struct rq *rq = detach_se->rq; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + int cpus, ret; + + WARN_ON_ONCE(attach_se->rq != rq); + + scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) { + dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); + + if (detach_se->dl_bw_attached) + __dl_server_detach_bw_locked(detach_se, dl_b, cpus); + + if (attach_se->dl_bw_attached) + ret = 0; + else + ret = __dl_server_attach_bw_locked(attach_se, dl_b, cpus); + } + if (ret) + return ret; + + if (cpu_online(cpu)) + dl_server_start(attach_se); + + return 0; +} + /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -3233,12 +3421,12 @@ static void dl_server_add_bw(struct root_domain *rd, int cpu) struct sched_dl_entity *dl_se; dl_se = &cpu_rq(cpu)->fair_server; - if (dl_server(dl_se) && cpu_active(cpu)) + if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu)) __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu)); #ifdef CONFIG_SCHED_CLASS_EXT dl_se = &cpu_rq(cpu)->ext_server; - if (dl_server(dl_se) && cpu_active(cpu)) + if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu)) __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu)); #endif } @@ -3247,11 +3435,13 @@ static u64 dl_server_read_bw(int cpu) { u64 dl_bw = 0; - if (cpu_rq(cpu)->fair_server.dl_server) + if (cpu_rq(cpu)->fair_server.dl_server && + cpu_rq(cpu)->fair_server.dl_bw_attached) dl_bw += cpu_rq(cpu)->fair_server.dl_bw; #ifdef CONFIG_SCHED_CLASS_EXT - if (cpu_rq(cpu)->ext_server.dl_server) + if (cpu_rq(cpu)->ext_server.dl_server && + cpu_rq(cpu)->ext_server.dl_bw_attached) dl_bw += cpu_rq(cpu)->ext_server.dl_bw; #endif diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 345aa11b84b2..f412c4bb21c3 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5814,6 +5814,7 @@ static void scx_root_disable(struct scx_sched *sch) struct scx_exit_info *ei = sch->exit_info; struct scx_task_iter sti; struct task_struct *p; + bool was_switched_all; int cpu; /* guarantee forward progress and wait for descendants to be disabled */ @@ -5840,6 +5841,8 @@ static void scx_root_disable(struct scx_sched *sch) */ mutex_lock(&scx_enable_mutex); + was_switched_all = scx_switched_all(); + static_branch_disable(&__scx_switched_all); WRITE_ONCE(scx_switching_all, false); @@ -5889,10 +5892,34 @@ static void scx_root_disable(struct scx_sched *sch) /* * Invalidate all the rq clocks to prevent getting outdated * rq clocks from a previous scx scheduler. + * + * Also re-balance the dl_server bandwidth reservations: detach + * ext_server (no more sched_ext tasks) and reinstate fair_server if it + * was previously detached because we were running in full mode. + * + * Unlike the enable path, this runs on a recovery path that cannot + * fail, so we use dl_server_swap_bw() to atomically free ext_server's + * bandwidth and reclaim it for fair_server under the same dl_b lock. + * + * The swap can still fail with -EBUSY if someone bumped ext_server's + * runtime via debugfs between enable and disable; in that narrow case + * both servers end up detached and we just WARN. */ for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); + scx_rq_clock_invalidate(rq); + + scoped_guard(rq_lock_irqsave, rq) { + update_rq_clock(rq); + if (was_switched_all) { + if (WARN_ON_ONCE(dl_server_swap_bw(&rq->ext_server, + &rq->fair_server))) + pr_warn("failed to re-attach fair_server on CPU %d\n", cpu); + } else { + dl_server_detach_bw(&rq->ext_server); + } + } } /* no task is on scx, turn off all the switches and flush in-progress calls */ @@ -6810,6 +6837,31 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (ret) goto err_disable; + /* + * Attach the ext_server bandwidth reservation before anything is + * committed so that we can fail the enable if the root domain cannot + * accommodate it. The matching fair_server detach is deferred to the + * tail of this function, after the switch is fully committed and can no + * longer fail. + * + * On failure, err_disable funnels into scx_root_disable() which + * detaches ext_server, so partially-attached state is cleaned up + * automatically. + */ + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + scoped_guard(rq_lock_irqsave, rq) { + update_rq_clock(rq); + ret = dl_server_attach_bw(&rq->ext_server); + } + if (ret) { + pr_warn("sched_ext: failed to attach ext_server on CPU %d (%d)\n", + cpu, ret); + goto err_disable; + } + } + /* * Once __scx_enabled is set, %current can be switched to SCX anytime. * This can lead to stalls as some BPF schedulers (e.g. userspace @@ -6926,6 +6978,25 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) static_branch_enable(&__scx_switched_all); + /* + * Detach the fair_server bandwidth reservation now that the switch + * is fully committed. In full mode (!SCX_OPS_SWITCH_PARTIAL) no + * task will ever run in the fair class, so give that bandwidth + * back to the RT class. The matching ext_server attach already + * happened earlier; this only releases bandwidth and cannot fail. + * + * In partial mode keep fair_server attached. + */ + if (scx_switched_all()) { + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + guard(rq_lock_irqsave)(rq); + update_rq_clock(rq); + dl_server_detach_bw(&rq->fair_server); + } + } + pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", sch->ops.name, scx_switched_all() ? "" : " (partial)"); kobject_uevent(&sch->kobj, KOBJ_ADD); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6b48bb3074fe..332ecf8930b4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -421,6 +421,10 @@ extern void ext_server_init(struct rq *rq); extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); extern int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init); +extern int dl_server_attach_bw(struct sched_dl_entity *dl_se); +extern void dl_server_detach_bw(struct sched_dl_entity *dl_se); +extern int dl_server_swap_bw(struct sched_dl_entity *detach_se, + struct sched_dl_entity *attach_se); static inline bool dl_server_active(struct sched_dl_entity *dl_se) { -- cgit v1.2.3 From 3b7be8e7fa698359616c3276e005f08c3b6070e4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 26 May 2026 12:43:29 -0700 Subject: sched/fair: Use rq_clock() in update_tg_load_avg() rate-limit update_tg_load_avg() is called once per leaf cfs_rq from the __update_blocked_fair() walk that runs inside the NOHZ idle-balance softirq, and again from update_load_avg() with UPDATE_TG. Its first operation after the trivial early-outs is unconditionally: now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC) return; Jakub ran into a system where nohz_idle_balance() was taking 75% of a CPU (which is handling network traffic and doing many irq_exit_cpu calls), with 35% of that CPU spent in update_load_avg, and 17% of the CPU in sched_clock_cpu(), reading the TSC. In a quick synthetic test, it looks like this patch reduces the CPU use of sched_balance_update_blocked_averages by about 20%. Switch the rate-limit to read rq_clock(rq_of(cfs_rq)) instead. This eliminates the rdtsc, and uses a fairly fresh timestamp, because all callers of update_tg_load_avg() and clear_tg_load_avg() hold rq->lock and have called update_rq_clock(rq) within microseconds: caller pre-state __update_blocked_fair encloser did update_rq_clock(rq) update_load_avg's three UPDATE_TG sites under rq->lock after enqueue/dequeue/update_curr attach_/detach_entity_cfs_rq preceded by update_load_avg(...) clear_tg_load_avg via offline path rq_clock_start_loop_update(rq) upfront so rq->clock is fresh at every call. Since cfs_rqs are per-CPU per-task_group, cfs_rq->last_update_tg_load_avg is always compared against the same rq's clock; no cross-rq drift. Signed-off-by: Rik van Riel Assisted-by: Claude (Anthropic) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://patch.msgid.link/20260527110250.6a91718d@fangorn --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 62a2dcb0d03e..b5819c4899f1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4962,7 +4962,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) * For migration heavy workloads, access to tg->load_avg can be * unbound. Limit the update rate to at most once per ms. */ - now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + now = rq_clock(rq_of(cfs_rq)); if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC) return; @@ -4985,7 +4985,7 @@ static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq) if (cfs_rq->tg == &root_task_group) return; - now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + now = rq_clock(rq_of(cfs_rq)); delta = 0 - cfs_rq->tg_load_avg_contrib; atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = 0; -- cgit v1.2.3 From bdaf235913e1f31453c6e0e109d797269f9f0a37 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 30 Apr 2026 21:50:47 +0000 Subject: locking: mutex: Fix proxy-exec potentially deactivating tasks marked TASK_RUNNING Vineeth found came up with a test driver that could trip up workqueue stalls. After fixing one issue this test found, Vineeth reported the test was still failing. Greatly simplified, a task that tries to take a mutex already owned by another task that is sleeping, can hit a edge case in the mutex_lock_common() case. If the task fails to get the lock, calls into schedule, but gets a spurious wakeup, it will find that it is first waiter, and go into the mutex_optimistic_spin() logic. Though before calling mutex_optimistic_spin(), we clear task blocked_on state, since mutex_optimistic_spin() may call schedule() if need_resched() is set. After mutex_optimistic_spin() fails, we set blocked_on again, restart the main mutex loop, try to take the lock and call into schedule_preempt_disabled(). From there, with proxy-execution, we'll see the task is blocked_on, follow the chain, see the owner is sleeping and dequeue the waiting task from the runqueue. This all sounds fine and reasonable. But what I had missed is that in mutex_optimistic_spin(), not only do we call schedule() but we set TASK_RUNNABLE right before doing so. This is ok for that invocation of schedule(). But when we come back we re-set the blocked_on we had just cleared, but we do not re-set the task state to TASK_INTERRUPTIBLE/UNINTERRUPTIBLE. This means we have a task that is blocked_on & TASK_RUNNABLE, so when the proxy execution code dequeues the task, we are in trouble since future wakeups will be shortcut by the ttwu_state_match() check. Thus, to avoid this, after mutex_optimistic_spin(), set the task state back when we set blocked_on. Many many thanks again to Vineeth for his very useful testing driver that uncovered this long hidden bug, that I hadn't tripped in all my testing! Very impressed with the problems he's uncovered! Reported-by: Vineeth Pillai Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: Vineeth Pillai Link: https://patch.msgid.link/20260430215103.2978955-3-jstultz@google.com --- kernel/locking/mutex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 09534628dc01..a93d4c6bee1a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -763,6 +763,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas raw_spin_lock_irqsave(&lock->wait_lock, flags); raw_spin_lock(¤t->blocked_lock); __set_task_blocked_on(current, lock); + set_current_state(state); if (opt_acquired) break; -- cgit v1.2.3 From 7a3a6bfbd62a2ba3e0ef1e92d6b71abb66890825 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 12 May 2026 02:56:11 +0000 Subject: sched: Rework prev_balance() to avoid stale prev references Historically, the prev value from __schedule() was the rq->curr. This prev value is passed down through numerous functions, and used in the class scheduler implementations. The fact that prev was on_cpu until the end of __schedule(), meant it was stable across the rq lock drops that the class->balance() implementations often do. However, with proxy-exec, the prev passed to functions called by __schedule() is rq->donor, which may not be the same as rq->curr and may not be on_cpu, this makes the prev value potentially unstable across rq lock drops. A recently found issue with proxy-exec, is when we begin doing return migration from try_to_wake_up(), its possible we may be waking up the rq->donor. When we do this, we proxy_resched_idle() to put_prev_set_next() setting the rq->donor to rq->idle, allowing the rq->donor to be return migrated and allowed to run. This however runs into trouble, as on another cpu we might be in the middle of calling __schedule(). Conceptually the rq lock is held for the majority of the time, but in calling prev_balance() its possible the class->balance() handler call may briefly drop the rq lock. This opens a window for try_to_wake_up() to wake and return migrate the rq->donor before the class logic reacquires the rq lock. Unfortunately prev_balance() pass in a prev argument, to which we pass rq->donor. However this prev value can now become stale and incorrect across a rq lock drop. So, to correct this, rework the prev_balance() call so that it does not take a "prev" argument. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260512025635.2840817-2-jstultz@google.com --- kernel/sched/core.c | 33 ++++++++++++++++----------------- kernel/sched/deadline.c | 8 +++++++- kernel/sched/idle.c | 2 +- kernel/sched/rt.c | 8 +++++++- kernel/sched/sched.h | 2 +- kernel/sched/stop_task.c | 2 +- 6 files changed, 33 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c8bfd697e2c..a9c9b89260cd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5986,10 +5986,9 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) schedstat_inc(this_rq()->sched_count); } -static void prev_balance(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) +static void prev_balance(struct rq *rq, struct rq_flags *rf) { - const struct sched_class *start_class = prev->sched_class; + const struct sched_class *start_class = rq->donor->sched_class; const struct sched_class *class; /* @@ -6001,7 +6000,7 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, * a runnable task of @class priority or higher. */ for_active_class_range(class, start_class, &idle_sched_class) { - if (class->balance && class->balance(rq, prev, rf)) + if (class->balance && class->balance(rq, rf)) break; } } @@ -6010,7 +6009,7 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, * Pick up the highest-prio task: */ static inline struct task_struct * -__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +__pick_next_task(struct rq *rq, struct rq_flags *rf) __must_hold(__rq_lockp(rq)) { const struct sched_class *class; @@ -6027,7 +6026,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * higher scheduling class, because otherwise those lose the * opportunity to pull in more work from other CPUs. */ - if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && + if (likely(!sched_class_above(rq->donor->sched_class, &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_queued)) { p = pick_task_fair(rq, rf); @@ -6038,19 +6037,19 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (!p) p = pick_task_idle(rq, rf); - put_prev_set_next_task(rq, prev, p); + put_prev_set_next_task(rq, rq->donor, p); return p; } restart: - prev_balance(rq, prev, rf); + prev_balance(rq, rf); for_each_active_class(class) { p = class->pick_task(rq, rf); if (unlikely(p == RETRY_TASK)) goto restart; if (p) { - put_prev_set_next_task(rq, prev, p); + put_prev_set_next_task(rq, rq->donor, p); return p; } } @@ -6102,7 +6101,7 @@ extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_f static void queue_core_balance(struct rq *rq); static struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +pick_next_task(struct rq *rq, struct rq_flags *rf) __must_hold(__rq_lockp(rq)) { struct task_struct *next, *p, *max; @@ -6115,7 +6114,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) bool need_sync; if (!sched_core_enabled(rq)) - return __pick_next_task(rq, prev, rf); + return __pick_next_task(rq, rf); cpu = cpu_of(rq); @@ -6128,7 +6127,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) */ rq->core_pick = NULL; rq->core_dl_server = NULL; - return __pick_next_task(rq, prev, rf); + return __pick_next_task(rq, rf); } /* @@ -6152,7 +6151,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) goto out_set_next; } - prev_balance(rq, prev, rf); + prev_balance(rq, rf); smt_mask = cpu_smt_mask(cpu); need_sync = !!rq->core->core_cookie; @@ -6334,7 +6333,7 @@ restart_multi: } out_set_next: - put_prev_set_next_task(rq, prev, next); + put_prev_set_next_task(rq, rq->donor, next); if (rq->core->core_forceidle_count && next == rq->idle) queue_core_balance(rq); @@ -6557,10 +6556,10 @@ static inline void sched_core_cpu_deactivate(unsigned int cpu) {} static inline void sched_core_cpu_dying(unsigned int cpu) {} static struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +pick_next_task(struct rq *rq, struct rq_flags *rf) __must_hold(__rq_lockp(rq)) { - return __pick_next_task(rq, prev, rf); + return __pick_next_task(rq, rf); } #endif /* !CONFIG_SCHED_CORE */ @@ -7108,7 +7107,7 @@ static void __sched notrace __schedule(int sched_mode) pick_again: assert_balance_callbacks_empty(rq); - next = pick_next_task(rq, rq->donor, &rf); + next = pick_next_task(rq, &rf); rq->next_class = next->sched_class; if (sched_proxy_exec()) { struct task_struct *prev_donor = rq->donor; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f9e62ed08d77..6ef5a808e13e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2698,8 +2698,14 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) resched_curr(rq); } -static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +static int balance_dl(struct rq *rq, struct rq_flags *rf) { + /* + * Note, rq->donor may change during rq lock drops, + * so don't re-use prev across lock drops + */ + struct task_struct *p = rq->donor; + if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { /* * This is OK, because current is on_cpu, which avoids it being diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index a83be0c834dd..ff39120d723a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -462,7 +462,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int flags) } static int -balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +balance_idle(struct rq *rq, struct rq_flags *rf) { return WARN_ON_ONCE(1); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e6ea728f519e..e474c31d8fe6 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1596,8 +1596,14 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) resched_curr(rq); } -static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +static int balance_rt(struct rq *rq, struct rq_flags *rf) { + /* + * Note, rq->donor may change during rq lock drops, + * so don't re-use p across lock drops + */ + struct task_struct *p = rq->donor; + if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { /* * This is OK, because current is on_cpu, which avoids it being diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 332ecf8930b4..ef715f2acbaa 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2587,7 +2587,7 @@ struct sched_class { /* * schedule/pick_next_task/prev_balance: rq->lock */ - int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + int (*balance)(struct rq *rq, struct rq_flags *rf); /* * schedule/pick_next_task: rq->lock diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index f95798baddeb..c909ca0d8c87 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -16,7 +16,7 @@ select_task_rq_stop(struct task_struct *p, int cpu, int flags) } static int -balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +balance_stop(struct rq *rq, struct rq_flags *rf) { return sched_stop_runnable(rq); } -- cgit v1.2.3 From 96a6988fb595ab1d77f60b33ea392b2e15b68605 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 12 May 2026 02:56:12 +0000 Subject: sched: deadline: Add some helper variables to cleanup deadline logic As part of an improvement to handling pushable deadline tasks, Peter suggested this cleanup[1], to use helper values for dl_entity and dl_rq in the enqueue_task_dl() and put_prev_task_dl() functions. There should be no functional change from this patch. To make sure this cleanup change doesn't obscure later logic changes, I've split it into its own patch. [1]: https://lore.kernel.org/lkml/20260304095123.GP606826@noisy.programming.kicks-ass.net/ Suggested-by: Peter Zijlstra Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260512025635.2840817-3-jstultz@google.com --- kernel/sched/deadline.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6ef5a808e13e..0b7ac4c12797 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2484,7 +2484,10 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { - if (is_dl_boosted(&p->dl)) { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; + + if (is_dl_boosted(dl_se)) { /* * Because of delays in the detection of the overrun of a * thread's runtime, it might be the case that a thread @@ -2497,14 +2500,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * * In this case, the boost overrides the throttle. */ - if (p->dl.dl_throttled) { + if (dl_se->dl_throttled) { /* * The replenish timer needs to be canceled. No * problem if it fires concurrently: boosted threads * are ignored in dl_task_timer(). */ - cancel_replenish_timer(&p->dl); - p->dl.dl_throttled = 0; + cancel_replenish_timer(dl_se); + dl_se->dl_throttled = 0; } } else if (!dl_prio(p->normal_prio)) { /* @@ -2516,7 +2519,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * being boosted again with no means to replenish the runtime and clear * the throttle. */ - p->dl.dl_throttled = 0; + dl_se->dl_throttled = 0; if (!(flags & ENQUEUE_REPLENISH)) printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n", task_pid_nr(p)); @@ -2525,20 +2528,20 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) } check_schedstat_required(); - update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl); + update_stats_wait_start_dl(dl_rq, dl_se); if (p->on_rq == TASK_ON_RQ_MIGRATING) flags |= ENQUEUE_MIGRATING; - enqueue_dl_entity(&p->dl, flags); + enqueue_dl_entity(dl_se, flags); - if (dl_server(&p->dl)) + if (dl_server(dl_se)) return; if (task_is_blocked(p)) return; - if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && !dl_se->dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -2835,7 +2838,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_s struct sched_dl_entity *dl_se = &p->dl; struct dl_rq *dl_rq = &rq->dl; - if (on_dl_rq(&p->dl)) + if (on_dl_rq(dl_se)) update_stats_wait_start_dl(dl_rq, dl_se); update_curr_dl(rq); @@ -2845,7 +2848,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_s if (task_is_blocked(p)) return; - if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) + if (on_dl_rq(dl_se) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } -- cgit v1.2.3 From cd8e62c85861bcfbbefedce11a6f8eb00c774312 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 12 May 2026 02:56:13 +0000 Subject: sched: deadline: Add dl_rq->curr pointer to address issues with Proxy Exec The DL scheduler keeps the current task in the rbtree, since the deadline value isn't usually chagned while the task is runnable. This results in set_next_task() and put_prev_task() being simpler, but unfortunately this causes complexity elsewhere. Specifically when update_curr_dl() updates the deadline, it has to dequeue and then enqueue the task. From put_prev_task_dl(), we first call update_curr_dl(), and then call enqueue_pushable_dl_task(). However, with Proxy Exec this goes awry. Since when a mutex is released, we might wake the waiting rq->donor. This will cause put_prev_task() to be called on the donor to take it off the cpu for return migration. At that point, from put_prev_task_dl() the update_curr_dl() logic will dequeue & enqueue the task, and the enqueue function will call enqueue_pushable_dl_task() (since the task_current() check won't prevent it). Then back up the callstack in put_prev_task_dl() we'll end up calling enqueue_pushable_dl_task() again, tripping the !RB_EMPTY_NODE(&p->pushable_dl_tasks) warning. So to avoid this, use Peter's suggested[1] approach, and add a dl_rq->curr pointer that is set/cleared from set_next_task()/ put_prev_task(), which effectively tracks the rq->donor. We can then use this to avoid adding the active donor to the pushable list from enqueue_task_dl(). [1]: https://lore.kernel.org/lkml/20260304095123.GP606826@noisy.programming.kicks-ass.net/ Suggested-by: Peter Zijlstra Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260512025635.2840817-4-jstultz@google.com --- kernel/sched/deadline.c | 13 +++++++++++++ kernel/sched/sched.h | 1 + 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0b7ac4c12797..4754dbe4232d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2541,6 +2541,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (task_is_blocked(p)) return; + if (dl_rq->curr == dl_se) + return; + if (!task_current(rq, p) && !dl_se->dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -2763,6 +2766,10 @@ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) } #endif /* !CONFIG_SCHED_HRTICK */ +/* + * DL keeps current in tree, because ->deadline is not typically changed while + * a task is runnable. + */ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { struct sched_dl_entity *dl_se = &p->dl; @@ -2775,6 +2782,9 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); + WARN_ON_ONCE(dl_rq->curr); + dl_rq->curr = dl_se; + if (!first) return; @@ -2845,6 +2855,9 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_s update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); + WARN_ON_ONCE(dl_rq->curr != dl_se); + dl_rq->curr = NULL; + if (task_is_blocked(p)) return; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef715f2acbaa..b3aff26dbb13 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -893,6 +893,7 @@ struct dl_rq { bool overloaded; + struct sched_dl_entity *curr; /* * Tasks on this rq that can be pushed away. They are kept in * an rb-tree, ordered by tasks' deadlines, with caching -- cgit v1.2.3 From f0c1ecde6447079505f1d4557d30401136218ae0 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 12 May 2026 02:56:14 +0000 Subject: sched: Rework block_task so it can be directly called Pull most of the logic out of try_to_block_task() and put it into block_task() directly, so that we can call block_task() and not have to worry about the failing cases in try_to_block_task() Suggested-by: Peter Zijlstra Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260512025635.2840817-5-jstultz@google.com --- kernel/sched/core.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a9c9b89260cd..2f1e85d09b94 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2236,8 +2236,29 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -static void block_task(struct rq *rq, struct task_struct *p, int flags) +static void block_task(struct rq *rq, struct task_struct *p, unsigned long task_state) { + int flags = DEQUEUE_NOCLOCK; + + p->sched_contributes_to_load = + (task_state & TASK_UNINTERRUPTIBLE) && + !(task_state & TASK_NOLOAD) && + !(task_state & TASK_FROZEN); + + if (unlikely(is_special_task_state(task_state))) + flags |= DEQUEUE_SPECIAL; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags)) __block_task(rq, p); } @@ -6587,7 +6608,6 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, unsigned long *task_state_p, bool should_block) { unsigned long task_state = *task_state_p; - int flags = DEQUEUE_NOCLOCK; if (signal_pending_state(task_state, p)) { WRITE_ONCE(p->__state, TASK_RUNNING); @@ -6607,26 +6627,7 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, if (!should_block) return false; - p->sched_contributes_to_load = - (task_state & TASK_UNINTERRUPTIBLE) && - !(task_state & TASK_NOLOAD) && - !(task_state & TASK_FROZEN); - - if (unlikely(is_special_task_state(task_state))) - flags |= DEQUEUE_SPECIAL; - - /* - * __schedule() ttwu() - * prev_state = prev->state; if (p->on_rq && ...) - * if (prev_state) goto out; - * p->on_rq = 0; smp_acquire__after_ctrl_dep(); - * p->state = TASK_WAKING - * - * Where __schedule() and ttwu() have matching control dependencies. - * - * After this, schedule() must not care about p->state any more. - */ - block_task(rq, p, flags); + block_task(rq, p, task_state); return true; } -- cgit v1.2.3 From f13beb010e4ab0735c9e46802cbcc820a8bd6467 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 12 May 2026 02:56:15 +0000 Subject: sched: Have try_to_wake_up() handle return-migration for PROXY_WAKING case This patch adds logic so try_to_wake_up() will notice if we are waking a task where blocked_on == PROXY_WAKING, and if necessary dequeue the task so the wakeup will naturally return-migrate the donor task back to a cpu it can run on. This helps performance as we do the dequeue and wakeup under the locks normally taken in the try_to_wake_up() and avoids having to do proxy_force_return() from __schedule(), which has to re-take similar locks and then force a pick again loop. This was split out from the larger proxy patch, and significantly reworked. Credits for the original patch go to: Peter Zijlstra (Intel) Juri Lelli Valentin Schneider Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260512025635.2840817-6-jstultz@google.com --- kernel/sched/core.c | 195 ++++++++++++++++++++++++++-------------------------- 1 file changed, 96 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2f1e85d09b94..3f71dd9c1063 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3735,6 +3735,53 @@ void update_rq_avg_idle(struct rq *rq) rq->idle_stamp = 0; } +#ifdef CONFIG_SCHED_PROXY_EXEC +static void zap_balance_callbacks(struct rq *rq); + +static inline void proxy_reset_donor(struct rq *rq) +{ + WARN_ON_ONCE(rq->donor == rq->curr); + + put_prev_set_next_task(rq, rq->donor, rq->curr); + rq_set_donor(rq, rq->curr); + zap_balance_callbacks(rq); + resched_curr(rq); +} + +/* + * Checks to see if task p has been proxy-migrated to another rq + * and needs to be returned. If so, we deactivate the task here + * so that it can be properly woken up on the p->wake_cpu + * (or whichever cpu select_task_rq() picks at the bottom of + * try_to_wake_up() + */ +static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p) +{ + if (!task_is_blocked(p)) + return false; + + scoped_guard(raw_spinlock, &p->blocked_lock) { + /* Task is waking up; clear any blocked_on relationship */ + __clear_task_blocked_on(p, NULL); + + /* If already current, don't need to return migrate */ + if (task_current(rq, p)) + return false; + + /* If we're return migrating the rq->donor, switch it out for idle */ + if (task_current_donor(rq, p)) + proxy_reset_donor(rq); + } + block_task(rq, p, TASK_WAKING); + return true; +} +#else /* !CONFIG_SCHED_PROXY_EXEC */ +static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p) +{ + return false; +} +#endif /* CONFIG_SCHED_PROXY_EXEC */ + static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) @@ -3799,28 +3846,26 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, */ static int ttwu_runnable(struct task_struct *p, int wake_flags) { - struct rq_flags rf; - struct rq *rq; - int ret = 0; + ACQUIRE(__task_rq_lock, guard)(p); + struct rq *rq = guard.rq; - rq = __task_rq_lock(p, &rf); - if (task_on_rq_queued(p)) { - update_rq_clock(rq); - if (p->se.sched_delayed) - enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); - if (!task_on_cpu(rq, p)) { - /* - * When on_rq && !on_cpu the task is preempted, see if - * it should preempt the task that is current now. - */ - wakeup_preempt(rq, p, wake_flags); - } - ttwu_do_wakeup(p); - ret = 1; - } - __task_rq_unlock(rq, p, &rf); + if (!task_on_rq_queued(p)) + return 0; - return ret; + update_rq_clock(rq); + if (p->se.sched_delayed) + enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); + if (proxy_needs_return(rq, p)) + return 0; + if (!task_on_cpu(rq, p)) { + /* + * When on_rq && !on_cpu the task is preempted, see if + * it should preempt the task that is current now. + */ + wakeup_preempt(rq, p, wake_flags); + } + ttwu_do_wakeup(p); + return 1; } void sched_ttwu_pending(void *arg) @@ -4207,6 +4252,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * it disabling IRQs (this allows not taking ->pi_lock). */ WARN_ON_ONCE(p->se.sched_delayed); + /* If p is current, we know we can run here, so clear blocked_on */ + clear_task_blocked_on(p, NULL); if (!ttwu_state_match(p, state, &success)) goto out; @@ -4223,6 +4270,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { smp_mb__after_spinlock(); + if (!ttwu_state_match(p, state, &success)) break; @@ -4287,6 +4335,14 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ WRITE_ONCE(p->__state, TASK_WAKING); + /* + * We never clear the blocked_on relation on proxy_deactivate. + * If we don't clear it here, we have TASK_RUNNING + p->blocked_on + * when waking up. Since this is a fully blocked, off CPU task + * waking up, it should be safe to clear the blocked_on relation. + */ + if (task_is_blocked(p)) + clear_task_blocked_on(p, NULL); /* * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, considering queueing p on the remote CPUs wake_list @@ -4331,6 +4387,16 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); + } else if (cpu != p->wake_cpu) { + /* + * If we were proxy-migrated to cpu, then + * select_task_rq() picks cpu instead of wake_cpu + * to return to, we won't call set_task_cpu(), + * leaving a stale wake_cpu pointing to where we + * proxy-migrated from. So just fixup wake_cpu here + * if its not correct + */ + p->wake_cpu = cpu; } ttwu_queue(p, cpu, wake_flags); @@ -6612,7 +6678,7 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, if (signal_pending_state(task_state, p)) { WRITE_ONCE(p->__state, TASK_RUNNING); *task_state_p = TASK_RUNNING; - set_task_blocked_on_waking(p, NULL); + clear_task_blocked_on(p, NULL); return false; } @@ -6656,13 +6722,11 @@ static inline struct task_struct *proxy_resched_idle(struct rq *rq) return rq->idle; } -static bool proxy_deactivate(struct rq *rq, struct task_struct *donor) +static void proxy_deactivate(struct rq *rq, struct task_struct *donor) { unsigned long state = READ_ONCE(donor->__state); - /* Don't deactivate if the state has been changed to TASK_RUNNING */ - if (state == TASK_RUNNING) - return false; + WARN_ON_ONCE(state == TASK_RUNNING); /* * Because we got donor from pick_next_task(), it is *crucial* * that we call proxy_resched_idle() before we deactivate it. @@ -6673,7 +6737,7 @@ static bool proxy_deactivate(struct rq *rq, struct task_struct *donor) * need to be changed from next *before* we deactivate. */ proxy_resched_idle(rq); - return try_to_block_task(rq, donor, &state, true); + block_task(rq, donor, state); } static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf) @@ -6747,71 +6811,6 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf, proxy_reacquire_rq_lock(rq, rf); } -static void proxy_force_return(struct rq *rq, struct rq_flags *rf, - struct task_struct *p) - __must_hold(__rq_lockp(rq)) -{ - struct rq *task_rq, *target_rq = NULL; - int cpu, wake_flag = WF_TTWU; - - lockdep_assert_rq_held(rq); - WARN_ON(p == rq->curr); - - if (p == rq->donor) - proxy_resched_idle(rq); - - proxy_release_rq_lock(rq, rf); - /* - * We drop the rq lock, and re-grab task_rq_lock to get - * the pi_lock (needed for select_task_rq) as well. - */ - scoped_guard (task_rq_lock, p) { - task_rq = scope.rq; - - /* - * Since we let go of the rq lock, the task may have been - * woken or migrated to another rq before we got the - * task_rq_lock. So re-check we're on the same RQ. If - * not, the task has already been migrated and that CPU - * will handle any futher migrations. - */ - if (task_rq != rq) - break; - - /* - * Similarly, if we've been dequeued, someone else will - * wake us - */ - if (!task_on_rq_queued(p)) - break; - - /* - * Since we should only be calling here from __schedule() - * -> find_proxy_task(), no one else should have - * assigned current out from under us. But check and warn - * if we see this, then bail. - */ - if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) { - WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d on_cpu: %i\n", - __func__, cpu_of(task_rq), - p->comm, p->pid, p->on_cpu); - break; - } - - update_rq_clock(task_rq); - deactivate_task(task_rq, p, DEQUEUE_NOCLOCK); - cpu = select_task_rq(p, p->wake_cpu, &wake_flag); - set_task_cpu(p, cpu); - target_rq = cpu_rq(cpu); - clear_task_blocked_on(p, NULL); - } - - if (target_rq) - attach_one_task(target_rq, p); - - proxy_reacquire_rq_lock(rq, rf); -} - /* * Find runnable lock owner to proxy for mutex blocked donor * @@ -6847,7 +6846,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) clear_task_blocked_on(p, PROXY_WAKING); return p; } - goto force_return; + goto deactivate; } /* @@ -6882,7 +6881,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) __clear_task_blocked_on(p, NULL); return p; } - goto force_return; + goto deactivate; } if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { @@ -6961,12 +6960,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) return owner; deactivate: - if (proxy_deactivate(rq, donor)) - return NULL; - /* If deactivate fails, force return */ - p = donor; -force_return: - proxy_force_return(rq, rf, p); + proxy_deactivate(rq, p); return NULL; migrate_task: proxy_migrate_task(rq, rf, p, owner_cpu); @@ -7113,6 +7107,9 @@ pick_again: if (sched_proxy_exec()) { struct task_struct *prev_donor = rq->donor; + if (!prev_state && prev->blocked_on) + clear_task_blocked_on(prev, NULL); + rq_set_donor(rq, next); if (unlikely(next->blocked_on)) { next = find_proxy_task(rq, next, &rf); -- cgit v1.2.3 From 4c2a20413d7fb3fc3dd7adf233a4f82bb203fb58 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 12 May 2026 02:56:16 +0000 Subject: sched: Add is_blocked task flag Add a new is_blocked flag to the task struct. This flag is set by try_to_block_task() and cleared by ttwu_do_wakeup() and tracks if the task is blocked. Traditionally this would mirror !p->on_rq, however due things like DELAY_DEQUEUE and PROXY_EXEC, this can diverge, so its useful to manage separately. Additionally with this, we might be able to get rid of the p->se.sched_delayed (ab)use in the core code (eventually). Taken whole cloth from Peter's email: https://lore.kernel.org/lkml/20260501132143.GC1026330@noisy.programming.kicks-ass.net/ With a few additional p->is_blocked = 0 in a few cases where we return current if blocked_on gets zeroed or there is no owner. This may hint that these current special cases might be dropped eventually. This change also helps resolve wait-queue stalls seen with proxy-execution. See previous patch attempts for details: https://lore.kernel.org/lkml/20260430215103.2978955-2-jstultz@google.com/ Reported-by: Vineeth Pillai Suggested-by: Peter Zijlstra Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260512025635.2840817-7-jstultz@google.com --- kernel/sched/core.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3f71dd9c1063..c7552869d5c4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -624,6 +624,12 @@ int task_llc(const struct task_struct *p) * [ The astute reader will observe that it is possible for two tasks on one * CPU to have ->on_cpu = 1 at the same time. ] * + * p->is_blocked <- { 0, 1 }: + * + * is set by try_to_block_task() and cleared by ttwu_do_wakeup() and tracks + * if the task is blocked. Traditionally this would mirror p->on_rq, however + * due things like DELAY_DEQUEUE and PROXY_EXEC, this can diverge. + * * task_cpu(p): is changed by set_task_cpu(), the rules are: * * - Don't call set_task_cpu() on a blocked task: @@ -3719,6 +3725,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) */ static inline void ttwu_do_wakeup(struct task_struct *p) { + p->is_blocked = 0; WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); } @@ -4252,6 +4259,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * it disabling IRQs (this allows not taking ->pi_lock). */ WARN_ON_ONCE(p->se.sched_delayed); + WARN_ON_ONCE(p->is_blocked); /* If p is current, we know we can run here, so clear blocked_on */ clear_task_blocked_on(p, NULL); if (!ttwu_state_match(p, state, &success)) @@ -4563,6 +4571,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) /* A delayed task cannot be in clone(). */ WARN_ON_ONCE(p->se.sched_delayed); + WARN_ON_ONCE(p->is_blocked); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; @@ -6676,6 +6685,7 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, unsigned long task_state = *task_state_p; if (signal_pending_state(task_state, p)) { + p->is_blocked = 0; WRITE_ONCE(p->__state, TASK_RUNNING); *task_state_p = TASK_RUNNING; clear_task_blocked_on(p, NULL); @@ -6683,6 +6693,8 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, return false; } + p->is_blocked = 1; + /* * We check should_block after signal_pending because we * will want to wake the task in that case. But if @@ -6843,6 +6855,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) /* if its PROXY_WAKING, do return migration or run if current */ if (mutex == PROXY_WAKING) { if (task_current(rq, p)) { + p->is_blocked = 0; clear_task_blocked_on(p, PROXY_WAKING); return p; } @@ -6878,6 +6891,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) * just run on this rq), or return-migrate the task. */ if (task_current(rq, p)) { + p->is_blocked = 0; __clear_task_blocked_on(p, NULL); return p; } @@ -7111,7 +7125,7 @@ pick_again: clear_task_blocked_on(prev, NULL); rq_set_donor(rq, next); - if (unlikely(next->blocked_on)) { + if (unlikely(next->is_blocked && next->blocked_on)) { next = find_proxy_task(rq, next, &rf); if (!next) { zap_balance_callbacks(rq); -- cgit v1.2.3 From 1628b25248d0742b2ce9c7cfa59cd183e35f37e1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 May 2026 02:56:17 +0000 Subject: sched: Add blocked_donor link to task for smarter mutex handoffs Add link to the task this task is proxying for, and use it so the mutex owner can do an intelligent hand-off of the mutex to the task that the owner is running on behalf. [jstultz: This patch was split out from larger proxy patch] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Juri Lelli Signed-off-by: Valentin Schneider Signed-off-by: Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260512025635.2840817-8-jstultz@google.com --- kernel/fork.c | 1 + kernel/locking/mutex.c | 60 ++++++++++++++++++++++++++++++++++++++++++++------ kernel/sched/core.c | 14 +++++++++++- 3 files changed, 67 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a679b2448234..6fcca1db0af3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2224,6 +2224,7 @@ __latent_entropy struct task_struct *copy_process( lockdep_init_task(p); p->blocked_on = NULL; /* not blocked yet */ + p->blocked_donor = NULL; /* nobody is boosting p yet */ #ifdef CONFIG_BCACHE p->sequential_io = 0; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index a93d4c6bee1a..28677165785f 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -981,15 +981,22 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) __releases(lock) { - struct task_struct *next = NULL; + struct task_struct *donor, *next = NULL; struct mutex_waiter *waiter; - DEFINE_WAKE_Q(wake_q); unsigned long owner; unsigned long flags; mutex_release(&lock->dep_map, ip); __release(lock); + /* + * Ensures the proxy donor stack is stable across unlock and handoff. + * Specifically, it avoids the case where current->blocked_donor is + * NULL when it is inspected while doing the unlock, but a preemption + * before taking the wake_lock would make it set and a hand-off is + * missed. + */ + guard(preempt)(); /* * Release the lock before (potentially) taking the spinlock such that * other contenders can get on with things ASAP. @@ -1002,6 +1009,12 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne MUTEX_WARN_ON(__owner_task(owner) != current); MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP); + if (sched_proxy_exec() && current->blocked_donor) { + /* force handoff if we have a blocked_donor */ + owner = MUTEX_FLAG_HANDOFF; + break; + } + if (owner & MUTEX_FLAG_HANDOFF) break; @@ -1014,20 +1027,53 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne } raw_spin_lock_irqsave(&lock->wait_lock, flags); + raw_spin_lock(¤t->blocked_lock); debug_mutex_unlock(lock); + + if (sched_proxy_exec()) { + /* + * If we have a task boosting current, and that task was boosting + * current through this lock, hand the lock to that task, as that + * is the highest waiter, as selected by the scheduling function. + */ + donor = current->blocked_donor; + if (donor) { + struct mutex *next_lock; + + raw_spin_lock_nested(&donor->blocked_lock, SINGLE_DEPTH_NESTING); + next_lock = __get_task_blocked_on(donor); + if (next_lock == lock) { + next = get_task_struct(donor); + __set_task_blocked_on_waking(donor, next_lock); + current->blocked_donor = NULL; + } + raw_spin_unlock(&donor->blocked_lock); + } + } + + /* + * Failing that, pick first on the wait list. + */ waiter = lock->first_waiter; - if (waiter) { - next = waiter->task; + if (!next && waiter) { + next = get_task_struct(waiter->task); + raw_spin_lock_nested(&next->blocked_lock, SINGLE_DEPTH_NESTING); debug_mutex_wake_waiter(lock, waiter); - set_task_blocked_on_waking(next, lock); - wake_q_add(&wake_q, next); + __set_task_blocked_on_waking(next, lock); + raw_spin_unlock(&next->blocked_lock); + } if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); + raw_spin_unlock(¤t->blocked_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + if (next) { + wake_up_process(next); + put_task_struct(next); + } } #ifndef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c7552869d5c4..4c6ceff3855e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6827,7 +6827,17 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf, * Find runnable lock owner to proxy for mutex blocked donor * * Follow the blocked-on relation: - * task->blocked_on -> mutex->owner -> task... + * + * ,-> task + * | | blocked-on + * | v + * blocked_donor | mutex + * | | owner + * | v + * `-- task + * + * and set the blocked_donor relation, this latter is used by the mutex + * code to find which (blocked) task to hand-off to. * * Lock order: * @@ -6969,6 +6979,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) * rq, therefore holding @rq->lock is sufficient to * guarantee its existence, as per ttwu_remote(). */ + owner->blocked_donor = p; } WARN_ON_ONCE(owner && !owner->on_rq); return owner; @@ -7125,6 +7136,7 @@ pick_again: clear_task_blocked_on(prev, NULL); rq_set_donor(rq, next); + next->blocked_donor = NULL; if (unlikely(next->is_blocked && next->blocked_on)) { next = find_proxy_task(rq, next, &rf); if (!next) { -- cgit v1.2.3 From abc40cca0efdf5ba28b7bc37f1db445a8cc840bd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2026 11:28:46 +0200 Subject: sched/proxy: Optimize try_to_wake_up() The reason for the clause in try_to_wake_up() is, per its comment, that find_proxy_task()'s proxy_deactivate() is not always called with a cleared p->blocked_on. However, that seems silly and easily cured. Make sure to always call proxy_deactivate() with a cleared p->blocked_on such that we might remove this clause from the common wake-up path. Signed-off-by: Peter Zijlstra (Intel) Acked-by: John Stultz Link: https://patch.msgid.link/20260526113322.244729903%40infradead.org --- kernel/sched/core.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4c6ceff3855e..a06d5a57adc8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4343,14 +4343,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ WRITE_ONCE(p->__state, TASK_WAKING); - /* - * We never clear the blocked_on relation on proxy_deactivate. - * If we don't clear it here, we have TASK_RUNNING + p->blocked_on - * when waking up. Since this is a fully blocked, off CPU task - * waking up, it should be safe to clear the blocked_on relation. - */ - if (task_is_blocked(p)) - clear_task_blocked_on(p, NULL); /* * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, considering queueing p on the remote CPUs wake_list @@ -6739,6 +6731,7 @@ static void proxy_deactivate(struct rq *rq, struct task_struct *donor) unsigned long state = READ_ONCE(donor->__state); WARN_ON_ONCE(state == TASK_RUNNING); + WARN_ON_ONCE(donor->blocked_on); /* * Because we got donor from pick_next_task(), it is *crucial* * that we call proxy_resched_idle() before we deactivate it. @@ -6864,9 +6857,9 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) for (p = donor; (mutex = p->blocked_on); p = owner) { /* if its PROXY_WAKING, do return migration or run if current */ if (mutex == PROXY_WAKING) { + clear_task_blocked_on(p, PROXY_WAKING); if (task_current(rq, p)) { p->is_blocked = 0; - clear_task_blocked_on(p, PROXY_WAKING); return p; } goto deactivate; @@ -6900,9 +6893,9 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) * and return p (if it is current and safe to * just run on this rq), or return-migrate the task. */ + __clear_task_blocked_on(p, NULL); if (task_current(rq, p)) { p->is_blocked = 0; - __clear_task_blocked_on(p, NULL); return p; } goto deactivate; @@ -6912,6 +6905,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) /* XXX Don't handle blocked owners/delayed dequeue yet */ if (curr_in_chain) return proxy_resched_idle(rq); + __clear_task_blocked_on(p, NULL); goto deactivate; } -- cgit v1.2.3 From 708024b575b4ea58c5956e7c09f2d2f48facd478 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2026 11:32:34 +0200 Subject: sched: Be more strict about p->is_blocked Upon entry to try_to_block_task(), p->is_blocked should be false. After all, the prior wakeup would have made it so per ttwu_do_wakeup(). Ensure this is the case, rather than clearing it in the path that doesn't set it. Signed-off-by: Peter Zijlstra (Intel) Acked-by: John Stultz Link: https://patch.msgid.link/20260526113322.364017314%40infradead.org --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a06d5a57adc8..8b7eb126c254 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6676,8 +6676,9 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, { unsigned long task_state = *task_state_p; + WARN_ON_ONCE(p->is_blocked); + if (signal_pending_state(task_state, p)) { - p->is_blocked = 0; WRITE_ONCE(p->__state, TASK_RUNNING); *task_state_p = TASK_RUNNING; clear_task_blocked_on(p, NULL); -- cgit v1.2.3 From 7918cf3693614c9f96bc9e43daff6fc72c01b81a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 May 2026 09:58:02 +0200 Subject: sched/proxy: Only return migrate when needed Current code will 'unconditionally' return migrate on PROXY_WAKING, even if the task is (still) on the original CPU. Check task_cpu(p) against p->waking_cpu, which per proxy_set_task_cpu() preserves the original CPU the task was on. If they do not mis-match, there is no need to go through the more expensive wakeup path. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260527082916.GP3126523%40noisy.programming.kicks-ass.net --- kernel/sched/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8b7eb126c254..b007b65d9c88 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3767,6 +3767,21 @@ static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p) if (!task_is_blocked(p)) return false; + /* + * Typically per __set_task_cpu(), task_cpu(p) == p->wake_cpu. + * + * However, proxy_set_task_cpu() is such that it preserves the + * original cpu in p->wake_cpu while migrating p for proxy reasons + * (possibly outside of the allowed p->cpus_ptr). + * + * Furthermore, migration_cpu_stop() / __migrate_swap_task(), will + * only set p->wake_cpu when !p->on_rq, and since here p->on_rq, this + * will not apply. But if it did, this check is the safe way around + * and would migrate. + */ + if (task_cpu(p) == p->wake_cpu) + return false; + scoped_guard(raw_spinlock, &p->blocked_lock) { /* Task is waking up; clear any blocked_on relationship */ __clear_task_blocked_on(p, NULL); -- cgit v1.2.3 From be365ce2bc20b8970bed350f82c3b760256b6945 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2026 11:42:29 +0200 Subject: sched/proxy: Switch proxy to use p->is_blocked Rather than gate the proxy paths with p->blocked_on, use p->is_blocked. This opens up the state: '->is_blocked && !->blocked_on' for future use. Notably, only proxy and delayed tasks can be ->on_rq && ->is_blocked, and it is guaranteed that sched_class::pick_task() will never return a delayed task. Therefore any task returned from pick_next_task() that has ->is_blocked set, must be a proxy task. Suggested-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260526113322.477954312%40infradead.org --- kernel/sched/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b007b65d9c88..9b710313dfb3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3764,7 +3764,7 @@ static inline void proxy_reset_donor(struct rq *rq) */ static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p) { - if (!task_is_blocked(p)) + if (!p->is_blocked) return false; /* @@ -6866,14 +6866,14 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) bool curr_in_chain = false; int this_cpu = cpu_of(rq); struct task_struct *p; - struct mutex *mutex; int owner_cpu; /* Follow blocked_on chain. */ - for (p = donor; (mutex = p->blocked_on); p = owner) { + for (p = donor; p->is_blocked; p = owner) { /* if its PROXY_WAKING, do return migration or run if current */ - if (mutex == PROXY_WAKING) { - clear_task_blocked_on(p, PROXY_WAKING); + struct mutex *mutex = p->blocked_on; + if (!mutex || mutex == PROXY_WAKING) { + clear_task_blocked_on(p, mutex); if (task_current(rq, p)) { p->is_blocked = 0; return p; @@ -7147,7 +7147,7 @@ pick_again: rq_set_donor(rq, next); next->blocked_donor = NULL; - if (unlikely(next->is_blocked && next->blocked_on)) { + if (unlikely(next->is_blocked)) { next = find_proxy_task(rq, next, &rf); if (!next) { zap_balance_callbacks(rq); -- cgit v1.2.3 From ec9d4f1c424134bbf30965075df78d02a5d021dc Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 26 May 2026 11:43:02 +0200 Subject: sched/proxy: Remove PROXY_WAKING Now that the proxy path uses ->is_blocked, use the '->is_blocked && !->blocked_on' state instead of PROXY_WAKING. Notably, this is where a blocked_on relation is broken but the donor task might still need a return migration. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260526113322.596522894%40infradead.org --- kernel/locking/mutex.c | 4 ++-- kernel/locking/ww_mutex.h | 4 ++-- kernel/sched/core.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 28677165785f..89d01f788973 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1044,7 +1044,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne next_lock = __get_task_blocked_on(donor); if (next_lock == lock) { next = get_task_struct(donor); - __set_task_blocked_on_waking(donor, next_lock); + __clear_task_blocked_on(next, lock); current->blocked_donor = NULL; } raw_spin_unlock(&donor->blocked_lock); @@ -1060,7 +1060,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne raw_spin_lock_nested(&next->blocked_lock, SINGLE_DEPTH_NESTING); debug_mutex_wake_waiter(lock, waiter); - __set_task_blocked_on_waking(next, lock); + __clear_task_blocked_on(next, lock); raw_spin_unlock(&next->blocked_lock); } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 6c12452097e1..d62b49b53ec3 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -324,7 +324,7 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, * blocked_on to PROXY_WAKING. Otherwise we can see * circular blocked_on relationships that can't resolve. */ - set_task_blocked_on_waking(waiter->task, lock); + clear_task_blocked_on(waiter->task, lock); wake_q_add(wake_q, waiter->task); } @@ -383,7 +383,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * are waking the mutex owner, who may be currently * blocked on a different mutex. */ - set_task_blocked_on_waking(owner, NULL); + clear_task_blocked_on(owner, NULL); wake_q_add(wake_q, owner); } return true; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9b710313dfb3..cec2c164fab1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6872,7 +6872,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) for (p = donor; p->is_blocked; p = owner) { /* if its PROXY_WAKING, do return migration or run if current */ struct mutex *mutex = p->blocked_on; - if (!mutex || mutex == PROXY_WAKING) { + if (!mutex) { clear_task_blocked_on(p, mutex); if (task_current(rq, p)) { p->is_blocked = 0; -- cgit v1.2.3 From c0404dd88d124714351f7a961d3313ee0f2f036b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2026 11:22:30 +0200 Subject: sched/proxy: Remove superfluous clear_task_blocked_in() Per the discussion here: https://lore.kernel.org/all/20260403112810.GG3738786@noisy.programming.kicks-ass.net/ The reason for this condition is that the signal condition in try_to_block_task() would set_task_blocked_in_waking(). However, it no longer does that, in fact, that path does clear_task_blocked_on(). Further, per the discussions here: https://lore.kernel.org/r/dc61cf77-e541-441d-a708-c40e19aa0db2%40amd.com https://lore.kernel.org/r//9dd1d24d-45d3-4ee2-8e67-8305b34bfb6d%40amd.com there are a few other edge cases that needed this. But they're all variants of PROXY_WAKING leaking out. And since PROXY_WAKING is now gone, this is no longer needed either. Signed-off-by: Peter Zijlstra (Intel) Acked-by: John Stultz Link: https://patch.msgid.link/20260526113322.120970670%40infradead.org --- kernel/sched/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cec2c164fab1..d5795188d5b6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7142,9 +7142,6 @@ pick_again: if (sched_proxy_exec()) { struct task_struct *prev_donor = rq->donor; - if (!prev_state && prev->blocked_on) - clear_task_blocked_on(prev, NULL); - rq_set_donor(rq, next); next->blocked_donor = NULL; if (unlikely(next->is_blocked)) { -- cgit v1.2.3 From 56e50ff567810db208cc37d9e17b8df044a9158c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 May 2026 12:00:59 +0200 Subject: sched: Simplify ttwu_runnable() Note that both proxy and delayed tasks have ->is_blocked set. Use this one condition to guard both paths. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260526113322.714832584%40infradead.org --- kernel/sched/core.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5795188d5b6..5a317f62a516 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3764,9 +3764,6 @@ static inline void proxy_reset_donor(struct rq *rq) */ static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p) { - if (!p->is_blocked) - return false; - /* * Typically per __set_task_cpu(), task_cpu(p) == p->wake_cpu. * @@ -3875,10 +3872,12 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) return 0; update_rq_clock(rq); - if (p->se.sched_delayed) - enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); - if (proxy_needs_return(rq, p)) - return 0; + if (p->is_blocked) { + if (p->se.sched_delayed) + enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); + if (proxy_needs_return(rq, p)) + return 0; + } if (!task_on_cpu(rq, p)) { /* * When on_rq && !on_cpu the task is preempted, see if -- cgit v1.2.3 From 63c1a12bc0e09af7dee919c4fb4a300a719d5125 Mon Sep 17 00:00:00 2001 From: "Guanyou.Chen" Date: Fri, 22 May 2026 21:09:59 +0800 Subject: sched: restore timer_slack_ns when resetting RT policy on fork Commit ed4fb6d7ef68 ("hrtimer: Use and report correct timerslack values for realtime tasks") sets timer_slack_ns to 0 for RT tasks in __setscheduler_params(). However, when an RT task with SCHED_RESET_ON_FORK creates child threads, the children inherit timer_slack_ns=0 from the parent. sched_fork() resets the child's policy to SCHED_NORMAL but does not restore timer_slack_ns, leaving the child permanently running with zero slack. Fix this by restoring timer_slack_ns from default_timer_slack_ns in sched_fork() when resetting from RT/DL to NORMAL policy, matching the existing behavior in __setscheduler_params(). Note: this fix alone requires a correct default_timer_slack_ns to be effective. See the following patch for that fix. Fixes: ed4fb6d7ef68 ("hrtimer: Use and report correct timerslack values for realtime tasks") Reported-by: Qiaoting.Lin Signed-off-by: Guanyou.Chen Signed-off-by: Chunhui.Li Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260522131000.1664983-2-chenguanyou@xiaomi.com --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5a317f62a516..2cfe8932d5db 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4826,6 +4826,7 @@ int sched_fork(u64 clone_flags, struct task_struct *p) p->policy = SCHED_NORMAL; p->static_prio = NICE_TO_PRIO(0); p->rt_priority = 0; + p->timer_slack_ns = p->default_timer_slack_ns; } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); -- cgit v1.2.3 From dfcfc97b6df0ea8e1b7d3b590022782abbec3389 Mon Sep 17 00:00:00 2001 From: Zecheng Li Date: Fri, 22 May 2026 10:15:48 -0400 Subject: sched/fair: Co-locate cfs_rq and sched_entity in cfs_tg_state Improve data locality and reduce pointer chasing by allocating struct cfs_rq and struct sched_entity together for non-root task groups. This is achieved by introducing a new combined struct cfs_tg_state that holds both objects in a single allocation. This patch: - Introduces struct cfs_tg_state that embeds cfs_rq, sched_entity, and sched_statistics together in a single structure. - Updates __schedstats_from_se() in stats.h to use cfs_tg_state for accessing sched_statistics from a group sched_entity. - Modifies alloc_fair_sched_group() and free_fair_sched_group() to allocate and free the new struct as a single unit. - Modifies the per-CPU pointers in task_group->se and task_group->cfs_rq to point to the members in the new combined structure. Signed-off-by: Zecheng Li Signed-off-by: Zecheng Li Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Reviewed-by: Josh Don Link: https://patch.msgid.link/20260522141623.600235-2-zli94@ncsu.edu --- kernel/sched/fair.c | 18 ++++++------------ kernel/sched/sched.h | 12 ++++++++++++ kernel/sched/stats.h | 9 +-------- 3 files changed, 19 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 78162707a9d5..e7d7d47ef7b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -15083,8 +15083,6 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) - kfree(tg->se[i]); } kfree(tg->cfs_rq); @@ -15093,6 +15091,7 @@ void free_fair_sched_group(struct task_group *tg) int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { + struct cfs_tg_state *state; struct sched_entity *se; struct cfs_rq *cfs_rq; int i; @@ -15109,16 +15108,13 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent)); for_each_possible_cpu(i) { - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL, cpu_to_node(i)); - if (!cfs_rq) + state = kzalloc_node(sizeof(*state), + GFP_KERNEL, cpu_to_node(i)); + if (!state) goto err; - se = kzalloc_node(sizeof(struct sched_entity_stats), - GFP_KERNEL, cpu_to_node(i)); - if (!se) - goto err_free_rq; - + cfs_rq = &state->cfs_rq; + se = &state->se; init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); @@ -15126,8 +15122,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; -err_free_rq: - kfree(cfs_rq); err: return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3aff26dbb13..585aba9f63b4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2294,6 +2294,18 @@ static inline struct task_group *task_group(struct task_struct *p) return p->sched_task_group; } +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Defined here to be available before stats.h is included, since + * stats.h has dependencies on things defined later in this file. + */ +struct cfs_tg_state { + struct cfs_rq cfs_rq; + struct sched_entity se; + struct sched_statistics stats; +} __no_randomize_layout; +#endif + /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index a612cf253c87..ebe0a7765f98 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -89,19 +89,12 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt #endif /* CONFIG_SCHEDSTATS */ -#ifdef CONFIG_FAIR_GROUP_SCHED -struct sched_entity_stats { - struct sched_entity se; - struct sched_statistics stats; -} __no_randomize_layout; -#endif - static inline struct sched_statistics * __schedstats_from_se(struct sched_entity *se) { #ifdef CONFIG_FAIR_GROUP_SCHED if (!entity_is_task(se)) - return &container_of(se, struct sched_entity_stats, se)->stats; + return &container_of(se, struct cfs_tg_state, se)->stats; #endif return &task_of(se)->stats; } -- cgit v1.2.3 From 89e1f67186baca353b68115bb98bd0bfed9f80c8 Mon Sep 17 00:00:00 2001 From: Zecheng Li Date: Fri, 22 May 2026 10:15:49 -0400 Subject: sched/fair: Remove task_group->se pointer array Now that struct sched_entity is co-located with struct cfs_rq for non-root task groups, the task_group->se pointer array is redundant. The associated sched_entity can be loaded directly from the cfs_rq. This patch performs the access conversion with the helpers: - is_root_task_group(tg): checks if a task group is the root task group. It compares the task group's address with the global root_task_group variable. - tg_se(tg, cpu): retrieves the cfs_rq and returns the address of the co-located se. This function checks if tg is the root task group to ensure behaving the same of previous tg->se[cpu]. Replaces all accesses that use the tg->se[cpu] pointer array with calls to the new tg_se(tg, cpu) accessor. - cfs_rq_se(cfs_rq): simplifies access paths like cfs_rq->tg->se[...] to use the co-located sched_entity. This function also checks if tg is the root task group to ensure same behavior. Since tg_se is not in very hot code paths, and the branch is a register comparison with an immediate value (`&root_task_group`), the performance impact is expected to be negligible. Signed-off-by: Zecheng Li Signed-off-by: Zecheng Li Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Reviewed-by: Josh Don Link: https://patch.msgid.link/20260522141623.600235-3-zli94@ncsu.edu --- kernel/sched/core.c | 7 ++----- kernel/sched/debug.c | 2 +- kernel/sched/fair.c | 25 +++++++++---------------- kernel/sched/sched.h | 31 ++++++++++++++++++++++++++----- 4 files changed, 38 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2cfe8932d5db..39cea012c230 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8923,7 +8923,7 @@ void __init sched_init(void) wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED - ptr += 2 * nr_cpu_ids * sizeof(void **); + ptr += nr_cpu_ids * sizeof(void **); #endif #ifdef CONFIG_RT_GROUP_SCHED ptr += 2 * nr_cpu_ids * sizeof(void **); @@ -8932,9 +8932,6 @@ void __init sched_init(void) ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); #ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.se = (struct sched_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -10016,7 +10013,7 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) int i; for_each_possible_cpu(i) { - stats = __schedstats_from_se(tg->se[i]); + stats = __schedstats_from_se(tg_se(tg, i)); ws += schedstat_val(stats->wait_sum); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5e09cf9fae3e..40584b27ea0c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -808,7 +808,7 @@ void dirty_sched_domain_sysctl(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { - struct sched_entity *se = tg->se[cpu]; + struct sched_entity *se = tg_se(tg, cpu); #define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) #define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", \ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e7d7d47ef7b2..447b0ac426d1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6876,7 +6876,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; + struct sched_entity *se = cfs_rq_se(cfs_rq); /* * It's possible we are called with runtime_remaining < 0 due to things @@ -11102,7 +11102,6 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq, *pos; bool decayed = false; - int cpu = cpu_of(rq); /* * Iterates the task_group tree in a bottom up fashion, see @@ -11122,7 +11121,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) } /* Propagate pending load changes to the parent, if any: */ - se = cfs_rq->tg->se[cpu]; + se = cfs_rq_se(cfs_rq); if (se && !skip_blocked_update(se)) update_load_avg(cfs_rq_of(se), se, UPDATE_TG); @@ -11148,8 +11147,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) */ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) { - struct rq *rq = rq_of(cfs_rq); - struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; + struct sched_entity *se = cfs_rq_se(cfs_rq); unsigned long now = jiffies; unsigned long load; @@ -15086,7 +15084,6 @@ void free_fair_sched_group(struct task_group *tg) } kfree(tg->cfs_rq); - kfree(tg->se); } int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) @@ -15099,9 +15096,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->cfs_rq = kzalloc_objs(cfs_rq, nr_cpu_ids); if (!tg->cfs_rq) goto err; - tg->se = kzalloc_objs(se, nr_cpu_ids); - if (!tg->se) - goto err; tg->shares = NICE_0_LOAD; @@ -15116,7 +15110,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) cfs_rq = &state->cfs_rq; se = &state->se; init_cfs_rq(cfs_rq); - init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + init_tg_cfs_entry(tg, cfs_rq, se, i, tg_se(parent, i)); init_entity_runnable_average(se); } @@ -15135,7 +15129,7 @@ void online_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { rq = cpu_rq(i); - se = tg->se[i]; + se = tg_se(tg, i); rq_lock_irq(rq, &rf); update_rq_clock(rq); attach_entity_cfs_rq(se); @@ -15152,7 +15146,7 @@ void unregister_fair_sched_group(struct task_group *tg) for_each_possible_cpu(cpu) { struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - struct sched_entity *se = tg->se[cpu]; + struct sched_entity *se = tg_se(tg, cpu); struct rq *rq = cpu_rq(cpu); if (se) { @@ -15189,7 +15183,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, init_cfs_rq_runtime(cfs_rq); tg->cfs_rq[cpu] = cfs_rq; - tg->se[cpu] = se; /* se could be NULL for root_task_group */ if (!se) @@ -15220,7 +15213,7 @@ static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) /* * We can't change the weight of the root cgroup. */ - if (!tg->se[0]) + if (is_root_task_group(tg)) return -EINVAL; shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); @@ -15231,7 +15224,7 @@ static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) tg->shares = shares; for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - struct sched_entity *se = tg->se[i]; + struct sched_entity *se = tg_se(tg, i); struct rq_flags rf; /* Propagate contribution to hierarchy */ @@ -15282,7 +15275,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - struct sched_entity *se = tg->se[i]; + struct sched_entity *se = tg_se(tg, i); struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 585aba9f63b4..823ba40cf098 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -484,8 +484,6 @@ struct task_group { #endif #ifdef CONFIG_FAIR_GROUP_SCHED - /* schedulable entities of this group on each CPU */ - struct sched_entity **se; /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; @@ -934,7 +932,8 @@ struct dl_rq { }; #ifdef CONFIG_FAIR_GROUP_SCHED - +/* Check whether a task group is root tg */ +#define is_root_task_group(tg) ((tg) == &root_task_group) /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) @@ -2304,6 +2303,28 @@ struct cfs_tg_state { struct sched_entity se; struct sched_statistics stats; } __no_randomize_layout; + +static inline struct sched_entity *tg_se(struct task_group *tg, int cpu) +{ + struct cfs_tg_state *state; + + if (is_root_task_group(tg)) + return NULL; + + state = container_of(tg->cfs_rq[cpu], struct cfs_tg_state, cfs_rq); + return &state->se; +} + +static inline struct sched_entity *cfs_rq_se(struct cfs_rq *cfs_rq) +{ + struct cfs_tg_state *state; + + if (is_root_task_group(cfs_rq->tg)) + return NULL; + + state = container_of(cfs_rq, struct cfs_tg_state, cfs_rq); + return &state->se; +} #endif /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ @@ -2316,8 +2337,8 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; - p->se.parent = tg->se[cpu]; - p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0; + p->se.parent = tg_se(tg, cpu); + p->se.depth = p->se.parent ? p->se.parent->depth + 1 : 0; #endif #ifdef CONFIG_RT_GROUP_SCHED -- cgit v1.2.3 From b8fea7af0e40feb6d9cbbd60b66ff0ec265e868f Mon Sep 17 00:00:00 2001 From: Zecheng Li Date: Fri, 22 May 2026 10:15:50 -0400 Subject: sched/fair: Allocate cfs_tg_state with percpu allocator To remove the cfs_rq pointer array in task_group, allocate the combined cfs_rq and sched_entity using the per-cpu allocator. This patch implements the following: - Changes task_group->cfs_rq from 'struct cfs_rq **' to 'struct cfs_rq __percpu *'. - Updates memory allocation in alloc_fair_sched_group() and free_fair_sched_group() to use alloc_percpu() and free_percpu() respectively. - Uses the inline accessor tg_cfs_rq(tg, cpu) with per_cpu_ptr() to retrieve the pointer to cfs_rq for the given task group and CPU. - Replaces direct accesses tg->cfs_rq[cpu] with calls to the new tg_cfs_rq(tg, cpu) helper. - Handles the root_task_group: since struct rq is already a per-cpu variable (runqueues), its embedded cfs_rq (rq->cfs) is also per-cpu. Therefore, we assign root_task_group.cfs_rq = &runqueues.cfs. - Cleanup the code in initializing the root task group. This change places each CPU's cfs_rq and sched_entity in its local per-cpu memory area to remove the per-task_group pointer arrays. Signed-off-by: Zecheng Li Signed-off-by: Zecheng Li Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Reviewed-by: Josh Don Link: https://patch.msgid.link/20260522141623.600235-4-zli94@ncsu.edu --- kernel/sched/core.c | 35 +++++++++++++--------------------- kernel/sched/fair.c | 54 +++++++++++++++++++++------------------------------- kernel/sched/sched.h | 14 ++++++++++---- 3 files changed, 45 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 39cea012c230..dd031410ab1a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8907,7 +8907,7 @@ static struct kmem_cache *task_group_cache __ro_after_init; void __init sched_init(void) { - unsigned long ptr = 0; + unsigned long __maybe_unused ptr = 0; int i; /* Make sure the linker didn't screw up */ @@ -8923,33 +8923,24 @@ void __init sched_init(void) wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED - ptr += nr_cpu_ids * sizeof(void **); -#endif -#ifdef CONFIG_RT_GROUP_SCHED - ptr += 2 * nr_cpu_ids * sizeof(void **); -#endif - if (ptr) { - ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); + root_task_group.cfs_rq = &runqueues.cfs; -#ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.cfs_rq = (struct cfs_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.shares = ROOT_TASK_GROUP_LOAD; - init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_EXT_GROUP_SCHED - scx_tg_init(&root_task_group); + scx_tg_init(&root_task_group); #endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED - root_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); + ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); - root_task_group.rt_rq = (struct rt_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); + root_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_RT_GROUP_SCHED */ - } init_defrootdomain(); @@ -9864,7 +9855,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, } for_each_online_cpu(i) { - struct cfs_rq *cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, i); struct rq *rq = cfs_rq->rq; guard(rq_lock_irq)(rq); @@ -10032,7 +10023,7 @@ static u64 throttled_time_self(struct task_group *tg) u64 total = 0; for_each_possible_cpu(i) { - total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time); + total += READ_ONCE(tg_cfs_rq(tg, i)->throttled_clock_self_time); } return total; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 447b0ac426d1..1d4ed883e630 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -334,7 +334,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) * to a tree or when we reach the top of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + tg_cfs_rq(cfs_rq->tg->parent, cpu)->on_list) { /* * If parent is already on the list, we add the child * just before. Thanks to circular linked property of @@ -342,7 +342,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) * of the list that starts by parent. */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + &(tg_cfs_rq(cfs_rq->tg->parent, cpu)->leaf_cfs_rq_list)); /* * The branch is now connected to its tree so we can * reset tmp_alone_branch to the beginning of the @@ -5037,7 +5037,7 @@ static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); clear_tg_load_avg(cfs_rq); } @@ -6594,7 +6594,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) { - return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); + return throttled_hierarchy(tg_cfs_rq(task_group(p), dst_cpu)); } static inline bool task_is_throttled(struct task_struct *p) @@ -6740,7 +6740,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); struct task_struct *p, *tmp; if (--cfs_rq->throttle_count) @@ -6811,7 +6811,7 @@ static void record_throttle_clock(struct cfs_rq *cfs_rq) static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); if (cfs_rq->throttle_count++) return 0; @@ -7285,8 +7285,8 @@ static void sync_throttle(struct task_group *tg, int cpu) if (!tg->parent) return; - cfs_rq = tg->cfs_rq[cpu]; - pcfs_rq = tg->parent->cfs_rq[cpu]; + cfs_rq = tg_cfs_rq(tg, cpu); + pcfs_rq = tg_cfs_rq(tg->parent, cpu); cfs_rq->throttle_count = pcfs_rq->throttle_count; cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); @@ -7478,7 +7478,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq) rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); raw_spin_lock(&cfs_b->lock); cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; @@ -7507,7 +7507,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); if (!cfs_rq->runtime_enabled) continue; @@ -10382,7 +10382,7 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ struct cfs_rq *dst_cfs_rq; #ifdef CONFIG_FAIR_GROUP_SCHED - dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu]; + dst_cfs_rq = tg_cfs_rq(task_group(p), dest_cpu); #else dst_cfs_rq = &cpu_rq(dest_cpu)->cfs; #endif @@ -14812,7 +14812,7 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) struct cfs_rq *cfs_rq; #ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq = task_group(p)->cfs_rq[cpu]; + cfs_rq = tg_cfs_rq(task_group(p), cpu); #else cfs_rq = &cpu_rq(cpu)->cfs; #endif @@ -15076,39 +15076,31 @@ static void task_change_group_fair(struct task_struct *p) void free_fair_sched_group(struct task_group *tg) { - int i; - - for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - } - - kfree(tg->cfs_rq); + free_percpu(tg->cfs_rq); } int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_tg_state *state; + struct cfs_tg_state __percpu *state; struct sched_entity *se; struct cfs_rq *cfs_rq; int i; - tg->cfs_rq = kzalloc_objs(cfs_rq, nr_cpu_ids); - if (!tg->cfs_rq) + state = alloc_percpu_gfp(struct cfs_tg_state, GFP_KERNEL); + if (!state) goto err; + tg->cfs_rq = &state->cfs_rq; tg->shares = NICE_0_LOAD; init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent)); for_each_possible_cpu(i) { - state = kzalloc_node(sizeof(*state), - GFP_KERNEL, cpu_to_node(i)); - if (!state) + cfs_rq = tg_cfs_rq(tg, i); + if (!cfs_rq) goto err; - cfs_rq = &state->cfs_rq; - se = &state->se; + se = tg_se(tg, i); init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, tg_se(parent, i)); init_entity_runnable_average(se); @@ -15145,7 +15137,7 @@ void unregister_fair_sched_group(struct task_group *tg) destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(cpu) { - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu); struct sched_entity *se = tg_se(tg, cpu); struct rq *rq = cpu_rq(cpu); @@ -15182,8 +15174,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, cfs_rq->rq = rq; init_cfs_rq_runtime(cfs_rq); - tg->cfs_rq[cpu] = cfs_rq; - /* se could be NULL for root_task_group */ if (!se) return; @@ -15276,7 +15266,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); struct sched_entity *se = tg_se(tg, i); - struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *grp_cfs_rq = tg_cfs_rq(tg, i); bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; struct rq_flags rf; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 823ba40cf098..c7c2dea65edd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -485,7 +485,7 @@ struct task_group { #ifdef CONFIG_FAIR_GROUP_SCHED /* runqueue "owned" by this group on each CPU */ - struct cfs_rq **cfs_rq; + struct cfs_rq __percpu *cfs_rq; unsigned long shares; /* * load_avg can be heavily contended at clock tick time, so put @@ -2304,6 +2304,12 @@ struct cfs_tg_state { struct sched_statistics stats; } __no_randomize_layout; +/* Access a specific CPU's cfs_rq from a task group */ +static inline struct cfs_rq *tg_cfs_rq(struct task_group *tg, int cpu) +{ + return per_cpu_ptr(tg->cfs_rq, cpu); +} + static inline struct sched_entity *tg_se(struct task_group *tg, int cpu) { struct cfs_tg_state *state; @@ -2311,7 +2317,7 @@ static inline struct sched_entity *tg_se(struct task_group *tg, int cpu) if (is_root_task_group(tg)) return NULL; - state = container_of(tg->cfs_rq[cpu], struct cfs_tg_state, cfs_rq); + state = container_of(tg_cfs_rq(tg, cpu), struct cfs_tg_state, cfs_rq); return &state->se; } @@ -2335,8 +2341,8 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_FAIR_GROUP_SCHED - set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); - p->se.cfs_rq = tg->cfs_rq[cpu]; + set_task_rq_fair(&p->se, p->se.cfs_rq, tg_cfs_rq(tg, cpu)); + p->se.cfs_rq = tg_cfs_rq(tg, cpu); p->se.parent = tg_se(tg, cpu); p->se.depth = p->se.parent ? p->se.parent->depth + 1 : 0; #endif -- cgit v1.2.3 From 1abbecd1d2d2fdd96e52f541f07ee2b163631bee Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 2 Jun 2026 05:00:01 +0000 Subject: sched/fair: Convert cfs bandwidth throttling to use guards Routine conversion of rcu_read_lock(), spin_lock*, and rq_lock usage within the cfs bandwidth controller to use class guards. Only notable changes are: - Checking for "cfs_rq->runtime_remaining <= 0" instead of the inverse to spot a throttle and break early. This also saves the need for extra indentation in the unthrottle case. - Reordering of list_del_rcu() against throttled_clock indicator update in unthrottle_cfs_rq(). Both are done with "cfs_b->lock" held after the "cfs_rq->throttled" is cleared which make the reordering safe against concurrent list modifications. No functional changes intended. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Tested-by: Aaron Lu Link: https://patch.msgid.link/20260602050005.11160-2-kprateek.nayak@amd.com --- kernel/sched/fair.c | 193 ++++++++++++++++++++++++---------------------------- 1 file changed, 90 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1d4ed883e630..261e5cedc717 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5035,13 +5035,13 @@ static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) */ rq_clock_start_loop_update(rq); - rcu_read_lock(); + guard(rcu)(); + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); clear_tg_load_avg(cfs_rq); } - rcu_read_unlock(); rq_clock_stop_loop_update(rq); } @@ -6540,13 +6540,10 @@ static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - int ret; - raw_spin_lock(&cfs_b->lock); - ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); - raw_spin_unlock(&cfs_b->lock); + guard(raw_spinlock)(&cfs_b->lock); - return ret; + return __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); } static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) @@ -6835,33 +6832,32 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - int dequeue = 1; - raw_spin_lock(&cfs_b->lock); - /* This will start the period timer if necessary */ - if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { + scoped_guard(raw_spinlock, &cfs_b->lock) { /* - * We have raced with bandwidth becoming available, and if we - * actually throttled the timer might not unthrottle us for an - * entire period. We additionally needed to make sure that any - * subsequent check_cfs_rq_runtime calls agree not to throttle - * us, as we may commit to do cfs put_prev+pick_next, so we ask - * for 1ns of runtime rather than just check cfs_b. + * Check if We have raced with bandwidth becoming available. If + * we actually throttled the timer might not unthrottle us for + * an entire period. We additionally needed to make sure that + * any subsequent check_cfs_rq_runtime calls agree not to + * throttle us, as we may commit to do cfs put_prev+pick_next, + * so we ask for 1ns of runtime rather than just check cfs_b. + * + * This will start the period timer if necessary. + */ + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) + return false; + + /* + * No bandwidth available; Add ourselves on the list to be + * unthrottled later. */ - dequeue = 0; - } else { list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); } - raw_spin_unlock(&cfs_b->lock); - - if (!dequeue) - return false; /* Throttle no longer required. */ /* freeze hierarchy runnable averages while throttled */ - rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); - rcu_read_unlock(); + scoped_guard(rcu) + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); /* * Note: distribution will already see us throttled via the @@ -6894,13 +6890,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq); - raw_spin_lock(&cfs_b->lock); - if (cfs_rq->throttled_clock) { + scoped_guard(raw_spinlock, &cfs_b->lock) { + list_del_rcu(&cfs_rq->throttled_list); + + if (!cfs_rq->throttled_clock) + break; + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; cfs_rq->throttled_clock = 0; } - list_del_rcu(&cfs_rq->throttled_list); - raw_spin_unlock(&cfs_b->lock); /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); @@ -6929,9 +6927,8 @@ static void __cfsb_csd_unthrottle(void *arg) { struct cfs_rq *cursor, *tmp; struct rq *rq = arg; - struct rq_flags rf; - rq_lock(rq, &rf); + guard(rq_lock)(rq); /* * Iterating over the list can trigger several call to @@ -6948,7 +6945,7 @@ static void __cfsb_csd_unthrottle(void *arg) * race with group being freed in the window between removing it * from the list and advancing to the next entry in the list. */ - rcu_read_lock(); + guard(rcu)(); list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list, throttled_csd_list) { @@ -6958,10 +6955,7 @@ static void __cfsb_csd_unthrottle(void *arg) unthrottle_cfs_rq(cursor); } - rcu_read_unlock(); - rq_clock_stop_loop_update(rq); - rq_unlock(rq, &rf); } static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) @@ -7001,11 +6995,11 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) u64 runtime, remaining = 1; bool throttled = false; struct cfs_rq *cfs_rq, *tmp; - struct rq_flags rf; struct rq *rq; LIST_HEAD(local_unthrottle); - rcu_read_lock(); + guard(rcu)(); + list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { rq = rq_of(cfs_rq); @@ -7015,65 +7009,63 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) break; } - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); + if (!cfs_rq_throttled(cfs_rq)) - goto next; + continue; /* Already queued for async unthrottle */ if (!list_empty(&cfs_rq->throttled_csd_list)) - goto next; + continue; /* By the above checks, this should never be true */ WARN_ON_ONCE(cfs_rq->runtime_remaining > 0); - raw_spin_lock(&cfs_b->lock); - runtime = -cfs_rq->runtime_remaining + 1; - if (runtime > cfs_b->runtime) - runtime = cfs_b->runtime; - cfs_b->runtime -= runtime; - remaining = cfs_b->runtime; - raw_spin_unlock(&cfs_b->lock); + scoped_guard(raw_spinlock, &cfs_b->lock) { + runtime = -cfs_rq->runtime_remaining + 1; + if (runtime > cfs_b->runtime) + runtime = cfs_b->runtime; + cfs_b->runtime -= runtime; + remaining = cfs_b->runtime; + } cfs_rq->runtime_remaining += runtime; - /* we check whether we're throttled above */ - if (cfs_rq->runtime_remaining > 0) { - if (cpu_of(rq) != this_cpu) { - unthrottle_cfs_rq_async(cfs_rq); - } else { - /* - * We currently only expect to be unthrottling - * a single cfs_rq locally. - */ - WARN_ON_ONCE(!list_empty(&local_unthrottle)); - list_add_tail(&cfs_rq->throttled_csd_list, - &local_unthrottle); - } - } else { + /* + * Ran out of bandwidth during distribution! + * Indicate throttled entities and break early. + */ + if (cfs_rq->runtime_remaining <= 0) { throttled = true; + break; } -next: - rq_unlock_irqrestore(rq, &rf); + /* we check whether we're throttled above */ + if (cpu_of(rq) != this_cpu) { + unthrottle_cfs_rq_async(cfs_rq); + continue; + } + + /* + * We currently only expect to be unthrottling + * a single cfs_rq locally. + */ + WARN_ON_ONCE(!list_empty(&local_unthrottle)); + list_add_tail(&cfs_rq->throttled_csd_list, &local_unthrottle); } list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle, throttled_csd_list) { struct rq *rq = rq_of(cfs_rq); - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); list_del_init(&cfs_rq->throttled_csd_list); - if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); - - rq_unlock_irqrestore(rq, &rf); } WARN_ON_ONCE(!list_empty(&local_unthrottle)); - rcu_read_unlock(); - return throttled; } @@ -7196,7 +7188,8 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (slack_runtime <= 0) return; - raw_spin_lock(&cfs_b->lock); + guard(raw_spinlock)(&cfs_b->lock); + if (cfs_b->quota != RUNTIME_INF) { cfs_b->runtime += slack_runtime; @@ -7205,7 +7198,6 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) !list_empty(&cfs_b->throttled_cfs_rq)) start_cfs_slack_bandwidth(cfs_b); } - raw_spin_unlock(&cfs_b->lock); /* even if it's not valid for return we don't want to try again */ cfs_rq->runtime_remaining -= slack_runtime; @@ -7228,25 +7220,21 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) */ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { - u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); - unsigned long flags; - /* confirm we're still not at a refresh boundary */ - raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->slack_started = false; + scoped_guard(raw_spinlock_irqsave, &cfs_b->lock) { + u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); - if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); - return; - } + cfs_b->slack_started = false; - if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) - runtime = cfs_b->runtime; + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) + return; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) + runtime = cfs_b->runtime; - if (!runtime) - return; + if (!runtime) + return; + } distribute_cfs_runtime(cfs_b); } @@ -7335,18 +7323,18 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); - unsigned long flags; int overrun; int idle = 0; int count = 0; - raw_spin_lock_irqsave(&cfs_b->lock, flags); + CLASS(raw_spinlock_irqsave, cfsb_guard)(&cfs_b->lock); + for (;;) { overrun = hrtimer_forward_now(timer, cfs_b->period); if (!overrun) break; - idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); + idle = do_sched_cfs_period_timer(cfs_b, overrun, cfsb_guard.flags); if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); @@ -7379,11 +7367,13 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) count = 0; } } - if (idle) + + if (idle) { cfs_b->period_active = 0; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); + return HRTIMER_NORESTART; + } - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; + return HRTIMER_RESTART; } void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) @@ -7450,14 +7440,12 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) */ for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - unsigned long flags; if (list_empty(&rq->cfsb_csd_list)) continue; - local_irq_save(flags); - __cfsb_csd_unthrottle(rq); - local_irq_restore(flags); + scoped_guard(irqsave) + __cfsb_csd_unthrottle(rq); } } @@ -7475,16 +7463,15 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq) lockdep_assert_rq_held(rq); - rcu_read_lock(); + guard(rcu)(); + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); - raw_spin_lock(&cfs_b->lock); - cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; - raw_spin_unlock(&cfs_b->lock); + scoped_guard(raw_spinlock, &cfs_b->lock) + cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; } - rcu_read_unlock(); } /* cpu offline callback */ @@ -7505,7 +7492,8 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) */ rq_clock_start_loop_update(rq); - rcu_read_lock(); + guard(rcu)(); + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); @@ -7528,7 +7516,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) cfs_rq->runtime_remaining = 1; unthrottle_cfs_rq(cfs_rq); } - rcu_read_unlock(); rq_clock_stop_loop_update(rq); } -- cgit v1.2.3 From 253edcf5436c916f2fbf7b880443c7f1ed76101d Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 2 Jun 2026 05:00:02 +0000 Subject: sched/fair: Use throttled_csd_list for local unthrottle When distribute_cfs_runtime() encounters a local cfs_rq, it adds it to a local list and unthrottles it at the end, when it is done unthrottling other cfs_rq(s) on cfs_b->throttled_cfs_rq until the bandwidth runs out. Instead of using a local list, reuse the local CPU's rq->throttled_csd_list and the __cfsb_csd_unthrottle() path for unthrottle. If this is the first cfs_rq to be queued on the "throttled_csd_list", it prevents the need for a remote CPUs to interrupt this local CPU if they themselves are performing async unthrottle. If this is not the first cfs_rq on the list, there is an async unthrottle operation pending on this local CPU and the unthrottle can be batched together. No functional changes intended. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Benjamin Segall Tested-by: Aaron Lu Link: https://patch.msgid.link/20260602050005.11160-3-kprateek.nayak@amd.com --- kernel/sched/fair.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 261e5cedc717..26a8bbb9e1e2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6991,12 +6991,11 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) { + bool throttled = false, unthrottle_local = false; int this_cpu = smp_processor_id(); u64 runtime, remaining = 1; - bool throttled = false; - struct cfs_rq *cfs_rq, *tmp; + struct cfs_rq *cfs_rq; struct rq *rq; - LIST_HEAD(local_unthrottle); guard(rcu)(); @@ -7047,24 +7046,23 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) } /* - * We currently only expect to be unthrottling - * a single cfs_rq locally. + * Allow a parallel async unthrottle to unthrottle + * this cfs_rq too via __cfsb_csd_unthrottle(). + * If we are first, do it ourselves at the end and + * save on an IPI from remote CPUs. */ - WARN_ON_ONCE(!list_empty(&local_unthrottle)); - list_add_tail(&cfs_rq->throttled_csd_list, &local_unthrottle); + unthrottle_local = list_empty(&rq->cfsb_csd_list); + list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list); } - list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle, - throttled_csd_list) { - struct rq *rq = rq_of(cfs_rq); - - guard(rq_lock_irqsave)(rq); - - list_del_init(&cfs_rq->throttled_csd_list); - if (cfs_rq_throttled(cfs_rq)) - unthrottle_cfs_rq(cfs_rq); + if (unthrottle_local) { + /* + * Protect against an IPI that is also trying to flush + * the unthrottled cfs_rq(s) from this CPU's csd_list. + */ + scoped_guard(irqsave) + __cfsb_csd_unthrottle(cpu_rq(this_cpu)); } - WARN_ON_ONCE(!list_empty(&local_unthrottle)); return throttled; } -- cgit v1.2.3 From 28ad5427682bccf06074366f347a6083d6730c1e Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 2 Jun 2026 05:25:29 +0000 Subject: sched/fair: Call update_curr() before unthrottling the hierarchy Subsequent commits will allow update_curr() to throttle the hierarchy when the runtime accounting exceeds allocated quota. Call update_curr() before the unthrottle event, and in tg_unthrottle_up() to catch up on any remaining runtime and stabilize the "runtime_remaining" and "throttle_count" for that cfs_rq. Doing an update_curr() early ensures the cfs_rq is not throttled right back up again when the unthrottle is in progress. Since all callers of unthrottle_cfs_rq(), except two, already update the rq_clock and call rq_clock_start_loop_update(), move the update_rq_clock() from unthrottle_cfs_rq() to the callers that don't update the rq_clock. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Benjamin Segall Tested-by: Aaron Lu Link: https://patch.msgid.link/20260602052531.11450-1-kprateek.nayak@amd.com --- kernel/sched/core.c | 5 ++++- kernel/sched/fair.c | 21 +++++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd031410ab1a..e745c58671ed 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9859,11 +9859,14 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, struct rq *rq = cfs_rq->rq; guard(rq_lock_irq)(rq); + cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 1; - if (cfs_rq->throttled) + if (cfs_rq->throttled) { + update_rq_clock(rq); unthrottle_cfs_rq(cfs_rq); + } } if (runtime_was_enabled && !runtime_enabled) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26a8bbb9e1e2..f91d85cd121b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6740,6 +6740,15 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); struct task_struct *p, *tmp; + /* + * If cfs_rq->curr is set, the cfs_rq might not have caught up + * since the last clock update. Do it now before we begin + * queueing task onto it to save the need for unnecessarily + * unthrottle the hierarchy for this cfs_rq to be throttled + * right back again. + */ + update_curr(cfs_rq); + if (--cfs_rq->throttle_count) return 0; @@ -6882,14 +6891,16 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) * We can't unthrottle this cfs_rq without any runtime remaining because * any enqueue in tg_unthrottle_up() will immediately trigger a throttle, * which is not supposed to happen on unthrottle path. + * + * Catch up on the remaining runtime since last clock update before + * checking runtime remaining. */ + update_curr(cfs_rq); if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) return; cfs_rq->throttled = 0; - update_rq_clock(rq); - scoped_guard(raw_spinlock, &cfs_b->lock) { list_del_rcu(&cfs_rq->throttled_list); @@ -6964,6 +6975,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) bool first; if (rq == this_rq()) { + update_rq_clock(rq); unthrottle_cfs_rq(cfs_rq); return; } @@ -7017,6 +7029,11 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) if (!list_empty(&cfs_rq->throttled_csd_list)) continue; + if (cfs_rq->curr) { + update_rq_clock(rq); + update_curr(cfs_rq); + } + /* By the above checks, this should never be true */ WARN_ON_ONCE(cfs_rq->runtime_remaining > 0); -- cgit v1.2.3 From 102a28344a60e637934ffca62d50ff8319b11165 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 2 Jun 2026 05:25:30 +0000 Subject: sched/fair: Move the throttled tasks to a local list in tg_unthrottle_up() An update_curr() during the enqueue of throttled task will start throttling the hierarchy from subsequent commit. This can lead to tg_throttle_down() seeing non-empty throttled_limbo_list for the cfs_rq attaching the task from throttled_limbo_list one by one. For example: R | A / \ *B C | rq->curr *B is throttled with tasks on hte limbo list. When the tasks are unthrottled via tg_unthrottle_up() and entity of group B is placed onto A, update_curr() is called to catch up the vruntime and it may throttle group A causing the subsequent tg_throttle_down() to see the pending task's on B's limbo list. tg_unthrottle_up() /* --cfs_rq->throttle_count == 0 */ list_for_each_entry_safe(p, cfs_rq->throttled_limbo_list) enqueue_task_fair() enqueue_entity(se /* B->se */) update_curr(cfs_rq /* A->gcfs_rq */) account_cfs_rq_runtime(cfs_rq) throttle_cfs_rq(cfs_rq /* A->gcfs_rq */ ) tg_throttle_down() /* Reaches B->cfs_rq with throttle_count == 0 */ !!! !list_empty(&cfs_rq->throttled_limbo_list)) !!! Move the tasks from throttled_limbo_list onto a local list before starting the unthrottle to prevent the splat described above. If the hierarchy is throttled again in middle of an unthrottle, put the pending tasks back onto the limbo list to prevent running them unnecessarily. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Benjamin Segall Tested-by: Aaron Lu Link: https://patch.msgid.link/20260602052531.11450-2-kprateek.nayak@amd.com --- kernel/sched/fair.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f91d85cd121b..3f3f09a021db 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6739,6 +6739,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) struct rq *rq = data; struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); struct task_struct *p, *tmp; + LIST_HEAD(throttled_tasks); /* * If cfs_rq->curr is set, the cfs_rq might not have caught up @@ -6769,13 +6770,31 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttled_clock_self_time += delta; } + /* + * Move the tasks to a local list since an update_curr() during + * enqueue_task_fair() can throttle a higher cfs_rq, and it can + * see the "throttled_limbo_list" being non-empty in + * tg_throttle_down() if throttle_count turned 0 above. + */ + list_splice_init(&cfs_rq->throttled_limbo_list, &throttled_tasks); + /* Re-enqueue the tasks that have been throttled at this level. */ - list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { + list_for_each_entry_safe(p, tmp, &throttled_tasks, throttle_node) { + /* + * Back to being throttled! Break out and put the remaining + * tasks back onto the limbo_list to prevent running them + * unnecessarily. + */ + if (cfs_rq->throttle_count) + break; + list_del_init(&p->throttle_node); p->throttled = false; - enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); + enqueue_task_fair(rq, p, ENQUEUE_WAKEUP); } + list_splice(&throttled_tasks, &cfs_rq->throttled_limbo_list); + /* Add cfs_rq with load or one or more already running entities to the list */ if (!cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); -- cgit v1.2.3 From f666241e6bd5d9a494beca982e1953208dce531c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2026 07:10:05 +0000 Subject: sched/fair: Unify cfs_rq throttling via account_cfs_rq_runtime() assign_cfs_rq_runtime() during update_curr() sets the resched indicator and relies on check_cfs_rq_runtime() during pick_next_task() / put_prev_entity() to throttle the hierarchy once current task is preempted / blocks. Per-task throttle, on the other hand, uses throttle_cfs_rq() to simply propagate the throttle signals, and then relies on task work to individually throttle the runnable tasks on their way out to the userspace. Remove check_cfs_rq_runtime() and unify throttling into account_cfs_rq_runtime() which only sets the cfs_rq->throttled, cfs_rq->throttle_count indicators via throttle_cfs_rq() and optionally adds the task work to the current task (donor) it is on the throttled hierarchy. throttle_cfs_rq() requests for sched_cfs_bandwidth_slice() worth of bandwidth for the current hierarchy that enable it to continue running uninterrupted when selected. For the rest, it requests a bare minimum of "1" to ensure some bandwidth is available and pass the "runtime_remaining > 0" checks once selected. For SCHED_PROXY_EXEC, a mutex holder cannot exit to userspace without dropping it first and the mutex_unlock() ensures proxy is stopped before the mutex handoff which preserves the current semantics for running a throttled task until it exits to the userspace even if it acts as a donor. [ prateek: rebased on tip, comments, commit message. ] Reviewed-By: Benjamin Segall Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Tested-by: Aaron Lu Link: https://patch.msgid.link/20260602071005.11942-1-kprateek.nayak@amd.com --- kernel/sched/fair.c | 101 ++++++++++++++++++++++++---------------------------- 1 file changed, 46 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3f3f09a021db..f4ed841f766f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -525,7 +525,7 @@ static int se_is_idle(struct sched_entity *se) #endif /* !CONFIG_FAIR_GROUP_SCHED */ static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); +bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -6388,8 +6388,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect) return se; } -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); - static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { /* @@ -6399,9 +6397,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) if (prev->on_rq) update_curr(cfs_rq); - /* throttle cfs_rqs exceeding runtime */ - check_cfs_rq_runtime(cfs_rq); - if (prev->on_rq) { update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */ @@ -6536,41 +6531,32 @@ static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, return cfs_rq->runtime_remaining > 0; } -/* returns 0 on failure to allocate runtime */ -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - guard(raw_spinlock)(&cfs_b->lock); +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq); - return __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); -} - -static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) +static bool __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; if (likely(cfs_rq->runtime_remaining > 0)) - return; + return false; if (cfs_rq->throttled) - return; + return true; /* - * if we're unable to extend our runtime we resched so that the active - * hierarchy can be throttled + * throttle_cfs_rq() will try to extend the runtime first + * before throttling the hierarchy. */ - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_curr(rq_of(cfs_rq)); + return throttle_cfs_rq(cfs_rq); } static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) +bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) - return; + return false; - __account_cfs_rq_runtime(cfs_rq, delta_exec); + return __account_cfs_rq_runtime(cfs_rq, delta_exec); } static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) @@ -6858,10 +6844,24 @@ static int tg_throttle_down(struct task_group *tg, void *data) static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { - struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *curr = cfs_rq->curr; + struct rq *rq = rq_of(cfs_rq); scoped_guard(raw_spinlock, &cfs_b->lock) { + u64 target_runtime = 1; + + /* + * If cfs_rq->curr is still runnable, we are here from an + * update_curr(). Request sysctl_sched_cfs_bandwidth_slice + * worth of bandwidth to continue running. + * + * If the curr is not runnable, just request enough bandwidth + * to be runnable next time the pick selects this cfs_rq. + */ + if (curr && curr->on_rq) + target_runtime = sched_cfs_bandwidth_slice(); + /* * Check if We have raced with bandwidth becoming available. If * we actually throttled the timer might not unthrottle us for @@ -6872,7 +6872,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) * * This will start the period timer if necessary. */ - if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, target_runtime)) return false; /* @@ -6893,6 +6893,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) */ cfs_rq->throttled = 1; WARN_ON_ONCE(cfs_rq->throttled_clock); + + /* + * If current hierarchy was throttled, add throttle work to the + * current donor. In case of proxy-execution, the execution + * context cannot exit to the userspace while holding a mutex + * and the rule of throttle deferral to only throttle the + * throttled context at exit to userspace is still preserved. + */ + if (curr && curr->on_rq) + task_throttle_setup_work(rq->donor); + return true; } @@ -7283,7 +7294,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - /* an active group must be handled by the update_curr()->put() path */ + /* an active group must be handled by the update_curr() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -7293,8 +7304,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) /* update runtime allocation */ account_cfs_rq_runtime(cfs_rq, 0); - if (cfs_rq->runtime_remaining <= 0) - throttle_cfs_rq(cfs_rq); } static void sync_throttle(struct task_group *tg, int cpu) @@ -7324,25 +7333,6 @@ static void sync_throttle(struct task_group *tg, int cpu) cfs_rq->pelt_clock_throttled = 1; } -/* conditionally throttle active cfs_rq's from put_prev_entity() */ -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - if (!cfs_bandwidth_used()) - return false; - - if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) - return false; - - /* - * it's possible for a throttled entity to be forced into a running - * state (e.g. set_curr_task), in this case we're finished. - */ - if (cfs_rq_throttled(cfs_rq)) - return true; - - return throttle_cfs_rq(cfs_rq); -} - static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = @@ -7596,8 +7586,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) #else /* !CONFIG_CFS_BANDWIDTH: */ -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } +static bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -9934,8 +9923,6 @@ again: if (cfs_rq->curr && cfs_rq->curr->on_rq) update_curr(cfs_rq); - throttled |= check_cfs_rq_runtime(cfs_rq); - se = pick_next_entity(rq, cfs_rq, true); if (!se) goto again; @@ -14853,8 +14840,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} */ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { - struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; + struct cfs_rq *cfs_rq; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -15036,6 +15023,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { struct sched_entity *se = &p->se; + bool throttled = false; for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -15046,9 +15034,12 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) set_next_entity(cfs_rq, se, first); /* ensure bandwidth has been allocated on our new cfs_rq */ - account_cfs_rq_runtime(cfs_rq, 0); + throttled |= account_cfs_rq_runtime(cfs_rq, 0); } + if (throttled) + task_throttle_setup_work(p); + se = &p->se; if (task_on_rq_queued(p)) { -- cgit v1.2.3 From 29922fdfc2a4008d66418bedd0ebf5038fc54efa Mon Sep 17 00:00:00 2001 From: Hongyan Xia Date: Fri, 5 Jun 2026 09:43:39 +0000 Subject: sched/fair: Fix cpu_util runnable_avg arithmetic If we take runnable_avg in max(runnable_avg, util_avg) in cpu_util(), we should then add or subtract task runnable_avg, but the arithmetic below is still with task util_avg. This mixes runnable_avg with util_avg which is incorrect. Fix by always doing arithmetic with runnable_avg and only take max(runnable_avg, util_avg) at the last step. Fixes: 7d0583cf9ec7 ("sched/fair, cpufreq: Introduce 'runnable boosting'") Signed-off-by: Hongyan Xia Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://patch.msgid.link/20260605094318.37931-1-hongyan.xia@transsion.com --- kernel/sched/fair.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f4ed841f766f..1b23e73f48b0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8968,25 +8968,32 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) static unsigned long cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) { + bool add_task = p && task_cpu(p) != cpu && dst_cpu == cpu; + bool sub_task = p && task_cpu(p) == cpu && dst_cpu != cpu; struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); unsigned long runnable; - if (boost) { - runnable = READ_ONCE(cfs_rq->avg.runnable_avg); - util = max(util, runnable); - } - /* * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its * contribution. If @p migrates from another CPU to @cpu add its * contribution. In all the other cases @cpu is not impacted by the * migration so its util_avg is already correct. */ - if (p && task_cpu(p) == cpu && dst_cpu != cpu) - lsub_positive(&util, task_util(p)); - else if (p && task_cpu(p) != cpu && dst_cpu == cpu) + if (add_task) util += task_util(p); + else if (sub_task) + lsub_positive(&util, task_util(p)); + + if (boost) { + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); + if (add_task) + runnable += READ_ONCE(p->se.avg.runnable_avg); + else if (sub_task) + lsub_positive(&runnable, + READ_ONCE(p->se.avg.runnable_avg)); + util = max(util, runnable); + } if (sched_feat(UTIL_EST)) { unsigned long util_est; -- cgit v1.2.3 From 76124a050ddbc8b252172205ab04f10a83c03a4d Mon Sep 17 00:00:00 2001 From: Liang Luo Date: Mon, 8 Jun 2026 15:18:42 +0800 Subject: sched/core: Combine separate 'else' and 'if' statements The kernel coding style recommends using 'else if' instead of placing 'if' on a separate line after 'else'. This change makes the code consistent with the rest of the kernel codebase. Signed-off-by: Liang Luo Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260608071842.325159-1-luoliang@kylinos.cn --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e745c58671ed..2f4530eb543f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3816,8 +3816,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; - else - if (p->in_iowait) { + else if (p->in_iowait) { delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } -- cgit v1.2.3 From 9ebe5c3c29f6217412ff256134516d4dff0e5624 Mon Sep 17 00:00:00 2001 From: Liang Luo Date: Mon, 8 Jun 2026 15:55:00 +0800 Subject: sched/deadline: Use task_on_rq_migrating() helper Replace the open-coded "p->on_rq == TASK_ON_RQ_MIGRATING" comparisons in enqueue_task_dl() and dequeue_task_dl() with the existing task_on_rq_migrating() helper, consistent with the rest of the scheduler code. No functional change. Signed-off-by: Liang Luo Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260608075500.387271-1-luoliang@kylinos.cn --- kernel/sched/deadline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 4754dbe4232d..5ccb06effea0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2530,7 +2530,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) check_schedstat_required(); update_stats_wait_start_dl(dl_rq, dl_se); - if (p->on_rq == TASK_ON_RQ_MIGRATING) + if (task_on_rq_migrating(p)) flags |= ENQUEUE_MIGRATING; enqueue_dl_entity(dl_se, flags); @@ -2552,7 +2552,7 @@ static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); - if (p->on_rq == TASK_ON_RQ_MIGRATING) + if (task_on_rq_migrating(p)) flags |= DEQUEUE_MIGRATING; dequeue_dl_entity(&p->dl, flags); -- cgit v1.2.3 From c095741713d1bc317b53e2da2b222e7448b6021f Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Wed, 3 Jun 2026 17:51:08 +0800 Subject: sched/fair: Fix newidle vs core-sched While testing Prateek's throttle series, I noticed a panic issue when coresched is enabled and bisected to this patch. I fed the panic log and this patch to an agent and its analysis looks correct to me(cpu56 and cpu57 are siblings in a VM): cpu57 (holds core-wide lock) pick_next_task() [core scheduling] for_each_cpu_wrap(i, smt_mask, 57): i=57: pick_task(rq_57) pick_task_fair(rq_57) -> picks task A rq_57->core_pick = task A // task_rq(A) == rq_57 i=56: pick_task(rq_56) pick_task_fair(rq_56) cfs_rq->nr_queued == 0 goto idle sched_balance_newidle(rq_56) raw_spin_rq_unlock(rq_56) // core-wide lock released newidle_balance() pulls task A: rq_57 -> rq_56 // task_rq(A) == rq_56 now raw_spin_rq_lock(rq_56) // core-wide lock re-acquired return > 0 goto again pick_task_fair(rq_56) -> picks task A rq_56->core_pick = task A // first loop done // rq_57->core_pick is still task A (set before lock release) // but task_rq(A) == rq_56 now next = rq_57->core_pick // = task A put_prev_set_next_task(rq_57, prev, task A) __set_next_task_fair(rq_57, task A) hrtick_start_fair(rq_57, task A) WARN_ON_ONCE(task_rq(task A) != rq_57) // task_rq(A) == rq_56 IOW: by allowing pick_task_fair() to do newidle_balance and not returning RETRY_TASK, it can end up selecting the same task on two CPUs. Restore the previous state by never doing newidle when core scheduling is enabled. Tested-by: Sven Schnelle Signed-off-by: "Aaron Lu" Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260603095108.GA1684319@bytedance.com --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1b23e73f48b0..d78467ec6ee1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9942,6 +9942,9 @@ again: return p; idle: + if (sched_core_enabled(rq)) + return NULL; + new_tasks = sched_balance_newidle(rq, rf); if (new_tasks < 0) return RETRY_TASK; -- cgit v1.2.3