From 45e09225085f70b856b7b9f26a18ea767a7e1563 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Nov 2025 16:08:23 +0100 Subject: sched/fair: Avoid rq->lock bouncing in sched_balance_newidle() While poking at this code recently I noted we do a pointless unlock+lock cycle in sched_balance_newidle(). We drop the rq->lock (so we can balance) but then instantly grab the same rq->lock again in sched_balance_update_blocked_averages(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251127154725.532469061@infradead.org --- kernel/sched/fair.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa033e45dce4..708ad01ac231 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9905,15 +9905,11 @@ static unsigned long task_h_load(struct task_struct *p) } #endif /* !CONFIG_FAIR_GROUP_SCHED */ -static void sched_balance_update_blocked_averages(int cpu) +static void __sched_balance_update_blocked_averages(struct rq *rq) { bool decayed = false, done = true; - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; - rq_lock_irqsave(rq, &rf); update_blocked_load_tick(rq); - update_rq_clock(rq); decayed |= __update_blocked_others(rq, &done); decayed |= __update_blocked_fair(rq, &done); @@ -9921,7 +9917,15 @@ static void sched_balance_update_blocked_averages(int cpu) update_blocked_load_status(rq, !done); if (decayed) cpufreq_update_util(rq, 0); - rq_unlock_irqrestore(rq, &rf); +} + +static void sched_balance_update_blocked_averages(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + guard(rq_lock_irqsave)(rq); + update_rq_clock(rq); + __sched_balance_update_blocked_averages(rq); } /********** Helpers for sched_balance_find_src_group ************************/ @@ -12868,12 +12872,17 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) } rcu_read_unlock(); + /* + * Include sched_balance_update_blocked_averages() in the cost + * calculation because it can be quite costly -- this ensures we skip + * it when avg_idle gets to be very low. + */ + t0 = sched_clock_cpu(this_cpu); + __sched_balance_update_blocked_averages(this_rq); + rq_modified_clear(this_rq); raw_spin_rq_unlock(this_rq); - t0 = sched_clock_cpu(this_cpu); - sched_balance_update_blocked_averages(this_cpu); - rcu_read_lock(); for_each_domain(this_cpu, sd) { u64 domain_cost; -- cgit v1.2.3