From a06247c6804f1a7c86a2e5398a4c1f1db1471848 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 11 Jan 2022 15:23:09 -0800 Subject: psi: Fix uaf issue when psi trigger is destroyed while being polled With write operation on psi files replacing old trigger with a new one, the lifetime of its waitqueue is totally arbitrary. Overwriting an existing trigger causes its waitqueue to be freed and pending poll() will stumble on trigger->event_wait which was destroyed. Fix this by disallowing to redefine an existing psi trigger. If a write operation is used on a file descriptor with an already existing psi trigger, the operation will fail with EBUSY error. Also bypass a check for psi_disabled in the psi_trigger_destroy as the flag can be flipped after the trigger is created, leading to a memory leak. Fixes: 0e94682b73bf ("psi: introduce psi monitor") Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com Suggested-by: Linus Torvalds Analyzed-by: Eric Biggers Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Eric Biggers Acked-by: Johannes Weiner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220111232309.1786347-1-surenb@google.com --- kernel/sched/psi.c | 66 ++++++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 37 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index a679613a7cb7..c137c4d6983e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1162,7 +1162,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount); mutex_lock(&group->trigger_lock); @@ -1191,15 +1190,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, return t; } -static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; + struct psi_group *group; struct task_struct *task_to_destroy = NULL; - if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return; + group = t->group; /* * Wakeup waiters to stop polling. Can happen if cgroup is deleted * from under a polling process. @@ -1235,9 +1238,9 @@ static void psi_trigger_destroy(struct kref *ref) mutex_unlock(&group->trigger_lock); /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_task RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_task + * Wait for psi_schedule_poll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * poll_task. */ synchronize_rcu(); /* @@ -1254,18 +1257,6 @@ static void psi_trigger_destroy(struct kref *ref) kfree(t); } -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1275,24 +1266,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock(); poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - kref_put(&t->refcount, psi_trigger_destroy); - return ret; } @@ -1316,14 +1298,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; - new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock); return nbytes; @@ -1358,7 +1350,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); } -- cgit v1.2.3 From 98b0d890220d45418cfbc5157b3382e6da5a12ab Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Jan 2022 14:46:56 +0100 Subject: sched/pelt: Relax the sync of util_sum with util_avg Rick reported performance regressions in bugzilla because of cpu frequency being lower than before: https://bugzilla.kernel.org/show_bug.cgi?id=215045 He bisected the problem to: commit 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent") This commit forces util_sum to be synced with the new util_avg after removing the contribution of a task and before the next periodic sync. By doing so util_sum is rounded to its lower bound and might lost up to LOAD_AVG_MAX-1 of accumulated contribution which has not yet been reflected in util_avg. Instead of always setting util_sum to the low bound of util_avg, which can significantly lower the utilization of root cfs_rq after propagating the change down into the hierarchy, we revert the change of util_sum and propagate the difference. In addition, we also check that cfs's util_sum always stays above the lower bound for a given util_avg as it has been observed that sched_entity's util_sum is sometimes above cfs one. Fixes: 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent") Reported-by: Rick Yiu Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Sachin Sant Link: https://lkml.kernel.org/r/20220111134659.24961-2-vincent.guittot@linaro.org --- kernel/sched/fair.c | 16 +++++++++++++--- kernel/sched/pelt.h | 4 +++- 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 095b0aa378df..d8f068d72b9d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3381,7 +3381,6 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } - /* * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to * propagate its contribution. The key to this propagation is the invariant @@ -3449,7 +3448,6 @@ void set_task_rq_fair(struct sched_entity *se, * XXX: only do this for the part of runnable > running ? * */ - static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { @@ -3681,7 +3679,19 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_util; sub_positive(&sa->util_avg, r); - sa->util_sum = sa->util_avg * divider; + sub_positive(&sa->util_sum, r * divider); + /* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ + sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); r = removed_runnable; sub_positive(&sa->runnable_avg, r); diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index e06071bf3472..c336f5f481bc 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -37,9 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running) } #endif +#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024) + static inline u32 get_pelt_divider(struct sched_avg *avg) { - return LOAD_AVG_MAX - 1024 + avg->period_contrib; + return PELT_MIN_DIVIDER + avg->period_contrib; } static inline void cfs_se_util_change(struct sched_avg *avg) -- cgit v1.2.3 From 7ceb77103001544a43e11d7f3a8a69a2c1f422cf Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Jan 2022 14:46:57 +0100 Subject: sched/pelt: Continue to relax the sync of util_sum with util_avg Rick reported performance regressions in bugzilla because of cpu frequency being lower than before: https://bugzilla.kernel.org/show_bug.cgi?id=215045 He bisected the problem to: commit 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent") This commit forces util_sum to be synced with the new util_avg after removing the contribution of a task and before the next periodic sync. By doing so util_sum is rounded to its lower bound and might lost up to LOAD_AVG_MAX-1 of accumulated contribution which has not yet been reflected in util_avg. update_tg_cfs_util() is not the only place where we round util_sum and lost some accumulated contributions that are not already reflected in util_avg. Modify update_tg_cfs_util() and detach_entity_load_avg() to not sync util_sum with the new util_avg. Instead of always setting util_sum to the low bound of util_avg, which can significantly lower the utilization, we propagate the difference. In addition, we also check that cfs's util_sum always stays above the lower bound for a given util_avg as it has been observed that sched_entity's util_sum is sometimes above cfs one. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Sachin Sant Link: https://lkml.kernel.org/r/20220111134659.24961-3-vincent.guittot@linaro.org --- kernel/sched/fair.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d8f068d72b9d..ad2809cc57c7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3451,11 +3451,11 @@ void set_task_rq_fair(struct sched_entity *se, static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3464,13 +3464,20 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq */ divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * divider; + new_sum = se->avg.util_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.util_sum; + se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + add_positive(&cfs_rq->avg.util_avg, delta_avg); + add_positive(&cfs_rq->avg.util_sum, delta_sum); + + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); } static inline void @@ -3790,7 +3797,11 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; -- cgit v1.2.3 From 95246d1ec80b8d19d882cd8eb7ad094e63b41bb8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Jan 2022 14:46:58 +0100 Subject: sched/pelt: Relax the sync of runnable_sum with runnable_avg Similarly to util_avg and util_sum, don't sync runnable_sum with the low bound of runnable_avg but only ensure that runnable_sum stays in the correct range. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Sachin Sant Link: https://lkml.kernel.org/r/20220111134659.24961-4-vincent.guittot@linaro.org --- kernel/sched/fair.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ad2809cc57c7..0e87e1916504 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3483,11 +3483,11 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3498,11 +3498,16 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; - se->avg.runnable_sum = se->avg.runnable_avg * divider; + new_sum = se->avg.runnable_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.runnable_sum; + se->avg.runnable_sum = new_sum; /* Update parent cfs_rq runnable */ - add_positive(&cfs_rq->avg.runnable_avg, delta); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + add_positive(&cfs_rq->avg.runnable_avg, delta_avg); + add_positive(&cfs_rq->avg.runnable_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); } static inline void @@ -3702,7 +3707,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_runnable; sub_positive(&sa->runnable_avg, r); - sa->runnable_sum = sa->runnable_avg * divider; + sub_positive(&sa->runnable_sum, r * divider); + /* See sa->util_sum above */ + sa->runnable_sum = max_t(u32, sa->runnable_sum, + sa->runnable_avg * PELT_MIN_DIVIDER); /* * removed_runnable is the unweighted version of removed_load so we @@ -3789,12 +3797,6 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = get_pelt_divider(&cfs_rq->avg); - dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); @@ -3803,7 +3805,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); -- cgit v1.2.3 From 2d02fa8cc21a93da35cfba462bf8ab87bf2db651 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Jan 2022 14:46:59 +0100 Subject: sched/pelt: Relax the sync of load_sum with load_avg Similarly to util_avg and util_sum, don't sync load_sum with the low bound of load_avg but only ensure that load_sum stays in the correct range. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Sachin Sant Link: https://lkml.kernel.org/r/20220111134659.24961-5-vincent.guittot@linaro.org --- kernel/sched/fair.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0e87e1916504..f4f02c2cff87 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3028,9 +3028,11 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u32 divider = get_pelt_divider(&se->avg); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } #else static inline void @@ -3513,9 +3515,10 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf static inline void update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; + long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; unsigned long load_avg; u64 load_sum = 0; + s64 delta_sum; u32 divider; if (!runnable_sum) @@ -3542,7 +3545,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq * assuming all tasks are equally runnable. */ if (scale_load_down(gcfs_rq->load.weight)) { - load_sum = div_s64(gcfs_rq->avg.load_sum, + load_sum = div_u64(gcfs_rq->avg.load_sum, scale_load_down(gcfs_rq->load.weight)); } @@ -3559,19 +3562,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; runnable_sum = max(runnable_sum, running_sum); - load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, divider); - - se->avg.load_sum = runnable_sum; + load_sum = se_weight(se) * runnable_sum; + load_avg = div_u64(load_sum, divider); - delta = load_avg - se->avg.load_avg; - if (!delta) + delta_avg = load_avg - se->avg.load_avg; + if (!delta_avg) return; - se->avg.load_avg = load_avg; + delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; - add_positive(&cfs_rq->avg.load_avg, delta); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; + se->avg.load_sum = runnable_sum; + se->avg.load_avg = load_avg; + add_positive(&cfs_rq->avg.load_avg, delta_avg); + add_positive(&cfs_rq->avg.load_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3687,7 +3693,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_load; sub_positive(&sa->load_avg, r); - sa->load_sum = sa->load_avg * divider; + sub_positive(&sa->load_sum, r * divider); + /* See sa->util_sum below */ + sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); r = removed_util; sub_positive(&sa->util_avg, r); -- cgit v1.2.3 From b171501f258063f5c56dd2c5fdf310802d8d7dc1 Mon Sep 17 00:00:00 2001 From: Cruz Zhao Date: Tue, 11 Jan 2022 17:55:59 +0800 Subject: sched/core: Accounting forceidle time for all tasks except idle task There are two types of forced idle time: forced idle time from cookie'd task and forced idle time form uncookie'd task. The forced idle time from uncookie'd task is actually caused by the cookie'd task in runqueue indirectly, and it's more accurate to measure the capacity loss with the sum of both. Assuming cpu x and cpu y are a pair of SMT siblings, consider the following scenarios: 1.There's a cookie'd task running on cpu x, and there're 4 uncookie'd tasks running on cpu y. For cpu x, there will be 80% forced idle time (from uncookie'd task); for cpu y, there will be 20% forced idle time (from cookie'd task). 2.There's a uncookie'd task running on cpu x, and there're 4 cookie'd tasks running on cpu y. For cpu x, there will be 80% forced idle time (from cookie'd task); for cpu y, there will be 20% forced idle time (from uncookie'd task). The scenario1 can recurrent by stress-ng(scenario2 can recurrent similary): (cookie'd)taskset -c x stress-ng -c 1 -l 100 (uncookie'd)taskset -c y stress-ng -c 4 -l 100 In the above two scenarios, the total capacity loss is 1 cpu, but in scenario1, the cookie'd forced idle time tells us 20% cpu capacity loss, in scenario2, the cookie'd forced idle time tells us 80% cpu capacity loss, which are not accurate. It'll be more accurate to measure with cookie'd forced idle time and uncookie'd forced idle time. Signed-off-by: Cruz Zhao Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josh Don Link: https://lore.kernel.org/r/1641894961-9241-2-git-send-email-CruzZhao@linux.alibaba.com --- kernel/sched/core.c | 3 +-- kernel/sched/core_sched.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83872f95a1ea..0d2ab2a2f9fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5822,8 +5822,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } if (schedstat_enabled() && rq->core->core_forceidle_count) { - if (cookie) - rq->core->core_forceidle_start = rq_clock(rq->core); + rq->core->core_forceidle_start = rq_clock(rq->core); rq->core->core_forceidle_occupation = occ; } diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 1fb45672ec85..c8746a9a7ada 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -277,7 +277,7 @@ void __sched_core_account_forceidle(struct rq *rq) rq_i = cpu_rq(i); p = rq_i->core_pick ?: rq_i->curr; - if (!p->core_cookie) + if (p == rq_i->idle) continue; __schedstat_add(p->stats.core_forceidle_sum, delta); -- cgit v1.2.3 From a315da5e686b02b20c1713dda818e8fb691526bb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Dec 2021 21:59:00 -0800 Subject: sched/fair: Fix all kernel-doc warnings Quieten all kernel-doc warnings in kernel/sched/fair.c: kernel/sched/fair.c:3663: warning: No description found for return value of 'update_cfs_rq_load_avg' kernel/sched/fair.c:8601: warning: No description found for return value of 'asym_smt_can_pull_tasks' kernel/sched/fair.c:8673: warning: Function parameter or member 'sds' not described in 'update_sg_lb_stats' kernel/sched/fair.c:9483: warning: contents before sections Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ricardo Neri Acked-by: Vincent Guittot Link: https://lore.kernel.org/r/20211218055900.2704-1-rdunlap@infradead.org --- kernel/sched/fair.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f4f02c2cff87..5146163bfabb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3668,7 +3668,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed load. + * Return: true if the load decayed or we removed load. * * Since both these conditions indicate a changed cfs_rq->avg.load we should * call update_tg_load_avg() when this function returns true. @@ -8573,6 +8573,8 @@ group_type group_classify(unsigned int imbalance_pct, * * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings * of @dst_cpu are idle and @sg has lower priority. + * + * Return: true if @dst_cpu can pull tasks, false otherwise. */ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, @@ -8648,6 +8650,7 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. + * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @sg_status: Holds flag indicating the status of the sched_group @@ -9455,12 +9458,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. + * @env: The load balancing environment. * * Also calculates the amount of runnable load which should be moved * to restore balance. * - * @env: The load balancing environment. - * * Return: - The busiest group if imbalance exists. */ static struct sched_group *find_busiest_group(struct lb_env *env) -- cgit v1.2.3 From 7e406d1ff39b8ee574036418a5043c86723170cf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 25 Dec 2021 01:04:57 +0100 Subject: sched: Avoid double preemption in __cond_resched_*lock*() For PREEMPT/DYNAMIC_PREEMPT the *_unlock() will already trigger a preemption, no point in then calling preempt_schedule_common() *again*. Use _cond_resched() instead, since this is a NOP for the preemptible configs while it provide a preemption point for the others. Reported-by: xuhaifeng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YcGnvDEYBwOiV0cR@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d2ab2a2f9fe..56b428c8ea96 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8218,9 +8218,7 @@ int __cond_resched_lock(spinlock_t *lock) if (spin_needbreak(lock) || resched) { spin_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; spin_lock(lock); @@ -8238,9 +8236,7 @@ int __cond_resched_rwlock_read(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { read_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; read_lock(lock); @@ -8258,9 +8254,7 @@ int __cond_resched_rwlock_write(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { write_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; write_lock(lock); -- cgit v1.2.3 From 809232619f5b15e31fb3563985e705454f32621f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 17 Jan 2022 15:30:10 -0500 Subject: sched/membarrier: Fix membarrier-rseq fence command missing from query bitmask The membarrier command MEMBARRIER_CMD_QUERY allows querying the available membarrier commands. When the membarrier-rseq fence commands were added, a new MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK was introduced with the intent to expose them with the MEMBARRIER_CMD_QUERY command, the but it was never added to MEMBARRIER_CMD_BITMASK. The membarrier-rseq fence commands are therefore not wired up with the query command. Rename MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK to MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK (the bitmask is not a command per-se), and change the erroneous MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK (which does not actually exist) to MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ. Wire up MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK in MEMBARRIER_CMD_BITMASK. Fixing this allows discovering availability of the membarrier-rseq fence feature. Fixes: 2a36ab717e8f ("rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ") Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Cc: # 5.10+ Link: https://lkml.kernel.org/r/20220117203010.30129-1-mathieu.desnoyers@efficios.com --- kernel/sched/membarrier.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index b5add64d9698..3d2825408e3a 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -147,11 +147,11 @@ #endif #ifdef CONFIG_RSEQ -#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ - | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ) #else -#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 #endif #define MEMBARRIER_CMD_BITMASK \ @@ -159,7 +159,8 @@ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ - | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) + | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ + | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK) static void ipi_mb(void *info) { -- cgit v1.2.3 From 44585f7bc0cb01095bc2ad4258049c02bbad21ef Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 29 Jan 2022 13:41:20 -0800 Subject: psi: fix "defined but not used" warnings when CONFIG_PROC_FS=n When CONFIG_PROC_FS is disabled psi code generates the following warnings: kernel/sched/psi.c:1364:30: warning: 'psi_cpu_proc_ops' defined but not used [-Wunused-const-variable=] 1364 | static const struct proc_ops psi_cpu_proc_ops = { | ^~~~~~~~~~~~~~~~ kernel/sched/psi.c:1355:30: warning: 'psi_memory_proc_ops' defined but not used [-Wunused-const-variable=] 1355 | static const struct proc_ops psi_memory_proc_ops = { | ^~~~~~~~~~~~~~~~~~~ kernel/sched/psi.c:1346:30: warning: 'psi_io_proc_ops' defined but not used [-Wunused-const-variable=] 1346 | static const struct proc_ops psi_io_proc_ops = { | ^~~~~~~~~~~~~~~ Make definitions of these structures and related functions conditional on CONFIG_PROC_FS config. Link: https://lkml.kernel.org/r/20220119223940.787748-3-surenb@google.com Fixes: 0e94682b73bf ("psi: introduce psi monitor") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 79 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 38 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index c137c4d6983e..e14358178849 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1082,44 +1082,6 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -static int psi_io_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_IO); -} - -static int psi_memory_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_MEM); -} - -static int psi_cpu_show(struct seq_file *m, void *v) -{ - return psi_show(m, &psi_system, PSI_CPU); -} - -static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) -{ - if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - return single_open(file, psi_show, NULL); -} - -static int psi_io_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_io_show); -} - -static int psi_memory_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_memory_show); -} - -static int psi_cpu_open(struct inode *inode, struct file *file) -{ - return psi_open(file, psi_cpu_show); -} - struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res) { @@ -1278,6 +1240,45 @@ __poll_t psi_trigger_poll(void **trigger_ptr, return ret; } +#ifdef CONFIG_PROC_FS +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *)) +{ + if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + return single_open(file, psi_show, NULL); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_io_show); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_memory_show); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_cpu_show); +} + static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t nbytes, enum psi_res res) { @@ -1392,3 +1393,5 @@ static int __init psi_proc_init(void) return 0; } module_init(psi_proc_init); + +#endif /* CONFIG_PROC_FS */ -- cgit v1.2.3 From 13765de8148f71fa795e0a6607de37c49ea5915a Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Thu, 3 Feb 2022 08:18:46 -0800 Subject: sched/fair: Fix fault in reweight_entity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Syzbot found a GPF in reweight_entity. This has been bisected to commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") There is a race between sched_post_fork() and setpriority(PRIO_PGRP) within a thread group that causes a null-ptr-deref in reweight_entity() in CFS. The scenario is that the main process spawns number of new threads, which then call setpriority(PRIO_PGRP, 0, -20), wait, and exit. For each of the new threads the copy_process() gets invoked, which adds the new task_struct and calls sched_post_fork() for it. In the above scenario there is a possibility that setpriority(PRIO_PGRP) and set_one_prio() will be called for a thread in the group that is just being created by copy_process(), and for which the sched_post_fork() has not been executed yet. This will trigger a null pointer dereference in reweight_entity(), as it will try to access the run queue pointer, which hasn't been set. Before the mentioned change the cfs_rq pointer for the task has been set in sched_fork(), which is called much earlier in copy_process(), before the new task is added to the thread_group. Now it is done in the sched_post_fork(), which is called after that. To fix the issue the remove the update_load param from the update_load param() function and call reweight_task() only if the task flag doesn't have the TASK_NEW flag set. Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") Reported-by: syzbot+af7a719bc92395ee41b3@syzkaller.appspotmail.com Signed-off-by: Tadeusz Struk Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20220203161846.1160750-1-tadeusz.struk@linaro.org --- kernel/sched/core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 848eaa0efe0e..fcf0c180617c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1214,8 +1214,9 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p, bool update_load) +static void set_load_weight(struct task_struct *p) { + bool update_load = !(READ_ONCE(p->__state) & TASK_NEW); int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; @@ -4406,7 +4407,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = p->static_prio; - set_load_weight(p, false); + set_load_weight(p); /* * We don't need the reset flag anymore after the fork. It has @@ -6921,7 +6922,7 @@ void set_user_nice(struct task_struct *p, long nice) put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); + set_load_weight(p); old_prio = p->prio; p->prio = effective_prio(p); @@ -7212,7 +7213,7 @@ static void __setscheduler_params(struct task_struct *p, */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p, true); + set_load_weight(p); } /* @@ -9445,7 +9446,7 @@ void __init sched_init(void) #endif } - set_load_weight(&init_task, false); + set_load_weight(&init_task); /* * The boot idle thread does lazy MMU switching as well: -- cgit v1.2.3 From b1e8206582f9d680cff7d04828708c8b6ab32957 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Feb 2022 10:16:57 +0100 Subject: sched: Fix yet more sched_fork() races Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") fixed a fork race vs cgroup, it opened up a race vs syscalls by not placing the task on the runqueue before it gets exposed through the pidhash. Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is trying to fix a single instance of this, instead fix the whole class of issues, effectively reverting this commit. Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tadeusz Struk Tested-by: Zhang Qiao Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fcf0c180617c..9745613d531c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1214,9 +1214,8 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p) +static void set_load_weight(struct task_struct *p, bool update_load) { - bool update_load = !(READ_ONCE(p->__state) & TASK_NEW); int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; @@ -4407,7 +4406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = p->static_prio; - set_load_weight(p); + set_load_weight(p, false); /* * We don't need the reset flag anymore after the fork. It has @@ -4425,6 +4424,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); + #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -4440,18 +4440,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; -#ifdef CONFIG_CGROUP_SCHED - struct task_group *tg; -#endif + /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_CGROUP_SCHED - tg = container_of(kargs->cset->subsys[cpu_cgrp_id], - struct task_group, css); - p->sched_task_group = autogroup_task_group(p, tg); + if (1) { + struct task_group *tg; + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } #endif rseq_migrate(p); /* @@ -4462,7 +4467,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} +void sched_post_fork(struct task_struct *p) +{ uclamp_post_fork(p); } @@ -6922,7 +6930,7 @@ void set_user_nice(struct task_struct *p, long nice) put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); + set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); @@ -7213,7 +7221,7 @@ static void __setscheduler_params(struct task_struct *p, */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p); + set_load_weight(p, true); } /* @@ -9446,7 +9454,7 @@ void __init sched_init(void) #endif } - set_load_weight(&init_task); + set_load_weight(&init_task, false); /* * The boot idle thread does lazy MMU switching as well: -- cgit v1.2.3