From d9c0ffcabd6aae7ff1e34e8078354c13bb9f1183 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jun 2018 18:29:41 +0200 Subject: sched/nohz: Skip remote tick on idle task entirely Some people have reported that the warning in sched_tick_remote() occasionally triggers, especially in favour of some RCU-Torture pressure: WARNING: CPU: 11 PID: 906 at kernel/sched/core.c:3138 sched_tick_remote+0xb6/0xc0 Modules linked in: CPU: 11 PID: 906 Comm: kworker/u32:3 Not tainted 4.18.0-rc2+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 Workqueue: events_unbound sched_tick_remote RIP: 0010:sched_tick_remote+0xb6/0xc0 Code: e8 0f 06 b8 00 c6 03 00 fb eb 9d 8b 43 04 85 c0 75 8d 48 8b 83 e0 0a 00 00 48 85 c0 75 81 eb 88 48 89 df e8 bc fe ff ff eb aa <0f> 0b eb +c5 66 0f 1f 44 00 00 bf 17 00 00 00 e8 b6 2e fe ff 0f b6 Call Trace: process_one_work+0x1df/0x3b0 worker_thread+0x44/0x3d0 kthread+0xf3/0x130 ? set_worker_desc+0xb0/0xb0 ? kthread_create_worker_on_cpu+0x70/0x70 ret_from_fork+0x35/0x40 This happens when the remote tick applies on an idle task. Usually the idle_cpu() check avoids that, but it is performed before we lock the runqueue and it is therefore racy. It was intended to be that way in order to prevent from useless runqueue locks since idle task tick callback is a no-op. Now if the racy check slips out of our hands and we end up remotely ticking an idle task, the empty task_tick_idle() is harmless. Still it won't pass the WARN_ON_ONCE() test that ensures rq_clock_task() is not too far from curr->se.exec_start because update_curr_idle() doesn't update the exec_start value like other scheduler policies. Hence the reported false positive. So let's have another check, while the rq is locked, to make sure we don't remote tick on an idle task. The lockless idle_cpu() still applies to avoid unecessary rq lock contention. Reported-by: Jacek Tomaka Reported-by: Paul E. McKenney Reported-by: Anna-Maria Gleixner Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1530203381-31234-1-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78d8facba456..22fce36426c0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3113,7 +3113,9 @@ static void sched_tick_remote(struct work_struct *work) struct tick_work *twork = container_of(dwork, struct tick_work, work); int cpu = twork->cpu; struct rq *rq = cpu_rq(cpu); + struct task_struct *curr; struct rq_flags rf; + u64 delta; /* * Handle the tick only if it appears the remote CPU is running in full @@ -3122,24 +3124,28 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { - struct task_struct *curr; - u64 delta; + if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) + goto out_requeue; - rq_lock_irq(rq, &rf); - update_rq_clock(rq); - curr = rq->curr; - delta = rq_clock_task(rq) - curr->se.exec_start; + rq_lock_irq(rq, &rf); + curr = rq->curr; + if (is_idle_task(curr)) + goto out_unlock; - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); - curr->sched_class->task_tick(rq, curr, 0); - rq_unlock_irq(rq, &rf); - } + update_rq_clock(rq); + delta = rq_clock_task(rq) - curr->se.exec_start; + + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + curr->sched_class->task_tick(rq, curr, 0); + +out_unlock: + rq_unlock_irq(rq, &rf); +out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough -- cgit v1.2.3 From 296b2ffe7fa9ed756c41415c6b1512bc4ad687b1 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Jun 2018 15:53:22 +0200 Subject: sched/rt: Fix call to cpufreq_update_util() With commit: 8f111bc357aa ("cpufreq/schedutil: Rewrite CPUFREQ_RT support") the schedutil governor uses rq->rt.rt_nr_running to detect whether an RT task is currently running on the CPU and to set frequency to max if necessary. cpufreq_update_util() is called in enqueue/dequeue_top_rt_rq() but rq->rt.rt_nr_running has not been updated yet when dequeue_top_rt_rq() is called so schedutil still considers that an RT task is running when the last task is dequeued. The update of rq->rt.rt_nr_running happens later in dequeue_rt_stack(). In fact, we can take advantage of the sequence that the dequeue then re-enqueue rt entities when a rt task is enqueued or dequeued; As a result enqueue_top_rt_rq() is always called when a task is enqueued or dequeued and also when groups are throttled or unthrottled. The only place that not use enqueue_top_rt_rq() is when root rt_rq is throttled. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: efault@gmx.de Cc: juri.lelli@redhat.com Cc: patrick.bellasi@arm.com Cc: viresh.kumar@linaro.org Fixes: 8f111bc357aa ('cpufreq/schedutil: Rewrite CPUFREQ_RT support') Link: http://lkml.kernel.org/r/1530021202-21695-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/rt.c | 16 ++++++++++------ kernel/sched/sched.h | 5 +++++ 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3cde46483f0a..c907fde01eaa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - if (rq->rt.rt_nr_running) + if (rt_rq_is_runnable(&rq->rt)) return sg_cpu->max; /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 47556b0c9a95..572567078b60 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; - if (!rt_se) + if (!rt_se) { dequeue_top_rt_rq(rt_rq); + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); + } else if (on_rt_rq(rt_se)) dequeue_rt_entity(rt_se, 0); } @@ -1001,8 +1004,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) sub_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 0; - /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq, 0); } static void @@ -1014,11 +1015,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) if (rt_rq->rt_queued) return; - if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) + + if (rt_rq_throttled(rt_rq)) return; - add_nr_running(rq, rt_rq->rt_nr_running); - rt_rq->rt_queued = 1; + if (rt_rq->rt_nr_running) { + add_nr_running(rq, rt_rq->rt_nr_running); + rt_rq->rt_queued = 1; + } /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq, 0); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6601baf2361c..27ddec334601 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -609,6 +609,11 @@ struct rt_rq { #endif }; +static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq) +{ + return rt_rq->rt_queued && rt_rq->rt_nr_running; +} + /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ -- cgit v1.2.3 From 512ac999d2755d2b7109e996a76b6fb8b888631d Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jun 2018 18:18:33 +0800 Subject: sched/fair: Fix bandwidth timer clock drift condition I noticed that cgroup task groups constantly get throttled even if they have low CPU usage, this causes some jitters on the response time to some of our business containers when enabling CPU quotas. It's very simple to reproduce: mkdir /sys/fs/cgroup/cpu/test cd /sys/fs/cgroup/cpu/test echo 100000 > cpu.cfs_quota_us echo $$ > tasks then repeat: cat cpu.stat | grep nr_throttled # nr_throttled will increase steadily After some analysis, we found that cfs_rq::runtime_remaining will be cleared by expire_cfs_rq_runtime() due to two equal but stale "cfs_{b|q}->runtime_expires" after period timer is re-armed. The current condition to judge clock drift in expire_cfs_rq_runtime() is wrong, the two runtime_expires are actually the same when clock drift happens, so this condtion can never hit. The orginal design was correctly done by this commit: a9cf55b28610 ("sched: Expire invalid runtime") ... but was changed to be the current implementation due to its locking bug. This patch introduces another way, it adds a new field in both structures cfs_rq and cfs_bandwidth to record the expiration update sequence, and uses them to figure out if clock drift happens (true if they are equal). Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period") Link: http://lkml.kernel.org/r/20180620101834.24455-1-xlpang@linux.alibaba.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 ++++++++------ kernel/sched/sched.h | 6 ++++-- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1866e64792a7..791707c56886 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4590,6 +4590,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); + cfs_b->expires_seq++; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4612,6 +4613,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); u64 amount = 0, min_amount, expires; + int expires_seq; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4628,6 +4630,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } + expires_seq = cfs_b->expires_seq; expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); @@ -4637,8 +4640,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) * spread between our sched_clock and the one on which runtime was * issued. */ - if ((s64)(expires - cfs_rq->runtime_expires) > 0) + if (cfs_rq->expires_seq != expires_seq) { + cfs_rq->expires_seq = expires_seq; cfs_rq->runtime_expires = expires; + } return cfs_rq->runtime_remaining > 0; } @@ -4664,12 +4669,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. It is valid to compare - * cfs_b->runtime_expires without any locks since we only care about - * exact equality, so a partial write will still work. + * whether the global deadline(cfs_b->expires_seq) has advanced. */ - - if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { + if (cfs_rq->expires_seq == cfs_b->expires_seq) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 27ddec334601..c7742dcc136c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -334,9 +334,10 @@ struct cfs_bandwidth { u64 runtime; s64 hierarchical_quota; u64 runtime_expires; + int expires_seq; - int idle; - int period_active; + short idle; + short period_active; struct hrtimer period_timer; struct hrtimer slack_timer; struct list_head throttled_cfs_rq; @@ -551,6 +552,7 @@ struct cfs_rq { #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; + int expires_seq; u64 runtime_expires; s64 runtime_remaining; -- cgit v1.2.3 From f1d1be8aee6c461652aea8f58bedebaa73d7f4d3 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jun 2018 18:18:34 +0800 Subject: sched/fair: Advance global expiration when period timer is restarted When period gets restarted after some idle time, start_cfs_bandwidth() doesn't update the expiration information, expire_cfs_rq_runtime() will see cfs_rq->runtime_expires smaller than rq clock and go to the clock drift logic, wasting needless CPU cycles on the scheduler hot path. Update the global expiration in start_cfs_bandwidth() to avoid frequent expire_cfs_rq_runtime() calls once a new period begins. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180620101834.24455-2-xlpang@linux.alibaba.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 791707c56886..840b92ee6f89 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5204,13 +5204,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + u64 overrun; + lockdep_assert_held(&cfs_b->lock); - if (!cfs_b->period_active) { - cfs_b->period_active = 1; - hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); - hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); - } + if (cfs_b->period_active) + return; + + cfs_b->period_active = 1; + overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); + cfs_b->expires_seq++; + hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -- cgit v1.2.3 From 3482d98bbc730758b63a5d1cf41d05ea17481412 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 14 Jun 2018 12:33:00 +0200 Subject: sched/util_est: Fix util_est_dequeue() for throttled cfs_rq When a cfs_rq is throttled, parent cfs_rq->nr_running is decreased and everything happens at cfs_rq level. Currently util_est stays unchanged in such case and it keeps accounting the utilization of throttled tasks. This can somewhat make sense as we don't dequeue tasks but only throttled cfs_rq. If a task of another group is enqueued/dequeued and root cfs_rq becomes idle during the dequeue, util_est will be cleared whereas it was accounting util_est of throttled tasks before. So the behavior of util_est is not always the same regarding throttled tasks and depends of side activity. Furthermore, util_est will not be updated when the cfs_rq is unthrottled as everything happens at cfs_rq level. Main results is that util_est will stay null whereas we now have running tasks. We have to wait for the next dequeue/enqueue of the previously throttled tasks to get an up to date util_est. Remove the assumption that cfs_rq's estimated utilization of a CPU is 0 if there is no running task so the util_est of a task remains until the latter is dequeued even if its cfs_rq has been throttled. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Patrick Bellasi Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 7f65ea42eb00 ("sched/fair: Add util_est on top of PELT") Link: http://lkml.kernel.org/r/1528972380-16268-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 840b92ee6f89..2f0a0be4d344 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) if (!sched_feat(UTIL_EST)) return; - /* - * Update root cfs_rq's estimated utilization - * - * If *p is the last task then the root cfs_rq's estimated utilization - * of a CPU is 0 by definition. - */ - ue.enqueued = 0; - if (cfs_rq->nr_running) { - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, - (_task_util_est(p) | UTIL_AVG_UNCHANGED)); - } + /* Update root cfs_rq's estimated utilization */ + ue.enqueued = cfs_rq->avg.util_est.enqueued; + ue.enqueued -= min_t(unsigned int, ue.enqueued, + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); /* -- cgit v1.2.3 From 1cef1150ef40ec52f507436a14230cbc2623299c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Jun 2018 11:45:49 +0200 Subject: kthread, sched/core: Fix kthread_parkme() (again...) Gaurav reports that commit: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue") isn't working for him. Because of the following race: > controller Thread CPUHP Thread > takedown_cpu > kthread_park > kthread_parkme > Set KTHREAD_SHOULD_PARK > smpboot_thread_fn > set Task interruptible > > > wake_up_process > if (!(p->state & state)) > goto out; > > Kthread_parkme > SET TASK_PARKED > schedule > raw_spin_lock(&rq->lock) > ttwu_remote > waiting for __task_rq_lock > context_switch > > finish_lock_switch > > > > Case TASK_PARKED > kthread_park_complete > > > SET Running Furthermore, Oleg noticed that the whole scheduler TASK_PARKED handling is buggered because the TASK_DEAD thing is done with preemption disabled, the current code can still complete early on preemption :/ So basically revert that earlier fix and go with a variant of the alternative mentioned in the commit. Promote TASK_PARKED to special state to avoid the store-store issue on task->state leading to the WARN in kthread_unpark() -> __kthread_bind(). But in addition, add wait_task_inactive() to kthread_park() to ensure the task really is PARKED when we return from kthread_park(). This avoids the whole kthread still gets migrated nonsense -- although it would be really good to get this done differently. Reported-by: Gaurav Kohli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue") Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 22fce36426c0..fe365c9a08e9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,7 +7,6 @@ */ #include "sched.h" -#include #include #include @@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev) membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } - if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { - switch (prev_state) { - case TASK_DEAD: - if (prev->sched_class->task_dead) - prev->sched_class->task_dead(prev); + if (unlikely(prev_state == TASK_DEAD)) { + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - - /* Task is done with its stack. */ - put_task_stack(prev); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); - put_task_struct(prev); - break; + /* Task is done with its stack. */ + put_task_stack(prev); - case TASK_PARKED: - kthread_park_complete(prev); - break; - } + put_task_struct(prev); } tick_nohz_task_switch(); -- cgit v1.2.3 From e117cb52bdb4d376b711bee34af6434c9e314b3b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 11 Jul 2018 09:29:48 +0200 Subject: sched/deadline: Fix switched_from_dl() warning Mark noticed that syzkaller is able to reliably trigger the following warning: dl_rq->running_bw > dl_rq->this_bw WARNING: CPU: 1 PID: 153 at kernel/sched/deadline.c:124 switched_from_dl+0x454/0x608 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 153 Comm: syz-executor253 Not tainted 4.18.0-rc3+ #29 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x458 show_stack+0x20/0x30 dump_stack+0x180/0x250 panic+0x2dc/0x4ec __warn_printk+0x0/0x150 report_bug+0x228/0x2d8 bug_handler+0xa0/0x1a0 brk_handler+0x2f0/0x568 do_debug_exception+0x1bc/0x5d0 el1_dbg+0x18/0x78 switched_from_dl+0x454/0x608 __sched_setscheduler+0x8cc/0x2018 sys_sched_setattr+0x340/0x758 el0_svc_naked+0x30/0x34 syzkaller reproducer runs a bunch of threads that constantly switch between DEADLINE and NORMAL classes while interacting through futexes. The splat above is caused by the fact that if a DEADLINE task is setattr back to NORMAL while in non_contending state (blocked on a futex - inactive timer armed), its contribution to running_bw is not removed before sub_rq_bw() gets called (!task_on_rq_queued() branch) and the latter sees running_bw > this_bw. Fix it by removing a task contribution from running_bw if the task is not queued and in non_contending state while switched to a different class. Reported-by: Mark Rutland Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Reviewed-by: Luca Abeni Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20180711072948.27061-1-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fbfc3f1d368a..10c7b51c0d1f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2290,8 +2290,17 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && p->dl.dl_runtime) task_non_contending(p); - if (!task_on_rq_queued(p)) + if (!task_on_rq_queued(p)) { + /* + * Inactive timer is armed. However, p is leaving DEADLINE and + * might migrate away from this rq while continuing to run on + * some other class. We need to remove its contribution from + * this rq running_bw now, or sub_rq_bw (below) will complain. + */ + if (p->dl.dl_non_contending) + sub_running_bw(&p->dl, &rq->dl); sub_rq_bw(&p->dl, &rq->dl); + } /* * We cannot use inactive_task_timer() to invoke sub_running_bw() -- cgit v1.2.3 From 6cd0c583b04b2bd9415e07b51b63ab799949dd66 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 23 Jul 2018 12:19:07 +0800 Subject: sched/topology: Check variable group before dereferencing it The 'group' variable in sched_domain_debug_one() is not checked when firstly used in cpumask_test_cpu(cpu, sched_group_span(group)), but it might be NULL (it is checked later in the following while loop) and may cause NULL pointer dereference. We need to check it before using to avoid NULL dereference. Signed-off-by: Yi Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiang Biao Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: zhong.weidong@zte.com.cn Link: http://lkml.kernel.org/r/1532319547-33335-1-git-send-email-wang.yi59@zte.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 05a831427bc7..56a0fed30c0a 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -47,7 +47,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); } - if (!cpumask_test_cpu(cpu, sched_group_span(group))) { + if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } -- cgit v1.2.3 From 840d719604b0925ca23dde95f1767e4528668369 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 20 Jul 2018 11:16:30 +0200 Subject: sched/deadline: Update rq_clock of later_rq when pushing a task Daniel Casini got this warn while running a DL task here at RetisLab: [ 461.137582] ------------[ cut here ]------------ [ 461.137583] rq->clock_update_flags < RQCF_ACT_SKIP [ 461.137599] WARNING: CPU: 4 PID: 2354 at kernel/sched/sched.h:967 assert_clock_updated.isra.32.part.33+0x17/0x20 [a ton of modules] [ 461.137646] CPU: 4 PID: 2354 Comm: label_image Not tainted 4.18.0-rc4+ #3 [ 461.137647] Hardware name: ASUS All Series/Z87-K, BIOS 0801 09/02/2013 [ 461.137649] RIP: 0010:assert_clock_updated.isra.32.part.33+0x17/0x20 [ 461.137649] Code: ff 48 89 83 08 09 00 00 eb c6 66 0f 1f 84 00 00 00 00 00 55 48 c7 c7 98 7a 6c a5 c6 05 bc 0d 54 01 01 48 89 e5 e8 a9 84 fb ff <0f> 0b 5d c3 0f 1f 44 00 00 0f 1f 44 00 00 83 7e 60 01 74 0a 48 3b [ 461.137673] RSP: 0018:ffffa77e08cafc68 EFLAGS: 00010082 [ 461.137674] RAX: 0000000000000000 RBX: ffff8b3fc1702d80 RCX: 0000000000000006 [ 461.137674] RDX: 0000000000000007 RSI: 0000000000000096 RDI: ffff8b3fded164b0 [ 461.137675] RBP: ffffa77e08cafc68 R08: 0000000000000026 R09: 0000000000000339 [ 461.137676] R10: ffff8b3fd060d410 R11: 0000000000000026 R12: ffffffffa4e14e20 [ 461.137677] R13: ffff8b3fdec22940 R14: ffff8b3fc1702da0 R15: ffff8b3fdec22940 [ 461.137678] FS: 00007efe43ee5700(0000) GS:ffff8b3fded00000(0000) knlGS:0000000000000000 [ 461.137679] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 461.137680] CR2: 00007efe30000010 CR3: 0000000301744003 CR4: 00000000001606e0 [ 461.137680] Call Trace: [ 461.137684] push_dl_task.part.46+0x3bc/0x460 [ 461.137686] task_woken_dl+0x60/0x80 [ 461.137689] ttwu_do_wakeup+0x4f/0x150 [ 461.137690] ttwu_do_activate+0x77/0x80 [ 461.137692] try_to_wake_up+0x1d6/0x4c0 [ 461.137693] wake_up_q+0x32/0x70 [ 461.137696] do_futex+0x7e7/0xb50 [ 461.137698] __x64_sys_futex+0x8b/0x180 [ 461.137701] do_syscall_64+0x5a/0x110 [ 461.137703] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 461.137705] RIP: 0033:0x7efe4918ca26 [ 461.137705] Code: 00 00 00 74 17 49 8b 48 20 44 8b 59 10 41 83 e3 30 41 83 fb 20 74 1e be 85 00 00 00 41 ba 01 00 00 00 41 b9 01 00 00 04 0f 05 <48> 3d 01 f0 ff ff 73 1f 31 c0 c3 be 8c 00 00 00 49 89 c8 4d 31 d2 [ 461.137738] RSP: 002b:00007efe43ee4928 EFLAGS: 00000283 ORIG_RAX: 00000000000000ca [ 461.137739] RAX: ffffffffffffffda RBX: 0000000005094df0 RCX: 00007efe4918ca26 [ 461.137740] RDX: 0000000000000001 RSI: 0000000000000085 RDI: 0000000005094e24 [ 461.137741] RBP: 00007efe43ee49c0 R08: 0000000005094e20 R09: 0000000004000001 [ 461.137741] R10: 0000000000000001 R11: 0000000000000283 R12: 0000000000000000 [ 461.137742] R13: 0000000005094df8 R14: 0000000000000001 R15: 0000000000448a10 [ 461.137743] ---[ end trace 187df4cad2bf7649 ]--- This warning happened in the push_dl_task(), because __add_running_bw()->cpufreq_update_util() is getting the rq_clock of the later_rq before its update, which takes place at activate_task(). The fix then is to update the rq_clock before calling add_running_bw(). To avoid double rq_clock_update() call, we set ENQUEUE_NOCLOCK flag to activate_task(). Reported-by: Daniel Casini Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Clark Williams Cc: Linus Torvalds Cc: Luca Abeni Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Fixes: e0367b12674b sched/deadline: Move CPU frequency selection triggering points Link: http://lkml.kernel.org/r/ca31d073a4788acf0684a8b255f14fea775ccf20.1532077269.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 10c7b51c0d1f..b5fbdde6afa9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2090,8 +2090,14 @@ retry: sub_rq_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); add_rq_bw(&next_task->dl, &later_rq->dl); + + /* + * Update the later_rq clock here, because the clock is used + * by the cpufreq_update_util() inside __add_running_bw(). + */ + update_rq_clock(later_rq); add_running_bw(&next_task->dl, &later_rq->dl); - activate_task(later_rq, next_task, 0); + activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); ret = 1; resched_curr(later_rq); -- cgit v1.2.3 From f3d133ee0a17d5694c6f21873eec9863e11fa423 Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Wed, 18 Jul 2018 08:46:55 +0800 Subject: sched/rt: Restore rt_runtime after disabling RT_RUNTIME_SHARE NO_RT_RUNTIME_SHARE feature is used to prevent a CPU borrow enough runtime with a spin-rt-task. However, if RT_RUNTIME_SHARE feature is enabled and rt_rq has borrowd enough rt_runtime at the beginning, rt_runtime can't be restored to its initial bandwidth rt_runtime after we disable RT_RUNTIME_SHARE. E.g. on my PC with 4 cores, procedure to reproduce: 1) Make sure RT_RUNTIME_SHARE is enabled cat /sys/kernel/debug/sched_features GENTLE_FAIR_SLEEPERS START_DEBIT NO_NEXT_BUDDY LAST_BUDDY CACHE_HOT_BUDDY WAKEUP_PREEMPTION NO_HRTICK NO_DOUBLE_TICK LB_BIAS NONTASK_CAPACITY TTWU_QUEUE NO_SIS_AVG_CPU SIS_PROP NO_WARN_DOUBLE_CLOCK RT_PUSH_IPI RT_RUNTIME_SHARE NO_LB_MIN ATTACH_AGE_LOAD WA_IDLE WA_WEIGHT WA_BIAS 2) Start a spin-rt-task ./loop_rr & 3) set affinity to the last cpu taskset -p 8 $pid_of_loop_rr 4) Observe that last cpu have borrowed enough runtime. cat /proc/sched_debug | grep rt_runtime .rt_runtime : 950.000000 .rt_runtime : 900.000000 .rt_runtime : 950.000000 .rt_runtime : 1000.000000 5) Disable RT_RUNTIME_SHARE echo NO_RT_RUNTIME_SHARE > /sys/kernel/debug/sched_features 6) Observe that rt_runtime can not been restored cat /proc/sched_debug | grep rt_runtime .rt_runtime : 950.000000 .rt_runtime : 900.000000 .rt_runtime : 950.000000 .rt_runtime : 1000.000000 This patch help to restore rt_runtime after we disable RT_RUNTIME_SHARE. Signed-off-by: Hailong Liu Signed-off-by: Jiang Biao Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: zhong.weidong@zte.com.cn Link: http://lkml.kernel.org/r/1531874815-39357-1-git-send-email-liu.hailong6@zte.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 572567078b60..eaaec8364f96 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -836,6 +836,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) * can be time-consuming. Try to avoid it when possible. */ raw_spin_lock(&rt_rq->rt_runtime_lock); + if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF) + rt_rq->rt_runtime = rt_b->rt_runtime; skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; raw_spin_unlock(&rt_rq->rt_runtime_lock); if (skip) -- cgit v1.2.3