From de9b8f5dcbd94bfb1d249907a635f1fb1968e19c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2015 23:09:29 +0200 Subject: sched: Fix crash trying to dequeue/enqueue the idle thread Sasha reports that his virtual machine tries to schedule the idle thread since commit 6c37067e2786 ("sched: Change the sched_class::set_cpus_allowed() calling context"). Hit trace shows this happening from idle_thread_get()->init_idle(), which is the _second_ init_idle() invocation on that task_struct, the first being done through idle_init()->fork_idle(). (this code is insane...) Because we call init_idle() twice in a row, its ->sched_class == &idle_sched_class and ->on_rq = TASK_ON_RQ_QUEUED. This means do_set_cpus_allowed() think we're queued and will call dequeue_task(), which is implemented with BUG() for the idle class, seeing how dequeueing the idle task is a daft thing. Aside of the whole insanity of calling init_idle() _twice_, change the code to call set_cpus_allowed_common() instead as this is 'obviously' before the idle task gets ran etc.. Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 6c37067e2786 ("sched: Change the sched_class::set_cpus_allowed() calling context") Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 97d276ff1edb..f0d043ec0182 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4927,7 +4927,15 @@ void init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - do_set_cpus_allowed(idle, cpumask_of(cpu)); +#ifdef CONFIG_SMP + /* + * Its possible that init_idle() gets called multiple times on a task, + * in that case do_set_cpus_allowed() will not do the right thing. + * + * And since this is boot we can forgo the serialization. + */ + set_cpus_allowed_common(idle, cpumask_of(cpu)); +#endif /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the cpu isn't yet set to this cpu so the @@ -4944,7 +4952,7 @@ void init_idle(struct task_struct *idle, int cpu) rq->curr = rq->idle = idle; idle->on_rq = TASK_ON_RQ_QUEUED; -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP idle->on_cpu = 1; #endif raw_spin_unlock(&rq->lock); @@ -4959,7 +4967,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif } -- cgit v1.2.3 From 00cc1633816de8c95f337608a1ea64e228faf771 Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Fri, 18 Sep 2015 11:27:45 +0200 Subject: sched: access local runqueue directly in single_task_running Commit 2ee507c47293 ("sched: Add function single_task_running to let a task check if it is the only task running on a cpu") referenced the current runqueue with the smp_processor_id. When CONFIG_DEBUG_PREEMPT is enabled, that is only allowed if preemption is disabled or the currrent task is bound to the local cpu (e.g. kernel worker). With commit f78195129963 ("kvm: add halt_poll_ns module parameter") KVM calls single_task_running. If CONFIG_DEBUG_PREEMPT is enabled that generates a lot of kernel messages. To avoid adding preemption in that cases, as it would limit the usefulness, we change single_task_running to access directly the cpu local runqueue. Cc: Tim Chen Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Cc: Fixes: 2ee507c472939db4b146d545352b8a7c79ef47f8 Signed-off-by: Dominik Dingel Signed-off-by: Paolo Bonzini --- kernel/sched/core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3595403921bd..4064f794ab8c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2666,13 +2666,20 @@ unsigned long nr_running(void) /* * Check if only the current task is running on the cpu. + * + * Caution: this function does not check that the caller has disabled + * preemption, thus the result might have a time-of-check-to-time-of-use + * race. The caller is responsible to use it correctly, for example: + * + * - from a non-preemptable section (of course) + * + * - from a thread that is bound to a single CPU + * + * - in a loop with very short iterations (e.g. a polling loop) */ bool single_task_running(void) { - if (cpu_rq(smp_processor_id())->nr_running == 1) - return true; - else - return false; + return raw_rq()->nr_running == 1; } EXPORT_SYMBOL(single_task_running); -- cgit v1.2.3 From ac5be6b47e8bd25b62bed2c82cda7398999f59e9 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 22 Sep 2015 14:58:49 -0700 Subject: userfaultfd: revert "userfaultfd: waitqueue: add nr wake parameter to __wake_up_locked_key" This reverts commit 51360155eccb907ff8635bd10fc7de876408c2e0 and adapts fs/userfaultfd.c to use the old version of that function. It didn't look robust to call __wake_up_common with "nr == 1" when we absolutely require wakeall semantics, but we've full control of what we insert in the two waitqueue heads of the blocked userfaults. No exclusive waitqueue risks to be inserted into those two waitqueue heads so we can as well stick to "nr == 1" of the old code and we can rely purely on the fact no waitqueue inserted in one of the two waitqueue heads we must enforce as wakeall, has wait->flags WQ_FLAG_EXCLUSIVE set. Signed-off-by: Andrea Arcangeli Cc: Dr. David Alan Gilbert Cc: Michael Ellerman Cc: Shuah Khan Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/wait.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 272d9322bc5d..052e02672d12 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) } EXPORT_SYMBOL_GPL(__wake_up_locked); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, - void *key) +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) { - __wake_up_common(q, mode, nr, 0, key); + __wake_up_common(q, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); @@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, 1, key); + __wake_up_locked_key(q, mode, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); -- cgit v1.2.3 From 95913d97914f44db2b81271c2e2ebd4d2ac2df83 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Sep 2015 14:45:09 +0200 Subject: sched/core: Fix TASK_DEAD race in finish_task_switch() So the problem this patch is trying to address is as follows: CPU0 CPU1 context_switch(A, B) ttwu(A) LOCK A->pi_lock A->on_cpu == 0 finish_task_switch(A) prev_state = A->state <-. WMB | A->on_cpu = 0; | UNLOCK rq0->lock | | context_switch(C, A) `-- A->state = TASK_DEAD prev_state == TASK_DEAD put_task_struct(A) context_switch(A, C) finish_task_switch(A) A->state == TASK_DEAD put_task_struct(A) The argument being that the WMB will allow the load of A->state on CPU0 to cross over and observe CPU1's store of A->state, which will then result in a double-drop and use-after-free. Now the comment states (and this was true once upon a long time ago) that we need to observe A->state while holding rq->lock because that will order us against the wakeup; however the wakeup will not in fact acquire (that) rq->lock; it takes A->pi_lock these days. We can obviously fix this by upgrading the WMB to an MB, but that is expensive, so we'd rather avoid that. The alternative this patch takes is: smp_store_release(&A->on_cpu, 0), which avoids the MB on some archs, but not important ones like ARM. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: # v3.1+ Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: manfred@colorfullife.com Cc: will.deacon@arm.com Fixes: e4a52bcb9a18 ("sched: Remove rq->lock from the first half of ttwu()") Link: http://lkml.kernel.org/r/20150929124509.GG3816@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++----- kernel/sched/sched.h | 5 +++-- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 615953141951..10a8faa1b0d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2517,11 +2517,11 @@ static struct rq *finish_task_switch(struct task_struct *prev) * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul + * + * We must observe prev->state before clearing prev->on_cpu (in + * finish_lock_switch), otherwise a concurrent wakeup can get prev + * running on another CPU and we could rave with its RUNNING -> DEAD + * transition, resulting in a double drop. */ prev_state = prev->state; vtime_task_switch(prev); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 68cda117574c..6d2a119c7ad9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1078,9 +1078,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. + * + * Pairs with the control dependency and rmb in try_to_wake_up(). */ - smp_wmb(); - prev->on_cpu = 0; + smp_store_release(&prev->on_cpu, 0); #endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ -- cgit v1.2.3