From 8f9fbf092cd0ae31722b42c9abb427a87d55c18a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Oct 2014 21:51:08 +0200 Subject: sched: Fix the PREEMPT_ACTIVE check in __trace_sched_switch_state() task_preempt_count() has nothing to do with the actual preempt counter, thread_info->saved_preempt_count is only valid right after switch_to(). __trace_sched_switch_state() can use preempt_count(), prev is still the current task when trace_sched_switch() is called. Signed-off-by: Oleg Nesterov [ Added BUG_ON(). ] Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Mel Gorman Cc: Oleg Nesterov Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20141007195108.GB28002@redhat.com Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 0a68d5ae584e..30fedaf3e56a 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -97,16 +97,19 @@ static inline long __trace_sched_switch_state(struct task_struct *p) long state = p->state; #ifdef CONFIG_PREEMPT +#ifdef CONFIG_SCHED_DEBUG + BUG_ON(p != current); +#endif /* CONFIG_SCHED_DEBUG */ /* * For all intents and purposes a preempted task is a running task. */ - if (task_preempt_count(p) & PREEMPT_ACTIVE) + if (preempt_count() & PREEMPT_ACTIVE) state = TASK_RUNNING | TASK_STATE_MAX; -#endif +#endif /* CONFIG_PREEMPT */ return state; } -#endif +#endif /* CREATE_TRACE_POINTS */ /* * Tracepoint for task switches, performed by the scheduler: -- cgit v1.2.3 From e2336f6e51edda875a49770b616ed5b02a74665b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 8 Oct 2014 20:33:48 +0200 Subject: sched: Kill task_preempt_count() task_preempt_count() is pointless if preemption counter is per-cpu, currently this is x86 only. It is only valid if the task is not running, and even in this case the only info it can provide is the state of PREEMPT_ACTIVE bit. Change its single caller to check p->on_rq instead, this should be the same if p->state != TASK_RUNNING, and kill this helper. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Steven Rostedt Cc: Kirill Tkhai Cc: Alexander Graf Cc: Andrew Morton Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Linus Torvalds Cc: linux-arch@vger.kernel.org Link: http://lkml.kernel.org/r/20141008183348.GC17495@redhat.com Signed-off-by: Ingo Molnar --- include/asm-generic/preempt.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index 1cd3f5d767a8..eb6f9e6c3075 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -23,9 +23,6 @@ static __always_inline void preempt_count_set(int pc) /* * must be macros to avoid header recursion hell */ -#define task_preempt_count(p) \ - (task_thread_info(p)->preempt_count & ~PREEMPT_NEED_RESCHED) - #define init_task_preempt_count(p) do { \ task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ } while (0) -- cgit v1.2.3 From 7f51412a415d87ea8598d14722fb31e4f5701257 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Sep 2014 10:22:40 +0100 Subject: sched/deadline: Fix bandwidth check/update when migrating tasks between exclusive cpusets Exclusive cpusets are the only way users can restrict SCHED_DEADLINE tasks affinity (performing what is commonly called clustered scheduling). Unfortunately, such thing is currently broken for two reasons: - No check is performed when the user tries to attach a task to an exlusive cpuset (recall that exclusive cpusets have an associated maximum allowed bandwidth). - Bandwidths of source and destination cpusets are not correctly updated after a task is migrated between them. This patch fixes both things at once, as they are opposite faces of the same coin. The check is performed in cpuset_can_attach(), as there aren't any points of failure after that function. The updated is split in two halves. We first reserve bandwidth in the destination cpuset, after we pass the check in cpuset_can_attach(). And we then release bandwidth from the source cpuset when the task's affinity is actually changed. Even if there can be time windows when sched_setattr() may erroneously fail in the source cpuset, we are fine with it, as we can't perfom an atomic update of both cpusets at once. Reported-by: Daniel Wagner Reported-by: Vincent Legout Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Dario Faggioli Cc: Michael Trimarchi Cc: Fabio Checconi Cc: michael@amarulasolutions.com Cc: luca.abeni@unitn.it Cc: Li Zefan Cc: Linus Torvalds Cc: cgroups@vger.kernel.org Link: http://lkml.kernel.org/r/1411118561-26323-3-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5e344bbe63ec..1d1fa081d44f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2052,6 +2052,8 @@ static inline void tsk_restore_flags(struct task_struct *task, task->flags |= orig_flags & flags; } +extern int task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed); #ifdef CONFIG_SMP extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); -- cgit v1.2.3 From f82f80426f7afcf55953924e71555984a4bd6ce6 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 7 Oct 2014 09:52:11 +0100 Subject: sched/deadline: Ensure that updates to exclusive cpusets don't break AC How we deal with updates to exclusive cpusets is currently broken. As an example, suppose we have an exclusive cpuset composed of two cpus: A[cpu0,cpu1]. We can assign SCHED_DEADLINE task to it up to the allowed bandwidth. If we want now to modify cpusetA's cpumask, we have to check that removing a cpu's amount of bandwidth doesn't break AC guarantees. This thing isn't checked in the current code. This patch fixes the problem above, denying an update if the new cpumask won't have enough bandwidth for SCHED_DEADLINE tasks that are currently active. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Li Zefan Cc: cgroups@vger.kernel.org Link: http://lkml.kernel.org/r/5433E6AF.5080105@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1d1fa081d44f..320a9779f1b4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2052,6 +2052,8 @@ static inline void tsk_restore_flags(struct task_struct *task, task->flags |= orig_flags & flags; } +extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial); extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); #ifdef CONFIG_SMP -- cgit v1.2.3 From 61ada528dea028331e99e8ceaed87c683ad25de2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Sep 2014 10:18:47 +0200 Subject: sched/wait: Provide infrastructure to deal with nested blocking There are a few places that call blocking primitives from wait loops, provide infrastructure to support this without the typical task_struct::state collision. We record the wakeup in wait_queue_t::flags which leaves task_struct::state free to be used by others. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Cc: tglx@linutronix.de Cc: ilya.dryomov@inktank.com Cc: umgwanakikbuti@gmail.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140924082242.051202318@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index e4a8eb9312ea..fc0e99395fbb 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -13,9 +13,12 @@ typedef struct __wait_queue wait_queue_t; typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key); int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); +/* __wait_queue::flags */ +#define WQ_FLAG_EXCLUSIVE 0x01 +#define WQ_FLAG_WOKEN 0x02 + struct __wait_queue { unsigned int flags; -#define WQ_FLAG_EXCLUSIVE 0x01 void *private; wait_queue_func_t func; struct list_head task_list; @@ -830,6 +833,8 @@ void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int sta long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key); +long wait_woken(wait_queue_t *wait, unsigned mode, long timeout); +int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); -- cgit v1.2.3 From e22b886a8a43b147e1994a9f970f678fc0df2033 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Sep 2014 10:18:48 +0200 Subject: sched/wait: Add might_sleep() checks Add more might_sleep() checks, suppose someone put a wait_event() like thing in a wait loop.. Can't put might_sleep() in ___wait_event() because there's the locked primitives which call ___wait_event() with locks held. Signed-off-by: Peter Zijlstra (Intel) Cc: tglx@linutronix.de Cc: ilya.dryomov@inktank.com Cc: umgwanakikbuti@gmail.com Cc: Oleg Nesterov Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140924082242.119255706@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index fc0e99395fbb..0421775e0b9f 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -261,6 +261,7 @@ __out: __ret; \ */ #define wait_event(wq, condition) \ do { \ + might_sleep(); \ if (condition) \ break; \ __wait_event(wq, condition); \ @@ -293,6 +294,7 @@ do { \ #define wait_event_timeout(wq, condition, timeout) \ ({ \ long __ret = timeout; \ + might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_timeout(wq, condition, timeout); \ __ret; \ @@ -318,6 +320,7 @@ do { \ */ #define wait_event_cmd(wq, condition, cmd1, cmd2) \ do { \ + might_sleep(); \ if (condition) \ break; \ __wait_event_cmd(wq, condition, cmd1, cmd2); \ @@ -345,6 +348,7 @@ do { \ #define wait_event_interruptible(wq, condition) \ ({ \ int __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible(wq, condition); \ __ret; \ @@ -378,6 +382,7 @@ do { \ #define wait_event_interruptible_timeout(wq, condition, timeout) \ ({ \ long __ret = timeout; \ + might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_interruptible_timeout(wq, \ condition, timeout); \ @@ -428,6 +433,7 @@ do { \ #define wait_event_hrtimeout(wq, condition, timeout) \ ({ \ int __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ TASK_UNINTERRUPTIBLE); \ @@ -453,6 +459,7 @@ do { \ #define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ ({ \ long __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ TASK_INTERRUPTIBLE); \ @@ -466,6 +473,7 @@ do { \ #define wait_event_interruptible_exclusive(wq, condition) \ ({ \ int __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible_exclusive(wq, condition);\ __ret; \ @@ -640,6 +648,7 @@ do { \ #define wait_event_killable(wq, condition) \ ({ \ int __ret = 0; \ + might_sleep(); \ if (!(condition)) \ __ret = __wait_event_killable(wq, condition); \ __ret; \ @@ -891,6 +900,7 @@ extern int bit_wait_io_timeout(struct wait_bit_key *); static inline int wait_on_bit(void *word, int bit, unsigned mode) { + might_sleep(); if (!test_bit(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, @@ -915,6 +925,7 @@ wait_on_bit(void *word, int bit, unsigned mode) static inline int wait_on_bit_io(void *word, int bit, unsigned mode) { + might_sleep(); if (!test_bit(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, @@ -941,6 +952,7 @@ wait_on_bit_io(void *word, int bit, unsigned mode) static inline int wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) { + might_sleep(); if (!test_bit(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, action, mode); @@ -968,6 +980,7 @@ wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode static inline int wait_on_bit_lock(void *word, int bit, unsigned mode) { + might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); @@ -991,6 +1004,7 @@ wait_on_bit_lock(void *word, int bit, unsigned mode) static inline int wait_on_bit_lock_io(void *word, int bit, unsigned mode) { + might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); @@ -1016,6 +1030,7 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode) static inline int wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) { + might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, action, mode); @@ -1034,6 +1049,7 @@ wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned static inline int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode) { + might_sleep(); if (atomic_read(val) == 0) return 0; return out_of_line_wait_on_atomic_t(val, action, mode); -- cgit v1.2.3 From 1029a2b52c09e479fd7b07275812ad97868c0fb0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Sep 2014 10:18:49 +0200 Subject: sched, exit: Deal with nested sleeps do_wait() is a big wait loop, but we set TASK_RUNNING too late; we end up calling potential sleeps before we reset it. Not strictly a bug since we're guaranteed to exit the loop and not call schedule(); put in annotations to quiet might_sleep(). WARNING: CPU: 0 PID: 1 at ../kernel/sched/core.c:7123 __might_sleep+0x7e/0x90() do not call blocking ops when !TASK_RUNNING; state=1 set at [] do_wait+0x88/0x270 Call Trace: [] dump_stack+0x4e/0x7a [] warn_slowpath_common+0x8c/0xc0 [] warn_slowpath_fmt+0x4c/0x50 [] __might_sleep+0x7e/0x90 [] might_fault+0x55/0xb0 [] wait_consider_task+0x90b/0xc10 [] do_wait+0x104/0x270 [] SyS_wait4+0x77/0x100 [] system_call_fastpath+0x16/0x1b Signed-off-by: Peter Zijlstra (Intel) Cc: tglx@linutronix.de Cc: umgwanakikbuti@gmail.com Cc: ilya.dryomov@inktank.com Cc: Alex Elder Cc: Andrew Morton Cc: Axel Lin Cc: Daniel Borkmann Cc: Dave Jones Cc: Guillaume Morin Cc: Ionut Alexa Cc: Jason Baron Cc: Linus Torvalds Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Rik van Riel Cc: Rusty Russell Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140924082242.186408915@infradead.org Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3d770f5564b8..5068a0d9fecd 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -175,10 +175,12 @@ extern int _cond_resched(void); */ # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) +# define sched_annotate_sleep() __set_current_state(TASK_RUNNING) #else static inline void __might_sleep(const char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) +# define sched_annotate_sleep() do { } while (0) #endif #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) -- cgit v1.2.3 From 26cabd31259ba43f68026ce3f62b78094124333f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Sep 2014 10:18:54 +0200 Subject: sched, net: Clean up sk_wait_event() vs. might_sleep() WARNING: CPU: 1 PID: 1744 at kernel/sched/core.c:7104 __might_sleep+0x58/0x90() do not call blocking ops when !TASK_RUNNING; state=1 set at [] prepare_to_wait+0x50 /0xa0 [] __might_sleep+0x58/0x90 [] lock_sock_nested+0x31/0xb0 [] sk_stream_wait_memory+0x18a/0x2d0 Which is a false positive because sk_wait_event() will already have TASK_RUNNING at that point if it would've gone through schedule_timeout(). So annotate with sched_annotate_sleep(); which goes away on !DEBUG builds. Reported-by: Ilya Dryomov Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20140924082242.524407432@infradead.org Cc: David S. Miller Cc: Linus Torvalds Cc: netdev@vger.kernel.org Cc: tglx@linutronix.de Cc: ilya.dryomov@inktank.com Cc: umgwanakikbuti@gmail.com Cc: oleg@redhat.com Signed-off-by: Ingo Molnar --- include/net/sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 7db3db112baa..e6f235ebf6c9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -897,6 +897,7 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) if (!__rc) { \ *(__timeo) = schedule_timeout(*(__timeo)); \ } \ + sched_annotate_sleep(); \ lock_sock(__sk); \ __rc = __condition; \ __rc; \ -- cgit v1.2.3 From 8eb23b9f35aae413140d3fda766a98092c21e9b0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Sep 2014 10:18:55 +0200 Subject: sched: Debug nested sleeps Validate we call might_sleep() with TASK_RUNNING, which catches places where we nest blocking primitives, eg. mutex usage in a wait loop. Since all blocking is arranged through task_struct::state, nesting this will cause the inner primitive to set TASK_RUNNING and the outer will thus not block. Another observed problem is calling a blocking function from schedule()->sched_submit_work()->blk_schedule_flush_plug() which will then destroy the task state for the actual __schedule() call that comes after it. Signed-off-by: Peter Zijlstra (Intel) Cc: tglx@linutronix.de Cc: ilya.dryomov@inktank.com Cc: umgwanakikbuti@gmail.com Cc: oleg@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140924082242.591637616@infradead.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 320a9779f1b4..4648e07f7d6f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -243,6 +243,43 @@ extern char ___assert_task_state[1 - 2*!!( ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ (task->flags & PF_FROZEN) == 0) +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + +#define __set_task_state(tsk, state_value) \ + do { \ + (tsk)->task_state_change = _THIS_IP_; \ + (tsk)->state = (state_value); \ + } while (0) +#define set_task_state(tsk, state_value) \ + do { \ + (tsk)->task_state_change = _THIS_IP_; \ + set_mb((tsk)->state, (state_value)); \ + } while (0) + +/* + * set_current_state() includes a barrier so that the write of current->state + * is correctly serialised wrt the caller's subsequent test of whether to + * actually sleep: + * + * set_current_state(TASK_UNINTERRUPTIBLE); + * if (do_i_need_to_sleep()) + * schedule(); + * + * If the caller does not need such serialisation then use __set_current_state() + */ +#define __set_current_state(state_value) \ + do { \ + current->task_state_change = _THIS_IP_; \ + current->state = (state_value); \ + } while (0) +#define set_current_state(state_value) \ + do { \ + current->task_state_change = _THIS_IP_; \ + set_mb(current->state, (state_value)); \ + } while (0) + +#else + #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) #define set_task_state(tsk, state_value) \ @@ -259,11 +296,13 @@ extern char ___assert_task_state[1 - 2*!!( * * If the caller does not need such serialisation then use __set_current_state() */ -#define __set_current_state(state_value) \ +#define __set_current_state(state_value) \ do { current->state = (state_value); } while (0) -#define set_current_state(state_value) \ +#define set_current_state(state_value) \ set_mb(current->state, (state_value)) +#endif + /* Task command name length */ #define TASK_COMM_LEN 16 @@ -1661,6 +1700,9 @@ struct task_struct { unsigned int sequential_io; unsigned int sequential_io_avg; #endif +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + unsigned long task_state_change; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ -- cgit v1.2.3 From 3427445afd26bd2395f29241319283a93f362cd0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Sep 2014 10:18:56 +0200 Subject: sched: Exclude cond_resched() from nested sleep test cond_resched() is a preemption point, not strictly a blocking primitive, so exclude it from the ->state test. In particular, preemption preserves task_struct::state. Signed-off-by: Peter Zijlstra (Intel) Cc: tglx@linutronix.de Cc: ilya.dryomov@inktank.com Cc: umgwanakikbuti@gmail.com Cc: oleg@redhat.com Cc: Alex Elder Cc: Andrew Morton Cc: Axel Lin Cc: Daniel Borkmann Cc: Dave Jones Cc: Jason Baron Cc: Linus Torvalds Cc: Rusty Russell Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140924082242.656559952@infradead.org Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 3 +++ include/linux/sched.h | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5068a0d9fecd..446d76a87ba1 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -162,6 +162,7 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP + void ___might_sleep(const char *file, int line, int preempt_offset); void __might_sleep(const char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep @@ -177,6 +178,8 @@ extern int _cond_resched(void); do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) # define sched_annotate_sleep() __set_current_state(TASK_RUNNING) #else + static inline void ___might_sleep(const char *file, int line, + int preempt_offset) { } static inline void __might_sleep(const char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4648e07f7d6f..4400ddc2fe73 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2806,7 +2806,7 @@ static inline int signal_pending_state(long state, struct task_struct *p) extern int _cond_resched(void); #define cond_resched() ({ \ - __might_sleep(__FILE__, __LINE__, 0); \ + ___might_sleep(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) @@ -2819,14 +2819,14 @@ extern int __cond_resched_lock(spinlock_t *lock); #endif #define cond_resched_lock(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ __cond_resched_lock(lock); \ }) extern int __cond_resched_softirq(void); #define cond_resched_softirq() ({ \ - __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ + ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ __cond_resched_softirq(); \ }) -- cgit v1.2.3 From 36df04bc5273a046f53b5e359febc1225f85aa7b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Oct 2014 12:21:57 +0100 Subject: sched/wait: Reimplement wait_event_freezable() Provide better implementations of wait_event_freezable() APIs. The problem is with freezer_do_not_count(), it hides the thread from the freezer, even though this thread might not actually freeze/sleep at all. Cc: oleg@redhat.com Cc: Rafael Wysocki Signed-off-by: Peter Zijlstra (Intel) Cc: Len Brown Cc: Linus Torvalds Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: linux-pm@vger.kernel.org Link: http://lkml.kernel.org/n/tip-d86fz1jmso9wjxa8jfpinp8o@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/freezer.h | 38 --------------------------------- include/linux/wait.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 7fd81b8c4897..e203665c0faa 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -265,35 +265,6 @@ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, __retval; \ }) -#define wait_event_freezable(wq, condition) \ -({ \ - int __retval; \ - freezer_do_not_count(); \ - __retval = wait_event_interruptible(wq, (condition)); \ - freezer_count(); \ - __retval; \ -}) - -#define wait_event_freezable_timeout(wq, condition, timeout) \ -({ \ - long __retval = timeout; \ - freezer_do_not_count(); \ - __retval = wait_event_interruptible_timeout(wq, (condition), \ - __retval); \ - freezer_count(); \ - __retval; \ -}) - -#define wait_event_freezable_exclusive(wq, condition) \ -({ \ - int __retval; \ - freezer_do_not_count(); \ - __retval = wait_event_interruptible_exclusive(wq, condition); \ - freezer_count(); \ - __retval; \ -}) - - #else /* !CONFIG_FREEZER */ static inline bool frozen(struct task_struct *p) { return false; } static inline bool freezing(struct task_struct *p) { return false; } @@ -331,15 +302,6 @@ static inline void set_freezable(void) {} #define freezable_schedule_hrtimeout_range(expires, delta, mode) \ schedule_hrtimeout_range(expires, delta, mode) -#define wait_event_freezable(wq, condition) \ - wait_event_interruptible(wq, condition) - -#define wait_event_freezable_timeout(wq, condition, timeout) \ - wait_event_interruptible_timeout(wq, condition, timeout) - -#define wait_event_freezable_exclusive(wq, condition) \ - wait_event_interruptible_exclusive(wq, condition) - #define wait_event_freezekillable(wq, condition) \ wait_event_killable(wq, condition) diff --git a/include/linux/wait.h b/include/linux/wait.h index 0421775e0b9f..2232ed16635a 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -267,6 +267,31 @@ do { \ __wait_event(wq, condition); \ } while (0) +#define __wait_event_freezable(wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ + schedule(); try_to_freeze()) + +/** + * wait_event - sleep (or freeze) until a condition gets true + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * + * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute + * to system load) until the @condition evaluates to true. The + * @condition is checked each time the waitqueue @wq is woken up. + * + * wake_up() has to be called after changing any variable that could + * change the result of the wait condition. + */ +#define wait_event_freezable(wq, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_freezable(wq, condition); \ + __ret; \ +}) + #define __wait_event_timeout(wq, condition, timeout) \ ___wait_event(wq, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ @@ -300,6 +325,24 @@ do { \ __ret; \ }) +#define __wait_event_freezable_timeout(wq, condition, timeout) \ + ___wait_event(wq, ___wait_cond_timeout(condition), \ + TASK_INTERRUPTIBLE, 0, timeout, \ + __ret = schedule_timeout(__ret); try_to_freeze()) + +/* + * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid + * increasing load and is freezable. + */ +#define wait_event_freezable_timeout(wq, condition, timeout) \ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_event_freezable_timeout(wq, condition, timeout); \ + __ret; \ +}) + #define __wait_event_cmd(wq, condition, cmd1, cmd2) \ (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) @@ -480,6 +523,20 @@ do { \ }) +#define __wait_event_freezable_exclusive(wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ + schedule(); try_to_freeze()) + +#define wait_event_freezable_exclusive(wq, condition) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_freezable_exclusive(wq, condition);\ + __ret; \ +}) + + #define __wait_event_interruptible_locked(wq, condition, exclusive, irq) \ ({ \ int __ret = 0; \ -- cgit v1.2.3 From 5d4d56582467f3c08dfedd0d995ce2092f384ecc Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Wed, 29 Oct 2014 14:48:13 +0100 Subject: sched/wait: Remove wait_event_freezekillable() There is no user.. make it go away. Signed-off-by: Peter Zijlstra (Intel) Cc: oleg@redhat.com Cc: Rafael Wysocki Cc: Len Brown Cc: Linus Torvalds Cc: Pavel Machek Cc: linux-pm@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/freezer.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/freezer.h b/include/linux/freezer.h index e203665c0faa..6b7fd9cf5ea2 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -246,15 +246,6 @@ static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, * defined in */ -#define wait_event_freezekillable(wq, condition) \ -({ \ - int __retval; \ - freezer_do_not_count(); \ - __retval = wait_event_killable(wq, (condition)); \ - freezer_count(); \ - __retval; \ -}) - /* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ #define wait_event_freezekillable_unsafe(wq, condition) \ ({ \ @@ -302,9 +293,6 @@ static inline void set_freezable(void) {} #define freezable_schedule_hrtimeout_range(expires, delta, mode) \ schedule_hrtimeout_range(expires, delta, mode) -#define wait_event_freezekillable(wq, condition) \ - wait_event_killable(wq, condition) - #define wait_event_freezekillable_unsafe(wq, condition) \ wait_event_killable(wq, condition) -- cgit v1.2.3 From 44dba3d5d6a10685fb15bd1954e62016334825e0 Mon Sep 17 00:00:00 2001 From: Iulia Manda Date: Fri, 31 Oct 2014 02:13:31 +0200 Subject: sched: Refactor task_struct to use numa_faults instead of numa_* pointers This patch simplifies task_struct by removing the four numa_* pointers in the same array and replacing them with the array pointer. By doing this, on x86_64, the size of task_struct is reduced by 3 ulong pointers (24 bytes on x86_64). A new parameter is added to the task_faults_idx function so that it can return an index to the correct offset, corresponding with the old precalculated pointers. All of the code in sched/ that depended on task_faults_idx and numa_* was changed in order to match the new logic. Signed-off-by: Iulia Manda Signed-off-by: Peter Zijlstra (Intel) Cc: mgorman@suse.de Cc: dave@stgolabs.net Cc: riel@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20141031001331.GA30662@winterfell Signed-off-by: Ingo Molnar --- include/linux/sched.h | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4400ddc2fe73..bd7c14ba86c4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1597,27 +1597,22 @@ struct task_struct { struct numa_group *numa_group; /* - * Exponential decaying average of faults on a per-node basis. - * Scheduling placement decisions are made based on the these counts. - * The values remain static for the duration of a PTE scan + * numa_faults is an array split into four regions: + * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer + * in this precise order. + * + * faults_memory: Exponential decaying average of faults on a per-node + * basis. Scheduling placement decisions are made based on these + * counts. The values remain static for the duration of a PTE scan. + * faults_cpu: Track the nodes the process was running on when a NUMA + * hinting fault was incurred. + * faults_memory_buffer and faults_cpu_buffer: Record faults per node + * during the current scan window. When the scan completes, the counts + * in faults_memory and faults_cpu decay and these values are copied. */ - unsigned long *numa_faults_memory; + unsigned long *numa_faults; unsigned long total_numa_faults; - /* - * numa_faults_buffer records faults per node during the current - * scan window. When the scan completes, the counts in - * numa_faults_memory decay and these values are copied. - */ - unsigned long *numa_faults_buffer_memory; - - /* - * Track the nodes the process was running on when a NUMA hinting - * fault was incurred. - */ - unsigned long *numa_faults_cpu; - unsigned long *numa_faults_buffer_cpu; - /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local. The task scan period is adapted -- cgit v1.2.3 From f622b429dadf83c3cc2d70f57f407ad85684eb36 Mon Sep 17 00:00:00 2001 From: Chen Hanxiao Date: Tue, 4 Nov 2014 16:51:22 +0800 Subject: sched: Update comments about CLONE_NEWUTS and CLONE_NEWIPC Remove question mark: s/New utsname group?/New utsname namespace Unified style for IPC: s/New ipcs/New ipc namespace Signed-off-by: Chen Hanxiao Acked-by: Serge E. Hallyn Signed-off-by: Peter Zijlstra (Intel) Cc: Jiri Kosina Cc: Linus Torvalds Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1415091082-15093-1-git-send-email-chenhanxiao@cn.fujitsu.com Signed-off-by: Ingo Molnar --- include/uapi/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index b932be9f5c5b..cc89ddefa926 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -23,8 +23,8 @@ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ /* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) and is now available for re-use. */ -#define CLONE_NEWUTS 0x04000000 /* New utsname group? */ -#define CLONE_NEWIPC 0x08000000 /* New ipcs */ +#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */ #define CLONE_NEWPID 0x20000000 /* New pid namespace */ #define CLONE_NEWNET 0x40000000 /* New network namespace */ -- cgit v1.2.3 From d8b163c4c657478ef33c082cff78d03a4ca07bb2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 11 Nov 2014 12:46:29 +0300 Subject: sched/numa: Init numa balancing fields of init_task We do not initialize init_task.numa_preferred_nid, but this value is inherited by userspace "init" process: rest_init()->kernel_thread(kernel_init)->do_fork(CLONE_VM); __sched_fork() { if (clone_flags & CLONE_VM) p->numa_preferred_nid = current->numa_preferred_nid; else p->numa_preferred_nid = -1; } kernel_init() becomes userspace "init" process. So, we propagate garbage nid to userspace, and it may be used during numa balancing. Currently, we do not have reports about this brings a problem, but it seem we should set it for sure. Even if init_task.numa_preferred_nid is zero, we may meet a weird configuration without nid#0. On sparc64, where processors are numbered physically, I saw a machine without cpu#1, while cpu#2 existed. Possible, something similar may be with numa nodes. So, let's initialize it and be sure we're safe. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Eric Paris Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Sergey Dyasly Link: http://lkml.kernel.org/r/1415699189.15631.6.camel@tkhai Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 77fc43f8fb72..5f30ac8c82bc 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -166,6 +166,15 @@ extern struct task_group root_task_group; # define INIT_RT_MUTEXES(tsk) #endif +#ifdef CONFIG_NUMA_BALANCING +# define INIT_NUMA_BALANCING(tsk) \ + .numa_preferred_nid = -1, \ + .numa_group = NULL, \ + .numa_faults = NULL, +#else +# define INIT_NUMA_BALANCING(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -237,6 +246,7 @@ extern struct task_group root_task_group; INIT_CPUSET_SEQ(tsk) \ INIT_RT_MUTEXES(tsk) \ INIT_VTIME(tsk) \ + INIT_NUMA_BALANCING(tsk) \ } -- cgit v1.2.3