From 3289bdb429884c0279bf9ab72dff7b934f19dfc6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Apr 2015 13:19:42 +0200 Subject: sched: Move the loadavg code to a more obvious location I could not find the loadavg code.. turns out it was hidden in a file called proc.c. It further got mingled up with the cruft per rq load indexes (which we really want to get rid of). Move the per rq load indexes into the fair.c load-balance code (that's the only thing that uses them) and rename proc.c to loadavg.c so we can find it again. Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Paul Gortmaker Cc: Thomas Gleixner [ Did minor cleanups to the code. ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e6122734..85cf253bc366 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -173,7 +173,12 @@ extern unsigned long nr_iowait_cpu(int cpu); extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void update_cpu_load_nohz(void); +#else +static inline void update_cpu_load_nohz(void) { } +#endif extern unsigned long get_parent_ip(unsigned long addr); -- cgit v1.2.3 From b76808e6808e34e7e78131d2b8cb0535622b8e9f Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 30 Apr 2015 21:19:57 -0700 Subject: signals, sched: Change all uses of JOBCTL_* from 'int' to 'long' c56fb6564dcd ("Fix a misaligned load inside ptrace_attach()") makes jobctl an "unsigned long". It makes sense to have the masks applied to it match that type. This is currently just a cosmetic change, but it will prevent the mask from being unexpectedly truncated if we ever end up with masks with more bits. One instance of "signr" is an int, but I left this alone because the mask ensures that it will never overflow. Signed-off-by: Palmer Dabbelt Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chris Metcalf Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: richard@nod.at Cc: vdavydov@parallels.com Link: http://lkml.kernel.org/r/1430453997-32459-4-git-send-email-palmer@dabbelt.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 85cf253bc366..4f066cb625ad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2082,22 +2082,22 @@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ -#define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) -#define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) -#define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) -#define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT) -#define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT) -#define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) -#define JOBCTL_LISTENING (1 << JOBCTL_LISTENING_BIT) +#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) +#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) +#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT) +#define JOBCTL_TRAP_STOP (1UL << JOBCTL_TRAP_STOP_BIT) +#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT) +#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) +#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) extern bool task_set_jobctl_pending(struct task_struct *task, - unsigned int mask); + unsigned long mask); extern void task_clear_jobctl_trapping(struct task_struct *task); extern void task_clear_jobctl_pending(struct task_struct *task, - unsigned int mask); + unsigned long mask); static inline void rcu_copy_process(struct task_struct *p) { -- cgit v1.2.3 From 7e60598785f30cf3dc9e476cc0fc3feeb37a0c63 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 30 Apr 2015 21:19:56 -0700 Subject: sched/wait: Change wait_on_bit*() to take an unsigned long *, not a void * The implementations of wait_on_bit*() will only work with long-aligned memory on systems that don't support misaligned loads and stores. This patch changes the function prototypes to ensure that the compiler will enforce alignment. Running make defconfig make KFLAGS="-Werror" seems to indicate that, as of c56fb6564dcd ("Fix a misaligned load inside ptrace_attach()"), there are now no users of non-long-aligned calls to wait_on_bit*(). I additionally tried a few "make randconfig" attempts, none of which failed to compile for this reason. Signed-off-by: Palmer Dabbelt Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chris Metcalf Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: richard@nod.at Cc: vdavydov@parallels.com Link: http://lkml.kernel.org/r/1430453997-32459-3-git-send-email-palmer@dabbelt.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 2db83349865b..d69ac4ecc88b 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -969,7 +969,7 @@ extern int bit_wait_io_timeout(struct wait_bit_key *); * on that signal. */ static inline int -wait_on_bit(void *word, int bit, unsigned mode) +wait_on_bit(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit(bit, word)) @@ -994,7 +994,7 @@ wait_on_bit(void *word, int bit, unsigned mode) * on that signal. */ static inline int -wait_on_bit_io(void *word, int bit, unsigned mode) +wait_on_bit_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit(bit, word)) @@ -1020,7 +1020,8 @@ wait_on_bit_io(void *word, int bit, unsigned mode) * received a signal and the mode permitted wakeup on that signal. */ static inline int -wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout) +wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, + unsigned long timeout) { might_sleep(); if (!test_bit(bit, word)) @@ -1047,7 +1048,8 @@ wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout) * on that signal. */ static inline int -wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) +wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, + unsigned mode) { might_sleep(); if (!test_bit(bit, word)) @@ -1075,7 +1077,7 @@ wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode * the @mode allows that signal to wake the process. */ static inline int -wait_on_bit_lock(void *word, int bit, unsigned mode) +wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) @@ -1099,7 +1101,7 @@ wait_on_bit_lock(void *word, int bit, unsigned mode) * the @mode allows that signal to wake the process. */ static inline int -wait_on_bit_lock_io(void *word, int bit, unsigned mode) +wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) @@ -1125,7 +1127,8 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode) * the @mode allows that signal to wake the process. */ static inline int -wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode) +wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, + unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) -- cgit v1.2.3 From e7cc4173115347bcdaa5de2824dd46ef2c58425f Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 30 Apr 2015 21:19:55 -0700 Subject: signals, ptrace, sched: Fix a misaligned load inside ptrace_attach() The misaligned load exception arises when running ptrace_attach() on the RISC-V (which hasn't been upstreamed yet). The problem is that wait_on_bit() takes a void* but then proceeds to call test_bit(), which takes a long*. This allows an int-aligned pointer to be passed to test_bit(), which promptly fails. This will manifest on any other asm-generic port where unaligned loads trap, where sizeof(long) > sizeof(int), and where task_struct.jobctl ends up not being long-aligned. This patch changes task_struct.jobctl to be a long, which ensures it has the correct alignment. Signed-off-by: Palmer Dabbelt Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chris Metcalf Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: bobby.prani@gmail.com Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: richard@nod.at Cc: vdavydov@parallels.com Link: http://lkml.kernel.org/r/1430453997-32459-2-git-send-email-palmer@dabbelt.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4f066cb625ad..fb650a2f4a73 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1374,7 +1374,7 @@ struct task_struct { int exit_state; int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ - unsigned int jobctl; /* JOBCTL_*, siglock protected */ + unsigned long jobctl; /* JOBCTL_*, siglock protected */ /* Used for emulating ABI behavior of previous Linux versions */ unsigned int personality; -- cgit v1.2.3 From 316c1608d15c736439d4065ed12f306db554b3da Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:20 -0700 Subject: sched, timer: Convert usages of ACCESS_ONCE() in the scheduler to READ_ONCE()/WRITE_ONCE() ACCESS_ONCE doesn't work reliably on non-scalar types. This patch removes the rest of the existing usages of ACCESS_ONCE() in the scheduler, and use the new READ_ONCE() and WRITE_ONCE() APIs as appropriate. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Acked-by: Waiman Long Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1430251224-5764-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index fb650a2f4a73..d70910355b20 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3085,13 +3085,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm) static inline unsigned long task_rlimit(const struct task_struct *tsk, unsigned int limit) { - return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur); + return READ_ONCE(tsk->signal->rlim[limit].rlim_cur); } static inline unsigned long task_rlimit_max(const struct task_struct *tsk, unsigned int limit) { - return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max); + return READ_ONCE(tsk->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) -- cgit v1.2.3 From 1018016c706f7ff9f56fde3a649789c47085a293 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:22 -0700 Subject: sched, timer: Replace spinlocks with atomics in thread_group_cputimer(), to improve scalability While running a database workload, we found a scalability issue with itimers. Much of the problem was caused by the thread_group_cputimer spinlock. Each time we account for group system/user time, we need to obtain a thread_group_cputimer's spinlock to update the timers. On larger systems (such as a 16 socket machine), this caused more than 30% of total time spent trying to obtain this kernel lock to update these group timer stats. This patch converts the timers to 64-bit atomic variables and use atomic add to update them without a lock. With this patch, the percent of total time spent updating thread group cputimer timers was reduced from 30% down to less than 1%. Note: On 32-bit systems using the generic 64-bit atomics, this causes sample_group_cputimer() to take locks 3 times instead of just 1 time. However, we tested this patch on a 32-bit system ARM system using the generic atomics and did not find the overhead to be much of an issue. An explanation for why this isn't an issue is that 32-bit systems usually have small numbers of CPUs, and cacheline contention from extra spinlocks called periodically is not really apparent on smaller systems. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Cc: Waiman Long Link: http://lkml.kernel.org/r/1430251224-5764-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 7 ++++--- include/linux/sched.h | 10 +++------- 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 696d22312b31..7b9d8b59e7bf 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -50,9 +50,10 @@ extern struct fs_struct init_fs; .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ .cputimer = { \ - .cputime = INIT_CPUTIME, \ - .running = 0, \ - .lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock), \ + .utime = ATOMIC64_INIT(0), \ + .stime = ATOMIC64_INIT(0), \ + .sum_exec_runtime = ATOMIC64_INIT(0), \ + .running = 0 \ }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index d70910355b20..a45874c3fab6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -598,9 +598,10 @@ struct task_cputime { * used for thread group CPU timer calculations. */ struct thread_group_cputimer { - struct task_cputime cputime; + atomic64_t utime; + atomic64_t stime; + atomic64_t sum_exec_runtime; int running; - raw_spinlock_t lock; }; #include @@ -2967,11 +2968,6 @@ static __always_inline bool need_resched(void) void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); -static inline void thread_group_cputime_init(struct signal_struct *sig) -{ - raw_spin_lock_init(&sig->cputimer.lock); -} - /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. -- cgit v1.2.3 From 971e8a985482c76487edb5a49811e99b96e846e1 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:23 -0700 Subject: sched, timer: Provide an atomic 'struct task_cputime' data structure This patch adds an atomic variant of the 'struct task_cputime' data structure, which can be used to store and update task_cputime statistics without needing to do locking. Suggested-by: Ingo Molnar Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Cc: Waiman Long Link: http://lkml.kernel.org/r/1430251224-5764-5-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index a45874c3fab6..6eb78cd45da7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -572,6 +572,23 @@ struct task_cputime { .sum_exec_runtime = 0, \ } +/* + * This is the atomic variant of task_cputime, which can be used for + * storing and updating task_cputime statistics without locking. + */ +struct task_cputime_atomic { + atomic64_t utime; + atomic64_t stime; + atomic64_t sum_exec_runtime; +}; + +#define INIT_CPUTIME_ATOMIC \ + (struct task_cputime_atomic) { \ + .utime = ATOMIC64_INIT(0), \ + .stime = ATOMIC64_INIT(0), \ + .sum_exec_runtime = ATOMIC64_INIT(0), \ + } + #ifdef CONFIG_PREEMPT_COUNT #define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) #else -- cgit v1.2.3 From 7110744516276e906f9197e2857d026eb2343393 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Apr 2015 13:00:24 -0700 Subject: sched, timer: Use the atomic task_cputime in thread_group_cputimer Recent optimizations were made to thread_group_cputimer to improve its scalability by keeping track of cputime stats without a lock. However, the values were open coded to the structure, causing them to be at a different abstraction level from the regular task_cputime structure. Furthermore, any subsequent similar optimizations would not be able to share the new code, since they are specific to thread_group_cputimer. This patch adds the new task_cputime_atomic data structure (introduced in the previous patch in the series) to thread_group_cputimer for keeping track of the cputime atomically, which also helps generalize the code. Suggested-by: Ingo Molnar Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Andrew Morton Cc: Aswin Chandramouleeswaran Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Scott J Norton Cc: Steven Rostedt Cc: Waiman Long Link: http://lkml.kernel.org/r/1430251224-5764-6-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 6 ++---- include/linux/sched.h | 4 +--- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 7b9d8b59e7bf..bb9b075f0eb0 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -50,10 +50,8 @@ extern struct fs_struct init_fs; .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ .cputimer = { \ - .utime = ATOMIC64_INIT(0), \ - .stime = ATOMIC64_INIT(0), \ - .sum_exec_runtime = ATOMIC64_INIT(0), \ - .running = 0 \ + .cputime_atomic = INIT_CPUTIME_ATOMIC, \ + .running = 0, \ }, \ .cred_guard_mutex = \ __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6eb78cd45da7..4adc536a3b03 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -615,9 +615,7 @@ struct task_cputime_atomic { * used for thread group CPU timer calculations. */ struct thread_group_cputimer { - atomic64_t utime; - atomic64_t stime; - atomic64_t sum_exec_runtime; + struct task_cputime_atomic cputime_atomic; int running; }; -- cgit v1.2.3 From 7675104990ed255b9315a82ae827ff312a2a88a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2015 08:27:50 -0700 Subject: sched: Implement lockless wake-queues This is useful for locking primitives that can effect multiple wakeups per operation and want to avoid lock internal lock contention by delaying the wakeups until we've released the lock internal locks. Alternatively it can be used to avoid issuing multiple wakeups, and thus save a few cycles, in packet processing. Queue all target tasks and wakeup once you've processed all packets. That way you avoid waking the target task multiple times if there were multiple packets for the same task. Properties of a wake_q are: - Lockless, as queue head must reside on the stack. - Being a queue, maintains wakeup order passed by the callers. This can be important for otherwise, in scenarios where highly contended locks could affect any reliance on lock fairness. - A queued task cannot be added again until it is woken up. This patch adds the needed infrastructure into the scheduler code and uses the new wake_list to delay the futex wakeups until after we've released the hash bucket locks. Signed-off-by: Peter Zijlstra (Intel) [tweaks, adjustments, comments, etc.] Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Borislav Petkov Cc: Chris Mason Cc: Davidlohr Bueso Cc: George Spelvin Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Manfred Spraul Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1430494072-30283-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- include/linux/sched.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4adc536a3b03..254d88e80f65 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -920,6 +920,50 @@ enum cpu_idle_type { #define SCHED_CAPACITY_SHIFT 10 #define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) +/* + * Wake-queues are lists of tasks with a pending wakeup, whose + * callers have already marked the task as woken internally, + * and can thus carry on. A common use case is being able to + * do the wakeups once the corresponding user lock as been + * released. + * + * We hold reference to each task in the list across the wakeup, + * thus guaranteeing that the memory is still valid by the time + * the actual wakeups are performed in wake_up_q(). + * + * One per task suffices, because there's never a need for a task to be + * in two wake queues simultaneously; it is forbidden to abandon a task + * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is + * already in a wake queue, the wakeup will happen soon and the second + * waker can just skip it. + * + * The WAKE_Q macro declares and initializes the list head. + * wake_up_q() does NOT reinitialize the list; it's expected to be + * called near the end of a function, where the fact that the queue is + * not used again will be easy to see by inspection. + * + * Note that this can cause spurious wakeups. schedule() callers + * must ensure the call is done inside a loop, confirming that the + * wakeup condition has in fact occurred. + */ +struct wake_q_node { + struct wake_q_node *next; +}; + +struct wake_q_head { + struct wake_q_node *first; + struct wake_q_node **lastp; +}; + +#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) + +#define WAKE_Q(name) \ + struct wake_q_head name = { WAKE_Q_TAIL, &name.first } + +extern void wake_q_add(struct wake_q_head *head, + struct task_struct *task); +extern void wake_up_q(struct wake_q_head *head); + /* * sched-domains (multiprocessor balancing) declarations: */ @@ -1532,6 +1576,8 @@ struct task_struct { /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; + struct wake_q_node wake_q; + #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ struct rb_root pi_waiters; -- cgit v1.2.3 From 920ce39f6c204d4ce4d8acebe7522f0dfa95f662 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 8 May 2015 14:31:50 -0700 Subject: sched, timer: Fix documentation for 'struct thread_group_cputimer' Fix the docbook build bug reported by Fengguang Wu. Reported-by: Fengguang Wu Signed-off-by: Jason Low Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hp.com Cc: aswin@hp.com Cc: bp@alien8.de Cc: dave@stgolabs.net Cc: fweisbec@gmail.com Cc: mgorman@suse.de Cc: oleg@redhat.com Cc: paulmck@linux.vnet.ibm.com Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Cc: rostedt@goodmis.org Cc: scott.norton@hp.com Cc: torvalds@linux-foundation.org Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/1431120710.5136.12.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 254d88e80f65..0eceeec5a01a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -606,10 +606,9 @@ struct task_cputime_atomic { /** * struct thread_group_cputimer - thread group interval timer counts - * @cputime: thread group interval timers. + * @cputime_atomic: atomic thread group interval timers. * @running: non-zero when there are timers running and * @cputime receives updates. - * @lock: lock for fields in this struct. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. -- cgit v1.2.3 From 8c8a457a60050d5922676f81913d87e4af6fd97b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 14 May 2015 14:31:01 +0300 Subject: sched: Remove redundant #ifdef Two adjacent members in task_struct were guarded by the same #define, so we can merge the two blocks. Signed-off-by: Nikolay Borisov Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431603061-29408-1-git-send-email-kernel@kyup.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0eceeec5a01a..5f8defa155cf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1398,8 +1398,6 @@ struct task_struct { int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ -#ifdef CONFIG_PREEMPT_RCU struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU -- cgit v1.2.3 From 92cf211874e954027b8e91cc9a15485a50b58d6b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 12 May 2015 16:41:46 +0200 Subject: sched/preempt: Merge preempt_mask.h into preempt.h preempt_mask.h defines all the preempt_count semantics and related symbols: preempt, softirq, hardirq, nmi, preempt active, need resched, etc... preempt.h defines the accessors and mutators of preempt_count. But there is a messy dependency game around those two header files: * preempt_mask.h includes preempt.h in order to access preempt_count() * preempt_mask.h defines all preempt_count semantic and symbols except PREEMPT_NEED_RESCHED that is needed by asm/preempt.h Thus we need to define it from preempt.h, right before including asm/preempt.h, instead of defining it to preempt_mask.h with the other preempt_count symbols. Therefore the preempt_count semantics happen to be spread out. * We plan to introduce preempt_active_[enter,exit]() to consolidate preempt_schedule*() code. But we'll need to access both preempt_count mutators (preempt_count_add()) and preempt_count symbols (PREEMPT_ACTIVE, PREEMPT_OFFSET). The usual place to define preempt operations is in preempt.h but then we'll need symbols in preempt_mask.h which already includes preempt.h. So we end up with a ressource circle dependency. Lets merge preempt_mask.h into preempt.h to solve these dependency issues. This way we gather semantic symbols and operation definition of preempt_count in a single file. This is a dumb copy-paste merge. Further merge re-arrangments are performed in a subsequent patch to ease review. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431441711-29753-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/bottom_half.h | 1 - include/linux/hardirq.h | 2 +- include/linux/preempt.h | 111 ++++++++++++++++++++++++++++++++++++++++ include/linux/preempt_mask.h | 117 ------------------------------------------- include/linux/sched.h | 2 +- 5 files changed, 113 insertions(+), 120 deletions(-) delete mode 100644 include/linux/preempt_mask.h (limited to 'include/linux') diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index 86c12c93e3cf..8fdcb783197d 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -2,7 +2,6 @@ #define _LINUX_BH_H #include -#include #ifdef CONFIG_TRACE_IRQFLAGS extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index f4af03404b97..dfd59d6bc6f0 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -1,7 +1,7 @@ #ifndef LINUX_HARDIRQ_H #define LINUX_HARDIRQ_H -#include +#include #include #include #include diff --git a/include/linux/preempt.h b/include/linux/preempt.h index de83b4eb1642..8cc0338a5e9a 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -17,6 +17,117 @@ #include +/* + * We put the hardirq and softirq counter into the preemption + * counter. The bitmask has the following meaning: + * + * - bits 0-7 are the preemption count (max preemption depth: 256) + * - bits 8-15 are the softirq count (max # of softirqs: 256) + * + * The hardirq count could in theory be the same as the number of + * interrupts in the system, but we run all interrupt handlers with + * interrupts disabled, so we cannot have nesting interrupts. Though + * there are a few palaeontologic drivers which reenable interrupts in + * the handler, so we need more than one bit here. + * + * PREEMPT_MASK: 0x000000ff + * SOFTIRQ_MASK: 0x0000ff00 + * HARDIRQ_MASK: 0x000f0000 + * NMI_MASK: 0x00100000 + * PREEMPT_ACTIVE: 0x00200000 + */ +#define PREEMPT_BITS 8 +#define SOFTIRQ_BITS 8 +#define HARDIRQ_BITS 4 +#define NMI_BITS 1 + +#define PREEMPT_SHIFT 0 +#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) +#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) +#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) + +#define __IRQ_MASK(x) ((1UL << (x))-1) + +#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) +#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) +#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) +#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) + +#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) +#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) +#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) +#define NMI_OFFSET (1UL << NMI_SHIFT) + +#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) + +#define PREEMPT_ACTIVE_BITS 1 +#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) +#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) + +#define hardirq_count() (preempt_count() & HARDIRQ_MASK) +#define softirq_count() (preempt_count() & SOFTIRQ_MASK) +#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ + | NMI_MASK)) + +/* + * Are we doing bottom half or hardware interrupt processing? + * Are we in a softirq context? Interrupt context? + * in_softirq - Are we currently processing softirq or have bh disabled? + * in_serving_softirq - Are we currently processing softirq? + */ +#define in_irq() (hardirq_count()) +#define in_softirq() (softirq_count()) +#define in_interrupt() (irq_count()) +#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) + +/* + * Are we in NMI context? + */ +#define in_nmi() (preempt_count() & NMI_MASK) + +#if defined(CONFIG_PREEMPT_COUNT) +# define PREEMPT_CHECK_OFFSET 1 +#else +# define PREEMPT_CHECK_OFFSET 0 +#endif + +/* + * The preempt_count offset needed for things like: + * + * spin_lock_bh() + * + * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and + * softirqs, such that unlock sequences of: + * + * spin_unlock(); + * local_bh_enable(); + * + * Work as expected. + */ +#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET) + +/* + * Are we running in atomic context? WARNING: this macro cannot + * always detect atomic context; in particular, it cannot know about + * held spinlocks in non-preemptible kernels. Thus it should not be + * used in the general case to determine whether sleeping is possible. + * Do not use in_atomic() in driver code. + */ +#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) + +/* + * Check whether we were atomic before we did preempt_disable(): + * (used by the scheduler, *after* releasing the kernel lock) + */ +#define in_atomic_preempt_off() \ + ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) + +#ifdef CONFIG_PREEMPT_COUNT +# define preemptible() (preempt_count() == 0 && !irqs_disabled()) +#else +# define preemptible() 0 +#endif + #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h deleted file mode 100644 index dbeec4d4a3be..000000000000 --- a/include/linux/preempt_mask.h +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef LINUX_PREEMPT_MASK_H -#define LINUX_PREEMPT_MASK_H - -#include - -/* - * We put the hardirq and softirq counter into the preemption - * counter. The bitmask has the following meaning: - * - * - bits 0-7 are the preemption count (max preemption depth: 256) - * - bits 8-15 are the softirq count (max # of softirqs: 256) - * - * The hardirq count could in theory be the same as the number of - * interrupts in the system, but we run all interrupt handlers with - * interrupts disabled, so we cannot have nesting interrupts. Though - * there are a few palaeontologic drivers which reenable interrupts in - * the handler, so we need more than one bit here. - * - * PREEMPT_MASK: 0x000000ff - * SOFTIRQ_MASK: 0x0000ff00 - * HARDIRQ_MASK: 0x000f0000 - * NMI_MASK: 0x00100000 - * PREEMPT_ACTIVE: 0x00200000 - */ -#define PREEMPT_BITS 8 -#define SOFTIRQ_BITS 8 -#define HARDIRQ_BITS 4 -#define NMI_BITS 1 - -#define PREEMPT_SHIFT 0 -#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) -#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) -#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) - -#define __IRQ_MASK(x) ((1UL << (x))-1) - -#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) -#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) -#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) -#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) - -#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) -#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) -#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) -#define NMI_OFFSET (1UL << NMI_SHIFT) - -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) - -#define PREEMPT_ACTIVE_BITS 1 -#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) -#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) - -#define hardirq_count() (preempt_count() & HARDIRQ_MASK) -#define softirq_count() (preempt_count() & SOFTIRQ_MASK) -#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ - | NMI_MASK)) - -/* - * Are we doing bottom half or hardware interrupt processing? - * Are we in a softirq context? Interrupt context? - * in_softirq - Are we currently processing softirq or have bh disabled? - * in_serving_softirq - Are we currently processing softirq? - */ -#define in_irq() (hardirq_count()) -#define in_softirq() (softirq_count()) -#define in_interrupt() (irq_count()) -#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) - -/* - * Are we in NMI context? - */ -#define in_nmi() (preempt_count() & NMI_MASK) - -#if defined(CONFIG_PREEMPT_COUNT) -# define PREEMPT_CHECK_OFFSET 1 -#else -# define PREEMPT_CHECK_OFFSET 0 -#endif - -/* - * The preempt_count offset needed for things like: - * - * spin_lock_bh() - * - * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and - * softirqs, such that unlock sequences of: - * - * spin_unlock(); - * local_bh_enable(); - * - * Work as expected. - */ -#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET) - -/* - * Are we running in atomic context? WARNING: this macro cannot - * always detect atomic context; in particular, it cannot know about - * held spinlocks in non-preemptible kernels. Thus it should not be - * used in the general case to determine whether sleeping is possible. - * Do not use in_atomic() in driver code. - */ -#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) - -/* - * Check whether we were atomic before we did preempt_disable(): - * (used by the scheduler, *after* releasing the kernel lock) - */ -#define in_atomic_preempt_off() \ - ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) - -#ifdef CONFIG_PREEMPT_COUNT -# define preemptible() (preempt_count() == 0 && !irqs_disabled()) -#else -# define preemptible() 0 -#endif - -#endif /* LINUX_PREEMPT_MASK_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5f8defa155cf..c53a1784d7a9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -25,7 +25,7 @@ struct sched_param { #include #include #include -#include +#include #include #include -- cgit v1.2.3 From 2e10e71ce88e3eaccfd09a045ae6ecebe657ba09 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 12 May 2015 16:41:47 +0200 Subject: sched/preempt: Rearrange a few symbols after headers merge Adjust a few comments, and further integrate a few definitions after the dumb headers copy. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431441711-29753-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 8cc0338a5e9a..37974cd4f092 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -9,14 +9,6 @@ #include #include -/* - * We use the MSB mostly because its available; see for - * the other bits -- can't include that header due to inclusion hell. - */ -#define PREEMPT_NEED_RESCHED 0x80000000 - -#include - /* * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: @@ -30,11 +22,12 @@ * there are a few palaeontologic drivers which reenable interrupts in * the handler, so we need more than one bit here. * - * PREEMPT_MASK: 0x000000ff - * SOFTIRQ_MASK: 0x0000ff00 - * HARDIRQ_MASK: 0x000f0000 - * NMI_MASK: 0x00100000 - * PREEMPT_ACTIVE: 0x00200000 + * PREEMPT_MASK: 0x000000ff + * SOFTIRQ_MASK: 0x0000ff00 + * HARDIRQ_MASK: 0x000f0000 + * NMI_MASK: 0x00100000 + * PREEMPT_ACTIVE: 0x00200000 + * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 @@ -64,6 +57,12 @@ #define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) #define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) +/* We use the MSB mostly because its available */ +#define PREEMPT_NEED_RESCHED 0x80000000 + +/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ +#include + #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #define softirq_count() (preempt_count() & SOFTIRQ_MASK) #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ @@ -122,12 +121,6 @@ #define in_atomic_preempt_off() \ ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) -#ifdef CONFIG_PREEMPT_COUNT -# define preemptible() (preempt_count() == 0 && !irqs_disabled()) -#else -# define preemptible() 0 -#endif - #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); @@ -160,6 +153,8 @@ do { \ #define preempt_enable_no_resched() sched_preempt_enable_no_resched() +#define preemptible() (preempt_count() == 0 && !irqs_disabled()) + #ifdef CONFIG_PREEMPT #define preempt_enable() \ do { \ @@ -232,6 +227,7 @@ do { \ #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() +#define preemptible() 0 #endif /* CONFIG_PREEMPT_COUNT */ -- cgit v1.2.3 From 90b62b5129d5cb50f62f40e684de7a1961e57197 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 12 May 2015 16:41:48 +0200 Subject: sched/preempt: Rename PREEMPT_CHECK_OFFSET to PREEMPT_DISABLE_OFFSET "CHECK" suggests it's only used as a comparison mask. But now it's used further as a config-conditional preempt disabler offset. Lets disambiguate this name. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431441711-29753-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 37974cd4f092..4689ef210a13 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -85,9 +85,9 @@ #define in_nmi() (preempt_count() & NMI_MASK) #if defined(CONFIG_PREEMPT_COUNT) -# define PREEMPT_CHECK_OFFSET 1 +# define PREEMPT_DISABLE_OFFSET 1 #else -# define PREEMPT_CHECK_OFFSET 0 +# define PREEMPT_DISABLE_OFFSET 0 #endif /* @@ -103,7 +103,7 @@ * * Work as expected. */ -#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET) +#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_DISABLE_OFFSET) /* * Are we running in atomic context? WARNING: this macro cannot @@ -119,7 +119,7 @@ * (used by the scheduler, *after* releasing the kernel lock) */ #define in_atomic_preempt_off() \ - ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) + ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void preempt_count_add(int val); -- cgit v1.2.3 From b30f0e3ffedfa52b1d67a302ae5860c49998e5e2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 12 May 2015 16:41:49 +0200 Subject: sched/preempt: Optimize preemption operations on __schedule() callers __schedule() disables preemption and some of its callers (the preempt_schedule*() family) also set PREEMPT_ACTIVE. So we have two preempt_count() modifications that could be performed at once. Lets remove the preemption disablement from __schedule() and pull this responsibility to its callers in order to optimize preempt_count() operations in a single place. Suggested-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431441711-29753-5-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 4689ef210a13..45da394f2779 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -137,6 +137,18 @@ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) +#define preempt_active_enter() \ +do { \ + preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ + barrier(); \ +} while (0) + +#define preempt_active_exit() \ +do { \ + barrier(); \ + preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ +} while (0) + #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ -- cgit v1.2.3 From e017cf21ae82e0b36f026b22083a8ae67926f465 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 12 May 2015 16:41:50 +0200 Subject: sched/preempt: Fix out of date comment Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431441711-29753-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 45da394f2779..4057696c641c 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -116,7 +116,7 @@ /* * Check whether we were atomic before we did preempt_disable(): - * (used by the scheduler, *after* releasing the kernel lock) + * (used by the scheduler) */ #define in_atomic_preempt_off() \ ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET) -- cgit v1.2.3 From 3e51f3c4004c9b01f66da03214a3e206f5ed627b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 12 May 2015 16:41:51 +0200 Subject: sched/preempt: Remove PREEMPT_ACTIVE unmasking off in_atomic() Now that PREEMPT_ACTIVE implies PREEMPT_DISABLE_OFFSET, ignoring PREEMPT_ACTIVE from in_atomic() check isn't useful anymore. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1431441711-29753-7-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 4057696c641c..a1a00e14c14f 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -112,7 +112,7 @@ * used in the general case to determine whether sleeping is possible. * Do not use in_atomic() in driver code. */ -#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) +#define in_atomic() (preempt_count() != 0) /* * Check whether we were atomic before we did preempt_disable(): -- cgit v1.2.3 From 8bcbde5480f9777f8b74d71493722c663e22c21b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:06 +0200 Subject: sched/preempt, mm/fault: Count pagefault_disable() levels in pagefault_disabled Until now, pagefault_disable()/pagefault_enabled() used the preempt count to track whether in an environment with pagefaults disabled (can be queried via in_atomic()). This patch introduces a separate counter in task_struct to count the level of pagefault_disable() calls. We'll keep manipulating the preempt count to retain compatibility to existing pagefault handlers. It is now possible to verify whether in a pagefault_disable() envionment by calling pagefault_disabled(). In contrast to in_atomic() it will not be influenced by preempt_enable()/preempt_disable(). This patch is based on a patch from Ingo Molnar. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-2-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + include/linux/uaccess.h | 36 +++++++++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c53a1784d7a9..dd07ac03f82a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1788,6 +1788,7 @@ struct task_struct { #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; #endif + int pagefault_disabled; }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index ecd3319dac33..23290cc93a24 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -2,20 +2,36 @@ #define __LINUX_UACCESS_H__ #include +#include #include +static __always_inline void pagefault_disabled_inc(void) +{ + current->pagefault_disabled++; +} + +static __always_inline void pagefault_disabled_dec(void) +{ + current->pagefault_disabled--; + WARN_ON(current->pagefault_disabled < 0); +} + /* - * These routines enable/disable the pagefault handler in that - * it will not take any locks and go straight to the fixup table. + * These routines enable/disable the pagefault handler. If disabled, it will + * not take any locks and go straight to the fixup table. + * + * We increase the preempt and the pagefault count, to be able to distinguish + * whether we run in simple atomic context or in a real pagefault_disable() + * context. + * + * For now, after pagefault_disabled() has been called, we run in atomic + * context. User access methods will not sleep. * - * They have great resemblance to the preempt_disable/enable calls - * and in fact they are identical; this is because currently there is - * no other way to make the pagefault handlers do this. So we do - * disable preemption but we don't necessarily care about that. */ static inline void pagefault_disable(void) { preempt_count_inc(); + pagefault_disabled_inc(); /* * make sure to have issued the store before a pagefault * can hit. @@ -25,18 +41,24 @@ static inline void pagefault_disable(void) static inline void pagefault_enable(void) { -#ifndef CONFIG_PREEMPT /* * make sure to issue those last loads/stores before enabling * the pagefault handler again. */ barrier(); + pagefault_disabled_dec(); +#ifndef CONFIG_PREEMPT preempt_count_dec(); #else preempt_enable(); #endif } +/* + * Is the pagefault handler disabled? If so, user access methods will not sleep. + */ +#define pagefault_disabled() (current->pagefault_disabled != 0) + #ifndef ARCH_HAS_NOCACHE_UACCESS static inline unsigned long __copy_from_user_inatomic_nocache(void *to, -- cgit v1.2.3 From 9ec23531fd48031d1b6ca5366f5f967d17a8bc28 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:07 +0200 Subject: sched/preempt, mm/fault: Trigger might_sleep() in might_fault() with disabled pagefaults Commit 662bbcb2747c ("mm, sched: Allow uaccess in atomic with pagefault_disable()") removed might_sleep() checks for all user access code (that uses might_fault()). The reason was to disable wrong "sleep in atomic" warnings in the following scenario: pagefault_disable() rc = copy_to_user(...) pagefault_enable() Which is valid, as pagefault_disable() increments the preempt counter and therefore disables the pagefault handler. copy_to_user() will not sleep and return an error code if a page is not available. However, as all might_sleep() checks are removed, CONFIG_DEBUG_ATOMIC_SLEEP would no longer detect the following scenario: spin_lock(&lock); rc = copy_to_user(...) spin_unlock(&lock) If the kernel is compiled with preemption turned on, preempt_disable() will make in_atomic() detect disabled preemption. The fault handler would correctly never sleep on user access. However, with preemption turned off, preempt_disable() is usually a NOP (with !CONFIG_PREEMPT_COUNT), therefore in_atomic() will not be able to detect disabled preemption nor disabled pagefaults. The fault handler could sleep. We really want to enable CONFIG_DEBUG_ATOMIC_SLEEP checks for user access functions again, otherwise we can end up with horrible deadlocks. Root of all evil is that pagefault_disable() acts almost as preempt_disable(), depending on preemption being turned on/off. As we now have pagefault_disabled(), we can use it to distinguish whether user acces functions might sleep. Convert might_fault() into a makro that calls __might_fault(), to allow proper file + line messages in case of a might_sleep() warning. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-3-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3a5b48e52a9e..060dd7b61c6d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -244,7 +244,8 @@ static inline u32 reciprocal_scale(u32 val, u32 ep_ro) #if defined(CONFIG_MMU) && \ (defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)) -void might_fault(void); +#define might_fault() __might_fault(__FILE__, __LINE__) +void __might_fault(const char *file, int line); #else static inline void might_fault(void) { } #endif -- cgit v1.2.3 From 2cb7c9cb426660b5ed58b643d9e7dd5d50ba901f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:09 +0200 Subject: sched/preempt, mm/kmap: Explicitly disable/enable preemption in kmap_atomic_* The existing code relies on pagefault_disable() implicitly disabling preemption, so that no schedule will happen between kmap_atomic() and kunmap_atomic(). Let's make this explicit, to prepare for pagefault_disable() not touching preemption anymore. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-5-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/highmem.h | 2 ++ include/linux/io-mapping.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 9286a46b7d69..6aefcd0031a6 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -65,6 +65,7 @@ static inline void kunmap(struct page *page) static inline void *kmap_atomic(struct page *page) { + preempt_disable(); pagefault_disable(); return page_address(page); } @@ -73,6 +74,7 @@ static inline void *kmap_atomic(struct page *page) static inline void __kunmap_atomic(void *addr) { pagefault_enable(); + preempt_enable(); } #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 657fab4efab3..c27dde7215b5 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -141,6 +141,7 @@ static inline void __iomem * io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset) { + preempt_disable(); pagefault_disable(); return ((char __force __iomem *) mapping) + offset; } @@ -149,6 +150,7 @@ static inline void io_mapping_unmap_atomic(void __iomem *vaddr) { pagefault_enable(); + preempt_enable(); } /* Non-atomic map/unmap */ -- cgit v1.2.3 From 70ffdb9393a7264a069265edded729078dcf0425 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:11 +0200 Subject: mm/fault, arch: Use pagefault_disable() to check for disabled pagefaults in the handler Introduce faulthandler_disabled() and use it to check for irq context and disabled pagefaults (via pagefault_disable()) in the pagefault handlers. Please note that we keep the in_atomic() checks in place - to detect whether in irq context (in which case preemption is always properly disabled). In contrast, preempt_disable() should never be used to disable pagefaults. With !CONFIG_PREEMPT_COUNT, preempt_disable() doesn't modify the preempt counter, and therefore the result of in_atomic() differs. We validate that condition by using might_fault() checks when calling might_sleep(). Therefore, add a comment to faulthandler_disabled(), describing why this is needed. faulthandler_disabled() and pagefault_disable() are defined in linux/uaccess.h, so let's properly add that include to all relevant files. This patch is based on a patch from Thomas Gleixner. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-7-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/uaccess.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 23290cc93a24..90786d2d74e5 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -59,6 +59,18 @@ static inline void pagefault_enable(void) */ #define pagefault_disabled() (current->pagefault_disabled != 0) +/* + * The pagefault handler is in general disabled by pagefault_disable() or + * when in irq context (via in_atomic()). + * + * This function should only be used by the fault handlers. Other users should + * stick to pagefault_disabled(). + * Please NEVER use preempt_disable() to disable the fault handler. With + * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled. + * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT. + */ +#define faulthandler_disabled() (pagefault_disabled() || in_atomic()) + #ifndef ARCH_HAS_NOCACHE_UACCESS static inline unsigned long __copy_from_user_inatomic_nocache(void *to, -- cgit v1.2.3 From 8222dbe21e79338de92d5e1956cd1e3994cc9f93 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 May 2015 17:52:20 +0200 Subject: sched/preempt, mm/fault: Decouple preemption from the page fault logic As the fault handlers now all rely on the pagefault_disabled() checks and implicit preempt_disable() calls by pagefault_disable() have been made explicit, we can completely rely on the pagefault_disableD counter. So let's no longer touch the preempt count when disabling/enabling pagefaults. After a call to pagefault_disable(), pagefault_disabled() will return true, but in_atomic() won't. Reviewed-and-tested-by: Thomas Gleixner Signed-off-by: David Hildenbrand Signed-off-by: Peter Zijlstra (Intel) Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: airlied@linux.ie Cc: akpm@linux-foundation.org Cc: benh@kernel.crashing.org Cc: bigeasy@linutronix.de Cc: borntraeger@de.ibm.com Cc: daniel.vetter@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: hocko@suse.cz Cc: hughd@google.com Cc: mst@redhat.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: schwidefsky@de.ibm.com Cc: yang.shi@windriver.com Link: http://lkml.kernel.org/r/1431359540-32227-16-git-send-email-dahi@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/linux/uaccess.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 90786d2d74e5..ae572c138607 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -1,7 +1,6 @@ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ -#include #include #include @@ -20,17 +19,11 @@ static __always_inline void pagefault_disabled_dec(void) * These routines enable/disable the pagefault handler. If disabled, it will * not take any locks and go straight to the fixup table. * - * We increase the preempt and the pagefault count, to be able to distinguish - * whether we run in simple atomic context or in a real pagefault_disable() - * context. - * - * For now, after pagefault_disabled() has been called, we run in atomic - * context. User access methods will not sleep. - * + * User access methods will not sleep when called from a pagefault_disabled() + * environment. */ static inline void pagefault_disable(void) { - preempt_count_inc(); pagefault_disabled_inc(); /* * make sure to have issued the store before a pagefault @@ -47,11 +40,6 @@ static inline void pagefault_enable(void) */ barrier(); pagefault_disabled_dec(); -#ifndef CONFIG_PREEMPT - preempt_count_dec(); -#else - preempt_enable(); -#endif } /* -- cgit v1.2.3 From 80ed87c8a9ca0cad7ca66cf3bbdfb17559a66dcf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2015 14:23:45 +0200 Subject: sched/wait: Introduce TASK_NOLOAD and TASK_IDLE Currently people use TASK_INTERRUPTIBLE to idle kthreads and wait for 'work' because TASK_UNINTERRUPTIBLE contributes to the loadavg. Having all idle kthreads contribute to the loadavg is somewhat silly. Now mostly this works OK, because kthreads have all their signals masked. However there's a few sites where this is causing problems and TASK_UNINTERRUPTIBLE should be used, except for that loadavg issue. This patch adds TASK_NOLOAD which, when combined with TASK_UNINTERRUPTIBLE avoids the loadavg accounting. As most of imagined usage sites are loops where a thread wants to idle, waiting for work, a helper TASK_IDLE is introduced. Signed-off-by: Peter Zijlstra (Intel) Cc: Julian Anastasov Cc: Linus Torvalds Cc: NeilBrown Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index dd07ac03f82a..7de815c6fa78 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -218,9 +218,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); #define TASK_WAKEKILL 128 #define TASK_WAKING 256 #define TASK_PARKED 512 -#define TASK_STATE_MAX 1024 +#define TASK_NOLOAD 1024 +#define TASK_STATE_MAX 2048 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP" +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; @@ -230,6 +231,8 @@ extern char ___assert_task_state[1 - 2*!!( #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) +#define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) + /* Convenience macros for the sake of wake_up */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) #define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) @@ -245,7 +248,8 @@ extern char ___assert_task_state[1 - 2*!!( ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_contributes_to_load(task) \ ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ - (task->flags & PF_FROZEN) == 0) + (task->flags & PF_FROZEN) == 0 && \ + (task->state & TASK_NOLOAD) == 0) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -- cgit v1.2.3 From 06931e62246844c73fba24d7aeb4a5dc897a2739 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 May 2015 15:11:28 +0200 Subject: sched/topology: Rename topology_thread_cpumask() to topology_sibling_cpumask() Rename topology_thread_cpumask() to topology_sibling_cpumask() for more consistency with scheduler code. Signed-off-by: Bartosz Golaszewski Reviewed-by: Thomas Gleixner Acked-by: Russell King Acked-by: Catalin Marinas Cc: Benoit Cousson Cc: Fenghua Yu Cc: Guenter Roeck Cc: Jean Delvare Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Oleg Drokin Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Russell King Cc: Viresh Kumar Link: http://lkml.kernel.org/r/1432645896-12588-2-git-send-email-bgolaszewski@baylibre.com Signed-off-by: Ingo Molnar --- include/linux/topology.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index 909b6e43b694..73ddad1e0fa3 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -191,8 +191,8 @@ static inline int cpu_to_mem(int cpu) #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif -#ifndef topology_thread_cpumask -#define topology_thread_cpumask(cpu) cpumask_of(cpu) +#ifndef topology_sibling_cpumask +#define topology_sibling_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) @@ -201,7 +201,7 @@ static inline int cpu_to_mem(int cpu) #ifdef CONFIG_SCHED_SMT static inline const struct cpumask *cpu_smt_mask(int cpu) { - return topology_thread_cpumask(cpu); + return topology_sibling_cpumask(cpu); } #endif -- cgit v1.2.3 From 4eaca0a887eaee04fc7a3866d0f5b51b34030dfa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Jun 2015 17:39:08 +0200 Subject: preempt: Use preempt_schedule_context() as the official tracing preemption point preempt_schedule_context() is a tracing safe preemption point but it's only used when CONFIG_CONTEXT_TRACKING=y. Other configs have tracing recursion issues since commit: b30f0e3ffedf ("sched/preempt: Optimize preemption operations on __schedule() callers") introduced function based preemp_count_*() ops. Lets make it available on all configs and give it a more appropriate name for its new position. Reported-by: Fengguang Wu Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1433432349-1021-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index a1a00e14c14f..7686dd63bc35 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -204,15 +204,11 @@ do { \ #ifdef CONFIG_PREEMPT -#ifndef CONFIG_CONTEXT_TRACKING -#define __preempt_schedule_context() __preempt_schedule() -#endif - #define preempt_enable_notrace() \ do { \ barrier(); \ if (unlikely(__preempt_count_dec_and_test())) \ - __preempt_schedule_context(); \ + __preempt_schedule_notrace(); \ } while (0) #else #define preempt_enable_notrace() \ -- cgit v1.2.3 From 9a92e3dc6ad02208a014d0d8404ebbd697e3d5ef Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Jun 2015 17:39:09 +0200 Subject: preempt: Reorganize the notrace definitions a bit preempt.h has two seperate "#ifdef CONFIG_PREEMPT" sections: one to define preempt_enable() and another to define preempt_enable_notrace(). Lets gather both. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Fengguang Wu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1433432349-1021-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 7686dd63bc35..0f1534acaf60 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -175,48 +175,46 @@ do { \ __preempt_schedule(); \ } while (0) +#define preempt_enable_notrace() \ +do { \ + barrier(); \ + if (unlikely(__preempt_count_dec_and_test())) \ + __preempt_schedule_notrace(); \ +} while (0) + #define preempt_check_resched() \ do { \ if (should_resched()) \ __preempt_schedule(); \ } while (0) -#else +#else /* !CONFIG_PREEMPT */ #define preempt_enable() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) -#define preempt_check_resched() do { } while (0) -#endif - -#define preempt_disable_notrace() \ -do { \ - __preempt_count_inc(); \ - barrier(); \ -} while (0) -#define preempt_enable_no_resched_notrace() \ +#define preempt_enable_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) -#ifdef CONFIG_PREEMPT +#define preempt_check_resched() do { } while (0) +#endif /* CONFIG_PREEMPT */ -#define preempt_enable_notrace() \ +#define preempt_disable_notrace() \ do { \ + __preempt_count_inc(); \ barrier(); \ - if (unlikely(__preempt_count_dec_and_test())) \ - __preempt_schedule_notrace(); \ } while (0) -#else -#define preempt_enable_notrace() \ + +#define preempt_enable_no_resched_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) -#endif #else /* !CONFIG_PREEMPT_COUNT */ -- cgit v1.2.3 From b17718d02f54b90978d0e0146368b512b11c3e84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2015 17:30:23 +0200 Subject: sched/stop_machine: Fix deadlock between multiple stop_two_cpus() Jiri reported a machine stuck in multi_cpu_stop() with migrate_swap_stop() as function and with the following src,dst cpu pairs: {11, 4} {13, 11} { 4, 13} 4 11 13 cpuM: queue(4 ,13) *Ma cpuN: queue(13,11) *N Na *M Mb cpuO: queue(11, 4) *O Oa *Nb *Ob Where *X denotes the cpu running the queueing of cpu-X and X[ab] denotes the first/second queued work. You'll observe the top of the workqueue for each cpu: 4,11,13 to be work from cpus: M, O, N resp. IOW. deadlock. Do away with the queueing trickery and introduce lg_double_lock() to lock both CPUs and fully serialize the stop_two_cpus() callers instead of the partial (and buggy) serialization we have now. Reported-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150605153023.GH19282@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/lglock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lglock.h b/include/linux/lglock.h index 0081f000e34b..c92ebd100d9b 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -52,10 +52,15 @@ struct lglock { static struct lglock name = { .lock = &name ## _lock } void lg_lock_init(struct lglock *lg, char *name); + void lg_local_lock(struct lglock *lg); void lg_local_unlock(struct lglock *lg); void lg_local_lock_cpu(struct lglock *lg, int cpu); void lg_local_unlock_cpu(struct lglock *lg, int cpu); + +void lg_double_lock(struct lglock *lg, int cpu1, int cpu2); +void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2); + void lg_global_lock(struct lglock *lg); void lg_global_unlock(struct lglock *lg); -- cgit v1.2.3