From 3289bdb429884c0279bf9ab72dff7b934f19dfc6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 14 Apr 2015 13:19:42 +0200
Subject: sched: Move the loadavg code to a more obvious location

I could not find the loadavg code.. turns out it was hidden in a file
called proc.c. It further got mingled up with the cruft per rq load
indexes (which we really want to get rid of).

Move the per rq load indexes into the fair.c load-balance code (that's
the only thing that uses them) and rename proc.c to loadavg.c so we
can find it again.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
[ Did minor cleanups to the code. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a2e6122734..85cf253bc366 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -173,7 +173,12 @@ extern unsigned long nr_iowait_cpu(int cpu);
 extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 
 extern void calc_global_load(unsigned long ticks);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void update_cpu_load_nohz(void);
+#else
+static inline void update_cpu_load_nohz(void) { }
+#endif
 
 extern unsigned long get_parent_ip(unsigned long addr);
 
-- 
cgit v1.2.3


From b76808e6808e34e7e78131d2b8cb0535622b8e9f Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@dabbelt.com>
Date: Thu, 30 Apr 2015 21:19:57 -0700
Subject: signals, sched: Change all uses of JOBCTL_* from 'int' to 'long'

c56fb6564dcd ("Fix a misaligned load inside ptrace_attach()") makes
jobctl an "unsigned long".  It makes sense to have the masks applied
to it match that type.  This is currently just a cosmetic change, but
it will prevent the mask from being unexpectedly truncated if we ever
end up with masks with more bits.

One instance of "signr" is an int, but I left this alone because the
mask ensures that it will never overflow.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bobby.prani@gmail.com
Cc: oleg@redhat.com
Cc: paulmck@linux.vnet.ibm.com
Cc: richard@nod.at
Cc: vdavydov@parallels.com
Link: http://lkml.kernel.org/r/1430453997-32459-4-git-send-email-palmer@dabbelt.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 85cf253bc366..4f066cb625ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2082,22 +2082,22 @@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
 #define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
 #define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
 
-#define JOBCTL_STOP_DEQUEUED	(1 << JOBCTL_STOP_DEQUEUED_BIT)
-#define JOBCTL_STOP_PENDING	(1 << JOBCTL_STOP_PENDING_BIT)
-#define JOBCTL_STOP_CONSUME	(1 << JOBCTL_STOP_CONSUME_BIT)
-#define JOBCTL_TRAP_STOP	(1 << JOBCTL_TRAP_STOP_BIT)
-#define JOBCTL_TRAP_NOTIFY	(1 << JOBCTL_TRAP_NOTIFY_BIT)
-#define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
-#define JOBCTL_LISTENING	(1 << JOBCTL_LISTENING_BIT)
+#define JOBCTL_STOP_DEQUEUED	(1UL << JOBCTL_STOP_DEQUEUED_BIT)
+#define JOBCTL_STOP_PENDING	(1UL << JOBCTL_STOP_PENDING_BIT)
+#define JOBCTL_STOP_CONSUME	(1UL << JOBCTL_STOP_CONSUME_BIT)
+#define JOBCTL_TRAP_STOP	(1UL << JOBCTL_TRAP_STOP_BIT)
+#define JOBCTL_TRAP_NOTIFY	(1UL << JOBCTL_TRAP_NOTIFY_BIT)
+#define JOBCTL_TRAPPING		(1UL << JOBCTL_TRAPPING_BIT)
+#define JOBCTL_LISTENING	(1UL << JOBCTL_LISTENING_BIT)
 
 #define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
 #define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
 
 extern bool task_set_jobctl_pending(struct task_struct *task,
-				    unsigned int mask);
+				    unsigned long mask);
 extern void task_clear_jobctl_trapping(struct task_struct *task);
 extern void task_clear_jobctl_pending(struct task_struct *task,
-				      unsigned int mask);
+				      unsigned long mask);
 
 static inline void rcu_copy_process(struct task_struct *p)
 {
-- 
cgit v1.2.3


From 7e60598785f30cf3dc9e476cc0fc3feeb37a0c63 Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@dabbelt.com>
Date: Thu, 30 Apr 2015 21:19:56 -0700
Subject: sched/wait: Change wait_on_bit*() to take an unsigned long *, not a
 void *

The implementations of wait_on_bit*() will only work with long-aligned
memory on systems that don't support misaligned loads and stores.

This patch changes the function prototypes to ensure that the compiler
will enforce alignment.

Running

  make defconfig
  make KFLAGS="-Werror"

seems to indicate that, as of c56fb6564dcd ("Fix a misaligned load
inside ptrace_attach()"), there are now no users of non-long-aligned
calls to wait_on_bit*().  I additionally tried a few "make randconfig"
attempts, none of which failed to compile for this reason.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bobby.prani@gmail.com
Cc: oleg@redhat.com
Cc: paulmck@linux.vnet.ibm.com
Cc: richard@nod.at
Cc: vdavydov@parallels.com
Link: http://lkml.kernel.org/r/1430453997-32459-3-git-send-email-palmer@dabbelt.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/wait.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2db83349865b..d69ac4ecc88b 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -969,7 +969,7 @@ extern int bit_wait_io_timeout(struct wait_bit_key *);
  * on that signal.
  */
 static inline int
-wait_on_bit(void *word, int bit, unsigned mode)
+wait_on_bit(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
 	if (!test_bit(bit, word))
@@ -994,7 +994,7 @@ wait_on_bit(void *word, int bit, unsigned mode)
  * on that signal.
  */
 static inline int
-wait_on_bit_io(void *word, int bit, unsigned mode)
+wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
 	if (!test_bit(bit, word))
@@ -1020,7 +1020,8 @@ wait_on_bit_io(void *word, int bit, unsigned mode)
  * received a signal and the mode permitted wakeup on that signal.
  */
 static inline int
-wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout)
+wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
+		    unsigned long timeout)
 {
 	might_sleep();
 	if (!test_bit(bit, word))
@@ -1047,7 +1048,8 @@ wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout)
  * on that signal.
  */
 static inline int
-wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode)
+wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
+		   unsigned mode)
 {
 	might_sleep();
 	if (!test_bit(bit, word))
@@ -1075,7 +1077,7 @@ wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode
  * the @mode allows that signal to wake the process.
  */
 static inline int
-wait_on_bit_lock(void *word, int bit, unsigned mode)
+wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
 	if (!test_and_set_bit(bit, word))
@@ -1099,7 +1101,7 @@ wait_on_bit_lock(void *word, int bit, unsigned mode)
  * the @mode allows that signal to wake the process.
  */
 static inline int
-wait_on_bit_lock_io(void *word, int bit, unsigned mode)
+wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
 	if (!test_and_set_bit(bit, word))
@@ -1125,7 +1127,8 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode)
  * the @mode allows that signal to wake the process.
  */
 static inline int
-wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode)
+wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
+			unsigned mode)
 {
 	might_sleep();
 	if (!test_and_set_bit(bit, word))
-- 
cgit v1.2.3


From e7cc4173115347bcdaa5de2824dd46ef2c58425f Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@dabbelt.com>
Date: Thu, 30 Apr 2015 21:19:55 -0700
Subject: signals, ptrace, sched: Fix a misaligned load inside ptrace_attach()

The misaligned load exception arises when running ptrace_attach() on
the RISC-V (which hasn't been upstreamed yet).  The problem is that
wait_on_bit() takes a void* but then proceeds to call test_bit(),
which takes a long*.  This allows an int-aligned pointer to be passed
to test_bit(), which promptly fails.  This will manifest on any other
asm-generic port where unaligned loads trap, where sizeof(long) >
sizeof(int), and where task_struct.jobctl ends up not being
long-aligned.

This patch changes task_struct.jobctl to be a long, which ensures it
has the correct alignment.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bobby.prani@gmail.com
Cc: oleg@redhat.com
Cc: paulmck@linux.vnet.ibm.com
Cc: richard@nod.at
Cc: vdavydov@parallels.com
Link: http://lkml.kernel.org/r/1430453997-32459-2-git-send-email-palmer@dabbelt.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4f066cb625ad..fb650a2f4a73 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1374,7 +1374,7 @@ struct task_struct {
 	int exit_state;
 	int exit_code, exit_signal;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
-	unsigned int jobctl;	/* JOBCTL_*, siglock protected */
+	unsigned long jobctl;	/* JOBCTL_*, siglock protected */
 
 	/* Used for emulating ABI behavior of previous Linux versions */
 	unsigned int personality;
-- 
cgit v1.2.3


From 316c1608d15c736439d4065ed12f306db554b3da Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Tue, 28 Apr 2015 13:00:20 -0700
Subject: sched, timer: Convert usages of ACCESS_ONCE() in the scheduler to
 READ_ONCE()/WRITE_ONCE()

ACCESS_ONCE doesn't work reliably on non-scalar types. This patch removes
the rest of the existing usages of ACCESS_ONCE() in the scheduler, and use
the new READ_ONCE() and WRITE_ONCE() APIs as appropriate.

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Waiman Long <Waiman.Long@hp.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1430251224-5764-2-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fb650a2f4a73..d70910355b20 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3085,13 +3085,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
 static inline unsigned long task_rlimit(const struct task_struct *tsk,
 		unsigned int limit)
 {
-	return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
+	return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
 }
 
 static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
 		unsigned int limit)
 {
-	return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
+	return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
 }
 
 static inline unsigned long rlimit(unsigned int limit)
-- 
cgit v1.2.3


From 1018016c706f7ff9f56fde3a649789c47085a293 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Tue, 28 Apr 2015 13:00:22 -0700
Subject: sched, timer: Replace spinlocks with atomics in
 thread_group_cputimer(), to improve scalability

While running a database workload, we found a scalability issue with itimers.

Much of the problem was caused by the thread_group_cputimer spinlock.
Each time we account for group system/user time, we need to obtain a
thread_group_cputimer's spinlock to update the timers. On larger systems
(such as a 16 socket machine), this caused more than 30% of total time
spent trying to obtain this kernel lock to update these group timer stats.

This patch converts the timers to 64-bit atomic variables and use
atomic add to update them without a lock. With this patch, the percent
of total time spent updating thread group cputimer timers was reduced
from 30% down to less than 1%.

Note: On 32-bit systems using the generic 64-bit atomics, this causes
sample_group_cputimer() to take locks 3 times instead of just 1 time.
However, we tested this patch on a 32-bit system ARM system using the
generic atomics and did not find the overhead to be much of an issue.
An explanation for why this isn't an issue is that 32-bit systems usually
have small numbers of CPUs, and cacheline contention from extra spinlocks
called periodically is not really apparent on smaller systems.

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Waiman Long <Waiman.Long@hp.com>
Link: http://lkml.kernel.org/r/1430251224-5764-4-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/init_task.h |  7 ++++---
 include/linux/sched.h     | 10 +++-------
 2 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 696d22312b31..7b9d8b59e7bf 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
 	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
 	.rlim		= INIT_RLIMITS,					\
 	.cputimer	= { 						\
-		.cputime = INIT_CPUTIME,				\
-		.running = 0,						\
-		.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
+		.utime 		  = ATOMIC64_INIT(0),			\
+		.stime		  = ATOMIC64_INIT(0),			\
+		.sum_exec_runtime = ATOMIC64_INIT(0),			\
+		.running 	  = 0					\
 	},								\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d70910355b20..a45874c3fab6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -598,9 +598,10 @@ struct task_cputime {
  * used for thread group CPU timer calculations.
  */
 struct thread_group_cputimer {
-	struct task_cputime cputime;
+	atomic64_t utime;
+	atomic64_t stime;
+	atomic64_t sum_exec_runtime;
 	int running;
-	raw_spinlock_t lock;
 };
 
 #include <linux/rwsem.h>
@@ -2967,11 +2968,6 @@ static __always_inline bool need_resched(void)
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 
-static inline void thread_group_cputime_init(struct signal_struct *sig)
-{
-	raw_spin_lock_init(&sig->cputimer.lock);
-}
-
 /*
  * Reevaluate whether the task has signals pending delivery.
  * Wake the task if so.
-- 
cgit v1.2.3


From 971e8a985482c76487edb5a49811e99b96e846e1 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Tue, 28 Apr 2015 13:00:23 -0700
Subject: sched, timer: Provide an atomic 'struct task_cputime' data structure

This patch adds an atomic variant of the 'struct task_cputime' data structure,
which can be used to store and update task_cputime statistics without
needing to do locking.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Waiman Long <Waiman.Long@hp.com>
Link: http://lkml.kernel.org/r/1430251224-5764-5-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a45874c3fab6..6eb78cd45da7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -572,6 +572,23 @@ struct task_cputime {
 		.sum_exec_runtime = 0,				\
 	}
 
+/*
+ * This is the atomic variant of task_cputime, which can be used for
+ * storing and updating task_cputime statistics without locking.
+ */
+struct task_cputime_atomic {
+	atomic64_t utime;
+	atomic64_t stime;
+	atomic64_t sum_exec_runtime;
+};
+
+#define INIT_CPUTIME_ATOMIC \
+	(struct task_cputime_atomic) {				\
+		.utime = ATOMIC64_INIT(0),			\
+		.stime = ATOMIC64_INIT(0),			\
+		.sum_exec_runtime = ATOMIC64_INIT(0),		\
+	}
+
 #ifdef CONFIG_PREEMPT_COUNT
 #define PREEMPT_DISABLED	(1 + PREEMPT_ENABLED)
 #else
-- 
cgit v1.2.3


From 7110744516276e906f9197e2857d026eb2343393 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Tue, 28 Apr 2015 13:00:24 -0700
Subject: sched, timer: Use the atomic task_cputime in thread_group_cputimer

Recent optimizations were made to thread_group_cputimer to improve its
scalability by keeping track of cputime stats without a lock. However,
the values were open coded to the structure, causing them to be at
a different abstraction level from the regular task_cputime structure.
Furthermore, any subsequent similar optimizations would not be able to
share the new code, since they are specific to thread_group_cputimer.

This patch adds the new task_cputime_atomic data structure (introduced in
the previous patch in the series) to thread_group_cputimer for keeping
track of the cputime atomically, which also helps generalize the code.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Waiman Long <Waiman.Long@hp.com>
Link: http://lkml.kernel.org/r/1430251224-5764-6-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/init_task.h | 6 ++----
 include/linux/sched.h     | 4 +---
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 7b9d8b59e7bf..bb9b075f0eb0 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,10 +50,8 @@ extern struct fs_struct init_fs;
 	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
 	.rlim		= INIT_RLIMITS,					\
 	.cputimer	= { 						\
-		.utime 		  = ATOMIC64_INIT(0),			\
-		.stime		  = ATOMIC64_INIT(0),			\
-		.sum_exec_runtime = ATOMIC64_INIT(0),			\
-		.running 	  = 0					\
+		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
+		.running	= 0,					\
 	},								\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6eb78cd45da7..4adc536a3b03 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -615,9 +615,7 @@ struct task_cputime_atomic {
  * used for thread group CPU timer calculations.
  */
 struct thread_group_cputimer {
-	atomic64_t utime;
-	atomic64_t stime;
-	atomic64_t sum_exec_runtime;
+	struct task_cputime_atomic cputime_atomic;
 	int running;
 };
 
-- 
cgit v1.2.3


From 7675104990ed255b9315a82ae827ff312a2a88a2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 1 May 2015 08:27:50 -0700
Subject: sched: Implement lockless wake-queues

This is useful for locking primitives that can effect multiple
wakeups per operation and want to avoid lock internal lock contention
by delaying the wakeups until we've released the lock internal locks.

Alternatively it can be used to avoid issuing multiple wakeups, and
thus save a few cycles, in packet processing. Queue all target tasks
and wakeup once you've processed all packets. That way you avoid
waking the target task multiple times if there were multiple packets
for the same task.

Properties of a wake_q are:
- Lockless, as queue head must reside on the stack.
- Being a queue, maintains wakeup order passed by the callers. This can
  be important for otherwise, in scenarios where highly contended locks
  could affect any reliance on lock fairness.
- A queued task cannot be added again until it is woken up.

This patch adds the needed infrastructure into the scheduler code
and uses the new wake_list to delay the futex wakeups until
after we've released the hash bucket locks.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[tweaks, adjustments, comments, etc.]
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Mason <clm@fb.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: George Spelvin <linux@horizon.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1430494072-30283-2-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4adc536a3b03..254d88e80f65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -920,6 +920,50 @@ enum cpu_idle_type {
 #define SCHED_CAPACITY_SHIFT	10
 #define SCHED_CAPACITY_SCALE	(1L << SCHED_CAPACITY_SHIFT)
 
+/*
+ * Wake-queues are lists of tasks with a pending wakeup, whose
+ * callers have already marked the task as woken internally,
+ * and can thus carry on. A common use case is being able to
+ * do the wakeups once the corresponding user lock as been
+ * released.
+ *
+ * We hold reference to each task in the list across the wakeup,
+ * thus guaranteeing that the memory is still valid by the time
+ * the actual wakeups are performed in wake_up_q().
+ *
+ * One per task suffices, because there's never a need for a task to be
+ * in two wake queues simultaneously; it is forbidden to abandon a task
+ * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
+ * already in a wake queue, the wakeup will happen soon and the second
+ * waker can just skip it.
+ *
+ * The WAKE_Q macro declares and initializes the list head.
+ * wake_up_q() does NOT reinitialize the list; it's expected to be
+ * called near the end of a function, where the fact that the queue is
+ * not used again will be easy to see by inspection.
+ *
+ * Note that this can cause spurious wakeups. schedule() callers
+ * must ensure the call is done inside a loop, confirming that the
+ * wakeup condition has in fact occurred.
+ */
+struct wake_q_node {
+	struct wake_q_node *next;
+};
+
+struct wake_q_head {
+	struct wake_q_node *first;
+	struct wake_q_node **lastp;
+};
+
+#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
+
+#define WAKE_Q(name)					\
+	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
+
+extern void wake_q_add(struct wake_q_head *head,
+		       struct task_struct *task);
+extern void wake_up_q(struct wake_q_head *head);
+
 /*
  * sched-domains (multiprocessor balancing) declarations:
  */
@@ -1532,6 +1576,8 @@ struct task_struct {
 	/* Protection of the PI data structures: */
 	raw_spinlock_t pi_lock;
 
+	struct wake_q_node wake_q;
+
 #ifdef CONFIG_RT_MUTEXES
 	/* PI waiters blocked on a rt_mutex held by this task */
 	struct rb_root pi_waiters;
-- 
cgit v1.2.3


From 920ce39f6c204d4ce4d8acebe7522f0dfa95f662 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Fri, 8 May 2015 14:31:50 -0700
Subject: sched, timer: Fix documentation for 'struct thread_group_cputimer'

Fix the docbook build bug reported by Fengguang Wu.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Jason Low <jason.low2@hp.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman.Long@hp.com
Cc: aswin@hp.com
Cc: bp@alien8.de
Cc: dave@stgolabs.net
Cc: fweisbec@gmail.com
Cc: mgorman@suse.de
Cc: oleg@redhat.com
Cc: paulmck@linux.vnet.ibm.com
Cc: preeti@linux.vnet.ibm.com
Cc: riel@redhat.com
Cc: rostedt@goodmis.org
Cc: scott.norton@hp.com
Cc: torvalds@linux-foundation.org
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/1431120710.5136.12.camel@j-VirtualBox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 254d88e80f65..0eceeec5a01a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -606,10 +606,9 @@ struct task_cputime_atomic {
 
 /**
  * struct thread_group_cputimer - thread group interval timer counts
- * @cputime:		thread group interval timers.
+ * @cputime_atomic:	atomic thread group interval timers.
  * @running:		non-zero when there are timers running and
  * 			@cputime receives updates.
- * @lock:		lock for fields in this struct.
  *
  * This structure contains the version of task_cputime, above, that is
  * used for thread group CPU timer calculations.
-- 
cgit v1.2.3


From 8c8a457a60050d5922676f81913d87e4af6fd97b Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <n.borisov@siteground.com>
Date: Thu, 14 May 2015 14:31:01 +0300
Subject: sched: Remove redundant #ifdef

Two adjacent members in task_struct were guarded
by the same #define, so we can merge the two blocks.

Signed-off-by: Nikolay Borisov <n.borisov@siteground.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431603061-29408-1-git-send-email-kernel@kyup.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0eceeec5a01a..5f8defa155cf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1398,8 +1398,6 @@ struct task_struct {
 	int rcu_read_lock_nesting;
 	union rcu_special rcu_read_unlock_special;
 	struct list_head rcu_node_entry;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-#ifdef CONFIG_PREEMPT_RCU
 	struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 #ifdef CONFIG_TASKS_RCU
-- 
cgit v1.2.3


From 92cf211874e954027b8e91cc9a15485a50b58d6b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 12 May 2015 16:41:46 +0200
Subject: sched/preempt: Merge preempt_mask.h into preempt.h

preempt_mask.h defines all the preempt_count semantics and related
symbols: preempt, softirq, hardirq, nmi, preempt active, need resched,
etc...

preempt.h defines the accessors and mutators of preempt_count.

But there is a messy dependency game around those two header files:

	* preempt_mask.h includes preempt.h in order to access preempt_count()

	* preempt_mask.h defines all preempt_count semantic and symbols
	  except PREEMPT_NEED_RESCHED that is needed by asm/preempt.h
	  Thus we need to define it from preempt.h, right before including
	  asm/preempt.h, instead of defining it to preempt_mask.h with the
	  other preempt_count symbols. Therefore the preempt_count semantics
	  happen to be spread out.

	* We plan to introduce preempt_active_[enter,exit]() to consolidate
	  preempt_schedule*() code. But we'll need to access both preempt_count
	  mutators (preempt_count_add()) and preempt_count symbols
	  (PREEMPT_ACTIVE, PREEMPT_OFFSET). The usual place to define preempt
	  operations is in preempt.h but then we'll need symbols in
	  preempt_mask.h which already includes preempt.h. So we end up with
	  a ressource circle dependency.

Lets merge preempt_mask.h into preempt.h to solve these dependency issues.
This way we gather semantic symbols and operation definition of
preempt_count in a single file.

This is a dumb copy-paste merge. Further merge re-arrangments are
performed in a subsequent patch to ease review.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431441711-29753-2-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/bottom_half.h  |   1 -
 include/linux/hardirq.h      |   2 +-
 include/linux/preempt.h      | 111 ++++++++++++++++++++++++++++++++++++++++
 include/linux/preempt_mask.h | 117 -------------------------------------------
 include/linux/sched.h        |   2 +-
 5 files changed, 113 insertions(+), 120 deletions(-)
 delete mode 100644 include/linux/preempt_mask.h

(limited to 'include/linux')

diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index 86c12c93e3cf..8fdcb783197d 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -2,7 +2,6 @@
 #define _LINUX_BH_H
 
 #include <linux/preempt.h>
-#include <linux/preempt_mask.h>
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f4af03404b97..dfd59d6bc6f0 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -1,7 +1,7 @@
 #ifndef LINUX_HARDIRQ_H
 #define LINUX_HARDIRQ_H
 
-#include <linux/preempt_mask.h>
+#include <linux/preempt.h>
 #include <linux/lockdep.h>
 #include <linux/ftrace_irq.h>
 #include <linux/vtime.h>
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index de83b4eb1642..8cc0338a5e9a 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -17,6 +17,117 @@
 
 #include <asm/preempt.h>
 
+/*
+ * We put the hardirq and softirq counter into the preemption
+ * counter. The bitmask has the following meaning:
+ *
+ * - bits 0-7 are the preemption count (max preemption depth: 256)
+ * - bits 8-15 are the softirq count (max # of softirqs: 256)
+ *
+ * The hardirq count could in theory be the same as the number of
+ * interrupts in the system, but we run all interrupt handlers with
+ * interrupts disabled, so we cannot have nesting interrupts. Though
+ * there are a few palaeontologic drivers which reenable interrupts in
+ * the handler, so we need more than one bit here.
+ *
+ * PREEMPT_MASK:	0x000000ff
+ * SOFTIRQ_MASK:	0x0000ff00
+ * HARDIRQ_MASK:	0x000f0000
+ *     NMI_MASK:	0x00100000
+ * PREEMPT_ACTIVE:	0x00200000
+ */
+#define PREEMPT_BITS	8
+#define SOFTIRQ_BITS	8
+#define HARDIRQ_BITS	4
+#define NMI_BITS	1
+
+#define PREEMPT_SHIFT	0
+#define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
+#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define NMI_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
+
+#define __IRQ_MASK(x)	((1UL << (x))-1)
+
+#define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
+#define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
+#define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+#define NMI_MASK	(__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
+
+#define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
+#define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
+#define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
+#define NMI_OFFSET	(1UL << NMI_SHIFT)
+
+#define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
+
+#define PREEMPT_ACTIVE_BITS	1
+#define PREEMPT_ACTIVE_SHIFT	(NMI_SHIFT + NMI_BITS)
+#define PREEMPT_ACTIVE	(__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
+
+#define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
+#define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
+#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
+				 | NMI_MASK))
+
+/*
+ * Are we doing bottom half or hardware interrupt processing?
+ * Are we in a softirq context? Interrupt context?
+ * in_softirq - Are we currently processing softirq or have bh disabled?
+ * in_serving_softirq - Are we currently processing softirq?
+ */
+#define in_irq()		(hardirq_count())
+#define in_softirq()		(softirq_count())
+#define in_interrupt()		(irq_count())
+#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
+
+/*
+ * Are we in NMI context?
+ */
+#define in_nmi()	(preempt_count() & NMI_MASK)
+
+#if defined(CONFIG_PREEMPT_COUNT)
+# define PREEMPT_CHECK_OFFSET 1
+#else
+# define PREEMPT_CHECK_OFFSET 0
+#endif
+
+/*
+ * The preempt_count offset needed for things like:
+ *
+ *  spin_lock_bh()
+ *
+ * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
+ * softirqs, such that unlock sequences of:
+ *
+ *  spin_unlock();
+ *  local_bh_enable();
+ *
+ * Work as expected.
+ */
+#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET)
+
+/*
+ * Are we running in atomic context?  WARNING: this macro cannot
+ * always detect atomic context; in particular, it cannot know about
+ * held spinlocks in non-preemptible kernels.  Thus it should not be
+ * used in the general case to determine whether sleeping is possible.
+ * Do not use in_atomic() in driver code.
+ */
+#define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
+
+/*
+ * Check whether we were atomic before we did preempt_disable():
+ * (used by the scheduler, *after* releasing the kernel lock)
+ */
+#define in_atomic_preempt_off() \
+		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
+
+#ifdef CONFIG_PREEMPT_COUNT
+# define preemptible()	(preempt_count() == 0 && !irqs_disabled())
+#else
+# define preemptible()	0
+#endif
+
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
 extern void preempt_count_add(int val);
 extern void preempt_count_sub(int val);
diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h
deleted file mode 100644
index dbeec4d4a3be..000000000000
--- a/include/linux/preempt_mask.h
+++ /dev/null
@@ -1,117 +0,0 @@
-#ifndef LINUX_PREEMPT_MASK_H
-#define LINUX_PREEMPT_MASK_H
-
-#include <linux/preempt.h>
-
-/*
- * We put the hardirq and softirq counter into the preemption
- * counter. The bitmask has the following meaning:
- *
- * - bits 0-7 are the preemption count (max preemption depth: 256)
- * - bits 8-15 are the softirq count (max # of softirqs: 256)
- *
- * The hardirq count could in theory be the same as the number of
- * interrupts in the system, but we run all interrupt handlers with
- * interrupts disabled, so we cannot have nesting interrupts. Though
- * there are a few palaeontologic drivers which reenable interrupts in
- * the handler, so we need more than one bit here.
- *
- * PREEMPT_MASK:	0x000000ff
- * SOFTIRQ_MASK:	0x0000ff00
- * HARDIRQ_MASK:	0x000f0000
- *     NMI_MASK:	0x00100000
- * PREEMPT_ACTIVE:	0x00200000
- */
-#define PREEMPT_BITS	8
-#define SOFTIRQ_BITS	8
-#define HARDIRQ_BITS	4
-#define NMI_BITS	1
-
-#define PREEMPT_SHIFT	0
-#define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
-#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
-#define NMI_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
-
-#define __IRQ_MASK(x)	((1UL << (x))-1)
-
-#define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
-#define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
-#define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
-#define NMI_MASK	(__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
-
-#define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
-#define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
-#define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
-#define NMI_OFFSET	(1UL << NMI_SHIFT)
-
-#define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
-
-#define PREEMPT_ACTIVE_BITS	1
-#define PREEMPT_ACTIVE_SHIFT	(NMI_SHIFT + NMI_BITS)
-#define PREEMPT_ACTIVE	(__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
-
-#define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
-#define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
-#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
-				 | NMI_MASK))
-
-/*
- * Are we doing bottom half or hardware interrupt processing?
- * Are we in a softirq context? Interrupt context?
- * in_softirq - Are we currently processing softirq or have bh disabled?
- * in_serving_softirq - Are we currently processing softirq?
- */
-#define in_irq()		(hardirq_count())
-#define in_softirq()		(softirq_count())
-#define in_interrupt()		(irq_count())
-#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
-
-/*
- * Are we in NMI context?
- */
-#define in_nmi()	(preempt_count() & NMI_MASK)
-
-#if defined(CONFIG_PREEMPT_COUNT)
-# define PREEMPT_CHECK_OFFSET 1
-#else
-# define PREEMPT_CHECK_OFFSET 0
-#endif
-
-/*
- * The preempt_count offset needed for things like:
- *
- *  spin_lock_bh()
- *
- * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
- * softirqs, such that unlock sequences of:
- *
- *  spin_unlock();
- *  local_bh_enable();
- *
- * Work as expected.
- */
-#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET)
-
-/*
- * Are we running in atomic context?  WARNING: this macro cannot
- * always detect atomic context; in particular, it cannot know about
- * held spinlocks in non-preemptible kernels.  Thus it should not be
- * used in the general case to determine whether sleeping is possible.
- * Do not use in_atomic() in driver code.
- */
-#define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
-
-/*
- * Check whether we were atomic before we did preempt_disable():
- * (used by the scheduler, *after* releasing the kernel lock)
- */
-#define in_atomic_preempt_off() \
-		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
-
-#ifdef CONFIG_PREEMPT_COUNT
-# define preemptible()	(preempt_count() == 0 && !irqs_disabled())
-#else
-# define preemptible()	0
-#endif
-
-#endif /* LINUX_PREEMPT_MASK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5f8defa155cf..c53a1784d7a9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -25,7 +25,7 @@ struct sched_param {
 #include <linux/errno.h>
 #include <linux/nodemask.h>
 #include <linux/mm_types.h>
-#include <linux/preempt_mask.h>
+#include <linux/preempt.h>
 
 #include <asm/page.h>
 #include <asm/ptrace.h>
-- 
cgit v1.2.3


From 2e10e71ce88e3eaccfd09a045ae6ecebe657ba09 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 12 May 2015 16:41:47 +0200
Subject: sched/preempt: Rearrange a few symbols after headers merge

Adjust a few comments, and further integrate a few definitions after
the dumb headers copy.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431441711-29753-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 8cc0338a5e9a..37974cd4f092 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -9,14 +9,6 @@
 #include <linux/linkage.h>
 #include <linux/list.h>
 
-/*
- * We use the MSB mostly because its available; see <linux/preempt_mask.h> for
- * the other bits -- can't include that header due to inclusion hell.
- */
-#define PREEMPT_NEED_RESCHED	0x80000000
-
-#include <asm/preempt.h>
-
 /*
  * We put the hardirq and softirq counter into the preemption
  * counter. The bitmask has the following meaning:
@@ -30,11 +22,12 @@
  * there are a few palaeontologic drivers which reenable interrupts in
  * the handler, so we need more than one bit here.
  *
- * PREEMPT_MASK:	0x000000ff
- * SOFTIRQ_MASK:	0x0000ff00
- * HARDIRQ_MASK:	0x000f0000
- *     NMI_MASK:	0x00100000
- * PREEMPT_ACTIVE:	0x00200000
+ *         PREEMPT_MASK:	0x000000ff
+ *         SOFTIRQ_MASK:	0x0000ff00
+ *         HARDIRQ_MASK:	0x000f0000
+ *             NMI_MASK:	0x00100000
+ *       PREEMPT_ACTIVE:	0x00200000
+ * PREEMPT_NEED_RESCHED:	0x80000000
  */
 #define PREEMPT_BITS	8
 #define SOFTIRQ_BITS	8
@@ -64,6 +57,12 @@
 #define PREEMPT_ACTIVE_SHIFT	(NMI_SHIFT + NMI_BITS)
 #define PREEMPT_ACTIVE	(__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
 
+/* We use the MSB mostly because its available */
+#define PREEMPT_NEED_RESCHED	0x80000000
+
+/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
+#include <asm/preempt.h>
+
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
 #define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
@@ -122,12 +121,6 @@
 #define in_atomic_preempt_off() \
 		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
 
-#ifdef CONFIG_PREEMPT_COUNT
-# define preemptible()	(preempt_count() == 0 && !irqs_disabled())
-#else
-# define preemptible()	0
-#endif
-
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
 extern void preempt_count_add(int val);
 extern void preempt_count_sub(int val);
@@ -160,6 +153,8 @@ do { \
 
 #define preempt_enable_no_resched() sched_preempt_enable_no_resched()
 
+#define preemptible()	(preempt_count() == 0 && !irqs_disabled())
+
 #ifdef CONFIG_PREEMPT
 #define preempt_enable() \
 do { \
@@ -232,6 +227,7 @@ do { \
 #define preempt_disable_notrace()		barrier()
 #define preempt_enable_no_resched_notrace()	barrier()
 #define preempt_enable_notrace()		barrier()
+#define preemptible()				0
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
-- 
cgit v1.2.3


From 90b62b5129d5cb50f62f40e684de7a1961e57197 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 12 May 2015 16:41:48 +0200
Subject: sched/preempt: Rename PREEMPT_CHECK_OFFSET to PREEMPT_DISABLE_OFFSET

"CHECK" suggests it's only used as a comparison mask. But now it's used
further as a config-conditional preempt disabler offset. Lets
disambiguate this name.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431441711-29753-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 37974cd4f092..4689ef210a13 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -85,9 +85,9 @@
 #define in_nmi()	(preempt_count() & NMI_MASK)
 
 #if defined(CONFIG_PREEMPT_COUNT)
-# define PREEMPT_CHECK_OFFSET 1
+# define PREEMPT_DISABLE_OFFSET 1
 #else
-# define PREEMPT_CHECK_OFFSET 0
+# define PREEMPT_DISABLE_OFFSET 0
 #endif
 
 /*
@@ -103,7 +103,7 @@
  *
  * Work as expected.
  */
-#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET)
+#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_DISABLE_OFFSET)
 
 /*
  * Are we running in atomic context?  WARNING: this macro cannot
@@ -119,7 +119,7 @@
  * (used by the scheduler, *after* releasing the kernel lock)
  */
 #define in_atomic_preempt_off() \
-		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
+		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
 
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
 extern void preempt_count_add(int val);
-- 
cgit v1.2.3


From b30f0e3ffedfa52b1d67a302ae5860c49998e5e2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 12 May 2015 16:41:49 +0200
Subject: sched/preempt: Optimize preemption operations on __schedule() callers

__schedule() disables preemption and some of its callers
(the preempt_schedule*() family) also set PREEMPT_ACTIVE.

So we have two preempt_count() modifications that could be performed
at once.

Lets remove the preemption disablement from __schedule() and pull
this responsibility to its callers in order to optimize preempt_count()
operations in a single place.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431441711-29753-5-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 4689ef210a13..45da394f2779 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -137,6 +137,18 @@ extern void preempt_count_sub(int val);
 #define preempt_count_inc() preempt_count_add(1)
 #define preempt_count_dec() preempt_count_sub(1)
 
+#define preempt_active_enter() \
+do { \
+	preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
+	barrier(); \
+} while (0)
+
+#define preempt_active_exit() \
+do { \
+	barrier(); \
+	preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
+} while (0)
+
 #ifdef CONFIG_PREEMPT_COUNT
 
 #define preempt_disable() \
-- 
cgit v1.2.3


From e017cf21ae82e0b36f026b22083a8ae67926f465 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 12 May 2015 16:41:50 +0200
Subject: sched/preempt: Fix out of date comment

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431441711-29753-6-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 45da394f2779..4057696c641c 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -116,7 +116,7 @@
 
 /*
  * Check whether we were atomic before we did preempt_disable():
- * (used by the scheduler, *after* releasing the kernel lock)
+ * (used by the scheduler)
  */
 #define in_atomic_preempt_off() \
 		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
-- 
cgit v1.2.3


From 3e51f3c4004c9b01f66da03214a3e206f5ed627b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 12 May 2015 16:41:51 +0200
Subject: sched/preempt: Remove PREEMPT_ACTIVE unmasking off in_atomic()

Now that PREEMPT_ACTIVE implies PREEMPT_DISABLE_OFFSET, ignoring
PREEMPT_ACTIVE from in_atomic() check isn't useful anymore.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431441711-29753-7-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 4057696c641c..a1a00e14c14f 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -112,7 +112,7 @@
  * used in the general case to determine whether sleeping is possible.
  * Do not use in_atomic() in driver code.
  */
-#define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
+#define in_atomic()	(preempt_count() != 0)
 
 /*
  * Check whether we were atomic before we did preempt_disable():
-- 
cgit v1.2.3


From 8bcbde5480f9777f8b74d71493722c663e22c21b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 11 May 2015 17:52:06 +0200
Subject: sched/preempt, mm/fault: Count pagefault_disable() levels in
 pagefault_disabled

Until now, pagefault_disable()/pagefault_enabled() used the preempt
count to track whether in an environment with pagefaults disabled (can
be queried via in_atomic()).

This patch introduces a separate counter in task_struct to count the
level of pagefault_disable() calls. We'll keep manipulating the preempt
count to retain compatibility to existing pagefault handlers.

It is now possible to verify whether in a pagefault_disable() envionment
by calling pagefault_disabled(). In contrast to in_atomic() it will not
be influenced by preempt_enable()/preempt_disable().

This patch is based on a patch from Ingo Molnar.

Reviewed-and-tested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David.Laight@ACULAB.COM
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: airlied@linux.ie
Cc: akpm@linux-foundation.org
Cc: benh@kernel.crashing.org
Cc: bigeasy@linutronix.de
Cc: borntraeger@de.ibm.com
Cc: daniel.vetter@intel.com
Cc: heiko.carstens@de.ibm.com
Cc: herbert@gondor.apana.org.au
Cc: hocko@suse.cz
Cc: hughd@google.com
Cc: mst@redhat.com
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: schwidefsky@de.ibm.com
Cc: yang.shi@windriver.com
Link: http://lkml.kernel.org/r/1431359540-32227-2-git-send-email-dahi@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h   |  1 +
 include/linux/uaccess.h | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c53a1784d7a9..dd07ac03f82a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1788,6 +1788,7 @@ struct task_struct {
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 	unsigned long	task_state_change;
 #endif
+	int pagefault_disabled;
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index ecd3319dac33..23290cc93a24 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -2,20 +2,36 @@
 #define __LINUX_UACCESS_H__
 
 #include <linux/preempt.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 
+static __always_inline void pagefault_disabled_inc(void)
+{
+	current->pagefault_disabled++;
+}
+
+static __always_inline void pagefault_disabled_dec(void)
+{
+	current->pagefault_disabled--;
+	WARN_ON(current->pagefault_disabled < 0);
+}
+
 /*
- * These routines enable/disable the pagefault handler in that
- * it will not take any locks and go straight to the fixup table.
+ * These routines enable/disable the pagefault handler. If disabled, it will
+ * not take any locks and go straight to the fixup table.
+ *
+ * We increase the preempt and the pagefault count, to be able to distinguish
+ * whether we run in simple atomic context or in a real pagefault_disable()
+ * context.
+ *
+ * For now, after pagefault_disabled() has been called, we run in atomic
+ * context. User access methods will not sleep.
  *
- * They have great resemblance to the preempt_disable/enable calls
- * and in fact they are identical; this is because currently there is
- * no other way to make the pagefault handlers do this. So we do
- * disable preemption but we don't necessarily care about that.
  */
 static inline void pagefault_disable(void)
 {
 	preempt_count_inc();
+	pagefault_disabled_inc();
 	/*
 	 * make sure to have issued the store before a pagefault
 	 * can hit.
@@ -25,18 +41,24 @@ static inline void pagefault_disable(void)
 
 static inline void pagefault_enable(void)
 {
-#ifndef CONFIG_PREEMPT
 	/*
 	 * make sure to issue those last loads/stores before enabling
 	 * the pagefault handler again.
 	 */
 	barrier();
+	pagefault_disabled_dec();
+#ifndef CONFIG_PREEMPT
 	preempt_count_dec();
 #else
 	preempt_enable();
 #endif
 }
 
+/*
+ * Is the pagefault handler disabled? If so, user access methods will not sleep.
+ */
+#define pagefault_disabled() (current->pagefault_disabled != 0)
+
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
 static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
-- 
cgit v1.2.3


From 9ec23531fd48031d1b6ca5366f5f967d17a8bc28 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 11 May 2015 17:52:07 +0200
Subject: sched/preempt, mm/fault: Trigger might_sleep() in might_fault() with
 disabled pagefaults

Commit 662bbcb2747c ("mm, sched: Allow uaccess in atomic with
pagefault_disable()") removed might_sleep() checks for all user access
code (that uses might_fault()).

The reason was to disable wrong "sleep in atomic" warnings in the
following scenario:

    pagefault_disable()
    rc = copy_to_user(...)
    pagefault_enable()

Which is valid, as pagefault_disable() increments the preempt counter
and therefore disables the pagefault handler. copy_to_user() will not
sleep and return an error code if a page is not available.

However, as all might_sleep() checks are removed,
CONFIG_DEBUG_ATOMIC_SLEEP would no longer detect the following scenario:

    spin_lock(&lock);
    rc = copy_to_user(...)
    spin_unlock(&lock)

If the kernel is compiled with preemption turned on, preempt_disable()
will make in_atomic() detect disabled preemption. The fault handler would
correctly never sleep on user access.
However, with preemption turned off, preempt_disable() is usually a NOP
(with !CONFIG_PREEMPT_COUNT), therefore in_atomic() will not be able to
detect disabled preemption nor disabled pagefaults. The fault handler
could sleep.
We really want to enable CONFIG_DEBUG_ATOMIC_SLEEP checks for user access
functions again, otherwise we can end up with horrible deadlocks.

Root of all evil is that pagefault_disable() acts almost as
preempt_disable(), depending on preemption being turned on/off.

As we now have pagefault_disabled(), we can use it to distinguish
whether user acces functions might sleep.

Convert might_fault() into a makro that calls __might_fault(), to
allow proper file + line messages in case of a might_sleep() warning.

Reviewed-and-tested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David.Laight@ACULAB.COM
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: airlied@linux.ie
Cc: akpm@linux-foundation.org
Cc: benh@kernel.crashing.org
Cc: bigeasy@linutronix.de
Cc: borntraeger@de.ibm.com
Cc: daniel.vetter@intel.com
Cc: heiko.carstens@de.ibm.com
Cc: herbert@gondor.apana.org.au
Cc: hocko@suse.cz
Cc: hughd@google.com
Cc: mst@redhat.com
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: schwidefsky@de.ibm.com
Cc: yang.shi@windriver.com
Link: http://lkml.kernel.org/r/1431359540-32227-3-git-send-email-dahi@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kernel.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3a5b48e52a9e..060dd7b61c6d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -244,7 +244,8 @@ static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
 
 #if defined(CONFIG_MMU) && \
 	(defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP))
-void might_fault(void);
+#define might_fault() __might_fault(__FILE__, __LINE__)
+void __might_fault(const char *file, int line);
 #else
 static inline void might_fault(void) { }
 #endif
-- 
cgit v1.2.3


From 2cb7c9cb426660b5ed58b643d9e7dd5d50ba901f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 11 May 2015 17:52:09 +0200
Subject: sched/preempt, mm/kmap: Explicitly disable/enable preemption in
 kmap_atomic_*

The existing code relies on pagefault_disable() implicitly disabling
preemption, so that no schedule will happen between kmap_atomic() and
kunmap_atomic().

Let's make this explicit, to prepare for pagefault_disable() not
touching preemption anymore.

Reviewed-and-tested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David.Laight@ACULAB.COM
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: airlied@linux.ie
Cc: akpm@linux-foundation.org
Cc: benh@kernel.crashing.org
Cc: bigeasy@linutronix.de
Cc: borntraeger@de.ibm.com
Cc: daniel.vetter@intel.com
Cc: heiko.carstens@de.ibm.com
Cc: herbert@gondor.apana.org.au
Cc: hocko@suse.cz
Cc: hughd@google.com
Cc: mst@redhat.com
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: schwidefsky@de.ibm.com
Cc: yang.shi@windriver.com
Link: http://lkml.kernel.org/r/1431359540-32227-5-git-send-email-dahi@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/highmem.h    | 2 ++
 include/linux/io-mapping.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 9286a46b7d69..6aefcd0031a6 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -65,6 +65,7 @@ static inline void kunmap(struct page *page)
 
 static inline void *kmap_atomic(struct page *page)
 {
+	preempt_disable();
 	pagefault_disable();
 	return page_address(page);
 }
@@ -73,6 +74,7 @@ static inline void *kmap_atomic(struct page *page)
 static inline void __kunmap_atomic(void *addr)
 {
 	pagefault_enable();
+	preempt_enable();
 }
 
 #define kmap_atomic_pfn(pfn)	kmap_atomic(pfn_to_page(pfn))
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 657fab4efab3..c27dde7215b5 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -141,6 +141,7 @@ static inline void __iomem *
 io_mapping_map_atomic_wc(struct io_mapping *mapping,
 			 unsigned long offset)
 {
+	preempt_disable();
 	pagefault_disable();
 	return ((char __force __iomem *) mapping) + offset;
 }
@@ -149,6 +150,7 @@ static inline void
 io_mapping_unmap_atomic(void __iomem *vaddr)
 {
 	pagefault_enable();
+	preempt_enable();
 }
 
 /* Non-atomic map/unmap */
-- 
cgit v1.2.3


From 70ffdb9393a7264a069265edded729078dcf0425 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 11 May 2015 17:52:11 +0200
Subject: mm/fault, arch: Use pagefault_disable() to check for disabled
 pagefaults in the handler

Introduce faulthandler_disabled() and use it to check for irq context and
disabled pagefaults (via pagefault_disable()) in the pagefault handlers.

Please note that we keep the in_atomic() checks in place - to detect
whether in irq context (in which case preemption is always properly
disabled).

In contrast, preempt_disable() should never be used to disable pagefaults.
With !CONFIG_PREEMPT_COUNT, preempt_disable() doesn't modify the preempt
counter, and therefore the result of in_atomic() differs.
We validate that condition by using might_fault() checks when calling
might_sleep().

Therefore, add a comment to faulthandler_disabled(), describing why this
is needed.

faulthandler_disabled() and pagefault_disable() are defined in
linux/uaccess.h, so let's properly add that include to all relevant files.

This patch is based on a patch from Thomas Gleixner.

Reviewed-and-tested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David.Laight@ACULAB.COM
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: airlied@linux.ie
Cc: akpm@linux-foundation.org
Cc: benh@kernel.crashing.org
Cc: bigeasy@linutronix.de
Cc: borntraeger@de.ibm.com
Cc: daniel.vetter@intel.com
Cc: heiko.carstens@de.ibm.com
Cc: herbert@gondor.apana.org.au
Cc: hocko@suse.cz
Cc: hughd@google.com
Cc: mst@redhat.com
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: schwidefsky@de.ibm.com
Cc: yang.shi@windriver.com
Link: http://lkml.kernel.org/r/1431359540-32227-7-git-send-email-dahi@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/uaccess.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 23290cc93a24..90786d2d74e5 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -59,6 +59,18 @@ static inline void pagefault_enable(void)
  */
 #define pagefault_disabled() (current->pagefault_disabled != 0)
 
+/*
+ * The pagefault handler is in general disabled by pagefault_disable() or
+ * when in irq context (via in_atomic()).
+ *
+ * This function should only be used by the fault handlers. Other users should
+ * stick to pagefault_disabled().
+ * Please NEVER use preempt_disable() to disable the fault handler. With
+ * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
+ * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
+ */
+#define faulthandler_disabled() (pagefault_disabled() || in_atomic())
+
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
 static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
-- 
cgit v1.2.3


From 8222dbe21e79338de92d5e1956cd1e3994cc9f93 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 11 May 2015 17:52:20 +0200
Subject: sched/preempt, mm/fault: Decouple preemption from the page fault
 logic

As the fault handlers now all rely on the pagefault_disabled() checks
and implicit preempt_disable() calls by pagefault_disable() have been
made explicit, we can completely rely on the pagefault_disableD counter.

So let's no longer touch the preempt count when disabling/enabling
pagefaults. After a call to pagefault_disable(), pagefault_disabled()
will return true, but in_atomic() won't.

Reviewed-and-tested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David.Laight@ACULAB.COM
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: airlied@linux.ie
Cc: akpm@linux-foundation.org
Cc: benh@kernel.crashing.org
Cc: bigeasy@linutronix.de
Cc: borntraeger@de.ibm.com
Cc: daniel.vetter@intel.com
Cc: heiko.carstens@de.ibm.com
Cc: herbert@gondor.apana.org.au
Cc: hocko@suse.cz
Cc: hughd@google.com
Cc: mst@redhat.com
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: schwidefsky@de.ibm.com
Cc: yang.shi@windriver.com
Link: http://lkml.kernel.org/r/1431359540-32227-16-git-send-email-dahi@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/uaccess.h | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 90786d2d74e5..ae572c138607 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -1,7 +1,6 @@
 #ifndef __LINUX_UACCESS_H__
 #define __LINUX_UACCESS_H__
 
-#include <linux/preempt.h>
 #include <linux/sched.h>
 #include <asm/uaccess.h>
 
@@ -20,17 +19,11 @@ static __always_inline void pagefault_disabled_dec(void)
  * These routines enable/disable the pagefault handler. If disabled, it will
  * not take any locks and go straight to the fixup table.
  *
- * We increase the preempt and the pagefault count, to be able to distinguish
- * whether we run in simple atomic context or in a real pagefault_disable()
- * context.
- *
- * For now, after pagefault_disabled() has been called, we run in atomic
- * context. User access methods will not sleep.
- *
+ * User access methods will not sleep when called from a pagefault_disabled()
+ * environment.
  */
 static inline void pagefault_disable(void)
 {
-	preempt_count_inc();
 	pagefault_disabled_inc();
 	/*
 	 * make sure to have issued the store before a pagefault
@@ -47,11 +40,6 @@ static inline void pagefault_enable(void)
 	 */
 	barrier();
 	pagefault_disabled_dec();
-#ifndef CONFIG_PREEMPT
-	preempt_count_dec();
-#else
-	preempt_enable();
-#endif
 }
 
 /*
-- 
cgit v1.2.3


From 80ed87c8a9ca0cad7ca66cf3bbdfb17559a66dcf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 8 May 2015 14:23:45 +0200
Subject: sched/wait: Introduce TASK_NOLOAD and TASK_IDLE

Currently people use TASK_INTERRUPTIBLE to idle kthreads and wait for
'work' because TASK_UNINTERRUPTIBLE contributes to the loadavg. Having
all idle kthreads contribute to the loadavg is somewhat silly.

Now mostly this works OK, because kthreads have all their signals
masked. However there's a few sites where this is causing problems and
TASK_UNINTERRUPTIBLE should be used, except for that loadavg issue.

This patch adds TASK_NOLOAD which, when combined with
TASK_UNINTERRUPTIBLE avoids the loadavg accounting.

As most of imagined usage sites are loops where a thread wants to
idle, waiting for work, a helper TASK_IDLE is introduced.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Julian Anastasov <ja@ssi.bg>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: NeilBrown <neilb@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dd07ac03f82a..7de815c6fa78 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -218,9 +218,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 #define TASK_WAKEKILL		128
 #define TASK_WAKING		256
 #define TASK_PARKED		512
-#define TASK_STATE_MAX		1024
+#define TASK_NOLOAD		1024
+#define TASK_STATE_MAX		2048
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
 
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@ -230,6 +231,8 @@ extern char ___assert_task_state[1 - 2*!!(
 #define TASK_STOPPED		(TASK_WAKEKILL | __TASK_STOPPED)
 #define TASK_TRACED		(TASK_WAKEKILL | __TASK_TRACED)
 
+#define TASK_IDLE		(TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
+
 /* Convenience macros for the sake of wake_up */
 #define TASK_NORMAL		(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
 #define TASK_ALL		(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
@@ -245,7 +248,8 @@ extern char ___assert_task_state[1 - 2*!!(
 			((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
 #define task_contributes_to_load(task)	\
 				((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
-				 (task->flags & PF_FROZEN) == 0)
+				 (task->flags & PF_FROZEN) == 0 && \
+				 (task->state & TASK_NOLOAD) == 0)
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 
-- 
cgit v1.2.3


From 06931e62246844c73fba24d7aeb4a5dc897a2739 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Tue, 26 May 2015 15:11:28 +0200
Subject: sched/topology: Rename topology_thread_cpumask() to
 topology_sibling_cpumask()

Rename topology_thread_cpumask() to topology_sibling_cpumask()
for more consistency with scheduler code.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Benoit Cousson <bcousson@baylibre.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jean Delvare <jdelvare@suse.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Drokin <oleg.drokin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Link: http://lkml.kernel.org/r/1432645896-12588-2-git-send-email-bgolaszewski@baylibre.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/topology.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/topology.h b/include/linux/topology.h
index 909b6e43b694..73ddad1e0fa3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -191,8 +191,8 @@ static inline int cpu_to_mem(int cpu)
 #ifndef topology_core_id
 #define topology_core_id(cpu)			((void)(cpu), 0)
 #endif
-#ifndef topology_thread_cpumask
-#define topology_thread_cpumask(cpu)		cpumask_of(cpu)
+#ifndef topology_sibling_cpumask
+#define topology_sibling_cpumask(cpu)		cpumask_of(cpu)
 #endif
 #ifndef topology_core_cpumask
 #define topology_core_cpumask(cpu)		cpumask_of(cpu)
@@ -201,7 +201,7 @@ static inline int cpu_to_mem(int cpu)
 #ifdef CONFIG_SCHED_SMT
 static inline const struct cpumask *cpu_smt_mask(int cpu)
 {
-	return topology_thread_cpumask(cpu);
+	return topology_sibling_cpumask(cpu);
 }
 #endif
 
-- 
cgit v1.2.3


From 4eaca0a887eaee04fc7a3866d0f5b51b34030dfa Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Jun 2015 17:39:08 +0200
Subject: preempt: Use preempt_schedule_context() as the official tracing
 preemption point

preempt_schedule_context() is a tracing safe preemption point but it's
only used when CONFIG_CONTEXT_TRACKING=y. Other configs have tracing
recursion issues since commit:

  b30f0e3ffedf ("sched/preempt: Optimize preemption operations on __schedule() callers")

introduced function based preemp_count_*() ops.

Lets make it available on all configs and give it a more appropriate
name for its new position.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1433432349-1021-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index a1a00e14c14f..7686dd63bc35 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -204,15 +204,11 @@ do { \
 
 #ifdef CONFIG_PREEMPT
 
-#ifndef CONFIG_CONTEXT_TRACKING
-#define __preempt_schedule_context() __preempt_schedule()
-#endif
-
 #define preempt_enable_notrace() \
 do { \
 	barrier(); \
 	if (unlikely(__preempt_count_dec_and_test())) \
-		__preempt_schedule_context(); \
+		__preempt_schedule_notrace(); \
 } while (0)
 #else
 #define preempt_enable_notrace() \
-- 
cgit v1.2.3


From 9a92e3dc6ad02208a014d0d8404ebbd697e3d5ef Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Jun 2015 17:39:09 +0200
Subject: preempt: Reorganize the notrace definitions a bit

preempt.h has two seperate "#ifdef CONFIG_PREEMPT" sections: one to
define preempt_enable() and another to define preempt_enable_notrace().

Lets gather both.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1433432349-1021-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 7686dd63bc35..0f1534acaf60 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -175,48 +175,46 @@ do { \
 		__preempt_schedule(); \
 } while (0)
 
+#define preempt_enable_notrace() \
+do { \
+	barrier(); \
+	if (unlikely(__preempt_count_dec_and_test())) \
+		__preempt_schedule_notrace(); \
+} while (0)
+
 #define preempt_check_resched() \
 do { \
 	if (should_resched()) \
 		__preempt_schedule(); \
 } while (0)
 
-#else
+#else /* !CONFIG_PREEMPT */
 #define preempt_enable() \
 do { \
 	barrier(); \
 	preempt_count_dec(); \
 } while (0)
-#define preempt_check_resched() do { } while (0)
-#endif
-
-#define preempt_disable_notrace() \
-do { \
-	__preempt_count_inc(); \
-	barrier(); \
-} while (0)
 
-#define preempt_enable_no_resched_notrace() \
+#define preempt_enable_notrace() \
 do { \
 	barrier(); \
 	__preempt_count_dec(); \
 } while (0)
 
-#ifdef CONFIG_PREEMPT
+#define preempt_check_resched() do { } while (0)
+#endif /* CONFIG_PREEMPT */
 
-#define preempt_enable_notrace() \
+#define preempt_disable_notrace() \
 do { \
+	__preempt_count_inc(); \
 	barrier(); \
-	if (unlikely(__preempt_count_dec_and_test())) \
-		__preempt_schedule_notrace(); \
 } while (0)
-#else
-#define preempt_enable_notrace() \
+
+#define preempt_enable_no_resched_notrace() \
 do { \
 	barrier(); \
 	__preempt_count_dec(); \
 } while (0)
-#endif
 
 #else /* !CONFIG_PREEMPT_COUNT */
 
-- 
cgit v1.2.3


From b17718d02f54b90978d0e0146368b512b11c3e84 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 5 Jun 2015 17:30:23 +0200
Subject: sched/stop_machine: Fix deadlock between multiple stop_two_cpus()

Jiri reported a machine stuck in multi_cpu_stop() with
migrate_swap_stop() as function and with the following src,dst cpu
pairs: {11,  4} {13, 11} { 4, 13}

                        4       11      13

cpuM: queue(4 ,13)
                        *Ma
cpuN: queue(13,11)
                                *N      Na
                        *M              Mb
cpuO: queue(11, 4)
                        *O      Oa
                                *Nb
                        *Ob

Where *X denotes the cpu running the queueing of cpu-X and X[ab] denotes
the first/second queued work.

You'll observe the top of the workqueue for each cpu: 4,11,13 to be work
from cpus: M, O, N resp. IOW. deadlock.

Do away with the queueing trickery and introduce lg_double_lock() to
lock both CPUs and fully serialize the stop_two_cpus() callers instead
of the partial (and buggy) serialization we have now.

Reported-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150605153023.GH19282@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lglock.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 0081f000e34b..c92ebd100d9b 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -52,10 +52,15 @@ struct lglock {
 	static struct lglock name = { .lock = &name ## _lock }
 
 void lg_lock_init(struct lglock *lg, char *name);
+
 void lg_local_lock(struct lglock *lg);
 void lg_local_unlock(struct lglock *lg);
 void lg_local_lock_cpu(struct lglock *lg, int cpu);
 void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2);
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
+
 void lg_global_lock(struct lglock *lg);
 void lg_global_unlock(struct lglock *lg);
 
-- 
cgit v1.2.3