From 7bd36014460f793c19e7d6c94dab67b0afcfcb7f Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Sep 2013 16:50:56 -0700
Subject: timekeeping: Fix HRTICK related deadlock from ntp lock changes

Gerlando Falauto reported that when HRTICK is enabled, it is
possible to trigger system deadlocks. These were hard to
reproduce, as HRTICK has been broken in the past, but seemed
to be connected to the timekeeping_seq lock.

Since seqlock/seqcount's aren't supported w/ lockdep, I added
some extra spinlock based locking and triggered the following
lockdep output:

[   15.849182] ntpd/4062 is trying to acquire lock:
[   15.849765]  (&(&pool->lock)->rlock){..-...}, at: [<ffffffff810aa9b5>] __queue_work+0x145/0x480
[   15.850051]
[   15.850051] but task is already holding lock:
[   15.850051]  (timekeeper_lock){-.-.-.}, at: [<ffffffff810df6df>] do_adjtimex+0x7f/0x100

<snip>

[   15.850051] Chain exists of: &(&pool->lock)->rlock --> &p->pi_lock --> timekeeper_lock
[   15.850051]  Possible unsafe locking scenario:
[   15.850051]
[   15.850051]        CPU0                    CPU1
[   15.850051]        ----                    ----
[   15.850051]   lock(timekeeper_lock);
[   15.850051]                                lock(&p->pi_lock);
[   15.850051] lock(timekeeper_lock);
[   15.850051] lock(&(&pool->lock)->rlock);
[   15.850051]
[   15.850051]  *** DEADLOCK ***

The deadlock was introduced by 06c017fdd4dc48451a ("timekeeping:
Hold timekeepering locks in do_adjtimex and hardpps") in 3.10

This patch avoids this deadlock, by moving the call to
schedule_delayed_work() outside of the timekeeper lock
critical section.

Reported-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Tested-by: Lin Ming <minggr@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: stable <stable@vger.kernel.org> #3.11, 3.10
Link: http://lkml.kernel.org/r/1378943457-27314-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/ntp.c         | 6 ++----
 kernel/time/timekeeping.c | 2 ++
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8f5b3b98577b..bb2215174f05 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -516,13 +516,13 @@ static void sync_cmos_clock(struct work_struct *work)
 	schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
 }
 
-static void notify_cmos_timer(void)
+void ntp_notify_cmos_timer(void)
 {
 	schedule_delayed_work(&sync_cmos_work, 0);
 }
 
 #else
-static inline void notify_cmos_timer(void) { }
+void ntp_notify_cmos_timer(void) { }
 #endif
 
 
@@ -687,8 +687,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
 	if (!(time_status & STA_NANO))
 		txc->time.tv_usec /= NSEC_PER_USEC;
 
-	notify_cmos_timer();
-
 	return result;
 }
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 48b9fffabdc2..947ba25a95a0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1703,6 +1703,8 @@ int do_adjtimex(struct timex *txc)
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
+	ntp_notify_cmos_timer();
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 6c9a27f5da9609fca46cb2b183724531b48f71ad Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Date: Tue, 10 Sep 2013 18:16:36 +0900
Subject: sched/fair: Fix small race where child->se.parent,cfs_rq might point
 to invalid ones

There is a small race between copy_process() and cgroup_attach_task()
where child->se.parent,cfs_rq points to invalid (old) ones.

        parent doing fork()      | someone moving the parent to another cgroup
  -------------------------------+---------------------------------------------
    copy_process()
      + dup_task_struct()
        -> parent->se is copied to child->se.
           se.parent,cfs_rq of them point to old ones.

                                     cgroup_attach_task()
                                       + cgroup_task_migrate()
                                         -> parent->cgroup is updated.
                                       + cpu_cgroup_attach()
                                         + sched_move_task()
                                           + task_move_group_fair()
                                             +- set_task_rq()
                                                -> se.parent,cfs_rq of parent
                                                   are updated.

      + cgroup_fork()
        -> parent->cgroup is copied to child->cgroup. (*1)
      + sched_fork()
        + task_fork_fair()
          -> se.parent,cfs_rq of child are accessed
             while they point to old ones. (*2)

In the worst case, this bug can lead to "use-after-free" and cause a panic,
because it's new cgroup's refcount that is incremented at (*1),
so the old cgroup(and related data) can be freed before (*2).

In fact, a panic caused by this bug was originally caught in RHEL6.4.

    BUG: unable to handle kernel NULL pointer dereference at (null)
    IP: [<ffffffff81051e3e>] sched_slice+0x6e/0xa0
    [...]
    Call Trace:
     [<ffffffff81051f25>] place_entity+0x75/0xa0
     [<ffffffff81056a3a>] task_fork_fair+0xaa/0x160
     [<ffffffff81063c0b>] sched_fork+0x6b/0x140
     [<ffffffff8106c3c2>] copy_process+0x5b2/0x1450
     [<ffffffff81063b49>] ? wake_up_new_task+0xd9/0x130
     [<ffffffff8106d2f4>] do_fork+0x94/0x460
     [<ffffffff81072a9e>] ? sys_wait4+0xae/0x100
     [<ffffffff81009598>] sys_clone+0x28/0x30
     [<ffffffff8100b393>] stub_clone+0x13/0x20
     [<ffffffff8100b072>] ? system_call_fastpath+0x16/0x1b

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/039601ceae06$733d3130$59b79390$@mxp.nes.nec.co.jp
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9b3fe1cd8f40..11cd13667359 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5928,11 +5928,15 @@ static void task_fork_fair(struct task_struct *p)
 	cfs_rq = task_cfs_rq(current);
 	curr = cfs_rq->curr;
 
-	if (unlikely(task_cpu(p) != this_cpu)) {
-		rcu_read_lock();
-		__set_task_cpu(p, this_cpu);
-		rcu_read_unlock();
-	}
+	/*
+	 * Not only the cpu but also the task_group of the parent might have
+	 * been changed after parent->se.parent,cfs_rq were copied to
+	 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
+	 * of child point to valid ones.
+	 */
+	rcu_read_lock();
+	__set_task_cpu(p, this_cpu);
+	rcu_read_unlock();
 
 	update_curr(cfs_rq);
 
-- 
cgit v1.2.3


From fc840914e9b07ab4685c195e1e54e58de4f84c03 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 9 Sep 2013 13:01:41 +0200
Subject: sched/debug: Take PID namespace into account

Emmanuel reported that /proc/sched_debug didn't report the right PIDs
when using namespaces, cure this.

Reported-by: Emmanuel Deloget <emmanuel.deloget@efixo.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20130909110141.GM31370@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/debug.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e076bddd4c66..196559994f7c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -124,7 +124,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		SEQ_printf(m, " ");
 
 	SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
-		p->comm, p->pid,
+		p->comm, task_pid_nr(p),
 		SPLIT_NS(p->se.vruntime),
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
@@ -289,7 +289,7 @@ do {									\
 	P(nr_load_updates);
 	P(nr_uninterruptible);
 	PN(next_balance);
-	P(curr->pid);
+	SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
 	PN(clock);
 	P(cpu_load[0]);
 	P(cpu_load[1]);
@@ -492,7 +492,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
 	unsigned long nr_switches;
 
-	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
+	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
 						get_nr_threads(p));
 	SEQ_printf(m,
 		"---------------------------------------------------------"
-- 
cgit v1.2.3


From 13b62e46d5407c7d619aea1dc9c3e0991b631b57 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 16 Sep 2013 11:30:36 +0300
Subject: sched: Fix comment for sched_info_depart

sched_info_depart seems to be only called from
sched_info_switch(), so only on involuntary task switch.

Fix the comment to match.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Link: http://lkml.kernel.org/r/20130916083036.GA1113@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/stats.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 5aef494fc8b4..c7edee71bce8 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -104,8 +104,9 @@ static inline void sched_info_queued(struct task_struct *t)
 }
 
 /*
- * Called when a process ceases being the active-running process, either
- * voluntarily or involuntarily.  Now we can calculate how long we ran.
+ * Called when a process ceases being the active-running process involuntarily
+ * due, typically, to expiring its time slice (this may also be called when
+ * switching to the idle task).  Now we can calculate how long we ran.
  * Also, if the process is still in the TASK_RUNNING state, call
  * sched_info_queued() to mark that it has now again started waiting on
  * the runqueue.
-- 
cgit v1.2.3


From fa7315871046b9a4c48627905691dbde57e51033 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 19 Sep 2013 10:16:42 +0200
Subject: perf: Fix capabilities bitfield compatibility in 'struct
 perf_event_mmap_page'

Solve the problems around the broken definition of perf_event_mmap_page::
cap_usr_time and cap_usr_rdpmc fields which used to overlap, partially
fixed by:

  860f085b74e9 ("perf: Fix broken union in 'struct perf_event_mmap_page'")

The problem with the fix (merged in v3.12-rc1 and not yet released
officially), noticed by Vince Weaver is that the new behavior is
not detectable by new user-space, and that due to the reuse of the
field names it's easy to mis-compile a binary if old headers are used
on a new kernel or new headers are used on an old kernel.

To solve all that make this change explicit, detectable and self-contained,
by iterating the ABI the following way:

 - Always clear bit 0, and rename it to usrpage->cap_bit0, to at least not
   confuse old user-space binaries. RDPMC will be marked as unavailable
   to old binaries but that's within the ABI, this is a capability bit.

 - Rename bit 1 to ->cap_bit0_is_deprecated and always set it to 1, so new
   libraries can reliably detect that bit 0 is deprecated and perma-zero
   without having to check the kernel version.

 - Use bits 2, 3, 4 for the newly defined, correct functionality:

	cap_user_rdpmc		: 1, /* The RDPMC instruction can be used to read counts */
	cap_user_time		: 1, /* The time_* fields are used */
	cap_user_time_zero	: 1, /* The time_zero field is used */

 - Rename all the bitfield names in perf_event.h to be different from the
   old names, to make sure it's not possible to mis-compile it
   accidentally with old assumptions.

The 'size' field can then be used in the future to add new fields and it
will act as a natural ABI version indicator as well.

Also adjust tools/perf/ userspace for the new definitions, noticed by
Adrian Hunter.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Also-Fixed-by: Adrian Hunter <adrian.hunter@intel.com>
Link: http://lkml.kernel.org/n/tip-zr03yxjrpXesOzzupszqglbv@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index dd236b66ca3a..cb4238e85b38 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3660,6 +3660,26 @@ static void calc_timer_values(struct perf_event *event,
 	*running = ctx_time - event->tstamp_running;
 }
 
+static void perf_event_init_userpage(struct perf_event *event)
+{
+	struct perf_event_mmap_page *userpg;
+	struct ring_buffer *rb;
+
+	rcu_read_lock();
+	rb = rcu_dereference(event->rb);
+	if (!rb)
+		goto unlock;
+
+	userpg = rb->user_page;
+
+	/* Allow new userspace to detect that bit 0 is deprecated */
+	userpg->cap_bit0_is_deprecated = 1;
+	userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+
+unlock:
+	rcu_read_unlock();
+}
+
 void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
 {
 }
@@ -4044,6 +4064,7 @@ again:
 	ring_buffer_attach(event, rb);
 	rcu_assign_pointer(event->rb, rb);
 
+	perf_event_init_userpage(event);
 	perf_event_update_userpage(event);
 
 unlock:
-- 
cgit v1.2.3


From b18855500fc40da050512d9df82d2f1471e59642 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Sun, 15 Sep 2013 17:49:13 +0400
Subject: sched/balancing: Fix 'local->avg_load > sds->avg_load' case in
 calculate_imbalance()

In busiest->group_imb case we can come to calculate_imbalance() with
local->avg_load >= busiest->avg_load >= sds->avg_load. This can result
in imbalance overflow, because it is calculated as follows

env->imbalance = min(
	max_pull * busiest->group_power,
	(sds->avg_load - local->avg_load) * local->group_power) / SCHED_POWER_SCALE;

As a result we can end up constantly bouncing tasks from one cpu to
another if there are pinned tasks.

Fix this by skipping the assignment and assuming imbalance=0 in case
local->avg_load > sds->avg_load.

[ The bug can be caught by running 2*N cpuhogs pinned to two logical cpus
  belonging to different cores on an HT-enabled machine with N logical
  cpus: just look at se.nr_migrations growth. ]

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/8f596cc6bc0e5e655119dc892c9bfcad26e971f4.1379252740.git.vdavydov@parallels.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 11cd13667359..0b99aae339cb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4896,7 +4896,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	 * max load less than avg load(as we skip the groups at or below
 	 * its cpu_power, while calculating max_load..)
 	 */
-	if (busiest->avg_load < sds->avg_load) {
+	if (busiest->avg_load <= sds->avg_load ||
+	    local->avg_load >= sds->avg_load) {
 		env->imbalance = 0;
 		return fix_small_imbalance(env, sds);
 	}
-- 
cgit v1.2.3


From 3029ede39373c368f402a76896600d85a4f7121b Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Sun, 15 Sep 2013 17:49:14 +0400
Subject: sched/balancing: Fix 'local->avg_load > busiest->avg_load' case in
 fix_small_imbalance()

In busiest->group_imb case we can come to fix_small_imbalance() with
local->avg_load > busiest->avg_load. This can result in wrong imbalance
fix-up, because there is the following check there where all the
members are unsigned:

if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >=
    (scaled_busy_load_per_task * imbn)) {
	env->imbalance = busiest->load_per_task;
	return;
}

As a result we can end up constantly bouncing tasks from one cpu to
another if there are pinned tasks.

Fix it by substituting the subtraction with an equivalent addition in
the check.

[ The bug can be caught by running 2*N cpuhogs pinned to two logical cpus
  belonging to different cores on an HT-enabled machine with N logical
  cpus: just look at se.nr_migrations growth. ]

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/ef167822e5c5b2d96cf5b0e3e4f4bdff3f0414a2.1379252740.git.vdavydov@parallels.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b99aae339cb..2aedaccebcc8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4823,8 +4823,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 		(busiest->load_per_task * SCHED_POWER_SCALE) /
 		busiest->group_power;
 
-	if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >=
-	    (scaled_busy_load_per_task * imbn)) {
+	if (busiest->avg_load + scaled_busy_load_per_task >=
+	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
 		env->imbalance = busiest->load_per_task;
 		return;
 	}
-- 
cgit v1.2.3


From 7e3115ef5149fc502e3a2e80719dba54a8e7409d Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Sat, 14 Sep 2013 19:39:46 +0400
Subject: sched/balancing: Fix cfs_rq->task_h_load calculation

Patch a003a2 (sched: Consider runnable load average in move_tasks())
sets all top-level cfs_rqs' h_load to rq->avg.load_avg_contrib, which is
always 0. This mistype leads to all tasks having weight 0 when load
balancing in a cpu-cgroup enabled setup. There obviously should be sum
of weights of all runnable tasks there instead. Fix it.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Reviewed-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1379173186-11944-1-git-send-email-vdavydov@parallels.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2aedaccebcc8..7c70201fbc61 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4242,7 +4242,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 	}
 
 	if (!se) {
-		cfs_rq->h_load = rq->avg.load_avg_contrib;
+		cfs_rq->h_load = cfs_rq->runnable_load_avg;
 		cfs_rq->last_h_load_update = now;
 	}
 
-- 
cgit v1.2.3


From 359e6fab6600562073162348cd4c18c5958296d8 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 24 Sep 2013 15:27:29 -0700
Subject: watchdog: update watchdog attributes atomically

proc_dowatchdog doesn't synchronize multiple callers which might lead to
confusion when two parallel callers might confuse watchdog_enable_all_cpus
resp watchdog_disable_all_cpus (eg watchdog gets enabled even if
watchdog_thresh was set to 0 already).

This patch adds a local mutex which synchronizes callers to the sysctl
handler.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 51c4f34d258e..ced7d0609931 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -520,13 +520,15 @@ int proc_dowatchdog(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int err, old_thresh, old_enabled;
+	static DEFINE_MUTEX(watchdog_proc_mutex);
 
+	mutex_lock(&watchdog_proc_mutex);
 	old_thresh = ACCESS_ONCE(watchdog_thresh);
 	old_enabled = ACCESS_ONCE(watchdog_user_enabled);
 
 	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (err || !write)
-		return err;
+		goto out;
 
 	set_sample_period();
 	/*
@@ -544,7 +546,8 @@ int proc_dowatchdog(struct ctl_table *table, int write,
 		watchdog_thresh = old_thresh;
 		watchdog_user_enabled = old_enabled;
 	}
-
+out:
+	mutex_unlock(&watchdog_proc_mutex);
 	return err;
 }
 #endif /* CONFIG_SYSCTL */
-- 
cgit v1.2.3


From 9809b18fcf6b8d8ec4d3643677345907e6b50eca Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 24 Sep 2013 15:27:30 -0700
Subject: watchdog: update watchdog_thresh properly

watchdog_tresh controls how often nmi perf event counter checks per-cpu
hrtimer_interrupts counter and blows up if the counter hasn't changed
since the last check.  The counter is updated by per-cpu
watchdog_hrtimer hrtimer which is scheduled with 2/5 watchdog_thresh
period which guarantees that hrtimer is scheduled 2 times per the main
period.  Both hrtimer and perf event are started together when the
watchdog is enabled.

So far so good.  But...

But what happens when watchdog_thresh is updated from sysctl handler?

proc_dowatchdog will set a new sampling period and hrtimer callback
(watchdog_timer_fn) will use the new value in the next round.  The
problem, however, is that nobody tells the perf event that the sampling
period has changed so it is ticking with the period configured when it
has been set up.

This might result in an ear ripping dissonance between perf and hrtimer
parts if the watchdog_thresh is increased.  And even worse it might lead
to KABOOM if the watchdog is configured to panic on such a spurious
lockup.

This patch fixes the issue by updating both nmi perf even counter and
hrtimers if the threshold value has changed.

The nmi one is disabled and then reinitialized from scratch.  This has
an unpleasant side effect that the allocation of the new event might
fail theoretically so the hard lockup detector would be disabled for
such cpus.  On the other hand such a memory allocation failure is very
unlikely because the original event is deallocated right before.

It would be much nicer if we just changed perf event period but there
doesn't seem to be any API to do that right now.  It is also unfortunate
that perf_event_alloc uses GFP_KERNEL allocation unconditionally so we
cannot use on_each_cpu() and do the same thing from the per-cpu context.
The update from the current CPU should be safe because
perf_event_disable removes the event atomically before it clears the
per-cpu watchdog_ev so it cannot change anything under running handler
feet.

The hrtimer is simply restarted (thanks to Don Zickus who has pointed
this out) if it is queued because we cannot rely it will fire&adopt to
the new sampling period before a new nmi event triggers (when the
treshold is decreased).

[akpm@linux-foundation.org: the UP version of __smp_call_function_single ended up in the wrong place]
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index ced7d0609931..4431610f049a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = {
 	.unpark			= watchdog_enable,
 };
 
-static int watchdog_enable_all_cpus(void)
+static void restart_watchdog_hrtimer(void *info)
+{
+	struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+	int ret;
+
+	/*
+	 * No need to cancel and restart hrtimer if it is currently executing
+	 * because it will reprogram itself with the new period now.
+	 * We should never see it unqueued here because we are running per-cpu
+	 * with interrupts disabled.
+	 */
+	ret = hrtimer_try_to_cancel(hrtimer);
+	if (ret == 1)
+		hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+				HRTIMER_MODE_REL_PINNED);
+}
+
+static void update_timers(int cpu)
+{
+	struct call_single_data data = {.func = restart_watchdog_hrtimer};
+	/*
+	 * Make sure that perf event counter will adopt to a new
+	 * sampling period. Updating the sampling period directly would
+	 * be much nicer but we do not have an API for that now so
+	 * let's use a big hammer.
+	 * Hrtimer will adopt the new period on the next tick but this
+	 * might be late already so we have to restart the timer as well.
+	 */
+	watchdog_nmi_disable(cpu);
+	__smp_call_function_single(cpu, &data, 1);
+	watchdog_nmi_enable(cpu);
+}
+
+static void update_timers_all_cpus(void)
+{
+	int cpu;
+
+	get_online_cpus();
+	preempt_disable();
+	for_each_online_cpu(cpu)
+		update_timers(cpu);
+	preempt_enable();
+	put_online_cpus();
+}
+
+static int watchdog_enable_all_cpus(bool sample_period_changed)
 {
 	int err = 0;
 
@@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void)
 			pr_err("Failed to create watchdog threads, disabled\n");
 		else
 			watchdog_running = 1;
+	} else if (sample_period_changed) {
+		update_timers_all_cpus();
 	}
 
 	return err;
@@ -537,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
 	 * watchdog_*_all_cpus() function takes care of this.
 	 */
 	if (watchdog_user_enabled && watchdog_thresh)
-		err = watchdog_enable_all_cpus();
+		err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
 	else
 		watchdog_disable_all_cpus();
 
@@ -557,5 +604,5 @@ void __init lockup_detector_init(void)
 	set_sample_period();
 
 	if (watchdog_user_enabled)
-		watchdog_enable_all_cpus();
+		watchdog_enable_all_cpus(false);
 }
-- 
cgit v1.2.3


From 8ac1c8d5deba65513b6a82c35e89e73996c8e0d6 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue, 24 Sep 2013 15:27:42 -0700
Subject: audit: fix endless wait in audit_log_start()

After commit 829199197a43 ("kernel/audit.c: avoid negative sleep
durations") audit emitters will block forever if userspace daemon cannot
handle backlog.

After the timeout the waiting loop turns into busy loop and runs until
daemon dies or returns back to work.  This is a minimal patch for that
bug.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Richard Guy Briggs <rgb@redhat.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Chuck Anderson <chuck.anderson@oracle.com>
Cc: Dan Duval <dan.duval@oracle.com>
Cc: Dave Kleikamp <dave.kleikamp@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/audit.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 91e53d04b6a9..7b0e23a740ce 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1117,9 +1117,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 
 			sleep_time = timeout_start + audit_backlog_wait_time -
 					jiffies;
-			if ((long)sleep_time > 0)
+			if ((long)sleep_time > 0) {
 				wait_for_auditd(sleep_time);
-			continue;
+				continue;
+			}
 		}
 		if (audit_rate_check() && printk_ratelimit())
 			printk(KERN_WARNING
-- 
cgit v1.2.3


From e2f0b88e84eed9cf9797f0a88c8012ee0b885a6d Mon Sep 17 00:00:00 2001
From: Chuansheng Liu <chuansheng.liu@intel.com>
Date: Tue, 24 Sep 2013 15:27:43 -0700
Subject: kernel/reboot.c: re-enable the function of variable reboot_default

Commit 1b3a5d02ee07 ("reboot: move arch/x86 reboot= handling to generic
kernel") did some cleanup for reboot= command line, but it made the
reboot_default inoperative.

The default value of variable reboot_default should be 1, and if command
line reboot= is not set, system will use the default reboot mode.

[akpm@linux-foundation.org: fix comment layout]
Signed-off-by: Li Fei <fei.li@intel.com>
Signed-off-by: liu chuansheng <chuansheng.liu@intel.com>
Acked-by: Robin Holt <robinmholt@linux.com>
Cc: <stable@vger.kernel.org>	[3.11.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/reboot.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/reboot.c b/kernel/reboot.c
index 269ed9384cc4..f813b3474646 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -32,7 +32,14 @@ EXPORT_SYMBOL(cad_pid);
 #endif
 enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
 
-int reboot_default;
+/*
+ * This variable is used privately to keep track of whether or not
+ * reboot_type is still set to its default value (i.e., reboot= hasn't
+ * been set on the command line).  This is needed so that we can
+ * suppress DMI scanning for reboot quirks.  Without it, it's
+ * impossible to override a faulty reboot quirk without recompiling.
+ */
+int reboot_default = 1;
 int reboot_cpu;
 enum reboot_type reboot_type = BOOT_ACPI;
 int reboot_force;
-- 
cgit v1.2.3


From 0c06a5d4b13cd66c833805a0d1db76b977944aac Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 10 Sep 2013 00:54:17 +0200
Subject: arm: Fix build error with context tracking calls

ad65782fba50 (context_tracking: Optimize main APIs off case
with static key) converted context tracking main APIs to inline
function and left ARM asm callers behind.

This can be easily fixed by making ARM calling the post static
keys context tracking function. We just need to replicate the
static key checks there. We'll remove these later when ARM will
support the context tracking static keys.

Reported-by: Guenter Roeck <linux@roeck-us.net>
Reported-by: Russell King <linux@arm.linux.org.uk>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Kevin Hilman <khilman@linaro.org>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Anil Kumar <anilk4.v@gmail.com>
Cc: Tony Lindgren <tony@atomide.com>
Cc: Benoit Cousson <b-cousson@ti.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Kevin Hilman <khilman@linaro.org>
---
 kernel/context_tracking.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 247091bf0587..859c8dfd78a1 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -50,6 +50,15 @@ void context_tracking_user_enter(void)
 {
 	unsigned long flags;
 
+	/*
+	 * Repeat the user_enter() check here because some archs may be calling
+	 * this from asm and if no CPU needs context tracking, they shouldn't
+	 * go further. Repeat the check here until they support the static key
+	 * check.
+	 */
+	if (!static_key_false(&context_tracking_enabled))
+		return;
+
 	/*
 	 * Some contexts may involve an exception occuring in an irq,
 	 * leading to that nesting:
@@ -151,6 +160,9 @@ void context_tracking_user_exit(void)
 {
 	unsigned long flags;
 
+	if (!static_key_false(&context_tracking_enabled))
+		return;
+
 	if (in_interrupt())
 		return;
 
-- 
cgit v1.2.3


From 3a126f85e015701e56240884f27f97543580d5f7 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Fri, 27 Sep 2013 13:17:39 -0700
Subject: kernel/params: fix handling of signed integer types

Commit 6072ddc8520b ("kernel: replace strict_strto*() with kstrto*()")
broke the handling of signed integer types, fix it.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Reported-by: Christian Kujau <lists@nerdbynature.de>
Tested-by: Christian Kujau <lists@nerdbynature.de>
Cc: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/params.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 81c4e78c8f4c..c00d5b502aa4 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -254,11 +254,11 @@ int parse_args(const char *doing,
 
 
 STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul);
-STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtoul);
+STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol);
 STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul);
-STANDARD_PARAM_DEF(int, int, "%i", long, kstrtoul);
+STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol);
 STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul);
-STANDARD_PARAM_DEF(long, long, "%li", long, kstrtoul);
+STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol);
 STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul);
 
 int param_set_charp(const char *val, const struct kernel_param *kp)
-- 
cgit v1.2.3