From fbd44a607a1a5019bc32c3615cead8c5ee8f89c9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 May 2013 20:22:36 +0200
Subject: tick: Use zalloc_cpumask_var for allocating offstack cpumasks

commit b352bc1cbc (tick: Convert broadcast cpu bitmaps to
cpumask_var_t) broke CONFIG_CPUMASK_OFFSTACK in a very subtle way.

Instead of allocating the cpumasks with zalloc_cpumask_var it uses
alloc_cpumask_var, so we can get random data there, which of course
confuses the logic completely and causes random failures.

Reported-and-tested-by: Dave Jones <davej@redhat.com>
Reported-and-tested-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305032015060.2990@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-broadcast.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 61d00a8cdf2f..d70cdc42c829 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -785,11 +785,11 @@ bool tick_broadcast_oneshot_available(void)
 
 void __init tick_broadcast_init(void)
 {
-	alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
-	alloc_cpumask_var(&tmpmask, GFP_NOWAIT);
+	zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
+	zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
 #ifdef CONFIG_TICK_ONESHOT
-	alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
-	alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
-	alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
+	zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
+	zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
+	zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
 #endif
 }
-- 
cgit v1.2.3


From 524eff183f51d080a83b348d0ea97c08b3607b9a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Mon, 6 May 2013 18:27:17 +0200
Subject: perf: Fix EXIT event notification

The perf_event_task_ctx() function needs to be called with
preemption disabled, since it's checking for currently
scheduled cpu against event cpu.

We disable preemption for task related perf event context
if there's one defined, leaving up to the chance which cpu
it gets scheduled in.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/1367857638-27631-2-git-send-email-jolsa@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6b41c1899a8b..38b68a05c3c6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4474,7 +4474,7 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
 static void perf_event_task_event(struct perf_task_event *task_event)
 {
 	struct perf_cpu_context *cpuctx;
-	struct perf_event_context *ctx;
+	struct perf_event_context *ctx, *task_ctx = task_event->task_ctx;
 	struct pmu *pmu;
 	int ctxn;
 
@@ -4485,20 +4485,22 @@ static void perf_event_task_event(struct perf_task_event *task_event)
 			goto next;
 		perf_event_task_ctx(&cpuctx->ctx, task_event);
 
-		ctx = task_event->task_ctx;
-		if (!ctx) {
-			ctxn = pmu->task_ctx_nr;
-			if (ctxn < 0)
-				goto next;
-			ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-			if (ctx)
-				perf_event_task_ctx(ctx, task_event);
-		}
+		if (task_ctx)
+			goto next;
+		ctxn = pmu->task_ctx_nr;
+		if (ctxn < 0)
+			goto next;
+		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+		if (ctx)
+			perf_event_task_ctx(ctx, task_event);
 next:
 		put_cpu_ptr(pmu->pmu_cpu_context);
 	}
-	if (task_event->task_ctx)
-		perf_event_task_ctx(task_event->task_ctx, task_event);
+	if (task_ctx) {
+		preempt_disable();
+		perf_event_task_ctx(task_ctx, task_event);
+		preempt_enable();
+	}
 
 	rcu_read_unlock();
 }
-- 
cgit v1.2.3


From 52d857a8784a09576215c71cebf368d61c12a754 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Mon, 6 May 2013 18:27:18 +0200
Subject: perf: Factor out auxiliary events notification

Add perf_event_aux() function to send out all types of
auxiliary events - mmap, task, comm events. For each type
there's match and output functions defined and used as
callbacks during perf_event_aux processing.

This way we can centralize the pmu/context iterating and
event matching logic. Also since lot of the code was
duplicated, this patch reduces the .text size about 2kB
on my setup:

  snipped output from 'objdump -x kernel/events/core.o'

  before:
  Idx Name          Size
    0 .text         0000d313

  after:
  Idx Name          Size
    0 .text         0000cad3

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/1367857638-27631-3-git-send-email-jolsa@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 242 +++++++++++++++++++--------------------------------
 1 file changed, 89 insertions(+), 153 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 38b68a05c3c6..9dc297faf7c0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4394,6 +4394,64 @@ perf_event_read_event(struct perf_event *event,
 	perf_output_end(&handle);
 }
 
+typedef int  (perf_event_aux_match_cb)(struct perf_event *event, void *data);
+typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+
+static void
+perf_event_aux_ctx(struct perf_event_context *ctx,
+		   perf_event_aux_match_cb match,
+		   perf_event_aux_output_cb output,
+		   void *data)
+{
+	struct perf_event *event;
+
+	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+		if (event->state < PERF_EVENT_STATE_INACTIVE)
+			continue;
+		if (!event_filter_match(event))
+			continue;
+		if (match(event, data))
+			output(event, data);
+	}
+}
+
+static void
+perf_event_aux(perf_event_aux_match_cb match,
+	       perf_event_aux_output_cb output,
+	       void *data,
+	       struct perf_event_context *task_ctx)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	struct pmu *pmu;
+	int ctxn;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+		if (cpuctx->unique_pmu != pmu)
+			goto next;
+		perf_event_aux_ctx(&cpuctx->ctx, match, output, data);
+		if (task_ctx)
+			goto next;
+		ctxn = pmu->task_ctx_nr;
+		if (ctxn < 0)
+			goto next;
+		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+		if (ctx)
+			perf_event_aux_ctx(ctx, match, output, data);
+next:
+		put_cpu_ptr(pmu->pmu_cpu_context);
+	}
+
+	if (task_ctx) {
+		preempt_disable();
+		perf_event_aux_ctx(task_ctx, match, output, data);
+		preempt_enable();
+	}
+	rcu_read_unlock();
+}
+
 /*
  * task tracking -- fork/exit
  *
@@ -4416,8 +4474,9 @@ struct perf_task_event {
 };
 
 static void perf_event_task_output(struct perf_event *event,
-				     struct perf_task_event *task_event)
+				   void *data)
 {
+	struct perf_task_event *task_event = data;
 	struct perf_output_handle handle;
 	struct perf_sample_data	sample;
 	struct task_struct *task = task_event->task;
@@ -4445,64 +4504,11 @@ out:
 	task_event->event_id.header.size = size;
 }
 
-static int perf_event_task_match(struct perf_event *event)
-{
-	if (event->state < PERF_EVENT_STATE_INACTIVE)
-		return 0;
-
-	if (!event_filter_match(event))
-		return 0;
-
-	if (event->attr.comm || event->attr.mmap ||
-	    event->attr.mmap_data || event->attr.task)
-		return 1;
-
-	return 0;
-}
-
-static void perf_event_task_ctx(struct perf_event_context *ctx,
-				  struct perf_task_event *task_event)
-{
-	struct perf_event *event;
-
-	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-		if (perf_event_task_match(event))
-			perf_event_task_output(event, task_event);
-	}
-}
-
-static void perf_event_task_event(struct perf_task_event *task_event)
+static int perf_event_task_match(struct perf_event *event,
+				 void *data __maybe_unused)
 {
-	struct perf_cpu_context *cpuctx;
-	struct perf_event_context *ctx, *task_ctx = task_event->task_ctx;
-	struct pmu *pmu;
-	int ctxn;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			goto next;
-		perf_event_task_ctx(&cpuctx->ctx, task_event);
-
-		if (task_ctx)
-			goto next;
-		ctxn = pmu->task_ctx_nr;
-		if (ctxn < 0)
-			goto next;
-		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-		if (ctx)
-			perf_event_task_ctx(ctx, task_event);
-next:
-		put_cpu_ptr(pmu->pmu_cpu_context);
-	}
-	if (task_ctx) {
-		preempt_disable();
-		perf_event_task_ctx(task_ctx, task_event);
-		preempt_enable();
-	}
-
-	rcu_read_unlock();
+	return event->attr.comm || event->attr.mmap ||
+	       event->attr.mmap_data || event->attr.task;
 }
 
 static void perf_event_task(struct task_struct *task,
@@ -4533,7 +4539,10 @@ static void perf_event_task(struct task_struct *task,
 		},
 	};
 
-	perf_event_task_event(&task_event);
+	perf_event_aux(perf_event_task_match,
+		       perf_event_task_output,
+		       &task_event,
+		       task_ctx);
 }
 
 void perf_event_fork(struct task_struct *task)
@@ -4559,8 +4568,9 @@ struct perf_comm_event {
 };
 
 static void perf_event_comm_output(struct perf_event *event,
-				     struct perf_comm_event *comm_event)
+				   void *data)
 {
+	struct perf_comm_event *comm_event = data;
 	struct perf_output_handle handle;
 	struct perf_sample_data sample;
 	int size = comm_event->event_id.header.size;
@@ -4587,39 +4597,16 @@ out:
 	comm_event->event_id.header.size = size;
 }
 
-static int perf_event_comm_match(struct perf_event *event)
-{
-	if (event->state < PERF_EVENT_STATE_INACTIVE)
-		return 0;
-
-	if (!event_filter_match(event))
-		return 0;
-
-	if (event->attr.comm)
-		return 1;
-
-	return 0;
-}
-
-static void perf_event_comm_ctx(struct perf_event_context *ctx,
-				  struct perf_comm_event *comm_event)
+static int perf_event_comm_match(struct perf_event *event,
+				 void *data __maybe_unused)
 {
-	struct perf_event *event;
-
-	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-		if (perf_event_comm_match(event))
-			perf_event_comm_output(event, comm_event);
-	}
+	return event->attr.comm;
 }
 
 static void perf_event_comm_event(struct perf_comm_event *comm_event)
 {
-	struct perf_cpu_context *cpuctx;
-	struct perf_event_context *ctx;
 	char comm[TASK_COMM_LEN];
 	unsigned int size;
-	struct pmu *pmu;
-	int ctxn;
 
 	memset(comm, 0, sizeof(comm));
 	strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -4629,24 +4616,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 	comm_event->comm_size = size;
 
 	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
-	rcu_read_lock();
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			goto next;
-		perf_event_comm_ctx(&cpuctx->ctx, comm_event);
 
-		ctxn = pmu->task_ctx_nr;
-		if (ctxn < 0)
-			goto next;
-
-		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-		if (ctx)
-			perf_event_comm_ctx(ctx, comm_event);
-next:
-		put_cpu_ptr(pmu->pmu_cpu_context);
-	}
-	rcu_read_unlock();
+	perf_event_aux(perf_event_comm_match,
+		       perf_event_comm_output,
+		       comm_event,
+		       NULL);
 }
 
 void perf_event_comm(struct task_struct *task)
@@ -4708,8 +4682,9 @@ struct perf_mmap_event {
 };
 
 static void perf_event_mmap_output(struct perf_event *event,
-				     struct perf_mmap_event *mmap_event)
+				   void *data)
 {
+	struct perf_mmap_event *mmap_event = data;
 	struct perf_output_handle handle;
 	struct perf_sample_data sample;
 	int size = mmap_event->event_id.header.size;
@@ -4736,46 +4711,24 @@ out:
 }
 
 static int perf_event_mmap_match(struct perf_event *event,
-				   struct perf_mmap_event *mmap_event,
-				   int executable)
+				 void *data)
 {
-	if (event->state < PERF_EVENT_STATE_INACTIVE)
-		return 0;
-
-	if (!event_filter_match(event))
-		return 0;
-
-	if ((!executable && event->attr.mmap_data) ||
-	    (executable && event->attr.mmap))
-		return 1;
-
-	return 0;
-}
-
-static void perf_event_mmap_ctx(struct perf_event_context *ctx,
-				  struct perf_mmap_event *mmap_event,
-				  int executable)
-{
-	struct perf_event *event;
+	struct perf_mmap_event *mmap_event = data;
+	struct vm_area_struct *vma = mmap_event->vma;
+	int executable = vma->vm_flags & VM_EXEC;
 
-	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-		if (perf_event_mmap_match(event, mmap_event, executable))
-			perf_event_mmap_output(event, mmap_event);
-	}
+	return (!executable && event->attr.mmap_data) ||
+	       (executable && event->attr.mmap);
 }
 
 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 {
-	struct perf_cpu_context *cpuctx;
-	struct perf_event_context *ctx;
 	struct vm_area_struct *vma = mmap_event->vma;
 	struct file *file = vma->vm_file;
 	unsigned int size;
 	char tmp[16];
 	char *buf = NULL;
 	const char *name;
-	struct pmu *pmu;
-	int ctxn;
 
 	memset(tmp, 0, sizeof(tmp));
 
@@ -4831,27 +4784,10 @@ got_name:
 
 	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			goto next;
-		perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
-					vma->vm_flags & VM_EXEC);
-
-		ctxn = pmu->task_ctx_nr;
-		if (ctxn < 0)
-			goto next;
-
-		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
-		if (ctx) {
-			perf_event_mmap_ctx(ctx, mmap_event,
-					vma->vm_flags & VM_EXEC);
-		}
-next:
-		put_cpu_ptr(pmu->pmu_cpu_context);
-	}
-	rcu_read_unlock();
+	perf_event_aux(perf_event_mmap_match,
+		       perf_event_mmap_output,
+		       mmap_event,
+		       NULL);
 
 	kfree(buf);
 }
-- 
cgit v1.2.3


From d3251859168b0b12841e1b90d6d768ab478dc23d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 May 2013 11:10:17 -0700
Subject: workqueue: workqueue_congested() shouldn't translate WORK_CPU_UNBOUND
 into node number

df2d5ae499 ("workqueue: map an unbound workqueues to multiple per-node
pool_workqueues") made unbound workqueues to map to multiple per-node
pool_workqueues and accordingly updated workqueue_contested() so that,
for unbound workqueues, it maps the specified @cpu to the NUMA node
number to obtain the matching pool_workqueue to query the congested
state.

Before this change, workqueue_congested() ignored @cpu for unbound
workqueues as there was only one pool_workqueue and some users
(fscache) called it with WORK_CPU_UNBOUND.  After the commit, this
causes the following oops as WORK_CPU_UNBOUND gets translated to
garbage by cpu_to_node().

  BUG: unable to handle kernel paging request at ffff8803598d98b8
  IP: [<ffffffff81043b7e>] unbound_pwq_by_node+0xa1/0xfa
  PGD 2421067 PUD 0
  Oops: 0000 [#1] SMP
  CPU: 1 PID: 2689 Comm: cat Tainted: GF            3.9.0-fsdevel+ #4
  task: ffff88003d801040 ti: ffff880025806000 task.ti: ffff880025806000
  RIP: 0010:[<ffffffff81043b7e>]  [<ffffffff81043b7e>] unbound_pwq_by_node+0xa1/0xfa
  RSP: 0018:ffff880025807ad8  EFLAGS: 00010202
  RAX: 0000000000000001 RBX: ffff8800388a2400 RCX: 0000000000000003
  RDX: ffff880025807fd8 RSI: ffffffff81a31420 RDI: ffff88003d8016e0
  RBP: ffff880025807ae8 R08: ffff88003d801730 R09: ffffffffa00b4898
  R10: ffffffff81044217 R11: ffff88003d801040 R12: 0000000064206e97
  R13: ffff880036059d98 R14: ffff880038cc8080 R15: ffff880038cc82d0
  FS:  00007f21afd9c740(0000) GS:ffff88003d100000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: ffff8803598d98b8 CR3: 000000003df49000 CR4: 00000000000007e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
  Stack:
   ffff8800388a2400 0000000000000002 ffff880025807b18 ffffffff810442ce
   ffffffff81044217 ffff880000000002 ffff8800371b4080 ffff88003d112ec0
   ffff880025807b38 ffffffffa00810b0 ffff880036059d88 ffff880036059be8
  Call Trace:
   [<ffffffff810442ce>] workqueue_congested+0xb7/0x12c
   [<ffffffffa00810b0>] fscache_enqueue_object+0xb2/0xe8 [fscache]
   [<ffffffffa007facd>] __fscache_acquire_cookie+0x3b9/0x56c [fscache]
   [<ffffffffa00ad8fe>] nfs_fscache_set_inode_cookie+0xee/0x132 [nfs]
   [<ffffffffa009e112>] do_open+0x9/0xd [nfs]
   [<ffffffff810e804a>] do_dentry_open+0x175/0x24b
   [<ffffffff810e8298>] finish_open+0x41/0x51

Fix it by using smp_processor_id() if @cpu is WORK_CPU_UNBOUND.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: David Howells <dhowells@redhat.com>
Tested-and-Acked-by: David Howells <dhowells@redhat.com>
---
 kernel/workqueue.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4aa9f5bc6b2d..1ae602809efb 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4311,6 +4311,12 @@ bool current_is_workqueue_rescuer(void)
  * no synchronization around this function and the test result is
  * unreliable and only useful as advisory hints or for debugging.
  *
+ * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
+ * Note that both per-cpu and unbound workqueues may be associated with
+ * multiple pool_workqueues which have separate congested states.  A
+ * workqueue being congested on one CPU doesn't mean the workqueue is also
+ * contested on other CPUs / NUMA nodes.
+ *
  * RETURNS:
  * %true if congested, %false otherwise.
  */
@@ -4321,6 +4327,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
 
 	rcu_read_lock_sched();
 
+	if (cpu == WORK_CPU_UNBOUND)
+		cpu = smp_processor_id();
+
 	if (!(wq->flags & WQ_UNBOUND))
 		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
 	else
-- 
cgit v1.2.3


From 4b0c0f294f60abcdd20994a8341a95c8ac5eeb96 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 May 2013 15:02:50 +0200
Subject: tick: Cleanup NOHZ per cpu data on cpu down

Prarit reported a crash on CPU offline/online. The reason is that on
CPU down the NOHZ related per cpu data of the dead cpu is not cleaned
up. If at cpu online an interrupt happens before the per cpu tick
device is registered the irq_enter() check potentially sees stale data
and dereferences a NULL pointer.

Cleanup the data after the cpu is dead.

Reported-by: Prarit Bhargava <prarit@redhat.com>
Cc: stable@vger.kernel.org
Cc: Mike Galbraith <bitbucket@online.de>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305031451561.2886@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 225f8bf19095..0eed1db2d792 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -904,7 +904,7 @@ void tick_cancel_sched_timer(int cpu)
 		hrtimer_cancel(&ts->sched_timer);
 # endif
 
-	ts->nohz_mode = NOHZ_MODE_INACTIVE;
+	memset(ts, 0, sizeof(*ts));
 }
 #endif
 
-- 
cgit v1.2.3


From d6cbf35dac8a3dadb9103379820c96d7c85df3d9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 14 May 2013 19:44:20 +0800
Subject: cgroup: initialize xattr before calling d_instantiate()

cgroup_create_file() calls d_instantiate(), which may decide to look
at the xattrs on the file. Smack always does this and SELinux can be
configured to do so.

But cgroup_add_file() didn't initialize xattrs before calling
cgroup_create_file(), which finally leads to dereferencing NULL
dentry->d_fsdata.

This bug has been there since cgroup xattr was introduced.

Cc: <stable@vger.kernel.org> # 3.8.x
Reported-by: Ivan Bulatovic <combuster@archlinux.us>
Reported-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2a9926275f80..38b136553044 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2699,13 +2699,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 		goto out;
 	}
 
+	cfe->type = (void *)cft;
+	cfe->dentry = dentry;
+	dentry->d_fsdata = cfe;
+	simple_xattrs_init(&cfe->xattrs);
+
 	mode = cgroup_file_mode(cft);
 	error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
 	if (!error) {
-		cfe->type = (void *)cft;
-		cfe->dentry = dentry;
-		dentry->d_fsdata = cfe;
-		simple_xattrs_init(&cfe->xattrs);
 		list_add_tail(&cfe->node, &parent->files);
 		cfe = NULL;
 	}
-- 
cgit v1.2.3


From f7ea0fd639c2c48d3c61b6eec75362be290c6874 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 13 May 2013 21:40:27 +0200
Subject: tick: Don't invoke tick_nohz_stop_sched_tick() if the cpu is offline

commit 5b39939a4 (nohz: Move ts->idle_calls incrementation into strict
idle logic) moved code out of tick_nohz_stop_sched_tick() and missed
to bail out when the cpu is offline. That's causing subsequent
failures as an offline CPU is supposed to die and not to fiddle with
nohz magic.

Return false in can_stop_idle_tick() if the cpu is offline.

Reported-and-tested-by: Jiri Kosina <jkosina@suse.cz>
Reported-and-tested-by: Prarit Bhargava <prarit@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: x86@kernel.org
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305132138160.2863@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0eed1db2d792..012142187db9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -469,6 +469,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 	if (unlikely(!cpu_online(cpu))) {
 		if (cpu == tick_do_timer_cpu)
 			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+		return false;
 	}
 
 	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
-- 
cgit v1.2.3


From b47430d3adbedbfdb5979ba4874f5dadf94f16b1 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Tue, 14 May 2013 04:01:27 +0530
Subject: rcu/idle: Wrap cpu-idle poll mode within rcu_idle_enter/exit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bjørn Mork reported the following warning when running powertop.

[   49.289034] ------------[ cut here ]------------
[   49.289055] WARNING: at kernel/rcutree.c:502 rcu_eqs_exit_common.isra.48+0x3d/0x125()
[   49.289244] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.10.0-bisect-rcu-warn+ #107
[   49.289251]  ffffffff8157d8c8 ffffffff81801e28 ffffffff8137e4e3 ffffffff81801e68
[   49.289260]  ffffffff8103094f ffffffff81801e68 0000000000000000 ffff88023afcd9b0
[   49.289268]  0000000000000000 0140000000000000 ffff88023bee7700 ffffffff81801e78
[   49.289276] Call Trace:
[   49.289285]  [<ffffffff8137e4e3>] dump_stack+0x19/0x1b
[   49.289293]  [<ffffffff8103094f>] warn_slowpath_common+0x62/0x7b
[   49.289300]  [<ffffffff8103097d>] warn_slowpath_null+0x15/0x17
[   49.289306]  [<ffffffff810a9006>] rcu_eqs_exit_common.isra.48+0x3d/0x125
[   49.289314]  [<ffffffff81079b49>] ? trace_hardirqs_off_caller+0x37/0xa6
[   49.289320]  [<ffffffff810a9692>] rcu_idle_exit+0x85/0xa8
[   49.289327]  [<ffffffff8107076e>] trace_cpu_idle_rcuidle+0xae/0xff
[   49.289334]  [<ffffffff810708b1>] cpu_startup_entry+0x72/0x115
[   49.289341]  [<ffffffff813689e5>] rest_init+0x149/0x150
[   49.289347]  [<ffffffff8136889c>] ? csum_partial_copy_generic+0x16c/0x16c
[   49.289355]  [<ffffffff81a82d34>] start_kernel+0x3f0/0x3fd
[   49.289362]  [<ffffffff81a8274c>] ? repair_env_string+0x5a/0x5a
[   49.289368]  [<ffffffff81a82481>] x86_64_start_reservations+0x2a/0x2c
[   49.289375]  [<ffffffff81a82550>] x86_64_start_kernel+0xcd/0xd1
[   49.289379] ---[ end trace 07a1cc95e29e9036 ]---

The warning is that 'rdtp->dynticks' has an unexpected value, which roughly
translates to - the calls to rcu_idle_enter() and rcu_idle_exit() were not
made in the correct order, or otherwise messed up.

And Bjørn's painstaking debugging indicated that this happens when the idle
loop enters the poll mode. Looking at the poll function cpu_idle_poll(), and
the implementation of trace_cpu_idle_rcuidle(), the problem becomes very clear:
cpu_idle_poll() lacks calls to rcu_idle_enter/exit(), and trace_cpu_idle_rcuidle()
calls them in the reverse order - first rcu_idle_exit(), and then rcu_idle_enter().
Hence the even/odd alternative sequencing of rdtp->dynticks goes for a toss.

And powertop readily triggers this because powertop uses the idle-tracing
infrastructure extensively.

So, to fix this, wrap the code in cpu_idle_poll() within rcu_idle_enter/exit(),
so that it blends properly with the calls inside trace_cpu_idle_rcuidle() and
thus get the function ordering right.

Reported-and-tested-by: Bjørn Mork <bjorn@mork.no>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/519169BF.4080208@linux.vnet.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu/idle.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 8b86c0c68edf..d5585f5e038e 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -40,11 +40,13 @@ __setup("hlt", cpu_idle_nopoll_setup);
 
 static inline int cpu_idle_poll(void)
 {
+	rcu_idle_enter();
 	trace_cpu_idle_rcuidle(0, smp_processor_id());
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+	rcu_idle_exit();
 	return 1;
 }
 
-- 
cgit v1.2.3


From 42a5cf46cd56f46267d2a9fcf2655f4078cd3042 Mon Sep 17 00:00:00 2001
From: Tirupathi Reddy <tirupath@codeaurora.org>
Date: Tue, 14 May 2013 13:59:02 +0530
Subject: timer: Don't reinitialize the cpu base lock during CPU_UP_PREPARE

An inactive timer's base can refer to a offline cpu's base.

In the current code, cpu_base's lock is blindly reinitialized each
time a CPU is brought up. If a CPU is brought online during the period
that another thread is trying to modify an inactive timer on that CPU
with holding its timer base lock, then the lock will be reinitialized
under its feet. This leads to following SPIN_BUG().

<0> BUG: spinlock already unlocked on CPU#3, kworker/u:3/1466
<0> lock: 0xe3ebe000, .magic: dead4ead, .owner: kworker/u:3/1466, .owner_cpu: 1
<4> [<c0013dc4>] (unwind_backtrace+0x0/0x11c) from [<c026e794>] (do_raw_spin_unlock+0x40/0xcc)
<4> [<c026e794>] (do_raw_spin_unlock+0x40/0xcc) from [<c076c160>] (_raw_spin_unlock+0x8/0x30)
<4> [<c076c160>] (_raw_spin_unlock+0x8/0x30) from [<c009b858>] (mod_timer+0x294/0x310)
<4> [<c009b858>] (mod_timer+0x294/0x310) from [<c00a5e04>] (queue_delayed_work_on+0x104/0x120)
<4> [<c00a5e04>] (queue_delayed_work_on+0x104/0x120) from [<c04eae00>] (sdhci_msm_bus_voting+0x88/0x9c)
<4> [<c04eae00>] (sdhci_msm_bus_voting+0x88/0x9c) from [<c04d8780>] (sdhci_disable+0x40/0x48)
<4> [<c04d8780>] (sdhci_disable+0x40/0x48) from [<c04bf300>] (mmc_release_host+0x4c/0xb0)
<4> [<c04bf300>] (mmc_release_host+0x4c/0xb0) from [<c04c7aac>] (mmc_sd_detect+0x90/0xfc)
<4> [<c04c7aac>] (mmc_sd_detect+0x90/0xfc) from [<c04c2504>] (mmc_rescan+0x7c/0x2c4)
<4> [<c04c2504>] (mmc_rescan+0x7c/0x2c4) from [<c00a6a7c>] (process_one_work+0x27c/0x484)
<4> [<c00a6a7c>] (process_one_work+0x27c/0x484) from [<c00a6e94>] (worker_thread+0x210/0x3b0)
<4> [<c00a6e94>] (worker_thread+0x210/0x3b0) from [<c00aad9c>] (kthread+0x80/0x8c)
<4> [<c00aad9c>] (kthread+0x80/0x8c) from [<c000ea80>] (kernel_thread_exit+0x0/0x8)

As an example, this particular crash occurred when CPU #3 is executing
mod_timer() on an inactive timer whose base is refered to offlined CPU
#2.  The code locked the timer_base corresponding to CPU #2. Before it
could proceed, CPU #2 came online and reinitialized the spinlock
corresponding to its base. Thus now CPU #3 held a lock which was
reinitialized. When CPU #3 finally ended up unlocking the old cpu_base
corresponding to CPU #2, we hit the above SPIN_BUG().

CPU #0		CPU #3				       CPU #2
------		-------				       -------
.....		 ......				      <Offline>
		mod_timer()
		 lock_timer_base
		   spin_lock_irqsave(&base->lock)

cpu_up(2)	 .....				        ......
							init_timers_cpu()
....		 .....				    	spin_lock_init(&base->lock)
.....		   spin_unlock_irqrestore(&base->lock)  ......
		   <spin_bug>

Allocation of per_cpu timer vector bases is done only once under
"tvec_base_done[]" check. In the current code, spinlock_initialization
of base->lock isn't under this check. When a CPU is up each time the
base lock is reinitialized. Move base spinlock initialization under
the check.

Signed-off-by: Tirupathi Reddy <tirupath@codeaurora.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1368520142-4136-1-git-send-email-tirupath@codeaurora.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 09bca8ce9771..7376589adc28 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1539,12 +1539,12 @@ static int __cpuinit init_timers_cpu(int cpu)
 			boot_done = 1;
 			base = &boot_tvec_bases;
 		}
+		spin_lock_init(&base->lock);
 		tvec_base_done[cpu] = 1;
 	} else {
 		base = per_cpu(tvec_bases, cpu);
 	}
 
-	spin_lock_init(&base->lock);
 
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
-- 
cgit v1.2.3


From 6faf72834d9d0c0dc6632604eaeffb621e87fcf9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 13 May 2013 06:53:37 -0700
Subject: rcu: Fix comparison sense in rcu_needs_cpu()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit c0f4dfd4f (rcu: Make RCU_FAST_NO_HZ take advantage of numbered
callbacks) introduced a bug that can result in excessively long grace
periods.  This bug reverse the senes of the "if" statement checking
for lazy callbacks, so that RCU takes a lazy approach when there are
in fact non-lazy callbacks.  This can result in excessive boot, suspend,
and resume times.

This commit therefore fixes the sense of this "if" statement.

Reported-by: Borislav Petkov <bp@alien8.de>
Reported-by: Bjørn Mork <bjorn@mork.no>
Reported-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Bjørn Mork <bjorn@mork.no>
Tested-by: Joerg Roedel <joro@8bytes.org>
---
 kernel/rcutree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 170814dc418f..6d939a645da1 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1667,7 +1667,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
 	rdtp->last_accelerate = jiffies;
 
 	/* Request timer delay depending on laziness, and round. */
-	if (rdtp->all_lazy) {
+	if (!rdtp->all_lazy) {
 		*dj = round_up(rcu_idle_gp_delay + jiffies,
 			       rcu_idle_gp_delay) - jiffies;
 	} else {
-- 
cgit v1.2.3


From 8f174b1175a10903ade40f36eb6c896412877ca0 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Wed, 1 May 2013 00:07:00 +0900
Subject: workqueue: correct handling of the pool spin_lock

When we fail to mutex_trylock(), we release the pool spin_lock and do
mutex_lock(). After that, we should regrab the pool spin_lock, but,
regrabbing is missed in current code. So correct it.

Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1ae602809efb..286847b90225 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2059,6 +2059,7 @@ static bool manage_workers(struct worker *worker)
 	if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
 		spin_unlock_irq(&pool->lock);
 		mutex_lock(&pool->manager_mutex);
+		spin_lock_irq(&pool->lock);
 		ret = true;
 	}
 
-- 
cgit v1.2.3


From ad7b1f841f8a54c6d61ff181451f55b68175e15a Mon Sep 17 00:00:00 2001
From: Marc Dionne <marc.c.dionne@gmail.com>
Date: Mon, 6 May 2013 17:44:55 -0400
Subject: workqueue: Make schedule_work() available again to non GPL modules

Commit 8425e3d5bdbe ("workqueue: inline trivial wrappers") changed
schedule_work() and schedule_delayed_work() to inline wrappers,
but these rely on some symbols that are EXPORT_SYMBOL_GPL, while
the original functions were EXPORT_SYMBOL.  This has the effect of
changing the licensing requirement for these functions and making
them unavailable to non GPL modules.

Make them available again by removing the restriction on the
required symbols.

Signed-off-by: Marc Dionne <marc.dionne@your-file-system.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 286847b90225..02916f421385 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -296,7 +296,7 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
 static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
 
 struct workqueue_struct *system_wq __read_mostly;
-EXPORT_SYMBOL_GPL(system_wq);
+EXPORT_SYMBOL(system_wq);
 struct workqueue_struct *system_highpri_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_highpri_wq);
 struct workqueue_struct *system_long_wq __read_mostly;
@@ -1411,7 +1411,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 	local_irq_restore(flags);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(queue_work_on);
+EXPORT_SYMBOL(queue_work_on);
 
 void delayed_work_timer_fn(unsigned long __data)
 {
@@ -1485,7 +1485,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	local_irq_restore(flags);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(queue_delayed_work_on);
+EXPORT_SYMBOL(queue_delayed_work_on);
 
 /**
  * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
-- 
cgit v1.2.3


From b4f711ee03d28f776fd2324fd0bd999cc428e4d2 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 24 Apr 2013 11:32:56 -0700
Subject: time: Revert ALWAYS_USE_PERSISTENT_CLOCK compile time optimizaitons

Kay Sievers noted that the ALWAYS_USE_PERSISTENT_CLOCK config,
which enables some minor compile time optimization to avoid
uncessary code in mostly the suspend/resume path could cause
problems for userland.

In particular, the dependency for RTC_HCTOSYS on
!ALWAYS_USE_PERSISTENT_CLOCK, which avoids setting the time
twice and simplifies suspend/resume, has the side effect
of causing the /sys/class/rtc/rtcN/hctosys flag to always be
zero, and this flag is commonly used by udev to setup the
/dev/rtc symlink to /dev/rtcN, which can cause pain for
older applications.

While the udev rules could use some work to be less fragile,
breaking userland should strongly be avoided. Additionally
the compile time optimizations are fairly minor, and the code
being optimized is likely to be reworked in the future, so
lets revert this change.

Reported-by: Kay Sievers <kay@vrfy.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: stable <stable@vger.kernel.org> #3.9
Cc: Feng Tang <feng.tang@intel.com>
Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Link: http://lkml.kernel.org/r/1366828376-18124-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/Kconfig | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 24510d84efd7..b69692250af4 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,11 +12,6 @@ config CLOCKSOURCE_WATCHDOG
 config ARCH_CLOCKSOURCE_DATA
 	bool
 
-# Platforms has a persistent clock
-config ALWAYS_USE_PERSISTENT_CLOCK
-	bool
-	default n
-
 # Timekeeping vsyscall support
 config GENERIC_TIME_VSYSCALL
 	bool
-- 
cgit v1.2.3


From 615ee5443ff9bedd356dc6865f3e9c276ce434ea Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Tue, 26 Mar 2013 11:35:16 -0400
Subject: rcu: Don't allocate bootmem from rcu_init()

When rcu_init() is called we already have slab working, allocating
bootmem at that point results in warnings and an allocation from
slab.  This commit therefore changes alloc_bootmem_cpumask_var() to
alloc_cpumask_var() in rcu_bootup_announce_oddness(), which is called
from rcu_init().

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Tested-by: Robin Holt <holt@sgi.com>

[paulmck: convert to zalloc_cpumask_var(), as suggested by Yinghai Lu.]
---
 kernel/rcutree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 6d939a645da1..3db5a375d8dd 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -88,7 +88,7 @@ static void __init rcu_bootup_announce_oddness(void)
 #ifdef CONFIG_RCU_NOCB_CPU
 #ifndef CONFIG_RCU_NOCB_CPU_NONE
 	if (!have_rcu_nocb_mask) {
-		alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+		zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL);
 		have_rcu_nocb_mask = true;
 	}
 #ifdef CONFIG_RCU_NOCB_CPU_ZERO
-- 
cgit v1.2.3


From 60705c89460fdc7227f2d153b68b3f34814738a4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 14 May 2013 15:40:48 -0400
Subject: tracing: Fix leaks of filter preds

Special preds are created when folding a series of preds that
can be done in serial. These are allocated in an ops field of
the pred structure. But they were never freed, causing memory
leaks.

This was discovered using the kmemleak checker:

unreferenced object 0xffff8800797fd5e0 (size 32):
  comm "swapper/0", pid 1, jiffies 4294690605 (age 104.608s)
  hex dump (first 32 bytes):
    00 00 01 00 03 00 05 00 07 00 09 00 0b 00 0d 00  ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffff814b52af>] kmemleak_alloc+0x73/0x98
    [<ffffffff8111ff84>] kmemleak_alloc_recursive.constprop.42+0x16/0x18
    [<ffffffff81120e68>] __kmalloc+0xd7/0x125
    [<ffffffff810d47eb>] kcalloc.constprop.24+0x2d/0x2f
    [<ffffffff810d4896>] fold_pred_tree_cb+0xa9/0xf4
    [<ffffffff810d3781>] walk_pred_tree+0x47/0xcc
    [<ffffffff810d5030>] replace_preds.isra.20+0x6f8/0x72f
    [<ffffffff810d50b5>] create_filter+0x4e/0x8b
    [<ffffffff81b1c30d>] ftrace_test_event_filter+0x5a/0x155
    [<ffffffff8100028d>] do_one_initcall+0xa0/0x137
    [<ffffffff81afbedf>] kernel_init_freeable+0x14d/0x1dc
    [<ffffffff814b24b7>] kernel_init+0xe/0xdb
    [<ffffffff814d539c>] ret_from_fork+0x7c/0xb0
    [<ffffffffffffffff>] 0xffffffffffffffff

Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: stable@vger.kernel.org # 2.6.39+
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index a6361178de5a..e1b653f7e1ca 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -750,7 +750,11 @@ static int filter_set_pred(struct event_filter *filter,
 
 static void __free_preds(struct event_filter *filter)
 {
+	int i;
+
 	if (filter->preds) {
+		for (i = 0; i < filter->n_preds; i++)
+			kfree(filter->preds[i].ops);
 		kfree(filter->preds);
 		filter->preds = NULL;
 	}
-- 
cgit v1.2.3


From c02c7e65d9b13670e34bc523744cf4f6e99c198a Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Mon, 13 May 2013 20:58:34 +0900
Subject: tracing/kprobes: Use rcu_dereference_raw for tp->files

Use rcu_dereference_raw() for accessing tp->files. Because the
write-side uses rcu_assign_pointer() for memory barrier,
the read-side also has to use rcu_dereference_raw() with
read memory barrier.

Link: http://lkml.kernel.org/r/20130513115834.6545.17022.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 47 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 636d45fe69b3..0a3d8d5c483d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -185,9 +185,14 @@ static struct trace_probe *find_trace_probe(const char *event,
 
 static int trace_probe_nr_files(struct trace_probe *tp)
 {
-	struct ftrace_event_file **file = tp->files;
+	struct ftrace_event_file **file;
 	int ret = 0;
 
+	/*
+	 * Since all tp->files updater is protected by probe_enable_lock,
+	 * we don't need to lock an rcu_read_lock.
+	 */
+	file = rcu_dereference_raw(tp->files);
 	if (file)
 		while (*(file++))
 			ret++;
@@ -209,9 +214,10 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
 	mutex_lock(&probe_enable_lock);
 
 	if (file) {
-		struct ftrace_event_file **new, **old = tp->files;
+		struct ftrace_event_file **new, **old;
 		int n = trace_probe_nr_files(tp);
 
+		old = rcu_dereference_raw(tp->files);
 		/* 1 is for new one and 1 is for stopper */
 		new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *),
 			      GFP_KERNEL);
@@ -251,11 +257,17 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
 static int
 trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file)
 {
+	struct ftrace_event_file **files;
 	int i;
 
-	if (tp->files) {
-		for (i = 0; tp->files[i]; i++)
-			if (tp->files[i] == file)
+	/*
+	 * Since all tp->files updater is protected by probe_enable_lock,
+	 * we don't need to lock an rcu_read_lock.
+	 */
+	files = rcu_dereference_raw(tp->files);
+	if (files) {
+		for (i = 0; files[i]; i++)
+			if (files[i] == file)
 				return i;
 	}
 
@@ -274,10 +286,11 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
 	mutex_lock(&probe_enable_lock);
 
 	if (file) {
-		struct ftrace_event_file **new, **old = tp->files;
+		struct ftrace_event_file **new, **old;
 		int n = trace_probe_nr_files(tp);
 		int i, j;
 
+		old = rcu_dereference_raw(tp->files);
 		if (n == 0 || trace_probe_file_index(tp, file) < 0) {
 			ret = -EINVAL;
 			goto out_unlock;
@@ -872,9 +885,16 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
 static __kprobes void
 kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
 {
-	struct ftrace_event_file **file = tp->files;
+	/*
+	 * Note: preempt is already disabled around the kprobe handler.
+	 * However, we still need an smp_read_barrier_depends() corresponding
+	 * to smp_wmb() in rcu_assign_pointer() to access the pointer.
+	 */
+	struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
+
+	if (unlikely(!file))
+		return;
 
-	/* Note: preempt is already disabled around the kprobe handler */
 	while (*file) {
 		__kprobe_trace_func(tp, regs, *file);
 		file++;
@@ -925,9 +945,16 @@ static __kprobes void
 kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
 		     struct pt_regs *regs)
 {
-	struct ftrace_event_file **file = tp->files;
+	/*
+	 * Note: preempt is already disabled around the kprobe handler.
+	 * However, we still need an smp_read_barrier_depends() corresponding
+	 * to smp_wmb() in rcu_assign_pointer() to access the pointer.
+	 */
+	struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
+
+	if (unlikely(!file))
+		return;
 
-	/* Note: preempt is already disabled around the kprobe handler */
 	while (*file) {
 		__kretprobe_trace_func(tp, ri, regs, *file);
 		file++;
-- 
cgit v1.2.3


From 3d1fc7b0880c4db612a3d3211a808659e28af588 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Mon, 13 May 2013 20:58:37 +0900
Subject: tracing/kprobes: Fix a sparse warning for incorrect type in
 assignment

Fix a sparse warning about the rcu operated pointer is
defined without __rcu address space.

Link: http://lkml.kernel.org/r/20130513115837.6545.23322.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 0a3d8d5c483d..81c5109b9d00 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,7 +35,7 @@ struct trace_probe {
 	const char		*symbol;	/* symbol name */
 	struct ftrace_event_class	class;
 	struct ftrace_event_call	call;
-	struct ftrace_event_file	**files;
+	struct ftrace_event_file * __rcu *files;
 	ssize_t			size;		/* trace entry size */
 	unsigned int		nr_args;
 	struct probe_arg	args[];
-- 
cgit v1.2.3


From b62fdd97fcae17e483b005bafd13fadbd9840672 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Mon, 13 May 2013 20:58:39 +0900
Subject: tracing/kprobes: Make print_*probe_event static

According to sparse warning, print_*probe_event static because
those functions are not directly called from outside.

Link: http://lkml.kernel.org/r/20130513115839.6545.83067.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 81c5109b9d00..9f46e98ba8f2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -962,7 +962,7 @@ kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
 }
 
 /* Event entry printers */
-enum print_line_t
+static enum print_line_t
 print_kprobe_event(struct trace_iterator *iter, int flags,
 		   struct trace_event *event)
 {
@@ -998,7 +998,7 @@ partial:
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-enum print_line_t
+static enum print_line_t
 print_kretprobe_event(struct trace_iterator *iter, int flags,
 		      struct trace_event *event)
 {
-- 
cgit v1.2.3


From 1be0c25da56e860992af972a60321563ca2cfcd1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 15 May 2013 14:24:24 -0700
Subject: workqueue: don't perform NUMA-aware allocations on offline nodes in
 wq_numa_init()

wq_numa_init() builds per-node cpumasks which are later used to make
unbound workqueues NUMA-aware.  The cpumasks are allocated using
alloc_cpumask_var_node() for all possible nodes.  Unfortunately, on
machines with off-line nodes, this leads to NUMA-aware allocations on
existing bug offline nodes, which in turn triggers BUG in the memory
allocation code.

Fix it by using NUMA_NO_NODE for cpumask allocations for offline
nodes.

  kernel BUG at include/linux/gfp.h:323!
  invalid opcode: 0000 [#1] SMP
  Modules linked in:
  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0+ #1
  Hardware name: ProLiant BL465c G7, BIOS A19 12/10/2011
  task: ffff880234608000 ti: ffff880234602000 task.ti: ffff880234602000
  RIP: 0010:[<ffffffff8117495d>]  [<ffffffff8117495d>] new_slab+0x2ad/0x340
  RSP: 0000:ffff880234603bf8  EFLAGS: 00010246
  RAX: 0000000000000000 RBX: ffff880237404b40 RCX: 00000000000000d0
  RDX: 0000000000000001 RSI: 0000000000000003 RDI: 00000000002052d0
  RBP: ffff880234603c28 R08: 0000000000000000 R09: 0000000000000001
  R10: 0000000000000001 R11: ffffffff812e3aa8 R12: 0000000000000001
  R13: ffff8802378161c0 R14: 0000000000030027 R15: 00000000000040d0
  FS:  0000000000000000(0000) GS:ffff880237800000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: ffff88043fdff000 CR3: 00000000018d5000 CR4: 00000000000007f0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
  Stack:
   ffff880234603c28 0000000000000001 00000000000000d0 ffff8802378161c0
   ffff880237404b40 ffff880237404b40 ffff880234603d28 ffffffff815edba1
   ffff880237816140 0000000000000000 ffff88023740e1c0
  Call Trace:
   [<ffffffff815edba1>] __slab_alloc+0x330/0x4f2
   [<ffffffff81174b25>] kmem_cache_alloc_node_trace+0xa5/0x200
   [<ffffffff812e3aa8>] alloc_cpumask_var_node+0x28/0x90
   [<ffffffff81a0bdb3>] wq_numa_init+0x10d/0x1be
   [<ffffffff81a0bec8>] init_workqueues+0x64/0x341
   [<ffffffff810002ea>] do_one_initcall+0xea/0x1a0
   [<ffffffff819f1f31>] kernel_init_freeable+0xb7/0x1ec
   [<ffffffff815d50de>] kernel_init+0xe/0xf0
   [<ffffffff815ff89c>] ret_from_fork+0x7c/0xb0
  Code: 45  84 ac 00 00 00 f0 41 80 4d 00 40 e9 f6 fe ff ff 66 0f 1f 84 00 00 00 00 00 e8 eb 4b ff ff 49 89 c5 e9 05 fe ff ff <0f> 0b 4c 8b 73 38 44 89 ff 81 cf 00 00 20 00 4c 89 f6 48 c1 ee

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-Tested-by: Lingzhu Xiang <lxiang@redhat.com>
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 02916f421385..ee8e29a2320c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4905,7 +4905,8 @@ static void __init wq_numa_init(void)
 	BUG_ON(!tbl);
 
 	for_each_node(node)
-		BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node));
+		BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
+				node_online(node) ? node : NUMA_NO_NODE));
 
 	for_each_possible_cpu(cpu) {
 		node = cpu_to_node(cpu);
-- 
cgit v1.2.3


From 6ed0106667d76589cb648c27edb4f4ffbf9d59ca Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 16 May 2013 20:48:49 +0900
Subject: tracing: Return -EBUSY when event_enable_func() fails to get module

Since try_module_get() returns false( = 0) when it fails to
pindown a module, event_enable_func() returns 0 which means
"succeed". This can cause a kernel panic when the entry
is removed, because the event is already released.

This fixes the bug by returning -EBUSY, because the reason
why it fails is that the module is being removed at that time.

Link: http://lkml.kernel.org/r/20130516114848.13508.97899.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7a0cf68027cc..27963e2bf4bf 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2072,8 +2072,10 @@ event_enable_func(struct ftrace_hash *hash,
  out_reg:
 	/* Don't let event modules unload while probe registered */
 	ret = try_module_get(file->event_call->mod);
-	if (!ret)
+	if (!ret) {
+		ret = -EBUSY;
 		goto out_free;
+	}
 
 	ret = __ftrace_event_enable_disable(file, 1, 1);
 	if (ret < 0)
-- 
cgit v1.2.3


From 264b83c07a84223f0efd0d1db9ccc66d6f88288f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 16 May 2013 17:43:55 +0200
Subject: usermodehelper: check subprocess_info->path != NULL

argv_split(empty_or_all_spaces) happily succeeds, it simply returns
argc == 0 and argv[0] == NULL. Change call_usermodehelper_exec() to
check sub_info->path != NULL to avoid the crash.

This is the minimal fix, todo:

 - perhaps we should change argv_split() to return NULL or change the
   callers.

 - kill or justify ->path[0] check

 - narrow the scope of helper_lock()

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-By: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 1296e72e4161..8241906c4b61 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -569,6 +569,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 	int retval = 0;
 
 	helper_lock();
+	if (!sub_info->path) {
+		retval = -EINVAL;
+		goto out;
+	}
+
 	if (sub_info->path[0] == '\0')
 		goto out;
 
-- 
cgit v1.2.3


From 06c9494c0e9bdfcaa14d6d2b096a0ff7abe8494f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 15 May 2013 20:33:01 +0100
Subject: kmemleak: Scan all allocated, writeable and not executable module
 sections

Instead of just picking data sections by name (names that start
with .data, .bss or .ref.data), use the section flags and scan all
sections that are allocated, writable and not executable. Which should
cover all sections of a module that might reference data.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
[catalin.marinas@arm.com: removed unused 'name' variable]
[catalin.marinas@arm.com: collapsed 'if' blocks]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index b049939177f6..06f496a5d182 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2431,10 +2431,10 @@ static void kmemleak_load_module(const struct module *mod,
 	kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
 
 	for (i = 1; i < info->hdr->e_shnum; i++) {
-		const char *name = info->secstrings + info->sechdrs[i].sh_name;
-		if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
-			continue;
-		if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
+		/* Scan all writable sections that's not executable */
+		if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
+		    !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
+		    (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
 			continue;
 
 		kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
-- 
cgit v1.2.3


From 89c837351db0b9b52fd572ec8b0445a42e59b75c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 15 May 2013 20:46:23 +0100
Subject: kmemleak: No need for scanning specific module sections

As kmemleak now scans all module sections that are allocated, writable
and non executable, there's no need to scan individual sections that
might reference data.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 06f496a5d182..cab4bce49c23 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2769,24 +2769,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
 	mod->trace_events = section_objs(info, "_ftrace_events",
 					 sizeof(*mod->trace_events),
 					 &mod->num_trace_events);
-	/*
-	 * This section contains pointers to allocated objects in the trace
-	 * code and not scanning it leads to false positives.
-	 */
-	kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
-			   mod->num_trace_events, GFP_KERNEL);
 #endif
 #ifdef CONFIG_TRACING
 	mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
 					 sizeof(*mod->trace_bprintk_fmt_start),
 					 &mod->num_trace_bprintk_fmt);
-	/*
-	 * This section contains pointers to allocated objects in the trace
-	 * code and not scanning it leads to false positives.
-	 */
-	kmemleak_scan_area(mod->trace_bprintk_fmt_start,
-			   sizeof(*mod->trace_bprintk_fmt_start) *
-			   mod->num_trace_bprintk_fmt, GFP_KERNEL);
 #endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 	/* sechdrs[0].sh_size is always zero */
-- 
cgit v1.2.3


From fbe06b7bae7c9cf6ab05168fce5ee93b2f4bae7c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 17 May 2013 11:49:10 -0700
Subject: x86, range: fix missing merge during add range
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Christian found v3.9 does not work with E350 with EFI is enabled.

[    1.658832] Trying to unpack rootfs image as initramfs...
[    1.679935] BUG: unable to handle kernel paging request at ffff88006e3fd000
[    1.686940] IP: [<ffffffff813661df>] memset+0x1f/0xb0
[    1.692010] PGD 1f77067 PUD 1f7a067 PMD 61420067 PTE 0

but early memtest report all memory could be accessed without problem.

early page table is set in following sequence:
[    0.000000] init_memory_mapping: [mem 0x00000000-0x000fffff]
[    0.000000] init_memory_mapping: [mem 0x6e600000-0x6e7fffff]
[    0.000000] init_memory_mapping: [mem 0x6c000000-0x6e5fffff]
[    0.000000] init_memory_mapping: [mem 0x00100000-0x6bffffff]
[    0.000000] init_memory_mapping: [mem 0x6e800000-0x6ea07fff]
but later efi_enter_virtual_mode try set mapping again wrongly.
[    0.010644] pid_max: default: 32768 minimum: 301
[    0.015302] init_memory_mapping: [mem 0x640c5000-0x6e3fcfff]
that means it fails with pfn_range_is_mapped.

It turns out that we have a bug in add_range_with_merge and it does not
merge range properly when new add one fill the hole between two exsiting
ranges. In the case when [mem 0x00100000-0x6bffffff] is the hole between
[mem 0x00000000-0x000fffff] and [mem 0x6c000000-0x6e7fffff].

Fix the add_range_with_merge by calling itself recursively.

Reported-by: "Christian König" <christian.koenig@amd.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/CAE9FiQVofGoSk7q5-0irjkBxemqK729cND4hov-1QCBJDhxpgQ@mail.gmail.com
Cc: <stable@vger.kernel.org> v3.9
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/range.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/range.c b/kernel/range.c
index 071b0ab455cb..eb911dbce267 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -48,9 +48,11 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
 		final_start = min(range[i].start, start);
 		final_end = max(range[i].end, end);
 
-		range[i].start = final_start;
-		range[i].end =  final_end;
-		return nr_range;
+		/* clear it and add it back for further merge */
+		range[i].start = 0;
+		range[i].end =  0;
+		return add_range_with_merge(range, az, nr_range,
+			final_start, final_end);
 	}
 
 	/* Need to add it: */
-- 
cgit v1.2.3


From ca1643186d3dce6171d8f171e516b02496360a9e Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 23 May 2013 11:51:10 -0400
Subject: tracing: Fix crash when ftrace=nop on the kernel command line

If ftrace=<tracer> is on the kernel command line, when that tracer is
registered, it will be initiated by tracing_set_tracer() to execute that
tracer.

The nop tracer is just a stub tracer that is used to have no tracer
enabled. It is assigned at early bootup as it is the default tracer.

But if ftrace=nop is on the kernel command line, the registering of the
nop tracer will call tracing_set_tracer() which will try to execute
the nop tracer. But it expects tr->current_trace to be assigned something
as it usually is assigned to the nop tracer. As it hasn't been assigned
to anything yet, it causes the system to crash.

The simple fix is to move the tr->current_trace = nop before registering
the nop tracer. The functionality is still the same as the nop tracer
doesn't do anything anyway.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ae6fa2d1cdf7..4d79485b3237 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6216,10 +6216,15 @@ __init static int tracer_alloc_buffers(void)
 
 	trace_init_cmdlines();
 
-	register_tracer(&nop_trace);
-
+	/*
+	 * register_tracer() might reference current_trace, so it
+	 * needs to be set before we register anything. This is
+	 * just a bootstrap of current_trace anyway.
+	 */
 	global_trace.current_trace = &nop_trace;
 
+	register_tracer(&nop_trace);
+
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
 
-- 
cgit v1.2.3


From 7805d000db30a3787a4c969bab6ae4d8a5fd8ce6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 May 2013 10:50:24 +0900
Subject: cgroup: fix a subtle bug in descendant pre-order walk

When cgroup_next_descendant_pre() initiates a walk, it checks whether
the subtree root doesn't have any children and if not returns NULL.
Later code assumes that the subtree isn't empty.  This is broken
because the subtree may become empty inbetween, which can lead to the
traversal escaping the subtree by walking to the sibling of the
subtree root.

There's no reason to have the early exit path.  Remove it along with
the later assumption that the subtree isn't empty.  This simplifies
the code a bit and fixes the subtle bug.

While at it, fix the comment of cgroup_for_each_descendant_pre() which
was incorrectly referring to ->css_offline() instead of
->css_online().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 38b136553044..31e9ef319070 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2954,11 +2954,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
 	/* if first iteration, pretend we just visited @cgroup */
-	if (!pos) {
-		if (list_empty(&cgroup->children))
-			return NULL;
+	if (!pos)
 		pos = cgroup;
-	}
 
 	/* visit the first child if exists */
 	next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
@@ -2966,14 +2963,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 		return next;
 
 	/* no child, visit my or the closest ancestor's next sibling */
-	do {
+	while (pos != cgroup) {
 		next = list_entry_rcu(pos->sibling.next, struct cgroup,
 				      sibling);
 		if (&next->sibling != &pos->parent->children)
 			return next;
 
 		pos = pos->parent;
-	} while (pos != cgroup);
+	}
 
 	return NULL;
 }
-- 
cgit v1.2.3


From 387b8b3e37cb1c257fb607787f73815c30d22859 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 24 May 2013 15:55:25 -0700
Subject: auditfilter.c: fix kernel-doc warnings

Fix kernel-doc warnings in kernel/auditfilter.c:

  Warning(kernel/auditfilter.c:1029): Excess function parameter 'loginuid' description in 'audit_receive_filter'
  Warning(kernel/auditfilter.c:1029): Excess function parameter 'sessionid' description in 'audit_receive_filter'
  Warning(kernel/auditfilter.c:1029): Excess function parameter 'sid' description in 'audit_receive_filter'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/auditfilter.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 83a2970295d1..6bd4a90d1991 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1021,9 +1021,6 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
  * @seq: netlink audit message sequence (serial) number
  * @data: payload data
  * @datasz: size of payload data
- * @loginuid: loginuid of sender
- * @sessionid: sessionid for netlink audit message
- * @sid: SE Linux Security ID of sender
  */
 int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz)
 {
-- 
cgit v1.2.3


From 2938d2757fc99c26aa678ce4eba910c4a77c3a55 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 May 2013 09:33:01 +0200
Subject: tick: Cure broadcast false positive pending bit warning

commit 26517f3e (tick: Avoid programming the local cpu timer if
broadcast pending) added a warning if the cpu enters broadcast mode
again while the pending bit is still set. Meelis reported that the
warning triggers. There are two corner cases which have been not
considered:

1) cpuidle calls clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER)
   twice. That can result in the following scenario

   CPU0                    CPU1
                           cpuidle_idle_call()
                             clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER)
                               set cpu in tick_broadcast_oneshot_mask

   broadcast interrupt
     event expired for cpu1
     set pending bit

                             acpi_idle_enter_simple()
                               clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER)
                                 WARN_ON(pending bit)

  Move the WARN_ON into the section where we enter broadcast mode so
  it wont provide false positives on the second call.

2) safe_halt() enables interrupts, so a broadcast interrupt can be
   delivered befor the broadcast mode is disabled. That sets the
   pending bit for the CPU which receives the broadcast
   interrupt. Though the interrupt is delivered right away from the
   broadcast handler and leaves the pending bit stale.

   Clear the pending bit for the current cpu in the broadcast handler.

Reported-and-tested-by: Meelis Roos <mroos@linux.ee>
Cc: Len Brown <lenb@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305271841130.4220@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 24938d577669..0c739423b0f9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -511,6 +511,12 @@ again:
 		}
 	}
 
+	/*
+	 * Remove the current cpu from the pending mask. The event is
+	 * delivered immediately in tick_do_broadcast() !
+	 */
+	cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask);
+
 	/* Take care of enforced broadcast requests */
 	cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
 	cpumask_clear(tick_broadcast_force_mask);
@@ -575,8 +581,8 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 
 	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
-		WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
 		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
+			WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
 			/*
 			 * We only reprogram the broadcast timer if we
-- 
cgit v1.2.3


From 7b959fc582741227a1c4cba710d6aff8fb183128 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 22 May 2013 18:34:09 +0900
Subject: kprobes: Fix to free gone and unused optprobes

Fix to free gone and unused optprobes. This bug will
cause a kernel panic if the user reuses the killed and
unused probe.

Reported at:

  http://sourceware.org/ml/systemtap/2013-q2/msg00142.html

In the normal path, an optprobe on an init function is
unregistered when a module goes live.

unregister_kprobe(kp)
 -> __unregister_kprobe_top
   ->__disable_kprobe
     ->disarm_kprobe(ap == op)
       ->__disarm_kprobe
        ->unoptimize_kprobe : the op is queued
                              on unoptimizing_list
and do nothing in __unregister_kprobe_bottom

After a while (usually wait 5 jiffies), kprobe_optimizer
runs to unoptimize and free optprobe.

kprobe_optimizer
 ->do_unoptimize_kprobes
   ->arch_unoptimize_kprobes : moved to free_list
 ->do_free_cleaned_kprobes
   ->hlist_del: the op is removed
   ->free_aggr_kprobe
     ->arch_remove_optimized_kprobe
     ->arch_remove_kprobe
     ->kfree: the op is freed

Here, if kprobes_module_callback is called and the delayed
unoptimizing probe is picked BEFORE kprobe_optimizer runs,

kprobes_module_callback
 ->kill_kprobe
   ->kill_optimized_kprobe : dequeued from unoptimizing_list <=!!!
     ->arch_remove_optimized_kprobe
   ->arch_remove_kprobe
   (but op is not freed, and on the kprobe hash table)

This doesn't happen if the probe unregistration is done AFTER
kprobes_module_callback is called (because at that time the op
is gone), and kprobe-tracer does it.

To fix this bug, this patch changes kprobes_module_callback to
enqueue the op to freeing_list at kill_optimized_kprobe only
if the op is unused. The unused probes on freeing_list will
be freed in do_free_cleaned_kprobes.

Note that this calls arch_remove_*kprobe twice on the
same probe. Thus those functions have to check the double free.
Fortunately, most of arch codes already checked that except
for mips. This will be fixed in the next patch.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Timo Juhani Lindfors <timo.lindfors@iki.fi>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: systemtap@sourceware.org
Cc: yrl.pp-manager.tt@hitachi.com
Cc: David S. Miller <davem@davemloft.net>
Cc: "David S. Miller" <davem@davemloft.net>
Link: http://lkml.kernel.org/r/20130522093409.9084.63554.stgit@mhiramat-M0-7522
[ Minor edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/kprobes.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3fed7f0cbcdf..bddf3b201a48 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -467,6 +467,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
 /* Optimization staging list, protected by kprobe_mutex */
 static LIST_HEAD(optimizing_list);
 static LIST_HEAD(unoptimizing_list);
+static LIST_HEAD(freeing_list);
 
 static void kprobe_optimizer(struct work_struct *work);
 static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
@@ -504,7 +505,7 @@ static __kprobes void do_optimize_kprobes(void)
  * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
  * if need) kprobes listed on unoptimizing_list.
  */
-static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
+static __kprobes void do_unoptimize_kprobes(void)
 {
 	struct optimized_kprobe *op, *tmp;
 
@@ -515,9 +516,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
 	/* Ditto to do_optimize_kprobes */
 	get_online_cpus();
 	mutex_lock(&text_mutex);
-	arch_unoptimize_kprobes(&unoptimizing_list, free_list);
+	arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
 	/* Loop free_list for disarming */
-	list_for_each_entry_safe(op, tmp, free_list, list) {
+	list_for_each_entry_safe(op, tmp, &freeing_list, list) {
 		/* Disarm probes if marked disabled */
 		if (kprobe_disabled(&op->kp))
 			arch_disarm_kprobe(&op->kp);
@@ -536,11 +537,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
 }
 
 /* Reclaim all kprobes on the free_list */
-static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
+static __kprobes void do_free_cleaned_kprobes(void)
 {
 	struct optimized_kprobe *op, *tmp;
 
-	list_for_each_entry_safe(op, tmp, free_list, list) {
+	list_for_each_entry_safe(op, tmp, &freeing_list, list) {
 		BUG_ON(!kprobe_unused(&op->kp));
 		list_del_init(&op->list);
 		free_aggr_kprobe(&op->kp);
@@ -556,8 +557,6 @@ static __kprobes void kick_kprobe_optimizer(void)
 /* Kprobe jump optimizer */
 static __kprobes void kprobe_optimizer(struct work_struct *work)
 {
-	LIST_HEAD(free_list);
-
 	mutex_lock(&kprobe_mutex);
 	/* Lock modules while optimizing kprobes */
 	mutex_lock(&module_mutex);
@@ -566,7 +565,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
 	 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
 	 * kprobes before waiting for quiesence period.
 	 */
-	do_unoptimize_kprobes(&free_list);
+	do_unoptimize_kprobes();
 
 	/*
 	 * Step 2: Wait for quiesence period to ensure all running interrupts
@@ -581,7 +580,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
 	do_optimize_kprobes();
 
 	/* Step 4: Free cleaned kprobes after quiesence period */
-	do_free_cleaned_kprobes(&free_list);
+	do_free_cleaned_kprobes();
 
 	mutex_unlock(&module_mutex);
 	mutex_unlock(&kprobe_mutex);
@@ -723,8 +722,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)
 	if (!list_empty(&op->list))
 		/* Dequeue from the (un)optimization queue */
 		list_del_init(&op->list);
-
 	op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+
+	if (kprobe_unused(p)) {
+		/* Enqueue if it is unused */
+		list_add(&op->list, &freeing_list);
+		/*
+		 * Remove unused probes from the hash list. After waiting
+		 * for synchronization, this probe is reclaimed.
+		 * (reclaiming is done by do_free_cleaned_kprobes().)
+		 */
+		hlist_del_rcu(&op->kp.hlist);
+	}
+
 	/* Don't touch the code, because it is already freed. */
 	arch_remove_optimized_kprobe(op);
 }
-- 
cgit v1.2.3


From 26cb63ad11e04047a64309362674bcbbd6a6f246 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 28 May 2013 10:55:48 +0200
Subject: perf: Fix perf mmap bugs

Vince reported a problem found by his perf specific trinity
fuzzer.

Al noticed 2 problems with perf's mmap():

 - it has issues against fork() since we use vma->vm_mm for accounting.
 - it has an rb refcount leak on double mmap().

We fix the issues against fork() by using VM_DONTCOPY; I don't
think there's code out there that uses this; we didn't hear
about weird accounting problems/crashes. If we do need this to
work, the previously proposed VM_PINNED could make this work.

Aside from the rb reference leak spotted by Al, Vince's example
prog was indeed doing a double mmap() through the use of
perf_event_set_output().

This exposes another problem, since we now have 2 events with
one buffer, the accounting gets screwy because we account per
event. Fix this by making the buffer responsible for its own
accounting.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Link: http://lkml.kernel.org/r/20130528085548.GA12193@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c     | 37 ++++++++++++++++++++-----------------
 kernel/events/internal.h |  3 +++
 2 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9dc297faf7c0..ae752cd4a086 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2917,7 +2917,7 @@ static void free_event_rcu(struct rcu_head *head)
 	kfree(event);
 }
 
-static void ring_buffer_put(struct ring_buffer *rb);
+static bool ring_buffer_put(struct ring_buffer *rb);
 
 static void free_event(struct perf_event *event)
 {
@@ -3582,13 +3582,13 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 	return rb;
 }
 
-static void ring_buffer_put(struct ring_buffer *rb)
+static bool ring_buffer_put(struct ring_buffer *rb)
 {
 	struct perf_event *event, *n;
 	unsigned long flags;
 
 	if (!atomic_dec_and_test(&rb->refcount))
-		return;
+		return false;
 
 	spin_lock_irqsave(&rb->event_lock, flags);
 	list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
@@ -3598,6 +3598,7 @@ static void ring_buffer_put(struct ring_buffer *rb)
 	spin_unlock_irqrestore(&rb->event_lock, flags);
 
 	call_rcu(&rb->rcu_head, rb_free_rcu);
+	return true;
 }
 
 static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3612,18 +3613,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	struct perf_event *event = vma->vm_file->private_data;
 
 	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-		unsigned long size = perf_data_size(event->rb);
-		struct user_struct *user = event->mmap_user;
 		struct ring_buffer *rb = event->rb;
+		struct user_struct *mmap_user = rb->mmap_user;
+		int mmap_locked = rb->mmap_locked;
+		unsigned long size = perf_data_size(rb);
 
-		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-		vma->vm_mm->pinned_vm -= event->mmap_locked;
 		rcu_assign_pointer(event->rb, NULL);
 		ring_buffer_detach(event, rb);
 		mutex_unlock(&event->mmap_mutex);
 
-		ring_buffer_put(rb);
-		free_uid(user);
+		if (ring_buffer_put(rb)) {
+			atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+			vma->vm_mm->pinned_vm -= mmap_locked;
+			free_uid(mmap_user);
+		}
 	}
 }
 
@@ -3676,9 +3679,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	WARN_ON_ONCE(event->ctx->parent_ctx);
 	mutex_lock(&event->mmap_mutex);
 	if (event->rb) {
-		if (event->rb->nr_pages == nr_pages)
-			atomic_inc(&event->rb->refcount);
-		else
+		if (event->rb->nr_pages != nr_pages)
 			ret = -EINVAL;
 		goto unlock;
 	}
@@ -3720,12 +3721,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		ret = -ENOMEM;
 		goto unlock;
 	}
-	rcu_assign_pointer(event->rb, rb);
+
+	rb->mmap_locked = extra;
+	rb->mmap_user = get_current_user();
 
 	atomic_long_add(user_extra, &user->locked_vm);
-	event->mmap_locked = extra;
-	event->mmap_user = get_current_user();
-	vma->vm_mm->pinned_vm += event->mmap_locked;
+	vma->vm_mm->pinned_vm += extra;
+
+	rcu_assign_pointer(event->rb, rb);
 
 	perf_event_update_userpage(event);
 
@@ -3734,7 +3737,7 @@ unlock:
 		atomic_inc(&event->mmap_count);
 	mutex_unlock(&event->mmap_mutex);
 
-	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = &perf_mmap_vmops;
 
 	return ret;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index eb675c4d59df..5bc6c8e9b851 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -31,6 +31,9 @@ struct ring_buffer {
 	spinlock_t			event_lock;
 	struct list_head		event_list;
 
+	int				mmap_locked;
+	struct user_struct		*mmap_user;
+
 	struct perf_event_mmap_page	*user_page;
 	void				*data_pages[0];
 };
-- 
cgit v1.2.3


From 6721cb60022629ae76365551f05d9658b8d14c55 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 23 May 2013 14:21:36 -0400
Subject: ring-buffer: Do not poll non allocated cpu buffers

The tracing infrastructure sets up for possible CPUs, but it uses
the ring buffer polling, it is possible to call the ring buffer
polling code with a CPU that hasn't been allocated. This will cause
a kernel oops when it access a ring buffer cpu buffer that is part
of the possible cpus but hasn't been allocated yet as the CPU has never
been online.

Reported-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Tested-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b59aea2c48c2..e444ff88f0a4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -620,6 +620,9 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
 	if (cpu == RING_BUFFER_ALL_CPUS)
 		work = &buffer->irq_work;
 	else {
+		if (!cpumask_test_cpu(cpu, buffer->cpumask))
+			return -EINVAL;
+
 		cpu_buffer = buffer->buffers[cpu];
 		work = &cpu_buffer->irq_work;
 	}
-- 
cgit v1.2.3


From aa848233f740abbabfa7669daca0ab94aaa37bcd Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Fri, 3 May 2013 23:27:07 +0200
Subject: ntp: Remove unused variable flags in __hardpps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kernel/time/ntp.c: In function ‘__hardpps’:
kernel/time/ntp.c:877: warning: unused variable ‘flags’

commit a076b2146fabb0894cae5e0189a8ba3f1502d737 ("ntp: Remove ntp_lock,
using the timekeeping locks to protect ntp state") removed its users,
but not the actual variable.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/ntp.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 12ff13a838c6..8f5b3b98577b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -874,7 +874,6 @@ static void hardpps_update_phase(long error)
 void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 {
 	struct pps_normtime pts_norm, freq_norm;
-	unsigned long flags;
 
 	pts_norm = pps_normalize_ts(*phase_ts);
 
-- 
cgit v1.2.3


From 0d6bd9953f739dad96d9a0de65383e479ab4e10d Mon Sep 17 00:00:00 2001
From: Zoran Markovic <zoran.markovic@linaro.org>
Date: Fri, 17 May 2013 11:24:05 -0700
Subject: timekeeping: Correct run-time detection of persistent_clock.

Since commit 31ade30692dc9680bfc95700d794818fa3f754ac, timekeeping_init()
checks for presence of persistent clock by attempting to read a non-zero
time value. This is an issue on platforms where persistent_clock (instead
is implemented as a free-running counter (instead of an RTC) starting
from zero on each boot and running during suspend. Examples are some ARM
platforms (e.g. PandaBoard).

An attempt to read such a clock during timekeeping_init() may return zero
value and falsely declare persistent clock as missing. Additionally, in
the above case suspend times may be accounted twice (once from
timekeeping_resume() and once from rtc_resume()), resulting in a gradual
drift of system time.

This patch does a run-time correction of the issue by doing the same check
during timekeeping_suspend().

A better long-term solution would have to return error when trying to read
non-existing clock and zero when trying to read an uninitialized clock, but
that would require changing all persistent_clock implementations.

This patch addresses the immediate breakage, for now.

Cc: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Feng Tang <feng.tang@intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Zoran Markovic <zoran.markovic@linaro.org>
[jstultz: Tweaked commit message and subject]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 98cd470bbe49..baeeb5c87cf1 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -975,6 +975,14 @@ static int timekeeping_suspend(void)
 
 	read_persistent_clock(&timekeeping_suspend_time);
 
+	/*
+	 * On some systems the persistent_clock can not be detected at
+	 * timekeeping_init by its return value, so if we see a valid
+	 * value returned, update the persistent_clock_exists flag.
+	 */
+	if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
+		persistent_clock_exist = true;
+
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	write_seqcount_begin(&timekeeper_seq);
 	timekeeping_forward_now(tk);
-- 
cgit v1.2.3


From 2a0ff3fbe39bc93f719ff857e5a359d9780579ff Mon Sep 17 00:00:00 2001
From: Jeff Liu <jeff.liu@oracle.com>
Date: Sun, 26 May 2013 21:33:09 +0800
Subject: cgroup: warn about mismatching options of a new mount of an existing
 hierarchy

With the new __DEVEL__sane_behavior mount option was introduced,
if the root cgroup is alive with no xattr function, to mount a
new cgroup with xattr will be rejected in terms of design which
just fine.  However, if the root cgroup does not mounted with
__DEVEL__sane_hehavior, to create a new cgroup with xattr option
will succeed although after that the EA function does not works
as expected but will get ENOTSUPP for setting up attributes under
either cgroup. e.g.

setfattr: /cgroup2/test: Operation not supported

Instead of keeping silence in this case, it's better to drop a log
entry in warning level.  That would be helpful to understand the
reason behind the scene from the user's perspective, and this is
essentially an improvement does not break the backward compatibilities.

With this fix, above mount attemption will keep up works as usual but
the following line cound be found at the system log:

[ ...] cgroup: new mount options do not match the existing superblock

tj: minor formatting / message updates.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reported-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 31e9ef319070..a7c9e6ddb979 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1686,11 +1686,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		 */
 		cgroup_drop_root(opts.new_root);
 
-		if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) &&
-		    root->flags != opts.flags) {
-			pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
-			ret = -EINVAL;
-			goto drop_new_super;
+		if (root->flags != opts.flags) {
+			if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
+				pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
+				ret = -EINVAL;
+				goto drop_new_super;
+			} else {
+				pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
+			}
 		}
 
 		/* no subsys rebinding, so refcounts don't change */
-- 
cgit v1.2.3


From 1bb539ca36e21c2f4fce0865e11df384bc7b7656 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 28 May 2013 14:38:43 -0400
Subject: ftrace: Use the rcu _notrace variants for rcu_dereference_raw() and
 friends

As rcu_dereference_raw() under RCU debug config options can add quite a
bit of checks, and that tracing uses rcu_dereference_raw(), these checks
happen with the function tracer. The function tracer also happens to trace
these debug checks too. This added overhead can livelock the system.

Have the function tracer use the new RCU _notrace equivalents that do
not do the debug checks for RCU.

Link: http://lkml.kernel.org/r/20130528184209.467603904@goodmis.org

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b549b0f5b977..6c508ff33c62 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -120,22 +120,22 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
 
 /*
  * Traverse the ftrace_global_list, invoking all entries.  The reason that we
- * can use rcu_dereference_raw() is that elements removed from this list
+ * can use rcu_dereference_raw_notrace() is that elements removed from this list
  * are simply leaked, so there is no need to interact with a grace-period
- * mechanism.  The rcu_dereference_raw() calls are needed to handle
+ * mechanism.  The rcu_dereference_raw_notrace() calls are needed to handle
  * concurrent insertions into the ftrace_global_list.
  *
  * Silly Alpha and silly pointer-speculation compiler optimizations!
  */
 #define do_for_each_ftrace_op(op, list)			\
-	op = rcu_dereference_raw(list);			\
+	op = rcu_dereference_raw_notrace(list);			\
 	do
 
 /*
  * Optimized for just a single item in the list (as that is the normal case).
  */
 #define while_for_each_ftrace_op(op)				\
-	while (likely(op = rcu_dereference_raw((op)->next)) &&	\
+	while (likely(op = rcu_dereference_raw_notrace((op)->next)) &&	\
 	       unlikely((op) != &ftrace_list_end))
 
 static inline void ftrace_ops_init(struct ftrace_ops *ops)
@@ -779,7 +779,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
 	if (hlist_empty(hhd))
 		return NULL;
 
-	hlist_for_each_entry_rcu(rec, hhd, node) {
+	hlist_for_each_entry_rcu_notrace(rec, hhd, node) {
 		if (rec->ip == ip)
 			return rec;
 	}
@@ -1165,7 +1165,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 
 	hhd = &hash->buckets[key];
 
-	hlist_for_each_entry_rcu(entry, hhd, hlist) {
+	hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) {
 		if (entry->ip == ip)
 			return entry;
 	}
@@ -1422,8 +1422,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
 	struct ftrace_hash *notrace_hash;
 	int ret;
 
-	filter_hash = rcu_dereference_raw(ops->filter_hash);
-	notrace_hash = rcu_dereference_raw(ops->notrace_hash);
+	filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
+	notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
 
 	if ((ftrace_hash_empty(filter_hash) ||
 	     ftrace_lookup_ip(filter_hash, ip)) &&
@@ -2920,7 +2920,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
 	 * on the hash. rcu_read_lock is too dangerous here.
 	 */
 	preempt_disable_notrace();
-	hlist_for_each_entry_rcu(entry, hhd, node) {
+	hlist_for_each_entry_rcu_notrace(entry, hhd, node) {
 		if (entry->ip == ip)
 			entry->ops->func(ip, parent_ip, &entry->data);
 	}
-- 
cgit v1.2.3


From 0184d50f9fd17658c232d6ee6d465a87f989d706 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 29 May 2013 15:56:49 -0400
Subject: tracing: Fix bad parameter passed in branch selftest

The branch selftest calls trace_test_buffer(), but with the new code
it expects the first parameter to be a pointer to a struct trace_buffer.
All self tests were changed but the branch selftest was missed.

This caused either a crash or failed test when the branch selftest was
enabled.

Link: http://lkml.kernel.org/r/20130529141333.GA24064@localhost

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_selftest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 55e2cf66967b..2901e3b88590 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1159,7 +1159,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 	/* stop the tracing. */
 	tracing_stop();
 	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
+	ret = trace_test_buffer(&tr->trace_buffer, &count);
 	trace->reset(tr);
 	tracing_start();
 
-- 
cgit v1.2.3


From 45eacc692771bd2b1ea3d384e6345cab3da10861 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 15 May 2013 22:16:32 +0200
Subject: vtime: Use consistent clocks among nohz accounting

While computing the cputime delta of dynticks CPUs,
we are mixing up clocks of differents natures:

* local_clock() which takes care of unstable clock
sources and fix these if needed.

* sched_clock() which is the weaker version of
local_clock(). It doesn't compute any fixup in case
of unstable source.

If the clock source is stable, those two clocks are the
same and we can safely compute the difference against
two random points.

Otherwise it results in random deltas as sched_clock()
can randomly drift away, back or forward, from local_clock().

As a consequence, some strange behaviour with unstable tsc
has been observed such as non progressing constant zero cputime.
(The 'top' command showing no load).

Fix this by only using local_clock(), or its irq safe/remote
equivalent, in vtime code.

Reported-by: Mike Galbraith <efault@gmx.de>
Suggested-by: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c    | 2 +-
 kernel/sched/cputime.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 58453b8272fd..e1a27f918723 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4745,7 +4745,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 */
 	idle->sched_class = &idle_sched_class;
 	ftrace_graph_init_idle_task(idle, cpu);
-	vtime_init_idle(idle);
+	vtime_init_idle(idle, cpu);
 #if defined(CONFIG_SMP)
 	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index cc2dc3eea8a3..b5ccba22603b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -747,17 +747,17 @@ void arch_vtime_task_switch(struct task_struct *prev)
 
 	write_seqlock(&current->vtime_seqlock);
 	current->vtime_snap_whence = VTIME_SYS;
-	current->vtime_snap = sched_clock();
+	current->vtime_snap = sched_clock_cpu(smp_processor_id());
 	write_sequnlock(&current->vtime_seqlock);
 }
 
-void vtime_init_idle(struct task_struct *t)
+void vtime_init_idle(struct task_struct *t, int cpu)
 {
 	unsigned long flags;
 
 	write_seqlock_irqsave(&t->vtime_seqlock, flags);
 	t->vtime_snap_whence = VTIME_SYS;
-	t->vtime_snap = sched_clock();
+	t->vtime_snap = sched_clock_cpu(cpu);
 	write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
 }
 
-- 
cgit v1.2.3


From 521921bad1192fb1b8f9b6a5aa673635848b8b5f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 16 May 2013 01:21:38 +0200
Subject: kvm: Move guest entry/exit APIs to context_tracking

The kvm_host.h header file doesn't handle well
inclusion when archs don't support KVM.

This results in build crashes for such archs when they
want to implement context tracking because this subsystem
includes kvm_host.h in order to implement the
guest_enter/exit APIs but it doesn't handle KVM off case.

To fix this, move the guest_enter()/guest_exit()
declarations and generic implementation to the context
tracking headers. These generic APIs actually belong to
this subsystem, besides other domains boundary tracking
like user_enter() et al.

KVM now properly becomes a user of this library, not the
other buggy way around.

Reported-by: Kevin Hilman <khilman@linaro.org>
Reviewed-by: Kevin Hilman <khilman@linaro.org>
Tested-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/context_tracking.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 65349f07b878..85bdde1137eb 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/context_tracking.h>
-#include <linux/kvm_host.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/hardirq.h>
-- 
cgit v1.2.3


From 1a7f829f094dd7951e7d46c571a18080e455a436 Mon Sep 17 00:00:00 2001
From: Li Zhong <zhong@linux.vnet.ibm.com>
Date: Fri, 17 May 2013 16:44:04 +0800
Subject: nohz: Fix notifier return val that enforce timekeeping

In tick_nohz_cpu_down_callback() if the cpu is the one handling
timekeeping, we must return something that stops the CPU_DOWN_PREPARE
notifiers and then start notify CPU_DOWN_FAILED on the already called
notifier call backs.

However traditional errno values are not handled by the notifier unless
these are encapsulated using errno_to_notifier().

Hence the current -EINVAL is misinterpreted and converted to junk after
notifier_to_errno(), leaving the notifier subsystem to random behaviour
such as eventually allowing the cpu to go down.

Fix this by using the standard NOTIFY_BAD instead.

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
Reviewed-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4208138fbf4..0cf1c1453181 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -306,7 +306,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
 		 * we can't safely shutdown that CPU.
 		 */
 		if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
-			return -EINVAL;
+			return NOTIFY_BAD;
 		break;
 	}
 	return NOTIFY_OK;
-- 
cgit v1.2.3


From f5d00c1f9adb350c24c5301600f7bf2da99b66de Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Tue, 28 May 2013 15:29:03 +0200
Subject: tick: Remove useless timekeeping duty attribution to broadcast source

Since 7300711e ("clockevents: broadcast fixup possible waiters"),
the timekeeping duty is assigned to the CPU that handles the tick
broadcast clock device by the time it is set in one shot mode.

This is an issue in full dynticks mode where the timekeeping duty
must stay handled by the boot CPU for now. Otherwise it prevents
secondary CPUs from offlining and this breaks
suspend/shutdown/reboot/...

As it appears there is no reason for this timekeeping duty to be
moved to the broadcast CPU, besides nothing prevent it from being
later re-assigned to another target, let's simply remove it.

Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-broadcast.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 0c739423b0f9..b4c245580b79 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -698,10 +698,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 
 		bc->event_handler = tick_handle_oneshot_broadcast;
 
-		/* Take the do_timer update */
-		if (!tick_nohz_full_cpu(cpu))
-			tick_do_timer_cpu = cpu;
-
 		/*
 		 * We must be careful here. There might be other CPUs
 		 * waiting for periodic broadcast. We need to set the
-- 
cgit v1.2.3


From f17a5194859a82afe4164e938b92035b86c55794 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 30 May 2013 21:10:37 -0400
Subject: tracing: Use current_uid() for critical time tracing

The irqsoff tracer records the max time that interrupts are disabled.
There are hooks in the assembly code that calls back into the tracer when
interrupts are disabled or enabled.

When they are enabled, the tracer checks if the amount of time they
were disabled is larger than the previous recorded max interrupts off
time. If it is, it creates a snapshot of the currently running trace
to store where the last largest interrupts off time was held and how
it happened.

During testing, this RCU lockdep dump appeared:

[ 1257.829021] ===============================
[ 1257.829021] [ INFO: suspicious RCU usage. ]
[ 1257.829021] 3.10.0-rc1-test+ #171 Tainted: G        W
[ 1257.829021] -------------------------------
[ 1257.829021] /home/rostedt/work/git/linux-trace.git/include/linux/rcupdate.h:780 rcu_read_lock() used illegally while idle!
[ 1257.829021]
[ 1257.829021] other info that might help us debug this:
[ 1257.829021]
[ 1257.829021]
[ 1257.829021] RCU used illegally from idle CPU!
[ 1257.829021] rcu_scheduler_active = 1, debug_locks = 0
[ 1257.829021] RCU used illegally from extended quiescent state!
[ 1257.829021] 2 locks held by trace-cmd/4831:
[ 1257.829021]  #0:  (max_trace_lock){......}, at: [<ffffffff810e2b77>] stop_critical_timing+0x1a3/0x209
[ 1257.829021]  #1:  (rcu_read_lock){.+.+..}, at: [<ffffffff810dae5a>] __update_max_tr+0x88/0x1ee
[ 1257.829021]
[ 1257.829021] stack backtrace:
[ 1257.829021] CPU: 3 PID: 4831 Comm: trace-cmd Tainted: G        W    3.10.0-rc1-test+ #171
[ 1257.829021] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007
[ 1257.829021]  0000000000000001 ffff880065f49da8 ffffffff8153dd2b ffff880065f49dd8
[ 1257.829021]  ffffffff81092a00 ffff88006bd78680 ffff88007add7500 0000000000000003
[ 1257.829021]  ffff88006bd78680 ffff880065f49e18 ffffffff810daebf ffffffff810dae5a
[ 1257.829021] Call Trace:
[ 1257.829021]  [<ffffffff8153dd2b>] dump_stack+0x19/0x1b
[ 1257.829021]  [<ffffffff81092a00>] lockdep_rcu_suspicious+0x109/0x112
[ 1257.829021]  [<ffffffff810daebf>] __update_max_tr+0xed/0x1ee
[ 1257.829021]  [<ffffffff810dae5a>] ? __update_max_tr+0x88/0x1ee
[ 1257.829021]  [<ffffffff811002b9>] ? user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff810dbf85>] update_max_tr_single+0x11d/0x12d
[ 1257.829021]  [<ffffffff811002b9>] ? user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff810e2b15>] stop_critical_timing+0x141/0x209
[ 1257.829021]  [<ffffffff8109569a>] ? trace_hardirqs_on+0xd/0xf
[ 1257.829021]  [<ffffffff811002b9>] ? user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff810e3057>] time_hardirqs_on+0x2a/0x2f
[ 1257.829021]  [<ffffffff811002b9>] ? user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff8109550c>] trace_hardirqs_on_caller+0x16/0x197
[ 1257.829021]  [<ffffffff8109569a>] trace_hardirqs_on+0xd/0xf
[ 1257.829021]  [<ffffffff811002b9>] user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff810029b4>] do_notify_resume+0x92/0x97
[ 1257.829021]  [<ffffffff8154bdca>] int_signal+0x12/0x17

What happened was entering into the user code, the interrupts were enabled
and a max interrupts off was recorded. The trace buffer was saved along with
various information about the task: comm, pid, uid, priority, etc.

The uid is recorded with task_uid(tsk). But this is a macro that uses rcu_read_lock()
to retrieve the data, and this happened to happen where RCU is blind (user_enter).

As only the preempt and irqs off tracers can have this happen, and they both
only have the tsk == current, if tsk == current, use current_uid() instead of
task_uid(), as current_uid() does not use RCU as only current can change its uid.

This fixes the RCU suspicious splat.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4d79485b3237..1a41023a1f88 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -843,7 +843,15 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
 	max_data->pid = tsk->pid;
-	max_data->uid = task_uid(tsk);
+	/*
+	 * If tsk == current, then use current_uid(), as that does not use
+	 * RCU. The irq tracer can be called out of RCU scope.
+	 */
+	if (tsk == current)
+		max_data->uid = current_uid();
+	else
+		max_data->uid = task_uid(tsk);
+
 	max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
 	max_data->policy = tsk->policy;
 	max_data->rt_priority = tsk->rt_priority;
-- 
cgit v1.2.3


From 346dbb79ea0118ebb0df372b35cab9d5805216cd Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 25 Apr 2013 19:28:54 +0200
Subject: irqdomain: export irq_domain_add_simple

All other irq_domain_add_* functions are exported already, and apparently
this one got left out by mistake, which causes build errors for ARM
allmodconfig kernels:

ERROR: "irq_domain_add_simple" [drivers/gpio/gpio-rcar.ko] undefined!
ERROR: "irq_domain_add_simple" [drivers/gpio/gpio-em.ko] undefined!

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Grant Likely <grant.likely@linaro.org>
---
 kernel/irq/irqdomain.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 5a83dde8ca0c..d1adaedb435f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -191,6 +191,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
 	/* A linear domain is the default */
 	return irq_domain_add_linear(of_node, size, ops, host_data);
 }
+EXPORT_SYMBOL_GPL(irq_domain_add_simple);
 
 /**
  * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
-- 
cgit v1.2.3


From 275e31b10ce20613aedceaa5160129c64b260a98 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Tue, 14 May 2013 19:02:45 +0800
Subject: kernel/irq/irqdomain.c: before use 'irq_data', need check it whether
 valid.

Since irq_data may be NULL, if so, we WARN_ON(), and continue, 'hwirq'
which related with 'irq_data' has to initialize later, or it will cause
issue.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Signed-off-by: Grant Likely <grant.likely@linaro.org>
---
 kernel/irq/irqdomain.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d1adaedb435f..8c4c8ea6a205 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -398,11 +398,12 @@ static void irq_domain_disassociate_many(struct irq_domain *domain,
 	while (count--) {
 		int irq = irq_base + count;
 		struct irq_data *irq_data = irq_get_irq_data(irq);
-		irq_hw_number_t hwirq = irq_data->hwirq;
+		irq_hw_number_t hwirq;
 
 		if (WARN_ON(!irq_data || irq_data->domain != domain))
 			continue;
 
+		hwirq = irq_data->hwirq;
 		irq_set_status_flags(irq, IRQ_NOREQUEST);
 
 		/* remove chip and handler */
-- 
cgit v1.2.3


From 94a63da0ac1a67bfb8b30aec1086523c5031ea5a Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 6 Jun 2013 12:10:23 +0100
Subject: irqdomain: document the simple domain first_irq

The first_irq needs to be zero to get a linear domain and that
comes with special semantics. We want to simplify this going
forward but some documentation never hurts.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Grant Likely <grant.likely@linaro.org>
---
 kernel/irq/irqdomain.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8c4c8ea6a205..54a4d5223238 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -143,7 +143,10 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
  * irq_domain_add_simple() - Allocate and register a simple irq_domain.
  * @of_node: pointer to interrupt controller's device tree node.
  * @size: total number of irqs in mapping
- * @first_irq: first number of irq block assigned to the domain
+ * @first_irq: first number of irq block assigned to the domain,
+ *	pass zero to assign irqs on-the-fly. This will result in a
+ *	linear IRQ domain so it is important to use irq_create_mapping()
+ *	for each used IRQ, especially when SPARSE_IRQ is enabled.
  * @ops: map/unmap domain callbacks
  * @host_data: Controller private data pointer
  *
-- 
cgit v1.2.3


From 016a8d5be6ddcc72ef0432d82d9f6fa34f61b907 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 28 May 2013 17:32:53 -0400
Subject: rcu: Don't call wakeup() with rcu_node structure ->lock held

This commit fixes a lockdep-detected deadlock by moving a wake_up()
call out from a rnp->lock critical section.  Please see below for
the long version of this story.

On Tue, 2013-05-28 at 16:13 -0400, Dave Jones wrote:

> [12572.705832] ======================================================
> [12572.750317] [ INFO: possible circular locking dependency detected ]
> [12572.796978] 3.10.0-rc3+ #39 Not tainted
> [12572.833381] -------------------------------------------------------
> [12572.862233] trinity-child17/31341 is trying to acquire lock:
> [12572.870390]  (rcu_node_0){..-.-.}, at: [<ffffffff811054ff>] rcu_read_unlock_special+0x9f/0x4c0
> [12572.878859]
> but task is already holding lock:
> [12572.894894]  (&ctx->lock){-.-...}, at: [<ffffffff811390ed>] perf_lock_task_context+0x7d/0x2d0
> [12572.903381]
> which lock already depends on the new lock.
>
> [12572.927541]
> the existing dependency chain (in reverse order) is:
> [12572.943736]
> -> #4 (&ctx->lock){-.-...}:
> [12572.960032]        [<ffffffff810b9851>] lock_acquire+0x91/0x1f0
> [12572.968337]        [<ffffffff816ebc90>] _raw_spin_lock+0x40/0x80
> [12572.976633]        [<ffffffff8113c987>] __perf_event_task_sched_out+0x2e7/0x5e0
> [12572.984969]        [<ffffffff81088953>] perf_event_task_sched_out+0x93/0xa0
> [12572.993326]        [<ffffffff816ea0bf>] __schedule+0x2cf/0x9c0
> [12573.001652]        [<ffffffff816eacfe>] schedule_user+0x2e/0x70
> [12573.009998]        [<ffffffff816ecd64>] retint_careful+0x12/0x2e
> [12573.018321]
> -> #3 (&rq->lock){-.-.-.}:
> [12573.034628]        [<ffffffff810b9851>] lock_acquire+0x91/0x1f0
> [12573.042930]        [<ffffffff816ebc90>] _raw_spin_lock+0x40/0x80
> [12573.051248]        [<ffffffff8108e6a7>] wake_up_new_task+0xb7/0x260
> [12573.059579]        [<ffffffff810492f5>] do_fork+0x105/0x470
> [12573.067880]        [<ffffffff81049686>] kernel_thread+0x26/0x30
> [12573.076202]        [<ffffffff816cee63>] rest_init+0x23/0x140
> [12573.084508]        [<ffffffff81ed8e1f>] start_kernel+0x3f1/0x3fe
> [12573.092852]        [<ffffffff81ed856f>] x86_64_start_reservations+0x2a/0x2c
> [12573.101233]        [<ffffffff81ed863d>] x86_64_start_kernel+0xcc/0xcf
> [12573.109528]
> -> #2 (&p->pi_lock){-.-.-.}:
> [12573.125675]        [<ffffffff810b9851>] lock_acquire+0x91/0x1f0
> [12573.133829]        [<ffffffff816ebe9b>] _raw_spin_lock_irqsave+0x4b/0x90
> [12573.141964]        [<ffffffff8108e881>] try_to_wake_up+0x31/0x320
> [12573.150065]        [<ffffffff8108ebe2>] default_wake_function+0x12/0x20
> [12573.158151]        [<ffffffff8107bbf8>] autoremove_wake_function+0x18/0x40
> [12573.166195]        [<ffffffff81085398>] __wake_up_common+0x58/0x90
> [12573.174215]        [<ffffffff81086909>] __wake_up+0x39/0x50
> [12573.182146]        [<ffffffff810fc3da>] rcu_start_gp_advanced.isra.11+0x4a/0x50
> [12573.190119]        [<ffffffff810fdb09>] rcu_start_future_gp+0x1c9/0x1f0
> [12573.198023]        [<ffffffff810fe2c4>] rcu_nocb_kthread+0x114/0x930
> [12573.205860]        [<ffffffff8107a91d>] kthread+0xed/0x100
> [12573.213656]        [<ffffffff816f4b1c>] ret_from_fork+0x7c/0xb0
> [12573.221379]
> -> #1 (&rsp->gp_wq){..-.-.}:
> [12573.236329]        [<ffffffff810b9851>] lock_acquire+0x91/0x1f0
> [12573.243783]        [<ffffffff816ebe9b>] _raw_spin_lock_irqsave+0x4b/0x90
> [12573.251178]        [<ffffffff810868f3>] __wake_up+0x23/0x50
> [12573.258505]        [<ffffffff810fc3da>] rcu_start_gp_advanced.isra.11+0x4a/0x50
> [12573.265891]        [<ffffffff810fdb09>] rcu_start_future_gp+0x1c9/0x1f0
> [12573.273248]        [<ffffffff810fe2c4>] rcu_nocb_kthread+0x114/0x930
> [12573.280564]        [<ffffffff8107a91d>] kthread+0xed/0x100
> [12573.287807]        [<ffffffff816f4b1c>] ret_from_fork+0x7c/0xb0

Notice the above call chain.

rcu_start_future_gp() is called with the rnp->lock held. Then it calls
rcu_start_gp_advance, which does a wakeup.

You can't do wakeups while holding the rnp->lock, as that would mean
that you could not do a rcu_read_unlock() while holding the rq lock, or
any lock that was taken while holding the rq lock. This is because...
(See below).

> [12573.295067]
> -> #0 (rcu_node_0){..-.-.}:
> [12573.309293]        [<ffffffff810b8d36>] __lock_acquire+0x1786/0x1af0
> [12573.316568]        [<ffffffff810b9851>] lock_acquire+0x91/0x1f0
> [12573.323825]        [<ffffffff816ebc90>] _raw_spin_lock+0x40/0x80
> [12573.331081]        [<ffffffff811054ff>] rcu_read_unlock_special+0x9f/0x4c0
> [12573.338377]        [<ffffffff810760a6>] __rcu_read_unlock+0x96/0xa0
> [12573.345648]        [<ffffffff811391b3>] perf_lock_task_context+0x143/0x2d0
> [12573.352942]        [<ffffffff8113938e>] find_get_context+0x4e/0x1f0
> [12573.360211]        [<ffffffff811403f4>] SYSC_perf_event_open+0x514/0xbd0
> [12573.367514]        [<ffffffff81140e49>] SyS_perf_event_open+0x9/0x10
> [12573.374816]        [<ffffffff816f4dd4>] tracesys+0xdd/0xe2

Notice the above trace.

perf took its own ctx->lock, which can be taken while holding the rq
lock. While holding this lock, it did a rcu_read_unlock(). The
perf_lock_task_context() basically looks like:

rcu_read_lock();
raw_spin_lock(ctx->lock);
rcu_read_unlock();

Now, what looks to have happened, is that we scheduled after taking that
first rcu_read_lock() but before taking the spin lock. When we scheduled
back in and took the ctx->lock, the following rcu_read_unlock()
triggered the "special" code.

The rcu_read_unlock_special() takes the rnp->lock, which gives us a
possible deadlock scenario.

	CPU0		CPU1		CPU2
	----		----		----

				     rcu_nocb_kthread()
    lock(rq->lock);
		    lock(ctx->lock);
				     lock(rnp->lock);

				     wake_up();

				     lock(rq->lock);

		    rcu_read_unlock();

		    rcu_read_unlock_special();

		    lock(rnp->lock);
    lock(ctx->lock);

**** DEADLOCK ****

> [12573.382068]
> other info that might help us debug this:
>
> [12573.403229] Chain exists of:
>   rcu_node_0 --> &rq->lock --> &ctx->lock
>
> [12573.424471]  Possible unsafe locking scenario:
>
> [12573.438499]        CPU0                    CPU1
> [12573.445599]        ----                    ----
> [12573.452691]   lock(&ctx->lock);
> [12573.459799]                                lock(&rq->lock);
> [12573.467010]                                lock(&ctx->lock);
> [12573.474192]   lock(rcu_node_0);
> [12573.481262]
>  *** DEADLOCK ***
>
> [12573.501931] 1 lock held by trinity-child17/31341:
> [12573.508990]  #0:  (&ctx->lock){-.-...}, at: [<ffffffff811390ed>] perf_lock_task_context+0x7d/0x2d0
> [12573.516475]
> stack backtrace:
> [12573.530395] CPU: 1 PID: 31341 Comm: trinity-child17 Not tainted 3.10.0-rc3+ #39
> [12573.545357]  ffffffff825b4f90 ffff880219f1dbc0 ffffffff816e375b ffff880219f1dc00
> [12573.552868]  ffffffff816dfa5d ffff880219f1dc50 ffff88023ce4d1f8 ffff88023ce4ca40
> [12573.560353]  0000000000000001 0000000000000001 ffff88023ce4d1f8 ffff880219f1dcc0
> [12573.567856] Call Trace:
> [12573.575011]  [<ffffffff816e375b>] dump_stack+0x19/0x1b
> [12573.582284]  [<ffffffff816dfa5d>] print_circular_bug+0x200/0x20f
> [12573.589637]  [<ffffffff810b8d36>] __lock_acquire+0x1786/0x1af0
> [12573.596982]  [<ffffffff810918f5>] ? sched_clock_cpu+0xb5/0x100
> [12573.604344]  [<ffffffff810b9851>] lock_acquire+0x91/0x1f0
> [12573.611652]  [<ffffffff811054ff>] ? rcu_read_unlock_special+0x9f/0x4c0
> [12573.619030]  [<ffffffff816ebc90>] _raw_spin_lock+0x40/0x80
> [12573.626331]  [<ffffffff811054ff>] ? rcu_read_unlock_special+0x9f/0x4c0
> [12573.633671]  [<ffffffff811054ff>] rcu_read_unlock_special+0x9f/0x4c0
> [12573.640992]  [<ffffffff811390ed>] ? perf_lock_task_context+0x7d/0x2d0
> [12573.648330]  [<ffffffff810b429e>] ? put_lock_stats.isra.29+0xe/0x40
> [12573.655662]  [<ffffffff813095a0>] ? delay_tsc+0x90/0xe0
> [12573.662964]  [<ffffffff810760a6>] __rcu_read_unlock+0x96/0xa0
> [12573.670276]  [<ffffffff811391b3>] perf_lock_task_context+0x143/0x2d0
> [12573.677622]  [<ffffffff81139070>] ? __perf_event_enable+0x370/0x370
> [12573.684981]  [<ffffffff8113938e>] find_get_context+0x4e/0x1f0
> [12573.692358]  [<ffffffff811403f4>] SYSC_perf_event_open+0x514/0xbd0
> [12573.699753]  [<ffffffff8108cd9d>] ? get_parent_ip+0xd/0x50
> [12573.707135]  [<ffffffff810b71fd>] ? trace_hardirqs_on_caller+0xfd/0x1c0
> [12573.714599]  [<ffffffff81140e49>] SyS_perf_event_open+0x9/0x10
> [12573.721996]  [<ffffffff816f4dd4>] tracesys+0xdd/0xe2

This commit delays the wakeup via irq_work(), which is what
perf and ftrace use to perform wakeups in critical sections.

Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 17 +++++++++++++++--
 kernel/rcutree.h |  2 ++
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 16ea67925015..b61d20c5ee7b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1613,6 +1613,14 @@ static int __noreturn rcu_gp_kthread(void *arg)
 	}
 }
 
+static void rsp_wakeup(struct irq_work *work)
+{
+	struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
+
+	/* Wake up rcu_gp_kthread() to start the grace period. */
+	wake_up(&rsp->gp_wq);
+}
+
 /*
  * Start a new RCU grace period if warranted, re-initializing the hierarchy
  * in preparation for detecting the next grace period.  The caller must hold
@@ -1637,8 +1645,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
 	}
 	rsp->gp_flags = RCU_GP_FLAG_INIT;
 
-	/* Wake up rcu_gp_kthread() to start the grace period. */
-	wake_up(&rsp->gp_wq);
+	/*
+	 * We can't do wakeups while holding the rnp->lock, as that
+	 * could cause possible deadlocks with the rq->lock. Deter
+	 * the wakeup to interrupt context.
+	 */
+	irq_work_queue(&rsp->wakeup_work);
 }
 
 /*
@@ -3235,6 +3247,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 
 	rsp->rda = rda;
 	init_waitqueue_head(&rsp->gp_wq);
+	init_irq_work(&rsp->wakeup_work, rsp_wakeup);
 	rnp = rsp->level[rcu_num_lvls - 1];
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index da77a8f57ff9..4df503470e42 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -27,6 +27,7 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
+#include <linux/irq_work.h>
 
 /*
  * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -442,6 +443,7 @@ struct rcu_state {
 	char *name;				/* Name of structure. */
 	char abbr;				/* Abbreviated name. */
 	struct list_head flavors;		/* List of RCU flavors. */
+	struct irq_work wakeup_work;		/* Postponed wakeups */
 };
 
 /* Values for rcu_state structure's gp_flags field. */
-- 
cgit v1.2.3


From 971394f389992f8462c4e5ae0e3b49a10a9534a3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 2 Jun 2013 07:13:57 -0700
Subject: rcu: Fix deadlock with CPU hotplug, RCU GP init, and timer migration

In Steven Rostedt's words:

> I've been debugging the last couple of days why my tests have been
> locking up. One of my tracing tests, runs all available tracers. The
> lockup always happened with the mmiotrace, which is used to trace
> interactions between priority drivers and the kernel. But to do this
> easily, when the tracer gets registered, it disables all but the boot
> CPUs. The lockup always happened after it got done disabling the CPUs.
>
> Then I decided to try this:
>
> while :; do
> 	for i in 1 2 3; do
> 		echo 0 > /sys/devices/system/cpu/cpu$i/online
> 	done
> 	for i in 1 2 3; do
> 		echo 1 > /sys/devices/system/cpu/cpu$i/online
> 	done
> done
>
> Well, sure enough, that locked up too, with the same users. Doing a
> sysrq-w (showing all blocked tasks):
>
> [ 2991.344562]   task                        PC stack   pid father
> [ 2991.344562] rcu_preempt     D ffff88007986fdf8     0    10      2 0x00000000
> [ 2991.344562]  ffff88007986fc98 0000000000000002 ffff88007986fc48 0000000000000908
> [ 2991.344562]  ffff88007986c280 ffff88007986ffd8 ffff88007986ffd8 00000000001d3c80
> [ 2991.344562]  ffff880079248a40 ffff88007986c280 0000000000000000 00000000fffd4295
> [ 2991.344562] Call Trace:
> [ 2991.344562]  [<ffffffff815437ba>] schedule+0x64/0x66
> [ 2991.344562]  [<ffffffff81541750>] schedule_timeout+0xbc/0xf9
> [ 2991.344562]  [<ffffffff8154bec0>] ? ftrace_call+0x5/0x2f
> [ 2991.344562]  [<ffffffff81049513>] ? cascade+0xa8/0xa8
> [ 2991.344562]  [<ffffffff815417ab>] schedule_timeout_uninterruptible+0x1e/0x20
> [ 2991.344562]  [<ffffffff810c980c>] rcu_gp_kthread+0x502/0x94b
> [ 2991.344562]  [<ffffffff81062791>] ? __init_waitqueue_head+0x50/0x50
> [ 2991.344562]  [<ffffffff810c930a>] ? rcu_gp_fqs+0x64/0x64
> [ 2991.344562]  [<ffffffff81061cdb>] kthread+0xb1/0xb9
> [ 2991.344562]  [<ffffffff81091e31>] ? lock_release_holdtime.part.23+0x4e/0x55
> [ 2991.344562]  [<ffffffff81061c2a>] ? __init_kthread_worker+0x58/0x58
> [ 2991.344562]  [<ffffffff8154c1dc>] ret_from_fork+0x7c/0xb0
> [ 2991.344562]  [<ffffffff81061c2a>] ? __init_kthread_worker+0x58/0x58
> [ 2991.344562] kworker/0:1     D ffffffff81a30680     0    47      2 0x00000000
> [ 2991.344562] Workqueue: events cpuset_hotplug_workfn
> [ 2991.344562]  ffff880078dbbb58 0000000000000002 0000000000000006 00000000000000d8
> [ 2991.344562]  ffff880078db8100 ffff880078dbbfd8 ffff880078dbbfd8 00000000001d3c80
> [ 2991.344562]  ffff8800779ca5c0 ffff880078db8100 ffffffff81541fcf 0000000000000000
> [ 2991.344562] Call Trace:
> [ 2991.344562]  [<ffffffff81541fcf>] ? __mutex_lock_common+0x3d4/0x609
> [ 2991.344562]  [<ffffffff815437ba>] schedule+0x64/0x66
> [ 2991.344562]  [<ffffffff81543a39>] schedule_preempt_disabled+0x18/0x24
> [ 2991.344562]  [<ffffffff81541fcf>] __mutex_lock_common+0x3d4/0x609
> [ 2991.344562]  [<ffffffff8103d11b>] ? get_online_cpus+0x3c/0x50
> [ 2991.344562]  [<ffffffff8103d11b>] ? get_online_cpus+0x3c/0x50
> [ 2991.344562]  [<ffffffff815422ff>] mutex_lock_nested+0x3b/0x40
> [ 2991.344562]  [<ffffffff8103d11b>] get_online_cpus+0x3c/0x50
> [ 2991.344562]  [<ffffffff810af7e6>] rebuild_sched_domains_locked+0x6e/0x3a8
> [ 2991.344562]  [<ffffffff810b0ec6>] rebuild_sched_domains+0x1c/0x2a
> [ 2991.344562]  [<ffffffff810b109b>] cpuset_hotplug_workfn+0x1c7/0x1d3
> [ 2991.344562]  [<ffffffff810b0ed9>] ? cpuset_hotplug_workfn+0x5/0x1d3
> [ 2991.344562]  [<ffffffff81058e07>] process_one_work+0x2d4/0x4d1
> [ 2991.344562]  [<ffffffff81058d3a>] ? process_one_work+0x207/0x4d1
> [ 2991.344562]  [<ffffffff8105964c>] worker_thread+0x2e7/0x3b5
> [ 2991.344562]  [<ffffffff81059365>] ? rescuer_thread+0x332/0x332
> [ 2991.344562]  [<ffffffff81061cdb>] kthread+0xb1/0xb9
> [ 2991.344562]  [<ffffffff81061c2a>] ? __init_kthread_worker+0x58/0x58
> [ 2991.344562]  [<ffffffff8154c1dc>] ret_from_fork+0x7c/0xb0
> [ 2991.344562]  [<ffffffff81061c2a>] ? __init_kthread_worker+0x58/0x58
> [ 2991.344562] bash            D ffffffff81a4aa80     0  2618   2612 0x10000000
> [ 2991.344562]  ffff8800379abb58 0000000000000002 0000000000000006 0000000000000c2c
> [ 2991.344562]  ffff880077fea140 ffff8800379abfd8 ffff8800379abfd8 00000000001d3c80
> [ 2991.344562]  ffff8800779ca5c0 ffff880077fea140 ffffffff81541fcf 0000000000000000
> [ 2991.344562] Call Trace:
> [ 2991.344562]  [<ffffffff81541fcf>] ? __mutex_lock_common+0x3d4/0x609
> [ 2991.344562]  [<ffffffff815437ba>] schedule+0x64/0x66
> [ 2991.344562]  [<ffffffff81543a39>] schedule_preempt_disabled+0x18/0x24
> [ 2991.344562]  [<ffffffff81541fcf>] __mutex_lock_common+0x3d4/0x609
> [ 2991.344562]  [<ffffffff81530078>] ? rcu_cpu_notify+0x2f5/0x86e
> [ 2991.344562]  [<ffffffff81530078>] ? rcu_cpu_notify+0x2f5/0x86e
> [ 2991.344562]  [<ffffffff815422ff>] mutex_lock_nested+0x3b/0x40
> [ 2991.344562]  [<ffffffff81530078>] rcu_cpu_notify+0x2f5/0x86e
> [ 2991.344562]  [<ffffffff81091c99>] ? __lock_is_held+0x32/0x53
> [ 2991.344562]  [<ffffffff81548912>] notifier_call_chain+0x6b/0x98
> [ 2991.344562]  [<ffffffff810671fd>] __raw_notifier_call_chain+0xe/0x10
> [ 2991.344562]  [<ffffffff8103cf64>] __cpu_notify+0x20/0x32
> [ 2991.344562]  [<ffffffff8103cf8d>] cpu_notify_nofail+0x17/0x36
> [ 2991.344562]  [<ffffffff815225de>] _cpu_down+0x154/0x259
> [ 2991.344562]  [<ffffffff81522710>] cpu_down+0x2d/0x3a
> [ 2991.344562]  [<ffffffff81526351>] store_online+0x4e/0xe7
> [ 2991.344562]  [<ffffffff8134d764>] dev_attr_store+0x20/0x22
> [ 2991.344562]  [<ffffffff811b3c5f>] sysfs_write_file+0x108/0x144
> [ 2991.344562]  [<ffffffff8114c5ef>] vfs_write+0xfd/0x158
> [ 2991.344562]  [<ffffffff8114c928>] SyS_write+0x5c/0x83
> [ 2991.344562]  [<ffffffff8154c494>] tracesys+0xdd/0xe2
>
> As well as held locks:
>
> [ 3034.728033] Showing all locks held in the system:
> [ 3034.728033] 1 lock held by rcu_preempt/10:
> [ 3034.728033]  #0:  (rcu_preempt_state.onoff_mutex){+.+...}, at: [<ffffffff810c9471>] rcu_gp_kthread+0x167/0x94b
> [ 3034.728033] 4 locks held by kworker/0:1/47:
> [ 3034.728033]  #0:  (events){.+.+.+}, at: [<ffffffff81058d3a>] process_one_work+0x207/0x4d1
> [ 3034.728033]  #1:  (cpuset_hotplug_work){+.+.+.}, at: [<ffffffff81058d3a>] process_one_work+0x207/0x4d1
> [ 3034.728033]  #2:  (cpuset_mutex){+.+.+.}, at: [<ffffffff810b0ec1>] rebuild_sched_domains+0x17/0x2a
> [ 3034.728033]  #3:  (cpu_hotplug.lock){+.+.+.}, at: [<ffffffff8103d11b>] get_online_cpus+0x3c/0x50
> [ 3034.728033] 1 lock held by mingetty/2563:
> [ 3034.728033]  #0:  (&ldata->atomic_read_lock){+.+...}, at: [<ffffffff8131e28a>] n_tty_read+0x252/0x7e8
> [ 3034.728033] 1 lock held by mingetty/2565:
> [ 3034.728033]  #0:  (&ldata->atomic_read_lock){+.+...}, at: [<ffffffff8131e28a>] n_tty_read+0x252/0x7e8
> [ 3034.728033] 1 lock held by mingetty/2569:
> [ 3034.728033]  #0:  (&ldata->atomic_read_lock){+.+...}, at: [<ffffffff8131e28a>] n_tty_read+0x252/0x7e8
> [ 3034.728033] 1 lock held by mingetty/2572:
> [ 3034.728033]  #0:  (&ldata->atomic_read_lock){+.+...}, at: [<ffffffff8131e28a>] n_tty_read+0x252/0x7e8
> [ 3034.728033] 1 lock held by mingetty/2575:
> [ 3034.728033]  #0:  (&ldata->atomic_read_lock){+.+...}, at: [<ffffffff8131e28a>] n_tty_read+0x252/0x7e8
> [ 3034.728033] 7 locks held by bash/2618:
> [ 3034.728033]  #0:  (sb_writers#5){.+.+.+}, at: [<ffffffff8114bc3f>] file_start_write+0x2a/0x2c
> [ 3034.728033]  #1:  (&buffer->mutex#2){+.+.+.}, at: [<ffffffff811b3b93>] sysfs_write_file+0x3c/0x144
> [ 3034.728033]  #2:  (s_active#54){.+.+.+}, at: [<ffffffff811b3c3e>] sysfs_write_file+0xe7/0x144
> [ 3034.728033]  #3:  (x86_cpu_hotplug_driver_mutex){+.+.+.}, at: [<ffffffff810217c2>] cpu_hotplug_driver_lock+0x17/0x19
> [ 3034.728033]  #4:  (cpu_add_remove_lock){+.+.+.}, at: [<ffffffff8103d196>] cpu_maps_update_begin+0x17/0x19
> [ 3034.728033]  #5:  (cpu_hotplug.lock){+.+.+.}, at: [<ffffffff8103cfd8>] cpu_hotplug_begin+0x2c/0x6d
> [ 3034.728033]  #6:  (rcu_preempt_state.onoff_mutex){+.+...}, at: [<ffffffff81530078>] rcu_cpu_notify+0x2f5/0x86e
> [ 3034.728033] 1 lock held by bash/2980:
> [ 3034.728033]  #0:  (&ldata->atomic_read_lock){+.+...}, at: [<ffffffff8131e28a>] n_tty_read+0x252/0x7e8
>
> Things looked a little weird. Also, this is a deadlock that lockdep did
> not catch. But what we have here does not look like a circular lock
> issue:
>
> Bash is blocked in rcu_cpu_notify():
>
> 1961		/* Exclude any attempts to start a new grace period. */
> 1962		mutex_lock(&rsp->onoff_mutex);
>
>
> kworker is blocked in get_online_cpus(), which makes sense as we are
> currently taking down a CPU.
>
> But rcu_preempt is not blocked on anything. It is simply sleeping in
> rcu_gp_kthread (really rcu_gp_init) here:
>
> 1453	#ifdef CONFIG_PROVE_RCU_DELAY
> 1454			if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 &&
> 1455			    system_state == SYSTEM_RUNNING)
> 1456				schedule_timeout_uninterruptible(2);
> 1457	#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
>
> And it does this while holding the onoff_mutex that bash is waiting for.
>
> Doing a function trace, it showed me where it happened:
>
> [  125.940066] rcu_pree-10      3.... 28384115273: schedule_timeout_uninterruptible <-rcu_gp_kthread
> [...]
> [  125.940066] rcu_pree-10      3d..3 28384202439: sched_switch: prev_comm=rcu_preempt prev_pid=10 prev_prio=120 prev_state=D ==> next_comm=watchdog/3 next_pid=38 next_prio=120
>
> The watchdog ran, and then:
>
> [  125.940066] watchdog-38      3d..3 28384692863: sched_switch: prev_comm=watchdog/3 prev_pid=38 prev_prio=120 prev_state=P ==> next_comm=modprobe next_pid=2848 next_prio=118
>
> Not sure what modprobe was doing, but shortly after that:
>
> [  125.940066] modprobe-2848    3d..3 28385041749: sched_switch: prev_comm=modprobe prev_pid=2848 prev_prio=118 prev_state=R+ ==> next_comm=migration/3 next_pid=40 next_prio=0
>
> Where the migration thread took down the CPU:
>
> [  125.940066] migratio-40      3d..3 28389148276: sched_switch: prev_comm=migration/3 prev_pid=40 prev_prio=0 prev_state=P ==> next_comm=swapper/3 next_pid=0 next_prio=120
>
> which finally did:
>
> [  125.940066]   <idle>-0       3...1 28389282142: arch_cpu_idle_dead <-cpu_startup_entry
> [  125.940066]   <idle>-0       3...1 28389282548: native_play_dead <-arch_cpu_idle_dead
> [  125.940066]   <idle>-0       3...1 28389282924: play_dead_common <-native_play_dead
> [  125.940066]   <idle>-0       3...1 28389283468: idle_task_exit <-play_dead_common
> [  125.940066]   <idle>-0       3...1 28389284644: amd_e400_remove_cpu <-play_dead_common
>
>
> CPU 3 is now offline, the rcu_preempt thread that ran on CPU 3 is still
> doing a schedule_timeout_uninterruptible() and it registered it's
> timeout to the timer base for CPU 3. You would think that it would get
> migrated right? The issue here is that the timer migration happens at
> the CPU notifier for CPU_DEAD. The problem is that the rcu notifier for
> CPU_DOWN is blocked waiting for the onoff_mutex to be released, which is
> held by the thread that just put itself into a uninterruptible sleep,
> that wont wake up until the CPU_DEAD notifier of the timer
> infrastructure is called, which wont happen until the rcu notifier
> finishes. Here's our deadlock!

This commit breaks this deadlock cycle by substituting a shorter udelay()
for the previous schedule_timeout_uninterruptible(), while at the same
time increasing the probability of the delay.  This maintains the intensity
of the testing.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/rcutree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index b61d20c5ee7b..35380019f0fc 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1451,9 +1451,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
 					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock_irq(&rnp->lock);
 #ifdef CONFIG_PROVE_RCU_DELAY
-		if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 &&
+		if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
 		    system_state == SYSTEM_RUNNING)
-			schedule_timeout_uninterruptible(2);
+			udelay(200);
 #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
 		cond_resched();
 	}
-- 
cgit v1.2.3


From 34376a50fb1fa095b9d0636fa41ed2e73125f214 Mon Sep 17 00:00:00 2001
From: Ben Greear <greearb@candelatech.com>
Date: Thu, 6 Jun 2013 14:29:49 -0700
Subject: Fix lockup related to stop_machine being stuck in __do_softirq.

The stop machine logic can lock up if all but one of the migration
threads make it through the disable-irq step and the one remaining
thread gets stuck in __do_softirq.  The reason __do_softirq can hang is
that it has a bail-out based on jiffies timeout, but in the lockup case,
jiffies itself is not incremented.

To work around this, re-add the max_restart counter in __do_irq and stop
processing irqs after 10 restarts.

Thanks to Tejun Heo and Rusty Russell and others for helping me track
this down.

This was introduced in 3.9 by commit c10d73671ad3 ("softirq: reduce
latencies").

It may be worth looking into ath9k to see if it has issues with its irq
handler at a later date.

The hang stack traces look something like this:

    ------------[ cut here ]------------
    WARNING: at kernel/watchdog.c:245 watchdog_overflow_callback+0x9c/0xa7()
    Watchdog detected hard LOCKUP on cpu 2
    Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen lockd sunrpc]
    Pid: 23, comm: migration/2 Tainted: G         C   3.9.4+ #11
    Call Trace:
     <NMI>   warn_slowpath_common+0x85/0x9f
      warn_slowpath_fmt+0x46/0x48
      watchdog_overflow_callback+0x9c/0xa7
      __perf_event_overflow+0x137/0x1cb
      perf_event_overflow+0x14/0x16
      intel_pmu_handle_irq+0x2dc/0x359
      perf_event_nmi_handler+0x19/0x1b
      nmi_handle+0x7f/0xc2
      do_nmi+0xbc/0x304
      end_repeat_nmi+0x1e/0x2e
     <<EOE>>
      cpu_stopper_thread+0xae/0x162
      smpboot_thread_fn+0x258/0x260
      kthread+0xc7/0xcf
      ret_from_fork+0x7c/0xb0
    ---[ end trace 4947dfa9b0a4cec3 ]---
    BUG: soft lockup - CPU#1 stuck for 22s! [migration/1:17]
    Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen lockd sunrpc]
    irq event stamp: 835637905
    hardirqs last  enabled at (835637904): __do_softirq+0x9f/0x257
    hardirqs last disabled at (835637905): apic_timer_interrupt+0x6d/0x80
    softirqs last  enabled at (5654720): __do_softirq+0x1ff/0x257
    softirqs last disabled at (5654725): irq_exit+0x5f/0xbb
    CPU 1
    Pid: 17, comm: migration/1 Tainted: G        WC   3.9.4+ #11 To be filled by O.E.M. To be filled by O.E.M./To be filled by O.E.M.
    RIP: tasklet_hi_action+0xf0/0xf0
    Process migration/1
    Call Trace:
     <IRQ>
      __do_softirq+0x117/0x257
      irq_exit+0x5f/0xbb
      smp_apic_timer_interrupt+0x8a/0x98
      apic_timer_interrupt+0x72/0x80
     <EOI>
      printk+0x4d/0x4f
      stop_machine_cpu_stop+0x22c/0x274
      cpu_stopper_thread+0xae/0x162
      smpboot_thread_fn+0x258/0x260
      kthread+0xc7/0xcf
      ret_from_fork+0x7c/0xb0

Signed-off-by: Ben Greear <greearb@candelatech.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Pekka Riikonen <priikone@iki.fi>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/softirq.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index b5197dcb0dad..3d6833f125d3 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,8 +195,12 @@ void local_bh_enable_ip(unsigned long ip)
 EXPORT_SYMBOL(local_bh_enable_ip);
 
 /*
- * We restart softirq processing for at most 2 ms,
- * and if need_resched() is not set.
+ * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
+ * but break the loop if need_resched() is set or after 2 ms.
+ * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in
+ * certain cases, such as stop_machine(), jiffies may cease to
+ * increment and so we need the MAX_SOFTIRQ_RESTART limit as
+ * well to make sure we eventually return from this method.
  *
  * These limits have been established via experimentation.
  * The two things to balance is latency against fairness -
@@ -204,6 +208,7 @@ EXPORT_SYMBOL(local_bh_enable_ip);
  * should not be able to lock up the box.
  */
 #define MAX_SOFTIRQ_TIME  msecs_to_jiffies(2)
+#define MAX_SOFTIRQ_RESTART 10
 
 asmlinkage void __do_softirq(void)
 {
@@ -212,6 +217,7 @@ asmlinkage void __do_softirq(void)
 	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
 	int cpu;
 	unsigned long old_flags = current->flags;
+	int max_restart = MAX_SOFTIRQ_RESTART;
 
 	/*
 	 * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -265,7 +271,8 @@ restart:
 
 	pending = local_softirq_pending();
 	if (pending) {
-		if (time_before(jiffies, end) && !need_resched())
+		if (time_before(jiffies, end) && !need_resched() &&
+		    --max_restart)
 			goto restart;
 
 		wakeup_softirqd();
-- 
cgit v1.2.3


From 58e8eedf18577c7eac722d5d1f190507ea263d1b Mon Sep 17 00:00:00 2001
From: Yoshihiro YUNOMAE <yoshihiro.yunomae.ez@hitachi.com>
Date: Tue, 23 Apr 2013 10:32:39 +0900
Subject: tracing: Fix outputting formats of x86-tsc and counter when use
 trace_clock

Outputting formats of x86-tsc and counter should be a raw format, but after
applying the patch(2b6080f28c7cc3efc8625ab71495aae89aeb63a0), the format was
changed to nanosec. This is because the global variable trace_clock_id was used.
When we use multiple buffers, clock_id of each sub-buffer should be used. Then,
this patch uses tr->clock_id instead of the global variable trace_clock_id.

[ Basically, this fixes a regression where the multibuffer code changed the
  trace_clock file to update tr->clock_id but the traces still use the old
  global trace_clock_id variable, negating the file's effect. The global
  trace_clock_id variable is obsolete and removed. - SR ]

Link: http://lkml.kernel.org/r/20130423013239.22334.7394.stgit@yunodevel

Signed-off-by: Yoshihiro YUNOMAE <yoshihiro.yunomae.ez@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 8 +++-----
 kernel/trace/trace.h | 2 --
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1a41023a1f88..e71a8be4a6ee 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -652,8 +652,6 @@ static struct {
 	ARCH_TRACE_CLOCKS
 };
 
-int trace_clock_id;
-
 /*
  * trace_parser_get_init - gets the buffer for trace parser
  */
@@ -2826,7 +2824,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 		iter->iter_flags |= TRACE_FILE_ANNOTATE;
 
 	/* Output in nanoseconds only if we are using a clock in nanoseconds. */
-	if (trace_clocks[trace_clock_id].in_ns)
+	if (trace_clocks[tr->clock_id].in_ns)
 		iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
 
 	/* stop the trace while dumping if we are not opening "snapshot" */
@@ -3825,7 +3823,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 		iter->iter_flags |= TRACE_FILE_LAT_FMT;
 
 	/* Output in nanoseconds only if we are using a clock in nanoseconds. */
-	if (trace_clocks[trace_clock_id].in_ns)
+	if (trace_clocks[tr->clock_id].in_ns)
 		iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
 
 	iter->cpu_file = tc->cpu;
@@ -5095,7 +5093,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 	cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "bytes: %ld\n", cnt);
 
-	if (trace_clocks[trace_clock_id].in_ns) {
+	if (trace_clocks[tr->clock_id].in_ns) {
 		/* local or global for trace_clock */
 		t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
 		usec_rem = do_div(t, USEC_PER_SEC);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 711ca7d3e7f1..20572ed88c5c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -700,8 +700,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
 
 extern unsigned long trace_flags;
 
-extern int trace_clock_id;
-
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-- 
cgit v1.2.3


From d7880812b3594d3c6dcbe3cfd71dabb17347d082 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 10 Jun 2013 16:52:03 +0200
Subject: idle: Add the stack canary init to cpu_startup_entry()

Moving x86 to the generic idle implementation (commit 7d1a9417 "x86:
Use generic idle loop") wreckaged the stack protector.

I stupidly missed that boot_init_stack_canary() must be inlined from a
function which never returns, but I put that call into
arch_cpu_idle_prepare() which of course returns.

I pondered to play tricks with arch_cpu_idle_prepare() first, but then
I noticed, that the other archs which have implemented the
stackprotector (ARM and SH) do not initialize the canary for the
non-boot cpus.

So I decided to move the boot_init_stack_canary() call into
cpu_startup_entry() ifdeffed with an CONFIG_X86 for now. This #ifdef
is just a temporary measure as I don't want to inflict the
boot_init_stack_canary() call on ARM and SH that late in the cycle.

I'll queue a patch for 3.11 which removes the #ifdef if the ARM/SH
maintainers have no objection.

Reported-by: Wouter van Kesteren <woutershep@gmail.com>
Cc: x86@kernel.org
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu/idle.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index d5585f5e038e..bf2ee1aafa0e 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -5,6 +5,7 @@
 #include <linux/cpu.h>
 #include <linux/tick.h>
 #include <linux/mm.h>
+#include <linux/stackprotector.h>
 
 #include <asm/tlb.h>
 
@@ -112,6 +113,21 @@ static void cpu_idle_loop(void)
 
 void cpu_startup_entry(enum cpuhp_state state)
 {
+	/*
+	 * This #ifdef needs to die, but it's too late in the cycle to
+	 * make this generic (arm and sh have never invoked the canary
+	 * init for the non boot cpus!). Will be fixed in 3.11
+	 */
+#ifdef CONFIG_X86
+	/*
+	 * If we're the non-boot CPU, nothing set the stack canary up
+	 * for us. The boot CPU already has it initialized but no harm
+	 * in doing it again. This is a good place for updating it, as
+	 * we wont ever return from this function (so the invalid
+	 * canaries already on the stack wont ever trigger).
+	 */
+	boot_init_stack_canary();
+#endif
 	current_set_polling();
 	arch_cpu_idle_prepare();
 	cpu_idle_loop();
-- 
cgit v1.2.3


From 16e53dbf10a2d7e228709a7286310e629ede5e45 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Wed, 12 Jun 2013 14:04:36 -0700
Subject: CPU hotplug: provide a generic helper to disable/enable CPU hotplug

There are instances in the kernel where we would like to disable CPU
hotplug (from sysfs) during some important operation.  Today the freezer
code depends on this and the code to do it was kinda tailor-made for
that.

Restructure the code and make it generic enough to be useful for other
usecases too.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Robin Holt <holt@sgi.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Shawn Guo <shawn.guo@linaro.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 55 +++++++++++++++++++++++--------------------------------
 1 file changed, 23 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index b5e4ab2d427e..198a38883e64 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -133,6 +133,27 @@ static void cpu_hotplug_done(void)
 	mutex_unlock(&cpu_hotplug.lock);
 }
 
+/*
+ * Wait for currently running CPU hotplug operations to complete (if any) and
+ * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
+ * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
+ * hotplug path before performing hotplug operations. So acquiring that lock
+ * guarantees mutual exclusion from any currently running hotplug operations.
+ */
+void cpu_hotplug_disable(void)
+{
+	cpu_maps_update_begin();
+	cpu_hotplug_disabled = 1;
+	cpu_maps_update_done();
+}
+
+void cpu_hotplug_enable(void)
+{
+	cpu_maps_update_begin();
+	cpu_hotplug_disabled = 0;
+	cpu_maps_update_done();
+}
+
 #else /* #if CONFIG_HOTPLUG_CPU */
 static void cpu_hotplug_begin(void) {}
 static void cpu_hotplug_done(void) {}
@@ -540,36 +561,6 @@ static int __init alloc_frozen_cpus(void)
 }
 core_initcall(alloc_frozen_cpus);
 
-/*
- * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
- * hotplug when tasks are about to be frozen. Also, don't allow the freezer
- * to continue until any currently running CPU hotplug operation gets
- * completed.
- * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
- * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
- * CPU hotplug path and released only after it is complete. Thus, we
- * (and hence the freezer) will block here until any currently running CPU
- * hotplug operation gets completed.
- */
-void cpu_hotplug_disable_before_freeze(void)
-{
-	cpu_maps_update_begin();
-	cpu_hotplug_disabled = 1;
-	cpu_maps_update_done();
-}
-
-
-/*
- * When tasks have been thawed, re-enable regular CPU hotplug (which had been
- * disabled while beginning to freeze tasks).
- */
-void cpu_hotplug_enable_after_thaw(void)
-{
-	cpu_maps_update_begin();
-	cpu_hotplug_disabled = 0;
-	cpu_maps_update_done();
-}
-
 /*
  * When callbacks for CPU hotplug notifications are being executed, we must
  * ensure that the state of the system with respect to the tasks being frozen
@@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
 
 	case PM_SUSPEND_PREPARE:
 	case PM_HIBERNATION_PREPARE:
-		cpu_hotplug_disable_before_freeze();
+		cpu_hotplug_disable();
 		break;
 
 	case PM_POST_SUSPEND:
 	case PM_POST_HIBERNATION:
-		cpu_hotplug_enable_after_thaw();
+		cpu_hotplug_enable();
 		break;
 
 	default:
-- 
cgit v1.2.3


From cf7df378aa4ff7da3a44769b7ff6e9eef1a9f3db Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Wed, 12 Jun 2013 14:04:37 -0700
Subject: reboot: rigrate shutdown/reboot to boot cpu

We recently noticed that reboot of a 1024 cpu machine takes approx 16
minutes of just stopping the cpus.  The slowdown was tracked to commit
f96972f2dc63 ("kernel/sys.c: call disable_nonboot_cpus() in
kernel_restart()").

The current implementation does all the work of hot removing the cpus
before halting the system.  We are switching to just migrating to the
boot cpu and then continuing with shutdown/reboot.

This also has the effect of not breaking x86's command line parameter
for specifying the reboot cpu.  Note, this code was shamelessly copied
from arch/x86/kernel/reboot.c with bits removed pertaining to the
reboot_cpu command line parameter.

Signed-off-by: Robin Holt <holt@sgi.com>
Tested-by: Shawn Guo <shawn.guo@linaro.org>
Cc: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index b95d3c72ba21..2bbd9a73b54c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
 
+/* Add backwards compatibility for stable trees. */
+#ifndef PF_NO_SETAFFINITY
+#define PF_NO_SETAFFINITY		PF_THREAD_BOUND
+#endif
+
+static void migrate_to_reboot_cpu(void)
+{
+	/* The boot cpu is always logical cpu 0 */
+	int cpu = 0;
+
+	cpu_hotplug_disable();
+
+	/* Make certain the cpu I'm about to reboot on is online */
+	if (!cpu_online(cpu))
+		cpu = cpumask_first(cpu_online_mask);
+
+	/* Prevent races with other tasks migrating this task */
+	current->flags |= PF_NO_SETAFFINITY;
+
+	/* Make certain I only run on the appropriate processor */
+	set_cpus_allowed_ptr(current, cpumask_of(cpu));
+}
+
 /**
  *	kernel_restart - reboot the system
  *	@cmd: pointer to buffer containing command to execute for restart
@@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
 void kernel_restart(char *cmd)
 {
 	kernel_restart_prepare(cmd);
-	disable_nonboot_cpus();
+	migrate_to_reboot_cpu();
 	syscore_shutdown();
 	if (!cmd)
 		printk(KERN_EMERG "Restarting system.\n");
@@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state)
 void kernel_halt(void)
 {
 	kernel_shutdown_prepare(SYSTEM_HALT);
-	disable_nonboot_cpus();
+	migrate_to_reboot_cpu();
 	syscore_shutdown();
 	printk(KERN_EMERG "System halted.\n");
 	kmsg_dump(KMSG_DUMP_HALT);
@@ -419,7 +442,7 @@ void kernel_power_off(void)
 	kernel_shutdown_prepare(SYSTEM_POWER_OFF);
 	if (pm_power_off_prepare)
 		pm_power_off_prepare();
-	disable_nonboot_cpus();
+	migrate_to_reboot_cpu();
 	syscore_shutdown();
 	printk(KERN_EMERG "Power down.\n");
 	kmsg_dump(KMSG_DUMP_POWEROFF);
-- 
cgit v1.2.3


From 637241a900cbd982f744d44646b48a273d609b34 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 12 Jun 2013 14:04:39 -0700
Subject: kmsg: honor dmesg_restrict sysctl on /dev/kmsg

The dmesg_restrict sysctl currently covers the syslog method for access
dmesg, however /dev/kmsg isn't covered by the same protections.  Most
people haven't noticed because util-linux dmesg(1) defaults to using the
syslog method for access in older versions.  With util-linux dmesg(1)
defaults to reading directly from /dev/kmsg.

To fix /dev/kmsg, let's compare the existing interfaces and what they
allow:

 - /proc/kmsg allows:
  - open (SYSLOG_ACTION_OPEN) if CAP_SYSLOG since it uses a destructive
    single-reader interface (SYSLOG_ACTION_READ).
  - everything, after an open.

 - syslog syscall allows:
  - anything, if CAP_SYSLOG.
  - SYSLOG_ACTION_READ_ALL and SYSLOG_ACTION_SIZE_BUFFER, if
    dmesg_restrict==0.
  - nothing else (EPERM).

The use-cases were:
 - dmesg(1) needs to do non-destructive SYSLOG_ACTION_READ_ALLs.
 - sysklog(1) needs to open /proc/kmsg, drop privs, and still issue the
   destructive SYSLOG_ACTION_READs.

AIUI, dmesg(1) is moving to /dev/kmsg, and systemd-journald doesn't
clear the ring buffer.

Based on the comments in devkmsg_llseek, it sounds like actions besides
reading aren't going to be supported by /dev/kmsg (i.e.
SYSLOG_ACTION_CLEAR), so we have a strict subset of the non-destructive
syslog syscall actions.

To this end, move the check as Josh had done, but also rename the
constants to reflect their new uses (SYSLOG_FROM_CALL becomes
SYSLOG_FROM_READER, and SYSLOG_FROM_FILE becomes SYSLOG_FROM_PROC).
SYSLOG_FROM_READER allows non-destructive actions, and SYSLOG_FROM_PROC
allows destructive actions after a capabilities-constrained
SYSLOG_ACTION_OPEN check.

 - /dev/kmsg allows:
  - open if CAP_SYSLOG or dmesg_restrict==0
  - reading/polling, after open

Addresses https://bugzilla.redhat.com/show_bug.cgi?id=903192

[akpm@linux-foundation.org: use pr_warn_once()]
Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Christian Kujau <lists@nerdbynature.de>
Tested-by: Josh Boyer <jwboyer@redhat.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 91 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 50 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index fa36e1494420..8212c1aef125 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -363,6 +363,53 @@ static void log_store(int facility, int level,
 	log_next_seq++;
 }
 
+#ifdef CONFIG_SECURITY_DMESG_RESTRICT
+int dmesg_restrict = 1;
+#else
+int dmesg_restrict;
+#endif
+
+static int syslog_action_restricted(int type)
+{
+	if (dmesg_restrict)
+		return 1;
+	/*
+	 * Unless restricted, we allow "read all" and "get buffer size"
+	 * for everybody.
+	 */
+	return type != SYSLOG_ACTION_READ_ALL &&
+	       type != SYSLOG_ACTION_SIZE_BUFFER;
+}
+
+static int check_syslog_permissions(int type, bool from_file)
+{
+	/*
+	 * If this is from /proc/kmsg and we've already opened it, then we've
+	 * already done the capabilities checks at open time.
+	 */
+	if (from_file && type != SYSLOG_ACTION_OPEN)
+		return 0;
+
+	if (syslog_action_restricted(type)) {
+		if (capable(CAP_SYSLOG))
+			return 0;
+		/*
+		 * For historical reasons, accept CAP_SYS_ADMIN too, with
+		 * a warning.
+		 */
+		if (capable(CAP_SYS_ADMIN)) {
+			pr_warn_once("%s (%d): Attempt to access syslog with "
+				     "CAP_SYS_ADMIN but no CAP_SYSLOG "
+				     "(deprecated).\n",
+				 current->comm, task_pid_nr(current));
+			return 0;
+		}
+		return -EPERM;
+	}
+	return security_syslog(type);
+}
+
+
 /* /dev/kmsg - userspace message inject/listen interface */
 struct devkmsg_user {
 	u64 seq;
@@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file)
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 		return 0;
 
-	err = security_syslog(SYSLOG_ACTION_READ_ALL);
+	err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+				       SYSLOG_FROM_READER);
 	if (err)
 		return err;
 
@@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level)
 }
 #endif
 
-#ifdef CONFIG_SECURITY_DMESG_RESTRICT
-int dmesg_restrict = 1;
-#else
-int dmesg_restrict;
-#endif
-
-static int syslog_action_restricted(int type)
-{
-	if (dmesg_restrict)
-		return 1;
-	/* Unless restricted, we allow "read all" and "get buffer size" for everybody */
-	return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
-}
-
-static int check_syslog_permissions(int type, bool from_file)
-{
-	/*
-	 * If this is from /proc/kmsg and we've already opened it, then we've
-	 * already done the capabilities checks at open time.
-	 */
-	if (from_file && type != SYSLOG_ACTION_OPEN)
-		return 0;
-
-	if (syslog_action_restricted(type)) {
-		if (capable(CAP_SYSLOG))
-			return 0;
-		/* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
-		if (capable(CAP_SYS_ADMIN)) {
-			printk_once(KERN_WARNING "%s (%d): "
-				 "Attempt to access syslog with CAP_SYS_ADMIN "
-				 "but no CAP_SYSLOG (deprecated).\n",
-				 current->comm, task_pid_nr(current));
-			return 0;
-		}
-		return -EPERM;
-	}
-	return 0;
-}
-
 #if defined(CONFIG_PRINTK_TIME)
 static bool printk_time = 1;
 #else
@@ -1249,7 +1258,7 @@ out:
 
 SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 {
-	return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
+	return do_syslog(type, buf, len, SYSLOG_FROM_READER);
 }
 
 /*
-- 
cgit v1.2.3


From f000cfdde5de4fc15dead5ccf524359c07eadf2b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 12 Jun 2013 14:04:46 -0700
Subject: audit: wait_for_auditd() should use TASK_UNINTERRUPTIBLE

audit_log_start() does wait_for_auditd() in a loop until
audit_backlog_wait_time passes or audit_skb_queue has a room.

If signal_pending() is true this becomes a busy-wait loop, schedule() in
TASK_INTERRUPTIBLE won't block.

Thanks to Guy for fully investigating and explaining the problem.

(akpm: that'll cause the system to lock up on a non-preemptible
uniprocessor kernel)

(Guy: "Our customer was in fact running a uniprocessor machine, and they
reported a system hang.")

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Guy Streeter <streeter@redhat.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 21c7fa615bd3..91e53d04b6a9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
 static void wait_for_auditd(unsigned long sleep_time)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	set_current_state(TASK_INTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	add_wait_queue(&audit_backlog_wait, &wait);
 
 	if (audit_backlog_limit &&
-- 
cgit v1.2.3


From 736f3203a06eafd0944103775a98584082744c6b Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Wed, 12 Jun 2013 14:05:07 -0700
Subject: kernel/audit_tree.c:audit_add_tree_rule(): protect `rule' from
 kill_rules()

audit_add_tree_rule() must set 'rule->tree = NULL;' firstly, to protect
the rule itself freed in kill_rules().

The reason is when it is killed, the 'rule' itself may have already
released, we should not access it.  one example: we add a rule to an
inode, just at the same time the other task is deleting this inode.

The work flow for adding a rule:

    audit_receive() -> (need audit_cmd_mutex lock)
      audit_receive_skb() ->
        audit_receive_msg() ->
          audit_receive_filter() ->
            audit_add_rule() ->
              audit_add_tree_rule() -> (need audit_filter_mutex lock)
                ...
                unlock audit_filter_mutex
                get_tree()
                ...
                iterate_mounts() -> (iterate all related inodes)
                  tag_mount() ->
                    tag_trunk() ->
                      create_trunk() -> (assume it is 1st rule)
                        fsnotify_add_mark() ->
                          fsnotify_add_inode_mark() ->  (add mark to inode->i_fsnotify_marks)
                        ...
                        get_tree(); (each inode will get one)
                ...
                lock audit_filter_mutex

The work flow for deleting an inode:

    __destroy_inode() ->
     fsnotify_inode_delete() ->
       __fsnotify_inode_delete() ->
        fsnotify_clear_marks_by_inode() ->  (get mark from inode->i_fsnotify_marks)
          fsnotify_destroy_mark() ->
           fsnotify_destroy_mark_locked() ->
             audit_tree_freeing_mark() ->
               evict_chunk() ->
                 ...
                 tree->goner = 1
                 ...
                 kill_rules() ->   (assume current->audit_context == NULL)
                   call_rcu() ->   (rule->tree != NULL)
                     audit_free_rule_rcu() ->
                       audit_free_rule()
                 ...
                 audit_schedule_prune() ->  (assume current->audit_context == NULL)
                   kthread_run() ->    (need audit_cmd_mutex and audit_filter_mutex lock)
                     prune_one() ->    (delete it from prue_list)
                       put_tree(); (match the original get_tree above)

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/audit_tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index a291aa23fb3f..43c307dc9453 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
 	struct vfsmount *mnt;
 	int err;
 
+	rule->tree = NULL;
 	list_for_each_entry(tree, &tree_list, list) {
 		if (!strcmp(seed->pathname, tree->pathname)) {
 			put_tree(seed);
-- 
cgit v1.2.3


From 29ce3785b22da47c49f4ef6e14b9014fa5dee261 Mon Sep 17 00:00:00 2001
From: James Bottomley <JBottomley@Parallels.com>
Date: Wed, 8 May 2013 14:05:34 -0700
Subject: idle: Enable interrupts in the weak arch_cpu_idle() implementation

PARISC bootup triggers the warning at kernel/cpu/idle.c:96. That's
caused by the weak arch_cpu_idle() implementation, which is provided
to avoid that architectures implement idle_poll over and over.

The switchover to polling mode happens in the first call of the weak
arch_cpu_idle() implementation, but that code fails to reenable
interrupts and therefor triggers the warning.

Fix this by enabling interrupts in the weak arch_cpu_idle() code.

[ tglx: Made the changelog match the patch ]

Signed-off-by: James Bottomley <JBottomley@Parallels.com>
Reviewed-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1371236142.2726.43.camel@dabdike
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu/idle.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index bf2ee1aafa0e..e695c0a0bcb5 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -59,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { }
 void __weak arch_cpu_idle(void)
 {
 	cpu_idle_force_poll = 1;
+	local_irq_enable();
 }
 
 /*
-- 
cgit v1.2.3


From 8aac62706adaaf0fab02c4327761561c8bda9448 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 14 Jun 2013 21:09:49 +0200
Subject: move exit_task_namespaces() outside of exit_notify()

exit_notify() does exit_task_namespaces() after
forget_original_parent(). This was needed to ensure that ->nsproxy
can't be cleared prematurely, an exiting child we are going to
reparent can do do_notify_parent() and use the parent's (ours) pid_ns.

However, after 32084504 "pidns: use task_active_pid_ns in
do_notify_parent" ->nsproxy != NULL is no longer needed, we rely
on task_active_pid_ns().

Move exit_task_namespaces() from exit_notify() to do_exit(), after
exit_fs() and before exit_task_work().

This solves the problem reported by Andrey, free_ipc_ns()->shm_destroy()
does fput() which needs task_work_add().

Note: this particular problem can be fixed if we change fput(), and
that change makes sense anyway. But there is another reason to move
the callsite. The original reason for exit_task_namespaces() from
the middle of exit_notify() was subtle and it has already gone away,
now this looks confusing. And this allows us do simplify exit_notify(),
we can avoid unlock/lock(tasklist) and we can use ->exit_state instead
of PF_EXITING in forget_original_parent().

Reported-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index af2eb3cbd499..7bb73f9d09db 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -649,7 +649,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 	 */
 	forget_original_parent(tsk);
-	exit_task_namespaces(tsk);
 
 	write_lock_irq(&tasklist_lock);
 	if (group_dead)
@@ -795,6 +794,7 @@ void do_exit(long code)
 	exit_shm(tsk);
 	exit_files(tsk);
 	exit_fs(tsk);
+	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	check_stack_usage();
 	exit_thread();
-- 
cgit v1.2.3


From 0541881502a1276149889fe468662ff6a8fc8f6d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 13 Jun 2013 13:17:02 -0700
Subject: range: Do not add new blank slot with add_range_with_merge

Joshua reported: Commit cd7b304dfaf1 (x86, range: fix missing merge
during add range) broke mtrr cleanup on his setup in 3.9.5.
corresponding commit in upstream is fbe06b7bae7c.

The reason is add_range_with_merge could generate blank spot.

We could avoid that by searching new expanded start/end, that
new range should include all connected ranges in range array.
At last add the new expanded start/end to the range array.
Also move up left array so do not add new blank slot in the
range array.

-v2: move left array to avoid enhance add_range()
-v3: include fix from Joshua about memmove declaring when
     DYN_DEBUG is used.

Reported-by: Joshua Covington <joshuacov@googlemail.com>
Tested-by: Joshua Covington <joshuacov@googlemail.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1371154622-8929-3-git-send-email-yinghai@kernel.org
Cc: <stable@vger.kernel.org> v3.9
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/range.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/range.c b/kernel/range.c
index eb911dbce267..322ea8e93e4b 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -4,7 +4,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sort.h>
-
+#include <linux/string.h>
 #include <linux/range.h>
 
 int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
@@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
 	if (start >= end)
 		return nr_range;
 
-	/* Try to merge it with old one: */
+	/* get new start/end: */
 	for (i = 0; i < nr_range; i++) {
-		u64 final_start, final_end;
 		u64 common_start, common_end;
 
 		if (!range[i].end)
@@ -45,14 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
 		if (common_start > common_end)
 			continue;
 
-		final_start = min(range[i].start, start);
-		final_end = max(range[i].end, end);
+		/* new start/end, will add it back at last */
+		start = min(range[i].start, start);
+		end = max(range[i].end, end);
 
-		/* clear it and add it back for further merge */
-		range[i].start = 0;
-		range[i].end =  0;
-		return add_range_with_merge(range, az, nr_range,
-			final_start, final_end);
+		memmove(&range[i], &range[i + 1],
+			(nr_range - (i + 1)) * sizeof(range[i]));
+		range[nr_range - 1].start = 0;
+		range[nr_range - 1].end   = 0;
+		nr_range--;
+		i--;
 	}
 
 	/* Need to add it: */
-- 
cgit v1.2.3


From 9bb5d40cd93c9dd4be74834b1dcb1ba03629716b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 4 Jun 2013 10:44:21 +0200
Subject: perf: Fix mmap() accounting hole

Vince's fuzzer once again found holes. This time it spotted a leak in
the locked page accounting.

When an event had redirected output and its close() was the last
reference to the buffer we didn't have a vm context to undo accounting.

Change the code to destroy the buffer on the last munmap() and detach
all redirected events at that time. This provides us the right context
to undo the vm accounting.

Reported-and-tested-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20130604084421.GI8923@twins.programming.kicks-ass.net
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c     | 228 ++++++++++++++++++++++++++++++++---------------
 kernel/events/internal.h |   3 +-
 2 files changed, 159 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ae752cd4a086..b391907d5352 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
 
-static void ring_buffer_attach(struct perf_event *event,
-			       struct ring_buffer *rb);
-
 void __weak perf_event_print_debug(void)	{ }
 
 extern __weak const char *perf_pmu_name(void)
@@ -2917,7 +2914,8 @@ static void free_event_rcu(struct rcu_head *head)
 	kfree(event);
 }
 
-static bool ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
 
 static void free_event(struct perf_event *event)
 {
@@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event)
 		if (has_branch_stack(event)) {
 			static_key_slow_dec_deferred(&perf_sched_events);
 			/* is system-wide event */
-			if (!(event->attach_state & PERF_ATTACH_TASK))
+			if (!(event->attach_state & PERF_ATTACH_TASK)) {
 				atomic_dec(&per_cpu(perf_branch_stack_events,
 						    event->cpu));
+			}
 		}
 	}
 
 	if (event->rb) {
-		ring_buffer_put(event->rb);
-		event->rb = NULL;
+		struct ring_buffer *rb;
+
+		/*
+		 * Can happen when we close an event with re-directed output.
+		 *
+		 * Since we have a 0 refcount, perf_mmap_close() will skip
+		 * over us; possibly making our ring_buffer_put() the last.
+		 */
+		mutex_lock(&event->mmap_mutex);
+		rb = event->rb;
+		if (rb) {
+			rcu_assign_pointer(event->rb, NULL);
+			ring_buffer_detach(event, rb);
+			ring_buffer_put(rb); /* could be last */
+		}
+		mutex_unlock(&event->mmap_mutex);
 	}
 
 	if (is_cgroup_event(event))
@@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 	unsigned int events = POLL_HUP;
 
 	/*
-	 * Race between perf_event_set_output() and perf_poll(): perf_poll()
-	 * grabs the rb reference but perf_event_set_output() overrides it.
-	 * Here is the timeline for two threads T1, T2:
-	 * t0: T1, rb = rcu_dereference(event->rb)
-	 * t1: T2, old_rb = event->rb
-	 * t2: T2, event->rb = new rb
-	 * t3: T2, ring_buffer_detach(old_rb)
-	 * t4: T1, ring_buffer_attach(rb1)
-	 * t5: T1, poll_wait(event->waitq)
-	 *
-	 * To avoid this problem, we grab mmap_mutex in perf_poll()
-	 * thereby ensuring that the assignment of the new ring buffer
-	 * and the detachment of the old buffer appear atomic to perf_poll()
+	 * Pin the event->rb by taking event->mmap_mutex; otherwise
+	 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
 	 */
 	mutex_lock(&event->mmap_mutex);
-
-	rcu_read_lock();
-	rb = rcu_dereference(event->rb);
-	if (rb) {
-		ring_buffer_attach(event, rb);
+	rb = event->rb;
+	if (rb)
 		events = atomic_xchg(&rb->poll, 0);
-	}
-	rcu_read_unlock();
-
 	mutex_unlock(&event->mmap_mutex);
 
 	poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event,
 		return;
 
 	spin_lock_irqsave(&rb->event_lock, flags);
-	if (!list_empty(&event->rb_entry))
-		goto unlock;
-
-	list_add(&event->rb_entry, &rb->event_list);
-unlock:
+	if (list_empty(&event->rb_entry))
+		list_add(&event->rb_entry, &rb->event_list);
 	spin_unlock_irqrestore(&rb->event_lock, flags);
 }
 
-static void ring_buffer_detach(struct perf_event *event,
-			       struct ring_buffer *rb)
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
 {
 	unsigned long flags;
 
@@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
 
 	rcu_read_lock();
 	rb = rcu_dereference(event->rb);
-	if (!rb)
-		goto unlock;
-
-	list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
-		wake_up_all(&event->waitq);
-
-unlock:
+	if (rb) {
+		list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+			wake_up_all(&event->waitq);
+	}
 	rcu_read_unlock();
 }
 
@@ -3582,23 +3571,14 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 	return rb;
 }
 
-static bool ring_buffer_put(struct ring_buffer *rb)
+static void ring_buffer_put(struct ring_buffer *rb)
 {
-	struct perf_event *event, *n;
-	unsigned long flags;
-
 	if (!atomic_dec_and_test(&rb->refcount))
-		return false;
+		return;
 
-	spin_lock_irqsave(&rb->event_lock, flags);
-	list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
-		list_del_init(&event->rb_entry);
-		wake_up_all(&event->waitq);
-	}
-	spin_unlock_irqrestore(&rb->event_lock, flags);
+	WARN_ON_ONCE(!list_empty(&rb->event_list));
 
 	call_rcu(&rb->rcu_head, rb_free_rcu);
-	return true;
 }
 
 static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3606,28 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 	struct perf_event *event = vma->vm_file->private_data;
 
 	atomic_inc(&event->mmap_count);
+	atomic_inc(&event->rb->mmap_count);
 }
 
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
 static void perf_mmap_close(struct vm_area_struct *vma)
 {
 	struct perf_event *event = vma->vm_file->private_data;
 
-	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-		struct ring_buffer *rb = event->rb;
-		struct user_struct *mmap_user = rb->mmap_user;
-		int mmap_locked = rb->mmap_locked;
-		unsigned long size = perf_data_size(rb);
+	struct ring_buffer *rb = event->rb;
+	struct user_struct *mmap_user = rb->mmap_user;
+	int mmap_locked = rb->mmap_locked;
+	unsigned long size = perf_data_size(rb);
 
-		rcu_assign_pointer(event->rb, NULL);
-		ring_buffer_detach(event, rb);
-		mutex_unlock(&event->mmap_mutex);
+	atomic_dec(&rb->mmap_count);
+
+	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+		return;
+
+	/* Detach current event from the buffer. */
+	rcu_assign_pointer(event->rb, NULL);
+	ring_buffer_detach(event, rb);
+	mutex_unlock(&event->mmap_mutex);
+
+	/* If there's still other mmap()s of this buffer, we're done. */
+	if (atomic_read(&rb->mmap_count)) {
+		ring_buffer_put(rb); /* can't be last */
+		return;
+	}
 
-		if (ring_buffer_put(rb)) {
-			atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
-			vma->vm_mm->pinned_vm -= mmap_locked;
-			free_uid(mmap_user);
+	/*
+	 * No other mmap()s, detach from all other events that might redirect
+	 * into the now unreachable buffer. Somewhat complicated by the
+	 * fact that rb::event_lock otherwise nests inside mmap_mutex.
+	 */
+again:
+	rcu_read_lock();
+	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+		if (!atomic_long_inc_not_zero(&event->refcount)) {
+			/*
+			 * This event is en-route to free_event() which will
+			 * detach it and remove it from the list.
+			 */
+			continue;
 		}
+		rcu_read_unlock();
+
+		mutex_lock(&event->mmap_mutex);
+		/*
+		 * Check we didn't race with perf_event_set_output() which can
+		 * swizzle the rb from under us while we were waiting to
+		 * acquire mmap_mutex.
+		 *
+		 * If we find a different rb; ignore this event, a next
+		 * iteration will no longer find it on the list. We have to
+		 * still restart the iteration to make sure we're not now
+		 * iterating the wrong list.
+		 */
+		if (event->rb == rb) {
+			rcu_assign_pointer(event->rb, NULL);
+			ring_buffer_detach(event, rb);
+			ring_buffer_put(rb); /* can't be last, we still have one */
+		}
+		mutex_unlock(&event->mmap_mutex);
+		put_event(event);
+
+		/*
+		 * Restart the iteration; either we're on the wrong list or
+		 * destroyed its integrity by doing a deletion.
+		 */
+		goto again;
 	}
+	rcu_read_unlock();
+
+	/*
+	 * It could be there's still a few 0-ref events on the list; they'll
+	 * get cleaned up by free_event() -- they'll also still have their
+	 * ref on the rb and will free it whenever they are done with it.
+	 *
+	 * Aside from that, this buffer is 'fully' detached and unmapped,
+	 * undo the VM accounting.
+	 */
+
+	atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+	vma->vm_mm->pinned_vm -= mmap_locked;
+	free_uid(mmap_user);
+
+	ring_buffer_put(rb); /* could be last */
 }
 
 static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3677,10 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
 	mutex_lock(&event->mmap_mutex);
 	if (event->rb) {
-		if (event->rb->nr_pages != nr_pages)
+		if (event->rb->nr_pages != nr_pages) {
 			ret = -EINVAL;
+			goto unlock;
+		}
+
+		if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+			/*
+			 * Raced against perf_mmap_close() through
+			 * perf_event_set_output(). Try again, hope for better
+			 * luck.
+			 */
+			mutex_unlock(&event->mmap_mutex);
+			goto again;
+		}
+
 		goto unlock;
 	}
 
@@ -3722,12 +3788,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
+	atomic_set(&rb->mmap_count, 1);
 	rb->mmap_locked = extra;
 	rb->mmap_user = get_current_user();
 
 	atomic_long_add(user_extra, &user->locked_vm);
 	vma->vm_mm->pinned_vm += extra;
 
+	ring_buffer_attach(event, rb);
 	rcu_assign_pointer(event->rb, rb);
 
 	perf_event_update_userpage(event);
@@ -3737,6 +3805,10 @@ unlock:
 		atomic_inc(&event->mmap_count);
 	mutex_unlock(&event->mmap_mutex);
 
+	/*
+	 * Since pinned accounting is per vm we cannot allow fork() to copy our
+	 * vma.
+	 */
 	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = &perf_mmap_vmops;
 
@@ -6415,6 +6487,8 @@ set:
 	if (atomic_read(&event->mmap_count))
 		goto unlock;
 
+	old_rb = event->rb;
+
 	if (output_event) {
 		/* get the rb we want to redirect to */
 		rb = ring_buffer_get(output_event);
@@ -6422,16 +6496,28 @@ set:
 			goto unlock;
 	}
 
-	old_rb = event->rb;
-	rcu_assign_pointer(event->rb, rb);
 	if (old_rb)
 		ring_buffer_detach(event, old_rb);
+
+	if (rb)
+		ring_buffer_attach(event, rb);
+
+	rcu_assign_pointer(event->rb, rb);
+
+	if (old_rb) {
+		ring_buffer_put(old_rb);
+		/*
+		 * Since we detached before setting the new rb, so that we
+		 * could attach the new rb, we could have missed a wakeup.
+		 * Provide it now.
+		 */
+		wake_up_all(&event->waitq);
+	}
+
 	ret = 0;
 unlock:
 	mutex_unlock(&event->mmap_mutex);
 
-	if (old_rb)
-		ring_buffer_put(old_rb);
 out:
 	return ret;
 }
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 5bc6c8e9b851..ca6599723be5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -31,7 +31,8 @@ struct ring_buffer {
 	spinlock_t			event_lock;
 	struct list_head		event_list;
 
-	int				mmap_locked;
+	atomic_t			mmap_count;
+	unsigned long			mmap_locked;
 	struct user_struct		*mmap_user;
 
 	struct perf_event_mmap_page	*user_page;
-- 
cgit v1.2.3


From 873b4c65b519fd769940eb281f77848227d4e5c1 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 5 Jun 2013 10:13:11 +0200
Subject: sched: Fix clear NOHZ_BALANCE_KICK

I have faced a sequence where the Idle Load Balance was sometime not
triggered for a while on my platform, in the following scenario:

 CPU 0 and CPU 1 are running tasks and CPU 2 is idle

 CPU 1 kicks the Idle Load Balance
 CPU 1 selects CPU 2 as the new Idle Load Balancer
 CPU 2 sets NOHZ_BALANCE_KICK for CPU 2
 CPU 2 sends a reschedule IPI to CPU 2

 While CPU 3 wakes up, CPU 0 or CPU 1 migrates a waking up task A on CPU 2

 CPU 2 finally wakes up, runs task A and discards the Idle Load Balance
       task A quickly goes back to sleep (before a tick occurs on CPU 2)
 CPU 2 goes back to idle with NOHZ_BALANCE_KICK set

Whenever CPU 2 will be selected as the ILB, no reschedule IPI will be sent
because NOHZ_BALANCE_KICK is already set and no Idle Load Balance will be
performed.

We must wait for the sched softirq to be raised on CPU 2 thanks to another
part the kernel to come back to clear NOHZ_BALANCE_KICK.

The proposed solution clears NOHZ_BALANCE_KICK in schedule_ipi if
we can't raise the sched_softirq for the Idle Load Balance.

Change since V1:

- move the clear of NOHZ_BALANCE_KICK in got_nohz_idle_kick if the ILB
  can't run on this CPU (as suggested by Peter)

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1370419991-13870-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 58453b8272fd..919bee68032b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu)
 static inline bool got_nohz_idle_kick(void)
 {
 	int cpu = smp_processor_id();
-	return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+
+	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+		return false;
+
+	if (idle_cpu(cpu) && !need_resched())
+		return true;
+
+	/*
+	 * We can't run Idle Load Balance on this CPU for this time so we
+	 * cancel it and clear NOHZ_BALANCE_KICK
+	 */
+	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+	return false;
 }
 
 #else /* CONFIG_NO_HZ_COMMON */
@@ -1393,8 +1405,9 @@ static void sched_ttwu_pending(void)
 
 void scheduler_ipi(void)
 {
-	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
-	    && !tick_nohz_full_cpu(smp_processor_id()))
+	if (llist_empty(&this_rq()->wake_list)
+			&& !tick_nohz_full_cpu(smp_processor_id())
+			&& !got_nohz_idle_kick())
 		return;
 
 	/*
@@ -1417,7 +1430,7 @@ void scheduler_ipi(void)
 	/*
 	 * Check if someone kicked us for doing the nohz idle load balance.
 	 */
-	if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+	if (unlikely(got_nohz_idle_kick())) {
 		this_rq()->idle_balance = 1;
 		raise_softirq_irqoff(SCHED_SOFTIRQ);
 	}
-- 
cgit v1.2.3


From 29bb9e5a75684106a37593ad75ec75ff8312731b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 24 May 2013 15:23:40 -0400
Subject: tracing/context-tracking: Add preempt_schedule_context() for tracing

Dave Jones hit the following bug report:

 ===============================
 [ INFO: suspicious RCU usage. ]
 3.10.0-rc2+ #1 Not tainted
 -------------------------------
 include/linux/rcupdate.h:771 rcu_read_lock() used illegally while idle!
 other info that might help us debug this:
 RCU used illegally from idle CPU! rcu_scheduler_active = 1, debug_locks = 0
 RCU used illegally from extended quiescent state!
 2 locks held by cc1/63645:
  #0:  (&rq->lock){-.-.-.}, at: [<ffffffff816b39fd>] __schedule+0xed/0x9b0
  #1:  (rcu_read_lock){.+.+..}, at: [<ffffffff8109d645>] cpuacct_charge+0x5/0x1f0

 CPU: 1 PID: 63645 Comm: cc1 Not tainted 3.10.0-rc2+ #1 [loadavg: 40.57 27.55 13.39 25/277 64369]
 Hardware name: Gigabyte Technology Co., Ltd. GA-MA78GM-S2H/GA-MA78GM-S2H, BIOS F12a 04/23/2010
  0000000000000000 ffff88010f78fcf8 ffffffff816ae383 ffff88010f78fd28
  ffffffff810b698d ffff88011c092548 000000000023d073 ffff88011c092500
  0000000000000001 ffff88010f78fd60 ffffffff8109d7c5 ffffffff8109d645
 Call Trace:
  [<ffffffff816ae383>] dump_stack+0x19/0x1b
  [<ffffffff810b698d>] lockdep_rcu_suspicious+0xfd/0x130
  [<ffffffff8109d7c5>] cpuacct_charge+0x185/0x1f0
  [<ffffffff8109d645>] ? cpuacct_charge+0x5/0x1f0
  [<ffffffff8108dffc>] update_curr+0xec/0x240
  [<ffffffff8108f528>] put_prev_task_fair+0x228/0x480
  [<ffffffff816b3a71>] __schedule+0x161/0x9b0
  [<ffffffff816b4721>] preempt_schedule+0x51/0x80
  [<ffffffff816b4800>] ? __cond_resched_softirq+0x60/0x60
  [<ffffffff816b6824>] ? retint_careful+0x12/0x2e
  [<ffffffff810ff3cc>] ftrace_ops_control_func+0x1dc/0x210
  [<ffffffff816be280>] ftrace_call+0x5/0x2f
  [<ffffffff816b681d>] ? retint_careful+0xb/0x2e
  [<ffffffff816b4805>] ? schedule_user+0x5/0x70
  [<ffffffff816b4805>] ? schedule_user+0x5/0x70
  [<ffffffff816b6824>] ? retint_careful+0x12/0x2e
 ------------[ cut here ]------------

What happened was that the function tracer traced the schedule_user() code
that tells RCU that the system is coming back from userspace, and to
add the CPU back to the RCU monitoring.

Because the function tracer does a preempt_disable/enable_notrace() calls
the preempt_enable_notrace() checks the NEED_RESCHED flag. If it is set,
then preempt_schedule() is called. But this is called before the user_exit()
function can inform the kernel that the CPU is no longer in user mode and
needs to be accounted for by RCU.

The fix is to create a new preempt_schedule_context() that checks if
the kernel is still in user mode and if so to switch it to kernel mode
before calling schedule. It also switches back to user mode coming back
from schedule in need be.

The only user of this currently is the preempt_enable_notrace(), which is
only used by the tracing subsystem.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1369423420.6828.226.camel@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/context_tracking.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'kernel')

diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 65349f07b878..66677003e223 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -71,6 +71,46 @@ void user_enter(void)
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_PREEMPT
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+void __sched notrace preempt_schedule_context(void)
+{
+	struct thread_info *ti = current_thread_info();
+	enum ctx_state prev_ctx;
+
+	if (likely(ti->preempt_count || irqs_disabled()))
+		return;
+
+	/*
+	 * Need to disable preemption in case user_exit() is traced
+	 * and the tracer calls preempt_enable_notrace() causing
+	 * an infinite recursion.
+	 */
+	preempt_disable_notrace();
+	prev_ctx = exception_enter();
+	preempt_enable_no_resched_notrace();
+
+	preempt_schedule();
+
+	preempt_disable_notrace();
+	exception_exit(prev_ctx);
+	preempt_enable_notrace();
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_PREEMPT */
 
 /**
  * user_exit - Inform the context tracking that the CPU is
-- 
cgit v1.2.3