From d6cbf35dac8a3dadb9103379820c96d7c85df3d9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 14 May 2013 19:44:20 +0800
Subject: cgroup: initialize xattr before calling d_instantiate()

cgroup_create_file() calls d_instantiate(), which may decide to look
at the xattrs on the file. Smack always does this and SELinux can be
configured to do so.

But cgroup_add_file() didn't initialize xattrs before calling
cgroup_create_file(), which finally leads to dereferencing NULL
dentry->d_fsdata.

This bug has been there since cgroup xattr was introduced.

Cc: <stable@vger.kernel.org> # 3.8.x
Reported-by: Ivan Bulatovic <combuster@archlinux.us>
Reported-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2a9926275f80..38b136553044 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2699,13 +2699,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 		goto out;
 	}
 
+	cfe->type = (void *)cft;
+	cfe->dentry = dentry;
+	dentry->d_fsdata = cfe;
+	simple_xattrs_init(&cfe->xattrs);
+
 	mode = cgroup_file_mode(cft);
 	error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
 	if (!error) {
-		cfe->type = (void *)cft;
-		cfe->dentry = dentry;
-		dentry->d_fsdata = cfe;
-		simple_xattrs_init(&cfe->xattrs);
 		list_add_tail(&cfe->node, &parent->files);
 		cfe = NULL;
 	}
-- 
cgit v1.2.3


From 6faf72834d9d0c0dc6632604eaeffb621e87fcf9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 13 May 2013 06:53:37 -0700
Subject: rcu: Fix comparison sense in rcu_needs_cpu()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit c0f4dfd4f (rcu: Make RCU_FAST_NO_HZ take advantage of numbered
callbacks) introduced a bug that can result in excessively long grace
periods.  This bug reverse the senes of the "if" statement checking
for lazy callbacks, so that RCU takes a lazy approach when there are
in fact non-lazy callbacks.  This can result in excessive boot, suspend,
and resume times.

This commit therefore fixes the sense of this "if" statement.

Reported-by: Borislav Petkov <bp@alien8.de>
Reported-by: Bjørn Mork <bjorn@mork.no>
Reported-by: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Bjørn Mork <bjorn@mork.no>
Tested-by: Joerg Roedel <joro@8bytes.org>
---
 kernel/rcutree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 170814dc418f..6d939a645da1 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1667,7 +1667,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
 	rdtp->last_accelerate = jiffies;
 
 	/* Request timer delay depending on laziness, and round. */
-	if (rdtp->all_lazy) {
+	if (!rdtp->all_lazy) {
 		*dj = round_up(rcu_idle_gp_delay + jiffies,
 			       rcu_idle_gp_delay) - jiffies;
 	} else {
-- 
cgit v1.2.3


From 8f174b1175a10903ade40f36eb6c896412877ca0 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Wed, 1 May 2013 00:07:00 +0900
Subject: workqueue: correct handling of the pool spin_lock

When we fail to mutex_trylock(), we release the pool spin_lock and do
mutex_lock(). After that, we should regrab the pool spin_lock, but,
regrabbing is missed in current code. So correct it.

Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1ae602809efb..286847b90225 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2059,6 +2059,7 @@ static bool manage_workers(struct worker *worker)
 	if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
 		spin_unlock_irq(&pool->lock);
 		mutex_lock(&pool->manager_mutex);
+		spin_lock_irq(&pool->lock);
 		ret = true;
 	}
 
-- 
cgit v1.2.3


From ad7b1f841f8a54c6d61ff181451f55b68175e15a Mon Sep 17 00:00:00 2001
From: Marc Dionne <marc.c.dionne@gmail.com>
Date: Mon, 6 May 2013 17:44:55 -0400
Subject: workqueue: Make schedule_work() available again to non GPL modules

Commit 8425e3d5bdbe ("workqueue: inline trivial wrappers") changed
schedule_work() and schedule_delayed_work() to inline wrappers,
but these rely on some symbols that are EXPORT_SYMBOL_GPL, while
the original functions were EXPORT_SYMBOL.  This has the effect of
changing the licensing requirement for these functions and making
them unavailable to non GPL modules.

Make them available again by removing the restriction on the
required symbols.

Signed-off-by: Marc Dionne <marc.dionne@your-file-system.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 286847b90225..02916f421385 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -296,7 +296,7 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
 static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
 
 struct workqueue_struct *system_wq __read_mostly;
-EXPORT_SYMBOL_GPL(system_wq);
+EXPORT_SYMBOL(system_wq);
 struct workqueue_struct *system_highpri_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_highpri_wq);
 struct workqueue_struct *system_long_wq __read_mostly;
@@ -1411,7 +1411,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 	local_irq_restore(flags);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(queue_work_on);
+EXPORT_SYMBOL(queue_work_on);
 
 void delayed_work_timer_fn(unsigned long __data)
 {
@@ -1485,7 +1485,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	local_irq_restore(flags);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(queue_delayed_work_on);
+EXPORT_SYMBOL(queue_delayed_work_on);
 
 /**
  * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
-- 
cgit v1.2.3


From 615ee5443ff9bedd356dc6865f3e9c276ce434ea Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Tue, 26 Mar 2013 11:35:16 -0400
Subject: rcu: Don't allocate bootmem from rcu_init()

When rcu_init() is called we already have slab working, allocating
bootmem at that point results in warnings and an allocation from
slab.  This commit therefore changes alloc_bootmem_cpumask_var() to
alloc_cpumask_var() in rcu_bootup_announce_oddness(), which is called
from rcu_init().

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Tested-by: Robin Holt <holt@sgi.com>

[paulmck: convert to zalloc_cpumask_var(), as suggested by Yinghai Lu.]
---
 kernel/rcutree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 6d939a645da1..3db5a375d8dd 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -88,7 +88,7 @@ static void __init rcu_bootup_announce_oddness(void)
 #ifdef CONFIG_RCU_NOCB_CPU
 #ifndef CONFIG_RCU_NOCB_CPU_NONE
 	if (!have_rcu_nocb_mask) {
-		alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+		zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL);
 		have_rcu_nocb_mask = true;
 	}
 #ifdef CONFIG_RCU_NOCB_CPU_ZERO
-- 
cgit v1.2.3


From 1be0c25da56e860992af972a60321563ca2cfcd1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 15 May 2013 14:24:24 -0700
Subject: workqueue: don't perform NUMA-aware allocations on offline nodes in
 wq_numa_init()

wq_numa_init() builds per-node cpumasks which are later used to make
unbound workqueues NUMA-aware.  The cpumasks are allocated using
alloc_cpumask_var_node() for all possible nodes.  Unfortunately, on
machines with off-line nodes, this leads to NUMA-aware allocations on
existing bug offline nodes, which in turn triggers BUG in the memory
allocation code.

Fix it by using NUMA_NO_NODE for cpumask allocations for offline
nodes.

  kernel BUG at include/linux/gfp.h:323!
  invalid opcode: 0000 [#1] SMP
  Modules linked in:
  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0+ #1
  Hardware name: ProLiant BL465c G7, BIOS A19 12/10/2011
  task: ffff880234608000 ti: ffff880234602000 task.ti: ffff880234602000
  RIP: 0010:[<ffffffff8117495d>]  [<ffffffff8117495d>] new_slab+0x2ad/0x340
  RSP: 0000:ffff880234603bf8  EFLAGS: 00010246
  RAX: 0000000000000000 RBX: ffff880237404b40 RCX: 00000000000000d0
  RDX: 0000000000000001 RSI: 0000000000000003 RDI: 00000000002052d0
  RBP: ffff880234603c28 R08: 0000000000000000 R09: 0000000000000001
  R10: 0000000000000001 R11: ffffffff812e3aa8 R12: 0000000000000001
  R13: ffff8802378161c0 R14: 0000000000030027 R15: 00000000000040d0
  FS:  0000000000000000(0000) GS:ffff880237800000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  CR2: ffff88043fdff000 CR3: 00000000018d5000 CR4: 00000000000007f0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
  Stack:
   ffff880234603c28 0000000000000001 00000000000000d0 ffff8802378161c0
   ffff880237404b40 ffff880237404b40 ffff880234603d28 ffffffff815edba1
   ffff880237816140 0000000000000000 ffff88023740e1c0
  Call Trace:
   [<ffffffff815edba1>] __slab_alloc+0x330/0x4f2
   [<ffffffff81174b25>] kmem_cache_alloc_node_trace+0xa5/0x200
   [<ffffffff812e3aa8>] alloc_cpumask_var_node+0x28/0x90
   [<ffffffff81a0bdb3>] wq_numa_init+0x10d/0x1be
   [<ffffffff81a0bec8>] init_workqueues+0x64/0x341
   [<ffffffff810002ea>] do_one_initcall+0xea/0x1a0
   [<ffffffff819f1f31>] kernel_init_freeable+0xb7/0x1ec
   [<ffffffff815d50de>] kernel_init+0xe/0xf0
   [<ffffffff815ff89c>] ret_from_fork+0x7c/0xb0
  Code: 45  84 ac 00 00 00 f0 41 80 4d 00 40 e9 f6 fe ff ff 66 0f 1f 84 00 00 00 00 00 e8 eb 4b ff ff 49 89 c5 e9 05 fe ff ff <0f> 0b 4c 8b 73 38 44 89 ff 81 cf 00 00 20 00 4c 89 f6 48 c1 ee

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-Tested-by: Lingzhu Xiang <lxiang@redhat.com>
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 02916f421385..ee8e29a2320c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4905,7 +4905,8 @@ static void __init wq_numa_init(void)
 	BUG_ON(!tbl);
 
 	for_each_node(node)
-		BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node));
+		BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
+				node_online(node) ? node : NUMA_NO_NODE));
 
 	for_each_possible_cpu(cpu) {
 		node = cpu_to_node(cpu);
-- 
cgit v1.2.3


From 6ed0106667d76589cb648c27edb4f4ffbf9d59ca Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 16 May 2013 20:48:49 +0900
Subject: tracing: Return -EBUSY when event_enable_func() fails to get module

Since try_module_get() returns false( = 0) when it fails to
pindown a module, event_enable_func() returns 0 which means
"succeed". This can cause a kernel panic when the entry
is removed, because the event is already released.

This fixes the bug by returning -EBUSY, because the reason
why it fails is that the module is being removed at that time.

Link: http://lkml.kernel.org/r/20130516114848.13508.97899.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7a0cf68027cc..27963e2bf4bf 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2072,8 +2072,10 @@ event_enable_func(struct ftrace_hash *hash,
  out_reg:
 	/* Don't let event modules unload while probe registered */
 	ret = try_module_get(file->event_call->mod);
-	if (!ret)
+	if (!ret) {
+		ret = -EBUSY;
 		goto out_free;
+	}
 
 	ret = __ftrace_event_enable_disable(file, 1, 1);
 	if (ret < 0)
-- 
cgit v1.2.3


From 264b83c07a84223f0efd0d1db9ccc66d6f88288f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 16 May 2013 17:43:55 +0200
Subject: usermodehelper: check subprocess_info->path != NULL

argv_split(empty_or_all_spaces) happily succeeds, it simply returns
argc == 0 and argv[0] == NULL. Change call_usermodehelper_exec() to
check sub_info->path != NULL to avoid the crash.

This is the minimal fix, todo:

 - perhaps we should change argv_split() to return NULL or change the
   callers.

 - kill or justify ->path[0] check

 - narrow the scope of helper_lock()

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-By: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 1296e72e4161..8241906c4b61 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -569,6 +569,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 	int retval = 0;
 
 	helper_lock();
+	if (!sub_info->path) {
+		retval = -EINVAL;
+		goto out;
+	}
+
 	if (sub_info->path[0] == '\0')
 		goto out;
 
-- 
cgit v1.2.3


From 06c9494c0e9bdfcaa14d6d2b096a0ff7abe8494f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 15 May 2013 20:33:01 +0100
Subject: kmemleak: Scan all allocated, writeable and not executable module
 sections

Instead of just picking data sections by name (names that start
with .data, .bss or .ref.data), use the section flags and scan all
sections that are allocated, writable and not executable. Which should
cover all sections of a module that might reference data.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
[catalin.marinas@arm.com: removed unused 'name' variable]
[catalin.marinas@arm.com: collapsed 'if' blocks]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index b049939177f6..06f496a5d182 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2431,10 +2431,10 @@ static void kmemleak_load_module(const struct module *mod,
 	kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
 
 	for (i = 1; i < info->hdr->e_shnum; i++) {
-		const char *name = info->secstrings + info->sechdrs[i].sh_name;
-		if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
-			continue;
-		if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
+		/* Scan all writable sections that's not executable */
+		if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
+		    !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
+		    (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
 			continue;
 
 		kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
-- 
cgit v1.2.3


From 89c837351db0b9b52fd572ec8b0445a42e59b75c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 15 May 2013 20:46:23 +0100
Subject: kmemleak: No need for scanning specific module sections

As kmemleak now scans all module sections that are allocated, writable
and non executable, there's no need to scan individual sections that
might reference data.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 06f496a5d182..cab4bce49c23 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2769,24 +2769,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
 	mod->trace_events = section_objs(info, "_ftrace_events",
 					 sizeof(*mod->trace_events),
 					 &mod->num_trace_events);
-	/*
-	 * This section contains pointers to allocated objects in the trace
-	 * code and not scanning it leads to false positives.
-	 */
-	kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
-			   mod->num_trace_events, GFP_KERNEL);
 #endif
 #ifdef CONFIG_TRACING
 	mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
 					 sizeof(*mod->trace_bprintk_fmt_start),
 					 &mod->num_trace_bprintk_fmt);
-	/*
-	 * This section contains pointers to allocated objects in the trace
-	 * code and not scanning it leads to false positives.
-	 */
-	kmemleak_scan_area(mod->trace_bprintk_fmt_start,
-			   sizeof(*mod->trace_bprintk_fmt_start) *
-			   mod->num_trace_bprintk_fmt, GFP_KERNEL);
 #endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 	/* sechdrs[0].sh_size is always zero */
-- 
cgit v1.2.3


From fbe06b7bae7c9cf6ab05168fce5ee93b2f4bae7c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 17 May 2013 11:49:10 -0700
Subject: x86, range: fix missing merge during add range
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Christian found v3.9 does not work with E350 with EFI is enabled.

[    1.658832] Trying to unpack rootfs image as initramfs...
[    1.679935] BUG: unable to handle kernel paging request at ffff88006e3fd000
[    1.686940] IP: [<ffffffff813661df>] memset+0x1f/0xb0
[    1.692010] PGD 1f77067 PUD 1f7a067 PMD 61420067 PTE 0

but early memtest report all memory could be accessed without problem.

early page table is set in following sequence:
[    0.000000] init_memory_mapping: [mem 0x00000000-0x000fffff]
[    0.000000] init_memory_mapping: [mem 0x6e600000-0x6e7fffff]
[    0.000000] init_memory_mapping: [mem 0x6c000000-0x6e5fffff]
[    0.000000] init_memory_mapping: [mem 0x00100000-0x6bffffff]
[    0.000000] init_memory_mapping: [mem 0x6e800000-0x6ea07fff]
but later efi_enter_virtual_mode try set mapping again wrongly.
[    0.010644] pid_max: default: 32768 minimum: 301
[    0.015302] init_memory_mapping: [mem 0x640c5000-0x6e3fcfff]
that means it fails with pfn_range_is_mapped.

It turns out that we have a bug in add_range_with_merge and it does not
merge range properly when new add one fill the hole between two exsiting
ranges. In the case when [mem 0x00100000-0x6bffffff] is the hole between
[mem 0x00000000-0x000fffff] and [mem 0x6c000000-0x6e7fffff].

Fix the add_range_with_merge by calling itself recursively.

Reported-by: "Christian König" <christian.koenig@amd.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/CAE9FiQVofGoSk7q5-0irjkBxemqK729cND4hov-1QCBJDhxpgQ@mail.gmail.com
Cc: <stable@vger.kernel.org> v3.9
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/range.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/range.c b/kernel/range.c
index 071b0ab455cb..eb911dbce267 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -48,9 +48,11 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
 		final_start = min(range[i].start, start);
 		final_end = max(range[i].end, end);
 
-		range[i].start = final_start;
-		range[i].end =  final_end;
-		return nr_range;
+		/* clear it and add it back for further merge */
+		range[i].start = 0;
+		range[i].end =  0;
+		return add_range_with_merge(range, az, nr_range,
+			final_start, final_end);
 	}
 
 	/* Need to add it: */
-- 
cgit v1.2.3


From ca1643186d3dce6171d8f171e516b02496360a9e Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 23 May 2013 11:51:10 -0400
Subject: tracing: Fix crash when ftrace=nop on the kernel command line

If ftrace=<tracer> is on the kernel command line, when that tracer is
registered, it will be initiated by tracing_set_tracer() to execute that
tracer.

The nop tracer is just a stub tracer that is used to have no tracer
enabled. It is assigned at early bootup as it is the default tracer.

But if ftrace=nop is on the kernel command line, the registering of the
nop tracer will call tracing_set_tracer() which will try to execute
the nop tracer. But it expects tr->current_trace to be assigned something
as it usually is assigned to the nop tracer. As it hasn't been assigned
to anything yet, it causes the system to crash.

The simple fix is to move the tr->current_trace = nop before registering
the nop tracer. The functionality is still the same as the nop tracer
doesn't do anything anyway.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ae6fa2d1cdf7..4d79485b3237 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6216,10 +6216,15 @@ __init static int tracer_alloc_buffers(void)
 
 	trace_init_cmdlines();
 
-	register_tracer(&nop_trace);
-
+	/*
+	 * register_tracer() might reference current_trace, so it
+	 * needs to be set before we register anything. This is
+	 * just a bootstrap of current_trace anyway.
+	 */
 	global_trace.current_trace = &nop_trace;
 
+	register_tracer(&nop_trace);
+
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
 
-- 
cgit v1.2.3


From 7805d000db30a3787a4c969bab6ae4d8a5fd8ce6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 May 2013 10:50:24 +0900
Subject: cgroup: fix a subtle bug in descendant pre-order walk

When cgroup_next_descendant_pre() initiates a walk, it checks whether
the subtree root doesn't have any children and if not returns NULL.
Later code assumes that the subtree isn't empty.  This is broken
because the subtree may become empty inbetween, which can lead to the
traversal escaping the subtree by walking to the sibling of the
subtree root.

There's no reason to have the early exit path.  Remove it along with
the later assumption that the subtree isn't empty.  This simplifies
the code a bit and fixes the subtle bug.

While at it, fix the comment of cgroup_for_each_descendant_pre() which
was incorrectly referring to ->css_offline() instead of
->css_online().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 38b136553044..31e9ef319070 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2954,11 +2954,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
 	/* if first iteration, pretend we just visited @cgroup */
-	if (!pos) {
-		if (list_empty(&cgroup->children))
-			return NULL;
+	if (!pos)
 		pos = cgroup;
-	}
 
 	/* visit the first child if exists */
 	next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
@@ -2966,14 +2963,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 		return next;
 
 	/* no child, visit my or the closest ancestor's next sibling */
-	do {
+	while (pos != cgroup) {
 		next = list_entry_rcu(pos->sibling.next, struct cgroup,
 				      sibling);
 		if (&next->sibling != &pos->parent->children)
 			return next;
 
 		pos = pos->parent;
-	} while (pos != cgroup);
+	}
 
 	return NULL;
 }
-- 
cgit v1.2.3


From 387b8b3e37cb1c257fb607787f73815c30d22859 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 24 May 2013 15:55:25 -0700
Subject: auditfilter.c: fix kernel-doc warnings

Fix kernel-doc warnings in kernel/auditfilter.c:

  Warning(kernel/auditfilter.c:1029): Excess function parameter 'loginuid' description in 'audit_receive_filter'
  Warning(kernel/auditfilter.c:1029): Excess function parameter 'sessionid' description in 'audit_receive_filter'
  Warning(kernel/auditfilter.c:1029): Excess function parameter 'sid' description in 'audit_receive_filter'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/auditfilter.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 83a2970295d1..6bd4a90d1991 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1021,9 +1021,6 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
  * @seq: netlink audit message sequence (serial) number
  * @data: payload data
  * @datasz: size of payload data
- * @loginuid: loginuid of sender
- * @sessionid: sessionid for netlink audit message
- * @sid: SE Linux Security ID of sender
  */
 int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz)
 {
-- 
cgit v1.2.3


From 2938d2757fc99c26aa678ce4eba910c4a77c3a55 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 28 May 2013 09:33:01 +0200
Subject: tick: Cure broadcast false positive pending bit warning

commit 26517f3e (tick: Avoid programming the local cpu timer if
broadcast pending) added a warning if the cpu enters broadcast mode
again while the pending bit is still set. Meelis reported that the
warning triggers. There are two corner cases which have been not
considered:

1) cpuidle calls clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER)
   twice. That can result in the following scenario

   CPU0                    CPU1
                           cpuidle_idle_call()
                             clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER)
                               set cpu in tick_broadcast_oneshot_mask

   broadcast interrupt
     event expired for cpu1
     set pending bit

                             acpi_idle_enter_simple()
                               clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER)
                                 WARN_ON(pending bit)

  Move the WARN_ON into the section where we enter broadcast mode so
  it wont provide false positives on the second call.

2) safe_halt() enables interrupts, so a broadcast interrupt can be
   delivered befor the broadcast mode is disabled. That sets the
   pending bit for the CPU which receives the broadcast
   interrupt. Though the interrupt is delivered right away from the
   broadcast handler and leaves the pending bit stale.

   Clear the pending bit for the current cpu in the broadcast handler.

Reported-and-tested-by: Meelis Roos <mroos@linux.ee>
Cc: Len Brown <lenb@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305271841130.4220@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 24938d577669..0c739423b0f9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -511,6 +511,12 @@ again:
 		}
 	}
 
+	/*
+	 * Remove the current cpu from the pending mask. The event is
+	 * delivered immediately in tick_do_broadcast() !
+	 */
+	cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask);
+
 	/* Take care of enforced broadcast requests */
 	cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
 	cpumask_clear(tick_broadcast_force_mask);
@@ -575,8 +581,8 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 
 	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
-		WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
 		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
+			WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
 			/*
 			 * We only reprogram the broadcast timer if we
-- 
cgit v1.2.3


From 7b959fc582741227a1c4cba710d6aff8fb183128 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Wed, 22 May 2013 18:34:09 +0900
Subject: kprobes: Fix to free gone and unused optprobes

Fix to free gone and unused optprobes. This bug will
cause a kernel panic if the user reuses the killed and
unused probe.

Reported at:

  http://sourceware.org/ml/systemtap/2013-q2/msg00142.html

In the normal path, an optprobe on an init function is
unregistered when a module goes live.

unregister_kprobe(kp)
 -> __unregister_kprobe_top
   ->__disable_kprobe
     ->disarm_kprobe(ap == op)
       ->__disarm_kprobe
        ->unoptimize_kprobe : the op is queued
                              on unoptimizing_list
and do nothing in __unregister_kprobe_bottom

After a while (usually wait 5 jiffies), kprobe_optimizer
runs to unoptimize and free optprobe.

kprobe_optimizer
 ->do_unoptimize_kprobes
   ->arch_unoptimize_kprobes : moved to free_list
 ->do_free_cleaned_kprobes
   ->hlist_del: the op is removed
   ->free_aggr_kprobe
     ->arch_remove_optimized_kprobe
     ->arch_remove_kprobe
     ->kfree: the op is freed

Here, if kprobes_module_callback is called and the delayed
unoptimizing probe is picked BEFORE kprobe_optimizer runs,

kprobes_module_callback
 ->kill_kprobe
   ->kill_optimized_kprobe : dequeued from unoptimizing_list <=!!!
     ->arch_remove_optimized_kprobe
   ->arch_remove_kprobe
   (but op is not freed, and on the kprobe hash table)

This doesn't happen if the probe unregistration is done AFTER
kprobes_module_callback is called (because at that time the op
is gone), and kprobe-tracer does it.

To fix this bug, this patch changes kprobes_module_callback to
enqueue the op to freeing_list at kill_optimized_kprobe only
if the op is unused. The unused probes on freeing_list will
be freed in do_free_cleaned_kprobes.

Note that this calls arch_remove_*kprobe twice on the
same probe. Thus those functions have to check the double free.
Fortunately, most of arch codes already checked that except
for mips. This will be fixed in the next patch.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Timo Juhani Lindfors <timo.lindfors@iki.fi>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: systemtap@sourceware.org
Cc: yrl.pp-manager.tt@hitachi.com
Cc: David S. Miller <davem@davemloft.net>
Cc: "David S. Miller" <davem@davemloft.net>
Link: http://lkml.kernel.org/r/20130522093409.9084.63554.stgit@mhiramat-M0-7522
[ Minor edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/kprobes.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3fed7f0cbcdf..bddf3b201a48 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -467,6 +467,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
 /* Optimization staging list, protected by kprobe_mutex */
 static LIST_HEAD(optimizing_list);
 static LIST_HEAD(unoptimizing_list);
+static LIST_HEAD(freeing_list);
 
 static void kprobe_optimizer(struct work_struct *work);
 static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
@@ -504,7 +505,7 @@ static __kprobes void do_optimize_kprobes(void)
  * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
  * if need) kprobes listed on unoptimizing_list.
  */
-static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
+static __kprobes void do_unoptimize_kprobes(void)
 {
 	struct optimized_kprobe *op, *tmp;
 
@@ -515,9 +516,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
 	/* Ditto to do_optimize_kprobes */
 	get_online_cpus();
 	mutex_lock(&text_mutex);
-	arch_unoptimize_kprobes(&unoptimizing_list, free_list);
+	arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
 	/* Loop free_list for disarming */
-	list_for_each_entry_safe(op, tmp, free_list, list) {
+	list_for_each_entry_safe(op, tmp, &freeing_list, list) {
 		/* Disarm probes if marked disabled */
 		if (kprobe_disabled(&op->kp))
 			arch_disarm_kprobe(&op->kp);
@@ -536,11 +537,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
 }
 
 /* Reclaim all kprobes on the free_list */
-static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
+static __kprobes void do_free_cleaned_kprobes(void)
 {
 	struct optimized_kprobe *op, *tmp;
 
-	list_for_each_entry_safe(op, tmp, free_list, list) {
+	list_for_each_entry_safe(op, tmp, &freeing_list, list) {
 		BUG_ON(!kprobe_unused(&op->kp));
 		list_del_init(&op->list);
 		free_aggr_kprobe(&op->kp);
@@ -556,8 +557,6 @@ static __kprobes void kick_kprobe_optimizer(void)
 /* Kprobe jump optimizer */
 static __kprobes void kprobe_optimizer(struct work_struct *work)
 {
-	LIST_HEAD(free_list);
-
 	mutex_lock(&kprobe_mutex);
 	/* Lock modules while optimizing kprobes */
 	mutex_lock(&module_mutex);
@@ -566,7 +565,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
 	 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
 	 * kprobes before waiting for quiesence period.
 	 */
-	do_unoptimize_kprobes(&free_list);
+	do_unoptimize_kprobes();
 
 	/*
 	 * Step 2: Wait for quiesence period to ensure all running interrupts
@@ -581,7 +580,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
 	do_optimize_kprobes();
 
 	/* Step 4: Free cleaned kprobes after quiesence period */
-	do_free_cleaned_kprobes(&free_list);
+	do_free_cleaned_kprobes();
 
 	mutex_unlock(&module_mutex);
 	mutex_unlock(&kprobe_mutex);
@@ -723,8 +722,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)
 	if (!list_empty(&op->list))
 		/* Dequeue from the (un)optimization queue */
 		list_del_init(&op->list);
-
 	op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+
+	if (kprobe_unused(p)) {
+		/* Enqueue if it is unused */
+		list_add(&op->list, &freeing_list);
+		/*
+		 * Remove unused probes from the hash list. After waiting
+		 * for synchronization, this probe is reclaimed.
+		 * (reclaiming is done by do_free_cleaned_kprobes().)
+		 */
+		hlist_del_rcu(&op->kp.hlist);
+	}
+
 	/* Don't touch the code, because it is already freed. */
 	arch_remove_optimized_kprobe(op);
 }
-- 
cgit v1.2.3


From 26cb63ad11e04047a64309362674bcbbd6a6f246 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 28 May 2013 10:55:48 +0200
Subject: perf: Fix perf mmap bugs

Vince reported a problem found by his perf specific trinity
fuzzer.

Al noticed 2 problems with perf's mmap():

 - it has issues against fork() since we use vma->vm_mm for accounting.
 - it has an rb refcount leak on double mmap().

We fix the issues against fork() by using VM_DONTCOPY; I don't
think there's code out there that uses this; we didn't hear
about weird accounting problems/crashes. If we do need this to
work, the previously proposed VM_PINNED could make this work.

Aside from the rb reference leak spotted by Al, Vince's example
prog was indeed doing a double mmap() through the use of
perf_event_set_output().

This exposes another problem, since we now have 2 events with
one buffer, the accounting gets screwy because we account per
event. Fix this by making the buffer responsible for its own
accounting.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Link: http://lkml.kernel.org/r/20130528085548.GA12193@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c     | 37 ++++++++++++++++++++-----------------
 kernel/events/internal.h |  3 +++
 2 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9dc297faf7c0..ae752cd4a086 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2917,7 +2917,7 @@ static void free_event_rcu(struct rcu_head *head)
 	kfree(event);
 }
 
-static void ring_buffer_put(struct ring_buffer *rb);
+static bool ring_buffer_put(struct ring_buffer *rb);
 
 static void free_event(struct perf_event *event)
 {
@@ -3582,13 +3582,13 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 	return rb;
 }
 
-static void ring_buffer_put(struct ring_buffer *rb)
+static bool ring_buffer_put(struct ring_buffer *rb)
 {
 	struct perf_event *event, *n;
 	unsigned long flags;
 
 	if (!atomic_dec_and_test(&rb->refcount))
-		return;
+		return false;
 
 	spin_lock_irqsave(&rb->event_lock, flags);
 	list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
@@ -3598,6 +3598,7 @@ static void ring_buffer_put(struct ring_buffer *rb)
 	spin_unlock_irqrestore(&rb->event_lock, flags);
 
 	call_rcu(&rb->rcu_head, rb_free_rcu);
+	return true;
 }
 
 static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3612,18 +3613,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	struct perf_event *event = vma->vm_file->private_data;
 
 	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-		unsigned long size = perf_data_size(event->rb);
-		struct user_struct *user = event->mmap_user;
 		struct ring_buffer *rb = event->rb;
+		struct user_struct *mmap_user = rb->mmap_user;
+		int mmap_locked = rb->mmap_locked;
+		unsigned long size = perf_data_size(rb);
 
-		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-		vma->vm_mm->pinned_vm -= event->mmap_locked;
 		rcu_assign_pointer(event->rb, NULL);
 		ring_buffer_detach(event, rb);
 		mutex_unlock(&event->mmap_mutex);
 
-		ring_buffer_put(rb);
-		free_uid(user);
+		if (ring_buffer_put(rb)) {
+			atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+			vma->vm_mm->pinned_vm -= mmap_locked;
+			free_uid(mmap_user);
+		}
 	}
 }
 
@@ -3676,9 +3679,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	WARN_ON_ONCE(event->ctx->parent_ctx);
 	mutex_lock(&event->mmap_mutex);
 	if (event->rb) {
-		if (event->rb->nr_pages == nr_pages)
-			atomic_inc(&event->rb->refcount);
-		else
+		if (event->rb->nr_pages != nr_pages)
 			ret = -EINVAL;
 		goto unlock;
 	}
@@ -3720,12 +3721,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		ret = -ENOMEM;
 		goto unlock;
 	}
-	rcu_assign_pointer(event->rb, rb);
+
+	rb->mmap_locked = extra;
+	rb->mmap_user = get_current_user();
 
 	atomic_long_add(user_extra, &user->locked_vm);
-	event->mmap_locked = extra;
-	event->mmap_user = get_current_user();
-	vma->vm_mm->pinned_vm += event->mmap_locked;
+	vma->vm_mm->pinned_vm += extra;
+
+	rcu_assign_pointer(event->rb, rb);
 
 	perf_event_update_userpage(event);
 
@@ -3734,7 +3737,7 @@ unlock:
 		atomic_inc(&event->mmap_count);
 	mutex_unlock(&event->mmap_mutex);
 
-	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = &perf_mmap_vmops;
 
 	return ret;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index eb675c4d59df..5bc6c8e9b851 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -31,6 +31,9 @@ struct ring_buffer {
 	spinlock_t			event_lock;
 	struct list_head		event_list;
 
+	int				mmap_locked;
+	struct user_struct		*mmap_user;
+
 	struct perf_event_mmap_page	*user_page;
 	void				*data_pages[0];
 };
-- 
cgit v1.2.3


From 6721cb60022629ae76365551f05d9658b8d14c55 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 23 May 2013 14:21:36 -0400
Subject: ring-buffer: Do not poll non allocated cpu buffers

The tracing infrastructure sets up for possible CPUs, but it uses
the ring buffer polling, it is possible to call the ring buffer
polling code with a CPU that hasn't been allocated. This will cause
a kernel oops when it access a ring buffer cpu buffer that is part
of the possible cpus but hasn't been allocated yet as the CPU has never
been online.

Reported-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Tested-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b59aea2c48c2..e444ff88f0a4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -620,6 +620,9 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
 	if (cpu == RING_BUFFER_ALL_CPUS)
 		work = &buffer->irq_work;
 	else {
+		if (!cpumask_test_cpu(cpu, buffer->cpumask))
+			return -EINVAL;
+
 		cpu_buffer = buffer->buffers[cpu];
 		work = &cpu_buffer->irq_work;
 	}
-- 
cgit v1.2.3


From aa848233f740abbabfa7669daca0ab94aaa37bcd Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Fri, 3 May 2013 23:27:07 +0200
Subject: ntp: Remove unused variable flags in __hardpps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kernel/time/ntp.c: In function ‘__hardpps’:
kernel/time/ntp.c:877: warning: unused variable ‘flags’

commit a076b2146fabb0894cae5e0189a8ba3f1502d737 ("ntp: Remove ntp_lock,
using the timekeeping locks to protect ntp state") removed its users,
but not the actual variable.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/ntp.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 12ff13a838c6..8f5b3b98577b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -874,7 +874,6 @@ static void hardpps_update_phase(long error)
 void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 {
 	struct pps_normtime pts_norm, freq_norm;
-	unsigned long flags;
 
 	pts_norm = pps_normalize_ts(*phase_ts);
 
-- 
cgit v1.2.3


From 0d6bd9953f739dad96d9a0de65383e479ab4e10d Mon Sep 17 00:00:00 2001
From: Zoran Markovic <zoran.markovic@linaro.org>
Date: Fri, 17 May 2013 11:24:05 -0700
Subject: timekeeping: Correct run-time detection of persistent_clock.

Since commit 31ade30692dc9680bfc95700d794818fa3f754ac, timekeeping_init()
checks for presence of persistent clock by attempting to read a non-zero
time value. This is an issue on platforms where persistent_clock (instead
is implemented as a free-running counter (instead of an RTC) starting
from zero on each boot and running during suspend. Examples are some ARM
platforms (e.g. PandaBoard).

An attempt to read such a clock during timekeeping_init() may return zero
value and falsely declare persistent clock as missing. Additionally, in
the above case suspend times may be accounted twice (once from
timekeeping_resume() and once from rtc_resume()), resulting in a gradual
drift of system time.

This patch does a run-time correction of the issue by doing the same check
during timekeeping_suspend().

A better long-term solution would have to return error when trying to read
non-existing clock and zero when trying to read an uninitialized clock, but
that would require changing all persistent_clock implementations.

This patch addresses the immediate breakage, for now.

Cc: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Feng Tang <feng.tang@intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Zoran Markovic <zoran.markovic@linaro.org>
[jstultz: Tweaked commit message and subject]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 98cd470bbe49..baeeb5c87cf1 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -975,6 +975,14 @@ static int timekeeping_suspend(void)
 
 	read_persistent_clock(&timekeeping_suspend_time);
 
+	/*
+	 * On some systems the persistent_clock can not be detected at
+	 * timekeeping_init by its return value, so if we see a valid
+	 * value returned, update the persistent_clock_exists flag.
+	 */
+	if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
+		persistent_clock_exist = true;
+
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	write_seqcount_begin(&timekeeper_seq);
 	timekeeping_forward_now(tk);
-- 
cgit v1.2.3


From 2a0ff3fbe39bc93f719ff857e5a359d9780579ff Mon Sep 17 00:00:00 2001
From: Jeff Liu <jeff.liu@oracle.com>
Date: Sun, 26 May 2013 21:33:09 +0800
Subject: cgroup: warn about mismatching options of a new mount of an existing
 hierarchy

With the new __DEVEL__sane_behavior mount option was introduced,
if the root cgroup is alive with no xattr function, to mount a
new cgroup with xattr will be rejected in terms of design which
just fine.  However, if the root cgroup does not mounted with
__DEVEL__sane_hehavior, to create a new cgroup with xattr option
will succeed although after that the EA function does not works
as expected but will get ENOTSUPP for setting up attributes under
either cgroup. e.g.

setfattr: /cgroup2/test: Operation not supported

Instead of keeping silence in this case, it's better to drop a log
entry in warning level.  That would be helpful to understand the
reason behind the scene from the user's perspective, and this is
essentially an improvement does not break the backward compatibilities.

With this fix, above mount attemption will keep up works as usual but
the following line cound be found at the system log:

[ ...] cgroup: new mount options do not match the existing superblock

tj: minor formatting / message updates.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reported-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 31e9ef319070..a7c9e6ddb979 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1686,11 +1686,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		 */
 		cgroup_drop_root(opts.new_root);
 
-		if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) &&
-		    root->flags != opts.flags) {
-			pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
-			ret = -EINVAL;
-			goto drop_new_super;
+		if (root->flags != opts.flags) {
+			if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
+				pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
+				ret = -EINVAL;
+				goto drop_new_super;
+			} else {
+				pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
+			}
 		}
 
 		/* no subsys rebinding, so refcounts don't change */
-- 
cgit v1.2.3


From 1bb539ca36e21c2f4fce0865e11df384bc7b7656 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 28 May 2013 14:38:43 -0400
Subject: ftrace: Use the rcu _notrace variants for rcu_dereference_raw() and
 friends

As rcu_dereference_raw() under RCU debug config options can add quite a
bit of checks, and that tracing uses rcu_dereference_raw(), these checks
happen with the function tracer. The function tracer also happens to trace
these debug checks too. This added overhead can livelock the system.

Have the function tracer use the new RCU _notrace equivalents that do
not do the debug checks for RCU.

Link: http://lkml.kernel.org/r/20130528184209.467603904@goodmis.org

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b549b0f5b977..6c508ff33c62 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -120,22 +120,22 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
 
 /*
  * Traverse the ftrace_global_list, invoking all entries.  The reason that we
- * can use rcu_dereference_raw() is that elements removed from this list
+ * can use rcu_dereference_raw_notrace() is that elements removed from this list
  * are simply leaked, so there is no need to interact with a grace-period
- * mechanism.  The rcu_dereference_raw() calls are needed to handle
+ * mechanism.  The rcu_dereference_raw_notrace() calls are needed to handle
  * concurrent insertions into the ftrace_global_list.
  *
  * Silly Alpha and silly pointer-speculation compiler optimizations!
  */
 #define do_for_each_ftrace_op(op, list)			\
-	op = rcu_dereference_raw(list);			\
+	op = rcu_dereference_raw_notrace(list);			\
 	do
 
 /*
  * Optimized for just a single item in the list (as that is the normal case).
  */
 #define while_for_each_ftrace_op(op)				\
-	while (likely(op = rcu_dereference_raw((op)->next)) &&	\
+	while (likely(op = rcu_dereference_raw_notrace((op)->next)) &&	\
 	       unlikely((op) != &ftrace_list_end))
 
 static inline void ftrace_ops_init(struct ftrace_ops *ops)
@@ -779,7 +779,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
 	if (hlist_empty(hhd))
 		return NULL;
 
-	hlist_for_each_entry_rcu(rec, hhd, node) {
+	hlist_for_each_entry_rcu_notrace(rec, hhd, node) {
 		if (rec->ip == ip)
 			return rec;
 	}
@@ -1165,7 +1165,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
 
 	hhd = &hash->buckets[key];
 
-	hlist_for_each_entry_rcu(entry, hhd, hlist) {
+	hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) {
 		if (entry->ip == ip)
 			return entry;
 	}
@@ -1422,8 +1422,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
 	struct ftrace_hash *notrace_hash;
 	int ret;
 
-	filter_hash = rcu_dereference_raw(ops->filter_hash);
-	notrace_hash = rcu_dereference_raw(ops->notrace_hash);
+	filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
+	notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
 
 	if ((ftrace_hash_empty(filter_hash) ||
 	     ftrace_lookup_ip(filter_hash, ip)) &&
@@ -2920,7 +2920,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
 	 * on the hash. rcu_read_lock is too dangerous here.
 	 */
 	preempt_disable_notrace();
-	hlist_for_each_entry_rcu(entry, hhd, node) {
+	hlist_for_each_entry_rcu_notrace(entry, hhd, node) {
 		if (entry->ip == ip)
 			entry->ops->func(ip, parent_ip, &entry->data);
 	}
-- 
cgit v1.2.3


From 0184d50f9fd17658c232d6ee6d465a87f989d706 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 29 May 2013 15:56:49 -0400
Subject: tracing: Fix bad parameter passed in branch selftest

The branch selftest calls trace_test_buffer(), but with the new code
it expects the first parameter to be a pointer to a struct trace_buffer.
All self tests were changed but the branch selftest was missed.

This caused either a crash or failed test when the branch selftest was
enabled.

Link: http://lkml.kernel.org/r/20130529141333.GA24064@localhost

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_selftest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 55e2cf66967b..2901e3b88590 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1159,7 +1159,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 	/* stop the tracing. */
 	tracing_stop();
 	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
+	ret = trace_test_buffer(&tr->trace_buffer, &count);
 	trace->reset(tr);
 	tracing_start();
 
-- 
cgit v1.2.3


From 45eacc692771bd2b1ea3d384e6345cab3da10861 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 15 May 2013 22:16:32 +0200
Subject: vtime: Use consistent clocks among nohz accounting

While computing the cputime delta of dynticks CPUs,
we are mixing up clocks of differents natures:

* local_clock() which takes care of unstable clock
sources and fix these if needed.

* sched_clock() which is the weaker version of
local_clock(). It doesn't compute any fixup in case
of unstable source.

If the clock source is stable, those two clocks are the
same and we can safely compute the difference against
two random points.

Otherwise it results in random deltas as sched_clock()
can randomly drift away, back or forward, from local_clock().

As a consequence, some strange behaviour with unstable tsc
has been observed such as non progressing constant zero cputime.
(The 'top' command showing no load).

Fix this by only using local_clock(), or its irq safe/remote
equivalent, in vtime code.

Reported-by: Mike Galbraith <efault@gmx.de>
Suggested-by: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c    | 2 +-
 kernel/sched/cputime.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 58453b8272fd..e1a27f918723 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4745,7 +4745,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 */
 	idle->sched_class = &idle_sched_class;
 	ftrace_graph_init_idle_task(idle, cpu);
-	vtime_init_idle(idle);
+	vtime_init_idle(idle, cpu);
 #if defined(CONFIG_SMP)
 	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index cc2dc3eea8a3..b5ccba22603b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -747,17 +747,17 @@ void arch_vtime_task_switch(struct task_struct *prev)
 
 	write_seqlock(&current->vtime_seqlock);
 	current->vtime_snap_whence = VTIME_SYS;
-	current->vtime_snap = sched_clock();
+	current->vtime_snap = sched_clock_cpu(smp_processor_id());
 	write_sequnlock(&current->vtime_seqlock);
 }
 
-void vtime_init_idle(struct task_struct *t)
+void vtime_init_idle(struct task_struct *t, int cpu)
 {
 	unsigned long flags;
 
 	write_seqlock_irqsave(&t->vtime_seqlock, flags);
 	t->vtime_snap_whence = VTIME_SYS;
-	t->vtime_snap = sched_clock();
+	t->vtime_snap = sched_clock_cpu(cpu);
 	write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
 }
 
-- 
cgit v1.2.3


From 521921bad1192fb1b8f9b6a5aa673635848b8b5f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 16 May 2013 01:21:38 +0200
Subject: kvm: Move guest entry/exit APIs to context_tracking

The kvm_host.h header file doesn't handle well
inclusion when archs don't support KVM.

This results in build crashes for such archs when they
want to implement context tracking because this subsystem
includes kvm_host.h in order to implement the
guest_enter/exit APIs but it doesn't handle KVM off case.

To fix this, move the guest_enter()/guest_exit()
declarations and generic implementation to the context
tracking headers. These generic APIs actually belong to
this subsystem, besides other domains boundary tracking
like user_enter() et al.

KVM now properly becomes a user of this library, not the
other buggy way around.

Reported-by: Kevin Hilman <khilman@linaro.org>
Reviewed-by: Kevin Hilman <khilman@linaro.org>
Tested-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/context_tracking.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 65349f07b878..85bdde1137eb 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/context_tracking.h>
-#include <linux/kvm_host.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/hardirq.h>
-- 
cgit v1.2.3


From 1a7f829f094dd7951e7d46c571a18080e455a436 Mon Sep 17 00:00:00 2001
From: Li Zhong <zhong@linux.vnet.ibm.com>
Date: Fri, 17 May 2013 16:44:04 +0800
Subject: nohz: Fix notifier return val that enforce timekeeping

In tick_nohz_cpu_down_callback() if the cpu is the one handling
timekeeping, we must return something that stops the CPU_DOWN_PREPARE
notifiers and then start notify CPU_DOWN_FAILED on the already called
notifier call backs.

However traditional errno values are not handled by the notifier unless
these are encapsulated using errno_to_notifier().

Hence the current -EINVAL is misinterpreted and converted to junk after
notifier_to_errno(), leaving the notifier subsystem to random behaviour
such as eventually allowing the cpu to go down.

Fix this by using the standard NOTIFY_BAD instead.

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
Reviewed-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4208138fbf4..0cf1c1453181 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -306,7 +306,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
 		 * we can't safely shutdown that CPU.
 		 */
 		if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
-			return -EINVAL;
+			return NOTIFY_BAD;
 		break;
 	}
 	return NOTIFY_OK;
-- 
cgit v1.2.3


From f5d00c1f9adb350c24c5301600f7bf2da99b66de Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Tue, 28 May 2013 15:29:03 +0200
Subject: tick: Remove useless timekeeping duty attribution to broadcast source

Since 7300711e ("clockevents: broadcast fixup possible waiters"),
the timekeeping duty is assigned to the CPU that handles the tick
broadcast clock device by the time it is set in one shot mode.

This is an issue in full dynticks mode where the timekeeping duty
must stay handled by the boot CPU for now. Otherwise it prevents
secondary CPUs from offlining and this breaks
suspend/shutdown/reboot/...

As it appears there is no reason for this timekeeping duty to be
moved to the broadcast CPU, besides nothing prevent it from being
later re-assigned to another target, let's simply remove it.

Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-broadcast.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 0c739423b0f9..b4c245580b79 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -698,10 +698,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 
 		bc->event_handler = tick_handle_oneshot_broadcast;
 
-		/* Take the do_timer update */
-		if (!tick_nohz_full_cpu(cpu))
-			tick_do_timer_cpu = cpu;
-
 		/*
 		 * We must be careful here. There might be other CPUs
 		 * waiting for periodic broadcast. We need to set the
-- 
cgit v1.2.3


From f17a5194859a82afe4164e938b92035b86c55794 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 30 May 2013 21:10:37 -0400
Subject: tracing: Use current_uid() for critical time tracing

The irqsoff tracer records the max time that interrupts are disabled.
There are hooks in the assembly code that calls back into the tracer when
interrupts are disabled or enabled.

When they are enabled, the tracer checks if the amount of time they
were disabled is larger than the previous recorded max interrupts off
time. If it is, it creates a snapshot of the currently running trace
to store where the last largest interrupts off time was held and how
it happened.

During testing, this RCU lockdep dump appeared:

[ 1257.829021] ===============================
[ 1257.829021] [ INFO: suspicious RCU usage. ]
[ 1257.829021] 3.10.0-rc1-test+ #171 Tainted: G        W
[ 1257.829021] -------------------------------
[ 1257.829021] /home/rostedt/work/git/linux-trace.git/include/linux/rcupdate.h:780 rcu_read_lock() used illegally while idle!
[ 1257.829021]
[ 1257.829021] other info that might help us debug this:
[ 1257.829021]
[ 1257.829021]
[ 1257.829021] RCU used illegally from idle CPU!
[ 1257.829021] rcu_scheduler_active = 1, debug_locks = 0
[ 1257.829021] RCU used illegally from extended quiescent state!
[ 1257.829021] 2 locks held by trace-cmd/4831:
[ 1257.829021]  #0:  (max_trace_lock){......}, at: [<ffffffff810e2b77>] stop_critical_timing+0x1a3/0x209
[ 1257.829021]  #1:  (rcu_read_lock){.+.+..}, at: [<ffffffff810dae5a>] __update_max_tr+0x88/0x1ee
[ 1257.829021]
[ 1257.829021] stack backtrace:
[ 1257.829021] CPU: 3 PID: 4831 Comm: trace-cmd Tainted: G        W    3.10.0-rc1-test+ #171
[ 1257.829021] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007
[ 1257.829021]  0000000000000001 ffff880065f49da8 ffffffff8153dd2b ffff880065f49dd8
[ 1257.829021]  ffffffff81092a00 ffff88006bd78680 ffff88007add7500 0000000000000003
[ 1257.829021]  ffff88006bd78680 ffff880065f49e18 ffffffff810daebf ffffffff810dae5a
[ 1257.829021] Call Trace:
[ 1257.829021]  [<ffffffff8153dd2b>] dump_stack+0x19/0x1b
[ 1257.829021]  [<ffffffff81092a00>] lockdep_rcu_suspicious+0x109/0x112
[ 1257.829021]  [<ffffffff810daebf>] __update_max_tr+0xed/0x1ee
[ 1257.829021]  [<ffffffff810dae5a>] ? __update_max_tr+0x88/0x1ee
[ 1257.829021]  [<ffffffff811002b9>] ? user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff810dbf85>] update_max_tr_single+0x11d/0x12d
[ 1257.829021]  [<ffffffff811002b9>] ? user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff810e2b15>] stop_critical_timing+0x141/0x209
[ 1257.829021]  [<ffffffff8109569a>] ? trace_hardirqs_on+0xd/0xf
[ 1257.829021]  [<ffffffff811002b9>] ? user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff810e3057>] time_hardirqs_on+0x2a/0x2f
[ 1257.829021]  [<ffffffff811002b9>] ? user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff8109550c>] trace_hardirqs_on_caller+0x16/0x197
[ 1257.829021]  [<ffffffff8109569a>] trace_hardirqs_on+0xd/0xf
[ 1257.829021]  [<ffffffff811002b9>] user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff810029b4>] do_notify_resume+0x92/0x97
[ 1257.829021]  [<ffffffff8154bdca>] int_signal+0x12/0x17

What happened was entering into the user code, the interrupts were enabled
and a max interrupts off was recorded. The trace buffer was saved along with
various information about the task: comm, pid, uid, priority, etc.

The uid is recorded with task_uid(tsk). But this is a macro that uses rcu_read_lock()
to retrieve the data, and this happened to happen where RCU is blind (user_enter).

As only the preempt and irqs off tracers can have this happen, and they both
only have the tsk == current, if tsk == current, use current_uid() instead of
task_uid(), as current_uid() does not use RCU as only current can change its uid.

This fixes the RCU suspicious splat.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4d79485b3237..1a41023a1f88 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -843,7 +843,15 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
 	max_data->pid = tsk->pid;
-	max_data->uid = task_uid(tsk);
+	/*
+	 * If tsk == current, then use current_uid(), as that does not use
+	 * RCU. The irq tracer can be called out of RCU scope.
+	 */
+	if (tsk == current)
+		max_data->uid = current_uid();
+	else
+		max_data->uid = task_uid(tsk);
+
 	max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
 	max_data->policy = tsk->policy;
 	max_data->rt_priority = tsk->rt_priority;
-- 
cgit v1.2.3


From 346dbb79ea0118ebb0df372b35cab9d5805216cd Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 25 Apr 2013 19:28:54 +0200
Subject: irqdomain: export irq_domain_add_simple

All other irq_domain_add_* functions are exported already, and apparently
this one got left out by mistake, which causes build errors for ARM
allmodconfig kernels:

ERROR: "irq_domain_add_simple" [drivers/gpio/gpio-rcar.ko] undefined!
ERROR: "irq_domain_add_simple" [drivers/gpio/gpio-em.ko] undefined!

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Grant Likely <grant.likely@linaro.org>
---
 kernel/irq/irqdomain.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 5a83dde8ca0c..d1adaedb435f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -191,6 +191,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
 	/* A linear domain is the default */
 	return irq_domain_add_linear(of_node, size, ops, host_data);
 }
+EXPORT_SYMBOL_GPL(irq_domain_add_simple);
 
 /**
  * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
-- 
cgit v1.2.3


From 275e31b10ce20613aedceaa5160129c64b260a98 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Tue, 14 May 2013 19:02:45 +0800
Subject: kernel/irq/irqdomain.c: before use 'irq_data', need check it whether
 valid.

Since irq_data may be NULL, if so, we WARN_ON(), and continue, 'hwirq'
which related with 'irq_data' has to initialize later, or it will cause
issue.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Signed-off-by: Grant Likely <grant.likely@linaro.org>
---
 kernel/irq/irqdomain.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d1adaedb435f..8c4c8ea6a205 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -398,11 +398,12 @@ static void irq_domain_disassociate_many(struct irq_domain *domain,
 	while (count--) {
 		int irq = irq_base + count;
 		struct irq_data *irq_data = irq_get_irq_data(irq);
-		irq_hw_number_t hwirq = irq_data->hwirq;
+		irq_hw_number_t hwirq;
 
 		if (WARN_ON(!irq_data || irq_data->domain != domain))
 			continue;
 
+		hwirq = irq_data->hwirq;
 		irq_set_status_flags(irq, IRQ_NOREQUEST);
 
 		/* remove chip and handler */
-- 
cgit v1.2.3


From 94a63da0ac1a67bfb8b30aec1086523c5031ea5a Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 6 Jun 2013 12:10:23 +0100
Subject: irqdomain: document the simple domain first_irq

The first_irq needs to be zero to get a linear domain and that
comes with special semantics. We want to simplify this going
forward but some documentation never hurts.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Grant Likely <grant.likely@linaro.org>
---
 kernel/irq/irqdomain.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8c4c8ea6a205..54a4d5223238 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -143,7 +143,10 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
  * irq_domain_add_simple() - Allocate and register a simple irq_domain.
  * @of_node: pointer to interrupt controller's device tree node.
  * @size: total number of irqs in mapping
- * @first_irq: first number of irq block assigned to the domain
+ * @first_irq: first number of irq block assigned to the domain,
+ *	pass zero to assign irqs on-the-fly. This will result in a
+ *	linear IRQ domain so it is important to use irq_create_mapping()
+ *	for each used IRQ, especially when SPARSE_IRQ is enabled.
  * @ops: map/unmap domain callbacks
  * @host_data: Controller private data pointer
  *
-- 
cgit v1.2.3


From 016a8d5be6ddcc72ef0432d82d9f6fa34f61b907 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 28 May 2013 17:32:53 -0400
Subject: rcu: Don't call wakeup() with rcu_node structure ->lock held

This commit fixes a lockdep-detected deadlock by moving a wake_up()
call out from a rnp->lock critical section.  Please see below for
the long version of this story.

On Tue, 2013-05-28 at 16:13 -0400, Dave Jones wrote:

> [12572.705832] ======================================================
> [12572.750317] [ INFO: possible circular locking dependency detected ]
> [12572.796978] 3.10.0-rc3+ #39 Not tainted
> [12572.833381] -------------------------------------------------------
> [12572.862233] trinity-child17/31341 is trying to acquire lock:
> [12572.870390]  (rcu_node_0){..-.-.}, at: [<ffffffff811054ff>] rcu_read_unlock_special+0x9f/0x4c0
> [12572.878859]
> but task is already holding lock:
> [12572.894894]  (&ctx->lock){-.-...}, at: [<ffffffff811390ed>] perf_lock_task_context+0x7d/0x2d0
> [12572.903381]
> which lock already depends on the new lock.
>
> [12572.927541]
> the existing dependency chain (in reverse order) is:
> [12572.943736]
> -> #4 (&ctx->lock){-.-...}:
> [12572.960032]        [<ffffffff810b9851>] lock_acquire+0x91/0x1f0
> [12572.968337]        [<ffffffff816ebc90>] _raw_spin_lock+0x40/0x80
> [12572.976633]        [<ffffffff8113c987>] __perf_event_task_sched_out+0x2e7/0x5e0
> [12572.984969]        [<ffffffff81088953>] perf_event_task_sched_out+0x93/0xa0
> [12572.993326]        [<ffffffff816ea0bf>] __schedule+0x2cf/0x9c0
> [12573.001652]        [<ffffffff816eacfe>] schedule_user+0x2e/0x70
> [12573.009998]        [<ffffffff816ecd64>] retint_careful+0x12/0x2e
> [12573.018321]
> -> #3 (&rq->lock){-.-.-.}:
> [12573.034628]        [<ffffffff810b9851>] lock_acquire+0x91/0x1f0
> [12573.042930]        [<ffffffff816ebc90>] _raw_spin_lock+0x40/0x80
> [12573.051248]        [<ffffffff8108e6a7>] wake_up_new_task+0xb7/0x260
> [12573.059579]        [<ffffffff810492f5>] do_fork+0x105/0x470
> [12573.067880]        [<ffffffff81049686>] kernel_thread+0x26/0x30
> [12573.076202]        [<ffffffff816cee63>] rest_init+0x23/0x140
> [12573.084508]        [<ffffffff81ed8e1f>] start_kernel+0x3f1/0x3fe
> [12573.092852]        [<ffffffff81ed856f>] x86_64_start_reservations+0x2a/0x2c
> [12573.101233]        [<ffffffff81ed863d>] x86_64_start_kernel+0xcc/0xcf
> [12573.109528]
> -> #2 (&p->pi_lock){-.-.-.}:
> [12573.125675]        [<ffffffff810b9851>] lock_acquire+0x91/0x1f0
> [12573.133829]        [<ffffffff816ebe9b>] _raw_spin_lock_irqsave+0x4b/0x90
> [12573.141964]        [<ffffffff8108e881>] try_to_wake_up+0x31/0x320
> [12573.150065]        [<ffffffff8108ebe2>] default_wake_function+0x12/0x20
> [12573.158151]        [<ffffffff8107bbf8>] autoremove_wake_function+0x18/0x40
> [12573.166195]        [<ffffffff81085398>] __wake_up_common+0x58/0x90
> [12573.174215]        [<ffffffff81086909>] __wake_up+0x39/0x50
> [12573.182146]        [<ffffffff810fc3da>] rcu_start_gp_advanced.isra.11+0x4a/0x50
> [12573.190119]        [<ffffffff810fdb09>] rcu_start_future_gp+0x1c9/0x1f0
> [12573.198023]        [<ffffffff810fe2c4>] rcu_nocb_kthread+0x114/0x930
> [12573.205860]        [<ffffffff8107a91d>] kthread+0xed/0x100
> [12573.213656]        [<ffffffff816f4b1c>] ret_from_fork+0x7c/0xb0
> [12573.221379]
> -> #1 (&rsp->gp_wq){..-.-.}:
> [12573.236329]        [<ffffffff810b9851>] lock_acquire+0x91/0x1f0
> [12573.243783]        [<ffffffff816ebe9b>] _raw_spin_lock_irqsave+0x4b/0x90
> [12573.251178]        [<ffffffff810868f3>] __wake_up+0x23/0x50
> [12573.258505]        [<ffffffff810fc3da>] rcu_start_gp_advanced.isra.11+0x4a/0x50
> [12573.265891]        [<ffffffff810fdb09>] rcu_start_future_gp+0x1c9/0x1f0
> [12573.273248]        [<ffffffff810fe2c4>] rcu_nocb_kthread+0x114/0x930
> [12573.280564]        [<ffffffff8107a91d>] kthread+0xed/0x100
> [12573.287807]        [<ffffffff816f4b1c>] ret_from_fork+0x7c/0xb0

Notice the above call chain.

rcu_start_future_gp() is called with the rnp->lock held. Then it calls
rcu_start_gp_advance, which does a wakeup.

You can't do wakeups while holding the rnp->lock, as that would mean
that you could not do a rcu_read_unlock() while holding the rq lock, or
any lock that was taken while holding the rq lock. This is because...
(See below).

> [12573.295067]
> -> #0 (rcu_node_0){..-.-.}:
> [12573.309293]        [<ffffffff810b8d36>] __lock_acquire+0x1786/0x1af0
> [12573.316568]        [<ffffffff810b9851>] lock_acquire+0x91/0x1f0
> [12573.323825]        [<ffffffff816ebc90>] _raw_spin_lock+0x40/0x80
> [12573.331081]        [<ffffffff811054ff>] rcu_read_unlock_special+0x9f/0x4c0
> [12573.338377]        [<ffffffff810760a6>] __rcu_read_unlock+0x96/0xa0
> [12573.345648]        [<ffffffff811391b3>] perf_lock_task_context+0x143/0x2d0
> [12573.352942]        [<ffffffff8113938e>] find_get_context+0x4e/0x1f0
> [12573.360211]        [<ffffffff811403f4>] SYSC_perf_event_open+0x514/0xbd0
> [12573.367514]        [<ffffffff81140e49>] SyS_perf_event_open+0x9/0x10
> [12573.374816]        [<ffffffff816f4dd4>] tracesys+0xdd/0xe2

Notice the above trace.

perf took its own ctx->lock, which can be taken while holding the rq
lock. While holding this lock, it did a rcu_read_unlock(). The
perf_lock_task_context() basically looks like:

rcu_read_lock();
raw_spin_lock(ctx->lock);
rcu_read_unlock();

Now, what looks to have happened, is that we scheduled after taking that
first rcu_read_lock() but before taking the spin lock. When we scheduled
back in and took the ctx->lock, the following rcu_read_unlock()
triggered the "special" code.

The rcu_read_unlock_special() takes the rnp->lock, which gives us a
possible deadlock scenario.

	CPU0		CPU1		CPU2
	----		----		----

				     rcu_nocb_kthread()
    lock(rq->lock);
		    lock(ctx->lock);
				     lock(rnp->lock);

				     wake_up();

				     lock(rq->lock);

		    rcu_read_unlock();

		    rcu_read_unlock_special();

		    lock(rnp->lock);
    lock(ctx->lock);

**** DEADLOCK ****

> [12573.382068]
> other info that might help us debug this:
>
> [12573.403229] Chain exists of:
>   rcu_node_0 --> &rq->lock --> &ctx->lock
>
> [12573.424471]  Possible unsafe locking scenario:
>
> [12573.438499]        CPU0                    CPU1
> [12573.445599]        ----                    ----
> [12573.452691]   lock(&ctx->lock);
> [12573.459799]                                lock(&rq->lock);
> [12573.467010]                                lock(&ctx->lock);
> [12573.474192]   lock(rcu_node_0);
> [12573.481262]
>  *** DEADLOCK ***
>
> [12573.501931] 1 lock held by trinity-child17/31341:
> [12573.508990]  #0:  (&ctx->lock){-.-...}, at: [<ffffffff811390ed>] perf_lock_task_context+0x7d/0x2d0
> [12573.516475]
> stack backtrace:
> [12573.530395] CPU: 1 PID: 31341 Comm: trinity-child17 Not tainted 3.10.0-rc3+ #39
> [12573.545357]  ffffffff825b4f90 ffff880219f1dbc0 ffffffff816e375b ffff880219f1dc00
> [12573.552868]  ffffffff816dfa5d ffff880219f1dc50 ffff88023ce4d1f8 ffff88023ce4ca40
> [12573.560353]  0000000000000001 0000000000000001 ffff88023ce4d1f8 ffff880219f1dcc0
> [12573.567856] Call Trace:
> [12573.575011]  [<ffffffff816e375b>] dump_stack+0x19/0x1b
> [12573.582284]  [<ffffffff816dfa5d>] print_circular_bug+0x200/0x20f
> [12573.589637]  [<ffffffff810b8d36>] __lock_acquire+0x1786/0x1af0
> [12573.596982]  [<ffffffff810918f5>] ? sched_clock_cpu+0xb5/0x100
> [12573.604344]  [<ffffffff810b9851>] lock_acquire+0x91/0x1f0
> [12573.611652]  [<ffffffff811054ff>] ? rcu_read_unlock_special+0x9f/0x4c0
> [12573.619030]  [<ffffffff816ebc90>] _raw_spin_lock+0x40/0x80
> [12573.626331]  [<ffffffff811054ff>] ? rcu_read_unlock_special+0x9f/0x4c0
> [12573.633671]  [<ffffffff811054ff>] rcu_read_unlock_special+0x9f/0x4c0
> [12573.640992]  [<ffffffff811390ed>] ? perf_lock_task_context+0x7d/0x2d0
> [12573.648330]  [<ffffffff810b429e>] ? put_lock_stats.isra.29+0xe/0x40
> [12573.655662]  [<ffffffff813095a0>] ? delay_tsc+0x90/0xe0
> [12573.662964]  [<ffffffff810760a6>] __rcu_read_unlock+0x96/0xa0
> [12573.670276]  [<ffffffff811391b3>] perf_lock_task_context+0x143/0x2d0
> [12573.677622]  [<ffffffff81139070>] ? __perf_event_enable+0x370/0x370
> [12573.684981]  [<ffffffff8113938e>] find_get_context+0x4e/0x1f0
> [12573.692358]  [<ffffffff811403f4>] SYSC_perf_event_open+0x514/0xbd0
> [12573.699753]  [<ffffffff8108cd9d>] ? get_parent_ip+0xd/0x50
> [12573.707135]  [<ffffffff810b71fd>] ? trace_hardirqs_on_caller+0xfd/0x1c0
> [12573.714599]  [<ffffffff81140e49>] SyS_perf_event_open+0x9/0x10
> [12573.721996]  [<ffffffff816f4dd4>] tracesys+0xdd/0xe2

This commit delays the wakeup via irq_work(), which is what
perf and ftrace use to perform wakeups in critical sections.

Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 17 +++++++++++++++--
 kernel/rcutree.h |  2 ++
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 16ea67925015..b61d20c5ee7b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1613,6 +1613,14 @@ static int __noreturn rcu_gp_kthread(void *arg)
 	}
 }
 
+static void rsp_wakeup(struct irq_work *work)
+{
+	struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
+
+	/* Wake up rcu_gp_kthread() to start the grace period. */
+	wake_up(&rsp->gp_wq);
+}
+
 /*
  * Start a new RCU grace period if warranted, re-initializing the hierarchy
  * in preparation for detecting the next grace period.  The caller must hold
@@ -1637,8 +1645,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
 	}
 	rsp->gp_flags = RCU_GP_FLAG_INIT;
 
-	/* Wake up rcu_gp_kthread() to start the grace period. */
-	wake_up(&rsp->gp_wq);
+	/*
+	 * We can't do wakeups while holding the rnp->lock, as that
+	 * could cause possible deadlocks with the rq->lock. Deter
+	 * the wakeup to interrupt context.
+	 */
+	irq_work_queue(&rsp->wakeup_work);
 }
 
 /*
@@ -3235,6 +3247,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 
 	rsp->rda = rda;
 	init_waitqueue_head(&rsp->gp_wq);
+	init_irq_work(&rsp->wakeup_work, rsp_wakeup);
 	rnp = rsp->level[rcu_num_lvls - 1];
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index da77a8f57ff9..4df503470e42 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -27,6 +27,7 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
+#include <linux/irq_work.h>
 
 /*
  * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -442,6 +443,7 @@ struct rcu_state {
 	char *name;				/* Name of structure. */
 	char abbr;				/* Abbreviated name. */
 	struct list_head flavors;		/* List of RCU flavors. */
+	struct irq_work wakeup_work;		/* Postponed wakeups */
 };
 
 /* Values for rcu_state structure's gp_flags field. */
-- 
cgit v1.2.3


From 971394f389992f8462c4e5ae0e3b49a10a9534a3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 2 Jun 2013 07:13:57 -0700
Subject: rcu: Fix deadlock with CPU hotplug, RCU GP init, and timer migration

In Steven Rostedt's words:

> I've been debugging the last couple of days why my tests have been
> locking up. One of my tracing tests, runs all available tracers. The
> lockup always happened with the mmiotrace, which is used to trace
> interactions between priority drivers and the kernel. But to do this
> easily, when the tracer gets registered, it disables all but the boot
> CPUs. The lockup always happened after it got done disabling the CPUs.
>
> Then I decided to try this:
>
> while :; do
> 	for i in 1 2 3; do
> 		echo 0 > /sys/devices/system/cpu/cpu$i/online
> 	done
> 	for i in 1 2 3; do
> 		echo 1 > /sys/devices/system/cpu/cpu$i/online
> 	done
> done
>
> Well, sure enough, that locked up too, with the same users. Doing a
> sysrq-w (showing all blocked tasks):
>
> [ 2991.344562]   task                        PC stack   pid father
> [ 2991.344562] rcu_preempt     D ffff88007986fdf8     0    10      2 0x00000000
> [ 2991.344562]  ffff88007986fc98 0000000000000002 ffff88007986fc48 0000000000000908
> [ 2991.344562]  ffff88007986c280 ffff88007986ffd8 ffff88007986ffd8 00000000001d3c80
> [ 2991.344562]  ffff880079248a40 ffff88007986c280 0000000000000000 00000000fffd4295
> [ 2991.344562] Call Trace:
> [ 2991.344562]  [<ffffffff815437ba>] schedule+0x64/0x66
> [ 2991.344562]  [<ffffffff81541750>] schedule_timeout+0xbc/0xf9
> [ 2991.344562]  [<ffffffff8154bec0>] ? ftrace_call+0x5/0x2f
> [ 2991.344562]  [<ffffffff81049513>] ? cascade+0xa8/0xa8
> [ 2991.344562]  [<ffffffff815417ab>] schedule_timeout_uninterruptible+0x1e/0x20
> [ 2991.344562]  [<ffffffff810c980c>] rcu_gp_kthread+0x502/0x94b
> [ 2991.344562]  [<ffffffff81062791>] ? __init_waitqueue_head+0x50/0x50
> [ 2991.344562]  [<ffffffff810c930a>] ? rcu_gp_fqs+0x64/0x64
> [ 2991.344562]  [<ffffffff81061cdb>] kthread+0xb1/0xb9
> [ 2991.344562]  [<ffffffff81091e31>] ? lock_release_holdtime.part.23+0x4e/0x55
> [ 2991.344562]  [<ffffffff81061c2a>] ? __init_kthread_worker+0x58/0x58
> [ 2991.344562]  [<ffffffff8154c1dc>] ret_from_fork+0x7c/0xb0
> [ 2991.344562]  [<ffffffff81061c2a>] ? __init_kthread_worker+0x58/0x58
> [ 2991.344562] kworker/0:1     D ffffffff81a30680     0    47      2 0x00000000
> [ 2991.344562] Workqueue: events cpuset_hotplug_workfn
> [ 2991.344562]  ffff880078dbbb58 0000000000000002 0000000000000006 00000000000000d8
> [ 2991.344562]  ffff880078db8100 ffff880078dbbfd8 ffff880078dbbfd8 00000000001d3c80
> [ 2991.344562]  ffff8800779ca5c0 ffff880078db8100 ffffffff81541fcf 0000000000000000
> [ 2991.344562] Call Trace:
> [ 2991.344562]  [<ffffffff81541fcf>] ? __mutex_lock_common+0x3d4/0x609
> [ 2991.344562]  [<ffffffff815437ba>] schedule+0x64/0x66
> [ 2991.344562]  [<ffffffff81543a39>] schedule_preempt_disabled+0x18/0x24
> [ 2991.344562]  [<ffffffff81541fcf>] __mutex_lock_common+0x3d4/0x609
> [ 2991.344562]  [<ffffffff8103d11b>] ? get_online_cpus+0x3c/0x50
> [ 2991.344562]  [<ffffffff8103d11b>] ? get_online_cpus+0x3c/0x50
> [ 2991.344562]  [<ffffffff815422ff>] mutex_lock_nested+0x3b/0x40
> [ 2991.344562]  [<ffffffff8103d11b>] get_online_cpus+0x3c/0x50
> [ 2991.344562]  [<ffffffff810af7e6>] rebuild_sched_domains_locked+0x6e/0x3a8
> [ 2991.344562]  [<ffffffff810b0ec6>] rebuild_sched_domains+0x1c/0x2a
> [ 2991.344562]  [<ffffffff810b109b>] cpuset_hotplug_workfn+0x1c7/0x1d3
> [ 2991.344562]  [<ffffffff810b0ed9>] ? cpuset_hotplug_workfn+0x5/0x1d3
> [ 2991.344562]  [<ffffffff81058e07>] process_one_work+0x2d4/0x4d1
> [ 2991.344562]  [<ffffffff81058d3a>] ? process_one_work+0x207/0x4d1
> [ 2991.344562]  [<ffffffff8105964c>] worker_thread+0x2e7/0x3b5
> [ 2991.344562]  [<ffffffff81059365>] ? rescuer_thread+0x332/0x332
> [ 2991.344562]  [<ffffffff81061cdb>] kthread+0xb1/0xb9
> [ 2991.344562]  [<ffffffff81061c2a>] ? __init_kthread_worker+0x58/0x58
> [ 2991.344562]  [<ffffffff8154c1dc>] ret_from_fork+0x7c/0xb0
> [ 2991.344562]  [<ffffffff81061c2a>] ? __init_kthread_worker+0x58/0x58
> [ 2991.344562] bash            D ffffffff81a4aa80     0  2618   2612 0x10000000
> [ 2991.344562]  ffff8800379abb58 0000000000000002 0000000000000006 0000000000000c2c
> [ 2991.344562]  ffff880077fea140 ffff8800379abfd8 ffff8800379abfd8 00000000001d3c80
> [ 2991.344562]  ffff8800779ca5c0 ffff880077fea140 ffffffff81541fcf 0000000000000000
> [ 2991.344562] Call Trace:
> [ 2991.344562]  [<ffffffff81541fcf>] ? __mutex_lock_common+0x3d4/0x609
> [ 2991.344562]  [<ffffffff815437ba>] schedule+0x64/0x66
> [ 2991.344562]  [<ffffffff81543a39>] schedule_preempt_disabled+0x18/0x24
> [ 2991.344562]  [<ffffffff81541fcf>] __mutex_lock_common+0x3d4/0x609
> [ 2991.344562]  [<ffffffff81530078>] ? rcu_cpu_notify+0x2f5/0x86e
> [ 2991.344562]  [<ffffffff81530078>] ? rcu_cpu_notify+0x2f5/0x86e
> [ 2991.344562]  [<ffffffff815422ff>] mutex_lock_nested+0x3b/0x40
> [ 2991.344562]  [<ffffffff81530078>] rcu_cpu_notify+0x2f5/0x86e
> [ 2991.344562]  [<ffffffff81091c99>] ? __lock_is_held+0x32/0x53
> [ 2991.344562]  [<ffffffff81548912>] notifier_call_chain+0x6b/0x98
> [ 2991.344562]  [<ffffffff810671fd>] __raw_notifier_call_chain+0xe/0x10
> [ 2991.344562]  [<ffffffff8103cf64>] __cpu_notify+0x20/0x32
> [ 2991.344562]  [<ffffffff8103cf8d>] cpu_notify_nofail+0x17/0x36
> [ 2991.344562]  [<ffffffff815225de>] _cpu_down+0x154/0x259
> [ 2991.344562]  [<ffffffff81522710>] cpu_down+0x2d/0x3a
> [ 2991.344562]  [<ffffffff81526351>] store_online+0x4e/0xe7
> [ 2991.344562]  [<ffffffff8134d764>] dev_attr_store+0x20/0x22
> [ 2991.344562]  [<ffffffff811b3c5f>] sysfs_write_file+0x108/0x144
> [ 2991.344562]  [<ffffffff8114c5ef>] vfs_write+0xfd/0x158
> [ 2991.344562]  [<ffffffff8114c928>] SyS_write+0x5c/0x83
> [ 2991.344562]  [<ffffffff8154c494>] tracesys+0xdd/0xe2
>
> As well as held locks:
>
> [ 3034.728033] Showing all locks held in the system:
> [ 3034.728033] 1 lock held by rcu_preempt/10:
> [ 3034.728033]  #0:  (rcu_preempt_state.onoff_mutex){+.+...}, at: [<ffffffff810c9471>] rcu_gp_kthread+0x167/0x94b
> [ 3034.728033] 4 locks held by kworker/0:1/47:
> [ 3034.728033]  #0:  (events){.+.+.+}, at: [<ffffffff81058d3a>] process_one_work+0x207/0x4d1
> [ 3034.728033]  #1:  (cpuset_hotplug_work){+.+.+.}, at: [<ffffffff81058d3a>] process_one_work+0x207/0x4d1
> [ 3034.728033]  #2:  (cpuset_mutex){+.+.+.}, at: [<ffffffff810b0ec1>] rebuild_sched_domains+0x17/0x2a
> [ 3034.728033]  #3:  (cpu_hotplug.lock){+.+.+.}, at: [<ffffffff8103d11b>] get_online_cpus+0x3c/0x50
> [ 3034.728033] 1 lock held by mingetty/2563:
> [ 3034.728033]  #0:  (&ldata->atomic_read_lock){+.+...}, at: [<ffffffff8131e28a>] n_tty_read+0x252/0x7e8
> [ 3034.728033] 1 lock held by mingetty/2565:
> [ 3034.728033]  #0:  (&ldata->atomic_read_lock){+.+...}, at: [<ffffffff8131e28a>] n_tty_read+0x252/0x7e8
> [ 3034.728033] 1 lock held by mingetty/2569:
> [ 3034.728033]  #0:  (&ldata->atomic_read_lock){+.+...}, at: [<ffffffff8131e28a>] n_tty_read+0x252/0x7e8
> [ 3034.728033] 1 lock held by mingetty/2572:
> [ 3034.728033]  #0:  (&ldata->atomic_read_lock){+.+...}, at: [<ffffffff8131e28a>] n_tty_read+0x252/0x7e8
> [ 3034.728033] 1 lock held by mingetty/2575:
> [ 3034.728033]  #0:  (&ldata->atomic_read_lock){+.+...}, at: [<ffffffff8131e28a>] n_tty_read+0x252/0x7e8
> [ 3034.728033] 7 locks held by bash/2618:
> [ 3034.728033]  #0:  (sb_writers#5){.+.+.+}, at: [<ffffffff8114bc3f>] file_start_write+0x2a/0x2c
> [ 3034.728033]  #1:  (&buffer->mutex#2){+.+.+.}, at: [<ffffffff811b3b93>] sysfs_write_file+0x3c/0x144
> [ 3034.728033]  #2:  (s_active#54){.+.+.+}, at: [<ffffffff811b3c3e>] sysfs_write_file+0xe7/0x144
> [ 3034.728033]  #3:  (x86_cpu_hotplug_driver_mutex){+.+.+.}, at: [<ffffffff810217c2>] cpu_hotplug_driver_lock+0x17/0x19
> [ 3034.728033]  #4:  (cpu_add_remove_lock){+.+.+.}, at: [<ffffffff8103d196>] cpu_maps_update_begin+0x17/0x19
> [ 3034.728033]  #5:  (cpu_hotplug.lock){+.+.+.}, at: [<ffffffff8103cfd8>] cpu_hotplug_begin+0x2c/0x6d
> [ 3034.728033]  #6:  (rcu_preempt_state.onoff_mutex){+.+...}, at: [<ffffffff81530078>] rcu_cpu_notify+0x2f5/0x86e
> [ 3034.728033] 1 lock held by bash/2980:
> [ 3034.728033]  #0:  (&ldata->atomic_read_lock){+.+...}, at: [<ffffffff8131e28a>] n_tty_read+0x252/0x7e8
>
> Things looked a little weird. Also, this is a deadlock that lockdep did
> not catch. But what we have here does not look like a circular lock
> issue:
>
> Bash is blocked in rcu_cpu_notify():
>
> 1961		/* Exclude any attempts to start a new grace period. */
> 1962		mutex_lock(&rsp->onoff_mutex);
>
>
> kworker is blocked in get_online_cpus(), which makes sense as we are
> currently taking down a CPU.
>
> But rcu_preempt is not blocked on anything. It is simply sleeping in
> rcu_gp_kthread (really rcu_gp_init) here:
>
> 1453	#ifdef CONFIG_PROVE_RCU_DELAY
> 1454			if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 &&
> 1455			    system_state == SYSTEM_RUNNING)
> 1456				schedule_timeout_uninterruptible(2);
> 1457	#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
>
> And it does this while holding the onoff_mutex that bash is waiting for.
>
> Doing a function trace, it showed me where it happened:
>
> [  125.940066] rcu_pree-10      3.... 28384115273: schedule_timeout_uninterruptible <-rcu_gp_kthread
> [...]
> [  125.940066] rcu_pree-10      3d..3 28384202439: sched_switch: prev_comm=rcu_preempt prev_pid=10 prev_prio=120 prev_state=D ==> next_comm=watchdog/3 next_pid=38 next_prio=120
>
> The watchdog ran, and then:
>
> [  125.940066] watchdog-38      3d..3 28384692863: sched_switch: prev_comm=watchdog/3 prev_pid=38 prev_prio=120 prev_state=P ==> next_comm=modprobe next_pid=2848 next_prio=118
>
> Not sure what modprobe was doing, but shortly after that:
>
> [  125.940066] modprobe-2848    3d..3 28385041749: sched_switch: prev_comm=modprobe prev_pid=2848 prev_prio=118 prev_state=R+ ==> next_comm=migration/3 next_pid=40 next_prio=0
>
> Where the migration thread took down the CPU:
>
> [  125.940066] migratio-40      3d..3 28389148276: sched_switch: prev_comm=migration/3 prev_pid=40 prev_prio=0 prev_state=P ==> next_comm=swapper/3 next_pid=0 next_prio=120
>
> which finally did:
>
> [  125.940066]   <idle>-0       3...1 28389282142: arch_cpu_idle_dead <-cpu_startup_entry
> [  125.940066]   <idle>-0       3...1 28389282548: native_play_dead <-arch_cpu_idle_dead
> [  125.940066]   <idle>-0       3...1 28389282924: play_dead_common <-native_play_dead
> [  125.940066]   <idle>-0       3...1 28389283468: idle_task_exit <-play_dead_common
> [  125.940066]   <idle>-0       3...1 28389284644: amd_e400_remove_cpu <-play_dead_common
>
>
> CPU 3 is now offline, the rcu_preempt thread that ran on CPU 3 is still
> doing a schedule_timeout_uninterruptible() and it registered it's
> timeout to the timer base for CPU 3. You would think that it would get
> migrated right? The issue here is that the timer migration happens at
> the CPU notifier for CPU_DEAD. The problem is that the rcu notifier for
> CPU_DOWN is blocked waiting for the onoff_mutex to be released, which is
> held by the thread that just put itself into a uninterruptible sleep,
> that wont wake up until the CPU_DEAD notifier of the timer
> infrastructure is called, which wont happen until the rcu notifier
> finishes. Here's our deadlock!

This commit breaks this deadlock cycle by substituting a shorter udelay()
for the previous schedule_timeout_uninterruptible(), while at the same
time increasing the probability of the delay.  This maintains the intensity
of the testing.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/rcutree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index b61d20c5ee7b..35380019f0fc 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1451,9 +1451,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
 					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock_irq(&rnp->lock);
 #ifdef CONFIG_PROVE_RCU_DELAY
-		if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 &&
+		if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
 		    system_state == SYSTEM_RUNNING)
-			schedule_timeout_uninterruptible(2);
+			udelay(200);
 #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
 		cond_resched();
 	}
-- 
cgit v1.2.3


From 34376a50fb1fa095b9d0636fa41ed2e73125f214 Mon Sep 17 00:00:00 2001
From: Ben Greear <greearb@candelatech.com>
Date: Thu, 6 Jun 2013 14:29:49 -0700
Subject: Fix lockup related to stop_machine being stuck in __do_softirq.

The stop machine logic can lock up if all but one of the migration
threads make it through the disable-irq step and the one remaining
thread gets stuck in __do_softirq.  The reason __do_softirq can hang is
that it has a bail-out based on jiffies timeout, but in the lockup case,
jiffies itself is not incremented.

To work around this, re-add the max_restart counter in __do_irq and stop
processing irqs after 10 restarts.

Thanks to Tejun Heo and Rusty Russell and others for helping me track
this down.

This was introduced in 3.9 by commit c10d73671ad3 ("softirq: reduce
latencies").

It may be worth looking into ath9k to see if it has issues with its irq
handler at a later date.

The hang stack traces look something like this:

    ------------[ cut here ]------------
    WARNING: at kernel/watchdog.c:245 watchdog_overflow_callback+0x9c/0xa7()
    Watchdog detected hard LOCKUP on cpu 2
    Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen lockd sunrpc]
    Pid: 23, comm: migration/2 Tainted: G         C   3.9.4+ #11
    Call Trace:
     <NMI>   warn_slowpath_common+0x85/0x9f
      warn_slowpath_fmt+0x46/0x48
      watchdog_overflow_callback+0x9c/0xa7
      __perf_event_overflow+0x137/0x1cb
      perf_event_overflow+0x14/0x16
      intel_pmu_handle_irq+0x2dc/0x359
      perf_event_nmi_handler+0x19/0x1b
      nmi_handle+0x7f/0xc2
      do_nmi+0xbc/0x304
      end_repeat_nmi+0x1e/0x2e
     <<EOE>>
      cpu_stopper_thread+0xae/0x162
      smpboot_thread_fn+0x258/0x260
      kthread+0xc7/0xcf
      ret_from_fork+0x7c/0xb0
    ---[ end trace 4947dfa9b0a4cec3 ]---
    BUG: soft lockup - CPU#1 stuck for 22s! [migration/1:17]
    Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen lockd sunrpc]
    irq event stamp: 835637905
    hardirqs last  enabled at (835637904): __do_softirq+0x9f/0x257
    hardirqs last disabled at (835637905): apic_timer_interrupt+0x6d/0x80
    softirqs last  enabled at (5654720): __do_softirq+0x1ff/0x257
    softirqs last disabled at (5654725): irq_exit+0x5f/0xbb
    CPU 1
    Pid: 17, comm: migration/1 Tainted: G        WC   3.9.4+ #11 To be filled by O.E.M. To be filled by O.E.M./To be filled by O.E.M.
    RIP: tasklet_hi_action+0xf0/0xf0
    Process migration/1
    Call Trace:
     <IRQ>
      __do_softirq+0x117/0x257
      irq_exit+0x5f/0xbb
      smp_apic_timer_interrupt+0x8a/0x98
      apic_timer_interrupt+0x72/0x80
     <EOI>
      printk+0x4d/0x4f
      stop_machine_cpu_stop+0x22c/0x274
      cpu_stopper_thread+0xae/0x162
      smpboot_thread_fn+0x258/0x260
      kthread+0xc7/0xcf
      ret_from_fork+0x7c/0xb0

Signed-off-by: Ben Greear <greearb@candelatech.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Pekka Riikonen <priikone@iki.fi>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/softirq.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index b5197dcb0dad..3d6833f125d3 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,8 +195,12 @@ void local_bh_enable_ip(unsigned long ip)
 EXPORT_SYMBOL(local_bh_enable_ip);
 
 /*
- * We restart softirq processing for at most 2 ms,
- * and if need_resched() is not set.
+ * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
+ * but break the loop if need_resched() is set or after 2 ms.
+ * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in
+ * certain cases, such as stop_machine(), jiffies may cease to
+ * increment and so we need the MAX_SOFTIRQ_RESTART limit as
+ * well to make sure we eventually return from this method.
  *
  * These limits have been established via experimentation.
  * The two things to balance is latency against fairness -
@@ -204,6 +208,7 @@ EXPORT_SYMBOL(local_bh_enable_ip);
  * should not be able to lock up the box.
  */
 #define MAX_SOFTIRQ_TIME  msecs_to_jiffies(2)
+#define MAX_SOFTIRQ_RESTART 10
 
 asmlinkage void __do_softirq(void)
 {
@@ -212,6 +217,7 @@ asmlinkage void __do_softirq(void)
 	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
 	int cpu;
 	unsigned long old_flags = current->flags;
+	int max_restart = MAX_SOFTIRQ_RESTART;
 
 	/*
 	 * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -265,7 +271,8 @@ restart:
 
 	pending = local_softirq_pending();
 	if (pending) {
-		if (time_before(jiffies, end) && !need_resched())
+		if (time_before(jiffies, end) && !need_resched() &&
+		    --max_restart)
 			goto restart;
 
 		wakeup_softirqd();
-- 
cgit v1.2.3


From 58e8eedf18577c7eac722d5d1f190507ea263d1b Mon Sep 17 00:00:00 2001
From: Yoshihiro YUNOMAE <yoshihiro.yunomae.ez@hitachi.com>
Date: Tue, 23 Apr 2013 10:32:39 +0900
Subject: tracing: Fix outputting formats of x86-tsc and counter when use
 trace_clock

Outputting formats of x86-tsc and counter should be a raw format, but after
applying the patch(2b6080f28c7cc3efc8625ab71495aae89aeb63a0), the format was
changed to nanosec. This is because the global variable trace_clock_id was used.
When we use multiple buffers, clock_id of each sub-buffer should be used. Then,
this patch uses tr->clock_id instead of the global variable trace_clock_id.

[ Basically, this fixes a regression where the multibuffer code changed the
  trace_clock file to update tr->clock_id but the traces still use the old
  global trace_clock_id variable, negating the file's effect. The global
  trace_clock_id variable is obsolete and removed. - SR ]

Link: http://lkml.kernel.org/r/20130423013239.22334.7394.stgit@yunodevel

Signed-off-by: Yoshihiro YUNOMAE <yoshihiro.yunomae.ez@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 8 +++-----
 kernel/trace/trace.h | 2 --
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1a41023a1f88..e71a8be4a6ee 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -652,8 +652,6 @@ static struct {
 	ARCH_TRACE_CLOCKS
 };
 
-int trace_clock_id;
-
 /*
  * trace_parser_get_init - gets the buffer for trace parser
  */
@@ -2826,7 +2824,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 		iter->iter_flags |= TRACE_FILE_ANNOTATE;
 
 	/* Output in nanoseconds only if we are using a clock in nanoseconds. */
-	if (trace_clocks[trace_clock_id].in_ns)
+	if (trace_clocks[tr->clock_id].in_ns)
 		iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
 
 	/* stop the trace while dumping if we are not opening "snapshot" */
@@ -3825,7 +3823,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 		iter->iter_flags |= TRACE_FILE_LAT_FMT;
 
 	/* Output in nanoseconds only if we are using a clock in nanoseconds. */
-	if (trace_clocks[trace_clock_id].in_ns)
+	if (trace_clocks[tr->clock_id].in_ns)
 		iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
 
 	iter->cpu_file = tc->cpu;
@@ -5095,7 +5093,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 	cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "bytes: %ld\n", cnt);
 
-	if (trace_clocks[trace_clock_id].in_ns) {
+	if (trace_clocks[tr->clock_id].in_ns) {
 		/* local or global for trace_clock */
 		t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
 		usec_rem = do_div(t, USEC_PER_SEC);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 711ca7d3e7f1..20572ed88c5c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -700,8 +700,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
 
 extern unsigned long trace_flags;
 
-extern int trace_clock_id;
-
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-- 
cgit v1.2.3


From d7880812b3594d3c6dcbe3cfd71dabb17347d082 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 10 Jun 2013 16:52:03 +0200
Subject: idle: Add the stack canary init to cpu_startup_entry()

Moving x86 to the generic idle implementation (commit 7d1a9417 "x86:
Use generic idle loop") wreckaged the stack protector.

I stupidly missed that boot_init_stack_canary() must be inlined from a
function which never returns, but I put that call into
arch_cpu_idle_prepare() which of course returns.

I pondered to play tricks with arch_cpu_idle_prepare() first, but then
I noticed, that the other archs which have implemented the
stackprotector (ARM and SH) do not initialize the canary for the
non-boot cpus.

So I decided to move the boot_init_stack_canary() call into
cpu_startup_entry() ifdeffed with an CONFIG_X86 for now. This #ifdef
is just a temporary measure as I don't want to inflict the
boot_init_stack_canary() call on ARM and SH that late in the cycle.

I'll queue a patch for 3.11 which removes the #ifdef if the ARM/SH
maintainers have no objection.

Reported-by: Wouter van Kesteren <woutershep@gmail.com>
Cc: x86@kernel.org
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu/idle.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index d5585f5e038e..bf2ee1aafa0e 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -5,6 +5,7 @@
 #include <linux/cpu.h>
 #include <linux/tick.h>
 #include <linux/mm.h>
+#include <linux/stackprotector.h>
 
 #include <asm/tlb.h>
 
@@ -112,6 +113,21 @@ static void cpu_idle_loop(void)
 
 void cpu_startup_entry(enum cpuhp_state state)
 {
+	/*
+	 * This #ifdef needs to die, but it's too late in the cycle to
+	 * make this generic (arm and sh have never invoked the canary
+	 * init for the non boot cpus!). Will be fixed in 3.11
+	 */
+#ifdef CONFIG_X86
+	/*
+	 * If we're the non-boot CPU, nothing set the stack canary up
+	 * for us. The boot CPU already has it initialized but no harm
+	 * in doing it again. This is a good place for updating it, as
+	 * we wont ever return from this function (so the invalid
+	 * canaries already on the stack wont ever trigger).
+	 */
+	boot_init_stack_canary();
+#endif
 	current_set_polling();
 	arch_cpu_idle_prepare();
 	cpu_idle_loop();
-- 
cgit v1.2.3


From 16e53dbf10a2d7e228709a7286310e629ede5e45 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Wed, 12 Jun 2013 14:04:36 -0700
Subject: CPU hotplug: provide a generic helper to disable/enable CPU hotplug

There are instances in the kernel where we would like to disable CPU
hotplug (from sysfs) during some important operation.  Today the freezer
code depends on this and the code to do it was kinda tailor-made for
that.

Restructure the code and make it generic enough to be useful for other
usecases too.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Robin Holt <holt@sgi.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Shawn Guo <shawn.guo@linaro.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 55 +++++++++++++++++++++++--------------------------------
 1 file changed, 23 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index b5e4ab2d427e..198a38883e64 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -133,6 +133,27 @@ static void cpu_hotplug_done(void)
 	mutex_unlock(&cpu_hotplug.lock);
 }
 
+/*
+ * Wait for currently running CPU hotplug operations to complete (if any) and
+ * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
+ * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
+ * hotplug path before performing hotplug operations. So acquiring that lock
+ * guarantees mutual exclusion from any currently running hotplug operations.
+ */
+void cpu_hotplug_disable(void)
+{
+	cpu_maps_update_begin();
+	cpu_hotplug_disabled = 1;
+	cpu_maps_update_done();
+}
+
+void cpu_hotplug_enable(void)
+{
+	cpu_maps_update_begin();
+	cpu_hotplug_disabled = 0;
+	cpu_maps_update_done();
+}
+
 #else /* #if CONFIG_HOTPLUG_CPU */
 static void cpu_hotplug_begin(void) {}
 static void cpu_hotplug_done(void) {}
@@ -540,36 +561,6 @@ static int __init alloc_frozen_cpus(void)
 }
 core_initcall(alloc_frozen_cpus);
 
-/*
- * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
- * hotplug when tasks are about to be frozen. Also, don't allow the freezer
- * to continue until any currently running CPU hotplug operation gets
- * completed.
- * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
- * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
- * CPU hotplug path and released only after it is complete. Thus, we
- * (and hence the freezer) will block here until any currently running CPU
- * hotplug operation gets completed.
- */
-void cpu_hotplug_disable_before_freeze(void)
-{
-	cpu_maps_update_begin();
-	cpu_hotplug_disabled = 1;
-	cpu_maps_update_done();
-}
-
-
-/*
- * When tasks have been thawed, re-enable regular CPU hotplug (which had been
- * disabled while beginning to freeze tasks).
- */
-void cpu_hotplug_enable_after_thaw(void)
-{
-	cpu_maps_update_begin();
-	cpu_hotplug_disabled = 0;
-	cpu_maps_update_done();
-}
-
 /*
  * When callbacks for CPU hotplug notifications are being executed, we must
  * ensure that the state of the system with respect to the tasks being frozen
@@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
 
 	case PM_SUSPEND_PREPARE:
 	case PM_HIBERNATION_PREPARE:
-		cpu_hotplug_disable_before_freeze();
+		cpu_hotplug_disable();
 		break;
 
 	case PM_POST_SUSPEND:
 	case PM_POST_HIBERNATION:
-		cpu_hotplug_enable_after_thaw();
+		cpu_hotplug_enable();
 		break;
 
 	default:
-- 
cgit v1.2.3


From cf7df378aa4ff7da3a44769b7ff6e9eef1a9f3db Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Wed, 12 Jun 2013 14:04:37 -0700
Subject: reboot: rigrate shutdown/reboot to boot cpu

We recently noticed that reboot of a 1024 cpu machine takes approx 16
minutes of just stopping the cpus.  The slowdown was tracked to commit
f96972f2dc63 ("kernel/sys.c: call disable_nonboot_cpus() in
kernel_restart()").

The current implementation does all the work of hot removing the cpus
before halting the system.  We are switching to just migrating to the
boot cpu and then continuing with shutdown/reboot.

This also has the effect of not breaking x86's command line parameter
for specifying the reboot cpu.  Note, this code was shamelessly copied
from arch/x86/kernel/reboot.c with bits removed pertaining to the
reboot_cpu command line parameter.

Signed-off-by: Robin Holt <holt@sgi.com>
Tested-by: Shawn Guo <shawn.guo@linaro.org>
Cc: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index b95d3c72ba21..2bbd9a73b54c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
 
+/* Add backwards compatibility for stable trees. */
+#ifndef PF_NO_SETAFFINITY
+#define PF_NO_SETAFFINITY		PF_THREAD_BOUND
+#endif
+
+static void migrate_to_reboot_cpu(void)
+{
+	/* The boot cpu is always logical cpu 0 */
+	int cpu = 0;
+
+	cpu_hotplug_disable();
+
+	/* Make certain the cpu I'm about to reboot on is online */
+	if (!cpu_online(cpu))
+		cpu = cpumask_first(cpu_online_mask);
+
+	/* Prevent races with other tasks migrating this task */
+	current->flags |= PF_NO_SETAFFINITY;
+
+	/* Make certain I only run on the appropriate processor */
+	set_cpus_allowed_ptr(current, cpumask_of(cpu));
+}
+
 /**
  *	kernel_restart - reboot the system
  *	@cmd: pointer to buffer containing command to execute for restart
@@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
 void kernel_restart(char *cmd)
 {
 	kernel_restart_prepare(cmd);
-	disable_nonboot_cpus();
+	migrate_to_reboot_cpu();
 	syscore_shutdown();
 	if (!cmd)
 		printk(KERN_EMERG "Restarting system.\n");
@@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state)
 void kernel_halt(void)
 {
 	kernel_shutdown_prepare(SYSTEM_HALT);
-	disable_nonboot_cpus();
+	migrate_to_reboot_cpu();
 	syscore_shutdown();
 	printk(KERN_EMERG "System halted.\n");
 	kmsg_dump(KMSG_DUMP_HALT);
@@ -419,7 +442,7 @@ void kernel_power_off(void)
 	kernel_shutdown_prepare(SYSTEM_POWER_OFF);
 	if (pm_power_off_prepare)
 		pm_power_off_prepare();
-	disable_nonboot_cpus();
+	migrate_to_reboot_cpu();
 	syscore_shutdown();
 	printk(KERN_EMERG "Power down.\n");
 	kmsg_dump(KMSG_DUMP_POWEROFF);
-- 
cgit v1.2.3


From 637241a900cbd982f744d44646b48a273d609b34 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 12 Jun 2013 14:04:39 -0700
Subject: kmsg: honor dmesg_restrict sysctl on /dev/kmsg

The dmesg_restrict sysctl currently covers the syslog method for access
dmesg, however /dev/kmsg isn't covered by the same protections.  Most
people haven't noticed because util-linux dmesg(1) defaults to using the
syslog method for access in older versions.  With util-linux dmesg(1)
defaults to reading directly from /dev/kmsg.

To fix /dev/kmsg, let's compare the existing interfaces and what they
allow:

 - /proc/kmsg allows:
  - open (SYSLOG_ACTION_OPEN) if CAP_SYSLOG since it uses a destructive
    single-reader interface (SYSLOG_ACTION_READ).
  - everything, after an open.

 - syslog syscall allows:
  - anything, if CAP_SYSLOG.
  - SYSLOG_ACTION_READ_ALL and SYSLOG_ACTION_SIZE_BUFFER, if
    dmesg_restrict==0.
  - nothing else (EPERM).

The use-cases were:
 - dmesg(1) needs to do non-destructive SYSLOG_ACTION_READ_ALLs.
 - sysklog(1) needs to open /proc/kmsg, drop privs, and still issue the
   destructive SYSLOG_ACTION_READs.

AIUI, dmesg(1) is moving to /dev/kmsg, and systemd-journald doesn't
clear the ring buffer.

Based on the comments in devkmsg_llseek, it sounds like actions besides
reading aren't going to be supported by /dev/kmsg (i.e.
SYSLOG_ACTION_CLEAR), so we have a strict subset of the non-destructive
syslog syscall actions.

To this end, move the check as Josh had done, but also rename the
constants to reflect their new uses (SYSLOG_FROM_CALL becomes
SYSLOG_FROM_READER, and SYSLOG_FROM_FILE becomes SYSLOG_FROM_PROC).
SYSLOG_FROM_READER allows non-destructive actions, and SYSLOG_FROM_PROC
allows destructive actions after a capabilities-constrained
SYSLOG_ACTION_OPEN check.

 - /dev/kmsg allows:
  - open if CAP_SYSLOG or dmesg_restrict==0
  - reading/polling, after open

Addresses https://bugzilla.redhat.com/show_bug.cgi?id=903192

[akpm@linux-foundation.org: use pr_warn_once()]
Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Christian Kujau <lists@nerdbynature.de>
Tested-by: Josh Boyer <jwboyer@redhat.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 91 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 50 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index fa36e1494420..8212c1aef125 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -363,6 +363,53 @@ static void log_store(int facility, int level,
 	log_next_seq++;
 }
 
+#ifdef CONFIG_SECURITY_DMESG_RESTRICT
+int dmesg_restrict = 1;
+#else
+int dmesg_restrict;
+#endif
+
+static int syslog_action_restricted(int type)
+{
+	if (dmesg_restrict)
+		return 1;
+	/*
+	 * Unless restricted, we allow "read all" and "get buffer size"
+	 * for everybody.
+	 */
+	return type != SYSLOG_ACTION_READ_ALL &&
+	       type != SYSLOG_ACTION_SIZE_BUFFER;
+}
+
+static int check_syslog_permissions(int type, bool from_file)
+{
+	/*
+	 * If this is from /proc/kmsg and we've already opened it, then we've
+	 * already done the capabilities checks at open time.
+	 */
+	if (from_file && type != SYSLOG_ACTION_OPEN)
+		return 0;
+
+	if (syslog_action_restricted(type)) {
+		if (capable(CAP_SYSLOG))
+			return 0;
+		/*
+		 * For historical reasons, accept CAP_SYS_ADMIN too, with
+		 * a warning.
+		 */
+		if (capable(CAP_SYS_ADMIN)) {
+			pr_warn_once("%s (%d): Attempt to access syslog with "
+				     "CAP_SYS_ADMIN but no CAP_SYSLOG "
+				     "(deprecated).\n",
+				 current->comm, task_pid_nr(current));
+			return 0;
+		}
+		return -EPERM;
+	}
+	return security_syslog(type);
+}
+
+
 /* /dev/kmsg - userspace message inject/listen interface */
 struct devkmsg_user {
 	u64 seq;
@@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file)
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
 		return 0;
 
-	err = security_syslog(SYSLOG_ACTION_READ_ALL);
+	err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+				       SYSLOG_FROM_READER);
 	if (err)
 		return err;
 
@@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level)
 }
 #endif
 
-#ifdef CONFIG_SECURITY_DMESG_RESTRICT
-int dmesg_restrict = 1;
-#else
-int dmesg_restrict;
-#endif
-
-static int syslog_action_restricted(int type)
-{
-	if (dmesg_restrict)
-		return 1;
-	/* Unless restricted, we allow "read all" and "get buffer size" for everybody */
-	return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
-}
-
-static int check_syslog_permissions(int type, bool from_file)
-{
-	/*
-	 * If this is from /proc/kmsg and we've already opened it, then we've
-	 * already done the capabilities checks at open time.
-	 */
-	if (from_file && type != SYSLOG_ACTION_OPEN)
-		return 0;
-
-	if (syslog_action_restricted(type)) {
-		if (capable(CAP_SYSLOG))
-			return 0;
-		/* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
-		if (capable(CAP_SYS_ADMIN)) {
-			printk_once(KERN_WARNING "%s (%d): "
-				 "Attempt to access syslog with CAP_SYS_ADMIN "
-				 "but no CAP_SYSLOG (deprecated).\n",
-				 current->comm, task_pid_nr(current));
-			return 0;
-		}
-		return -EPERM;
-	}
-	return 0;
-}
-
 #if defined(CONFIG_PRINTK_TIME)
 static bool printk_time = 1;
 #else
@@ -1249,7 +1258,7 @@ out:
 
 SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 {
-	return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
+	return do_syslog(type, buf, len, SYSLOG_FROM_READER);
 }
 
 /*
-- 
cgit v1.2.3


From f000cfdde5de4fc15dead5ccf524359c07eadf2b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 12 Jun 2013 14:04:46 -0700
Subject: audit: wait_for_auditd() should use TASK_UNINTERRUPTIBLE

audit_log_start() does wait_for_auditd() in a loop until
audit_backlog_wait_time passes or audit_skb_queue has a room.

If signal_pending() is true this becomes a busy-wait loop, schedule() in
TASK_INTERRUPTIBLE won't block.

Thanks to Guy for fully investigating and explaining the problem.

(akpm: that'll cause the system to lock up on a non-preemptible
uniprocessor kernel)

(Guy: "Our customer was in fact running a uniprocessor machine, and they
reported a system hang.")

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Guy Streeter <streeter@redhat.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 21c7fa615bd3..91e53d04b6a9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
 static void wait_for_auditd(unsigned long sleep_time)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	set_current_state(TASK_INTERRUPTIBLE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
 	add_wait_queue(&audit_backlog_wait, &wait);
 
 	if (audit_backlog_limit &&
-- 
cgit v1.2.3


From 736f3203a06eafd0944103775a98584082744c6b Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Wed, 12 Jun 2013 14:05:07 -0700
Subject: kernel/audit_tree.c:audit_add_tree_rule(): protect `rule' from
 kill_rules()

audit_add_tree_rule() must set 'rule->tree = NULL;' firstly, to protect
the rule itself freed in kill_rules().

The reason is when it is killed, the 'rule' itself may have already
released, we should not access it.  one example: we add a rule to an
inode, just at the same time the other task is deleting this inode.

The work flow for adding a rule:

    audit_receive() -> (need audit_cmd_mutex lock)
      audit_receive_skb() ->
        audit_receive_msg() ->
          audit_receive_filter() ->
            audit_add_rule() ->
              audit_add_tree_rule() -> (need audit_filter_mutex lock)
                ...
                unlock audit_filter_mutex
                get_tree()
                ...
                iterate_mounts() -> (iterate all related inodes)
                  tag_mount() ->
                    tag_trunk() ->
                      create_trunk() -> (assume it is 1st rule)
                        fsnotify_add_mark() ->
                          fsnotify_add_inode_mark() ->  (add mark to inode->i_fsnotify_marks)
                        ...
                        get_tree(); (each inode will get one)
                ...
                lock audit_filter_mutex

The work flow for deleting an inode:

    __destroy_inode() ->
     fsnotify_inode_delete() ->
       __fsnotify_inode_delete() ->
        fsnotify_clear_marks_by_inode() ->  (get mark from inode->i_fsnotify_marks)
          fsnotify_destroy_mark() ->
           fsnotify_destroy_mark_locked() ->
             audit_tree_freeing_mark() ->
               evict_chunk() ->
                 ...
                 tree->goner = 1
                 ...
                 kill_rules() ->   (assume current->audit_context == NULL)
                   call_rcu() ->   (rule->tree != NULL)
                     audit_free_rule_rcu() ->
                       audit_free_rule()
                 ...
                 audit_schedule_prune() ->  (assume current->audit_context == NULL)
                   kthread_run() ->    (need audit_cmd_mutex and audit_filter_mutex lock)
                     prune_one() ->    (delete it from prue_list)
                       put_tree(); (match the original get_tree above)

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/audit_tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index a291aa23fb3f..43c307dc9453 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
 	struct vfsmount *mnt;
 	int err;
 
+	rule->tree = NULL;
 	list_for_each_entry(tree, &tree_list, list) {
 		if (!strcmp(seed->pathname, tree->pathname)) {
 			put_tree(seed);
-- 
cgit v1.2.3


From 29ce3785b22da47c49f4ef6e14b9014fa5dee261 Mon Sep 17 00:00:00 2001
From: James Bottomley <JBottomley@Parallels.com>
Date: Wed, 8 May 2013 14:05:34 -0700
Subject: idle: Enable interrupts in the weak arch_cpu_idle() implementation

PARISC bootup triggers the warning at kernel/cpu/idle.c:96. That's
caused by the weak arch_cpu_idle() implementation, which is provided
to avoid that architectures implement idle_poll over and over.

The switchover to polling mode happens in the first call of the weak
arch_cpu_idle() implementation, but that code fails to reenable
interrupts and therefor triggers the warning.

Fix this by enabling interrupts in the weak arch_cpu_idle() code.

[ tglx: Made the changelog match the patch ]

Signed-off-by: James Bottomley <JBottomley@Parallels.com>
Reviewed-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1371236142.2726.43.camel@dabdike
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu/idle.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index bf2ee1aafa0e..e695c0a0bcb5 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -59,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { }
 void __weak arch_cpu_idle(void)
 {
 	cpu_idle_force_poll = 1;
+	local_irq_enable();
 }
 
 /*
-- 
cgit v1.2.3


From 8aac62706adaaf0fab02c4327761561c8bda9448 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 14 Jun 2013 21:09:49 +0200
Subject: move exit_task_namespaces() outside of exit_notify()

exit_notify() does exit_task_namespaces() after
forget_original_parent(). This was needed to ensure that ->nsproxy
can't be cleared prematurely, an exiting child we are going to
reparent can do do_notify_parent() and use the parent's (ours) pid_ns.

However, after 32084504 "pidns: use task_active_pid_ns in
do_notify_parent" ->nsproxy != NULL is no longer needed, we rely
on task_active_pid_ns().

Move exit_task_namespaces() from exit_notify() to do_exit(), after
exit_fs() and before exit_task_work().

This solves the problem reported by Andrey, free_ipc_ns()->shm_destroy()
does fput() which needs task_work_add().

Note: this particular problem can be fixed if we change fput(), and
that change makes sense anyway. But there is another reason to move
the callsite. The original reason for exit_task_namespaces() from
the middle of exit_notify() was subtle and it has already gone away,
now this looks confusing. And this allows us do simplify exit_notify(),
we can avoid unlock/lock(tasklist) and we can use ->exit_state instead
of PF_EXITING in forget_original_parent().

Reported-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index af2eb3cbd499..7bb73f9d09db 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -649,7 +649,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 	 */
 	forget_original_parent(tsk);
-	exit_task_namespaces(tsk);
 
 	write_lock_irq(&tasklist_lock);
 	if (group_dead)
@@ -795,6 +794,7 @@ void do_exit(long code)
 	exit_shm(tsk);
 	exit_files(tsk);
 	exit_fs(tsk);
+	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	check_stack_usage();
 	exit_thread();
-- 
cgit v1.2.3


From 0541881502a1276149889fe468662ff6a8fc8f6d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 13 Jun 2013 13:17:02 -0700
Subject: range: Do not add new blank slot with add_range_with_merge

Joshua reported: Commit cd7b304dfaf1 (x86, range: fix missing merge
during add range) broke mtrr cleanup on his setup in 3.9.5.
corresponding commit in upstream is fbe06b7bae7c.

The reason is add_range_with_merge could generate blank spot.

We could avoid that by searching new expanded start/end, that
new range should include all connected ranges in range array.
At last add the new expanded start/end to the range array.
Also move up left array so do not add new blank slot in the
range array.

-v2: move left array to avoid enhance add_range()
-v3: include fix from Joshua about memmove declaring when
     DYN_DEBUG is used.

Reported-by: Joshua Covington <joshuacov@googlemail.com>
Tested-by: Joshua Covington <joshuacov@googlemail.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1371154622-8929-3-git-send-email-yinghai@kernel.org
Cc: <stable@vger.kernel.org> v3.9
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/range.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/range.c b/kernel/range.c
index eb911dbce267..322ea8e93e4b 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -4,7 +4,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sort.h>
-
+#include <linux/string.h>
 #include <linux/range.h>
 
 int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
@@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
 	if (start >= end)
 		return nr_range;
 
-	/* Try to merge it with old one: */
+	/* get new start/end: */
 	for (i = 0; i < nr_range; i++) {
-		u64 final_start, final_end;
 		u64 common_start, common_end;
 
 		if (!range[i].end)
@@ -45,14 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
 		if (common_start > common_end)
 			continue;
 
-		final_start = min(range[i].start, start);
-		final_end = max(range[i].end, end);
+		/* new start/end, will add it back at last */
+		start = min(range[i].start, start);
+		end = max(range[i].end, end);
 
-		/* clear it and add it back for further merge */
-		range[i].start = 0;
-		range[i].end =  0;
-		return add_range_with_merge(range, az, nr_range,
-			final_start, final_end);
+		memmove(&range[i], &range[i + 1],
+			(nr_range - (i + 1)) * sizeof(range[i]));
+		range[nr_range - 1].start = 0;
+		range[nr_range - 1].end   = 0;
+		nr_range--;
+		i--;
 	}
 
 	/* Need to add it: */
-- 
cgit v1.2.3


From 9bb5d40cd93c9dd4be74834b1dcb1ba03629716b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 4 Jun 2013 10:44:21 +0200
Subject: perf: Fix mmap() accounting hole

Vince's fuzzer once again found holes. This time it spotted a leak in
the locked page accounting.

When an event had redirected output and its close() was the last
reference to the buffer we didn't have a vm context to undo accounting.

Change the code to destroy the buffer on the last munmap() and detach
all redirected events at that time. This provides us the right context
to undo the vm accounting.

Reported-and-tested-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20130604084421.GI8923@twins.programming.kicks-ass.net
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c     | 228 ++++++++++++++++++++++++++++++++---------------
 kernel/events/internal.h |   3 +-
 2 files changed, 159 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ae752cd4a086..b391907d5352 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
 
-static void ring_buffer_attach(struct perf_event *event,
-			       struct ring_buffer *rb);
-
 void __weak perf_event_print_debug(void)	{ }
 
 extern __weak const char *perf_pmu_name(void)
@@ -2917,7 +2914,8 @@ static void free_event_rcu(struct rcu_head *head)
 	kfree(event);
 }
 
-static bool ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
 
 static void free_event(struct perf_event *event)
 {
@@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event)
 		if (has_branch_stack(event)) {
 			static_key_slow_dec_deferred(&perf_sched_events);
 			/* is system-wide event */
-			if (!(event->attach_state & PERF_ATTACH_TASK))
+			if (!(event->attach_state & PERF_ATTACH_TASK)) {
 				atomic_dec(&per_cpu(perf_branch_stack_events,
 						    event->cpu));
+			}
 		}
 	}
 
 	if (event->rb) {
-		ring_buffer_put(event->rb);
-		event->rb = NULL;
+		struct ring_buffer *rb;
+
+		/*
+		 * Can happen when we close an event with re-directed output.
+		 *
+		 * Since we have a 0 refcount, perf_mmap_close() will skip
+		 * over us; possibly making our ring_buffer_put() the last.
+		 */
+		mutex_lock(&event->mmap_mutex);
+		rb = event->rb;
+		if (rb) {
+			rcu_assign_pointer(event->rb, NULL);
+			ring_buffer_detach(event, rb);
+			ring_buffer_put(rb); /* could be last */
+		}
+		mutex_unlock(&event->mmap_mutex);
 	}
 
 	if (is_cgroup_event(event))
@@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 	unsigned int events = POLL_HUP;
 
 	/*
-	 * Race between perf_event_set_output() and perf_poll(): perf_poll()
-	 * grabs the rb reference but perf_event_set_output() overrides it.
-	 * Here is the timeline for two threads T1, T2:
-	 * t0: T1, rb = rcu_dereference(event->rb)
-	 * t1: T2, old_rb = event->rb
-	 * t2: T2, event->rb = new rb
-	 * t3: T2, ring_buffer_detach(old_rb)
-	 * t4: T1, ring_buffer_attach(rb1)
-	 * t5: T1, poll_wait(event->waitq)
-	 *
-	 * To avoid this problem, we grab mmap_mutex in perf_poll()
-	 * thereby ensuring that the assignment of the new ring buffer
-	 * and the detachment of the old buffer appear atomic to perf_poll()
+	 * Pin the event->rb by taking event->mmap_mutex; otherwise
+	 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
 	 */
 	mutex_lock(&event->mmap_mutex);
-
-	rcu_read_lock();
-	rb = rcu_dereference(event->rb);
-	if (rb) {
-		ring_buffer_attach(event, rb);
+	rb = event->rb;
+	if (rb)
 		events = atomic_xchg(&rb->poll, 0);
-	}
-	rcu_read_unlock();
-
 	mutex_unlock(&event->mmap_mutex);
 
 	poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event,
 		return;
 
 	spin_lock_irqsave(&rb->event_lock, flags);
-	if (!list_empty(&event->rb_entry))
-		goto unlock;
-
-	list_add(&event->rb_entry, &rb->event_list);
-unlock:
+	if (list_empty(&event->rb_entry))
+		list_add(&event->rb_entry, &rb->event_list);
 	spin_unlock_irqrestore(&rb->event_lock, flags);
 }
 
-static void ring_buffer_detach(struct perf_event *event,
-			       struct ring_buffer *rb)
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
 {
 	unsigned long flags;
 
@@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
 
 	rcu_read_lock();
 	rb = rcu_dereference(event->rb);
-	if (!rb)
-		goto unlock;
-
-	list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
-		wake_up_all(&event->waitq);
-
-unlock:
+	if (rb) {
+		list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+			wake_up_all(&event->waitq);
+	}
 	rcu_read_unlock();
 }
 
@@ -3582,23 +3571,14 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 	return rb;
 }
 
-static bool ring_buffer_put(struct ring_buffer *rb)
+static void ring_buffer_put(struct ring_buffer *rb)
 {
-	struct perf_event *event, *n;
-	unsigned long flags;
-
 	if (!atomic_dec_and_test(&rb->refcount))
-		return false;
+		return;
 
-	spin_lock_irqsave(&rb->event_lock, flags);
-	list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
-		list_del_init(&event->rb_entry);
-		wake_up_all(&event->waitq);
-	}
-	spin_unlock_irqrestore(&rb->event_lock, flags);
+	WARN_ON_ONCE(!list_empty(&rb->event_list));
 
 	call_rcu(&rb->rcu_head, rb_free_rcu);
-	return true;
 }
 
 static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3606,28 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 	struct perf_event *event = vma->vm_file->private_data;
 
 	atomic_inc(&event->mmap_count);
+	atomic_inc(&event->rb->mmap_count);
 }
 
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
 static void perf_mmap_close(struct vm_area_struct *vma)
 {
 	struct perf_event *event = vma->vm_file->private_data;
 
-	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-		struct ring_buffer *rb = event->rb;
-		struct user_struct *mmap_user = rb->mmap_user;
-		int mmap_locked = rb->mmap_locked;
-		unsigned long size = perf_data_size(rb);
+	struct ring_buffer *rb = event->rb;
+	struct user_struct *mmap_user = rb->mmap_user;
+	int mmap_locked = rb->mmap_locked;
+	unsigned long size = perf_data_size(rb);
 
-		rcu_assign_pointer(event->rb, NULL);
-		ring_buffer_detach(event, rb);
-		mutex_unlock(&event->mmap_mutex);
+	atomic_dec(&rb->mmap_count);
+
+	if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+		return;
+
+	/* Detach current event from the buffer. */
+	rcu_assign_pointer(event->rb, NULL);
+	ring_buffer_detach(event, rb);
+	mutex_unlock(&event->mmap_mutex);
+
+	/* If there's still other mmap()s of this buffer, we're done. */
+	if (atomic_read(&rb->mmap_count)) {
+		ring_buffer_put(rb); /* can't be last */
+		return;
+	}
 
-		if (ring_buffer_put(rb)) {
-			atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
-			vma->vm_mm->pinned_vm -= mmap_locked;
-			free_uid(mmap_user);
+	/*
+	 * No other mmap()s, detach from all other events that might redirect
+	 * into the now unreachable buffer. Somewhat complicated by the
+	 * fact that rb::event_lock otherwise nests inside mmap_mutex.
+	 */
+again:
+	rcu_read_lock();
+	list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+		if (!atomic_long_inc_not_zero(&event->refcount)) {
+			/*
+			 * This event is en-route to free_event() which will
+			 * detach it and remove it from the list.
+			 */
+			continue;
 		}
+		rcu_read_unlock();
+
+		mutex_lock(&event->mmap_mutex);
+		/*
+		 * Check we didn't race with perf_event_set_output() which can
+		 * swizzle the rb from under us while we were waiting to
+		 * acquire mmap_mutex.
+		 *
+		 * If we find a different rb; ignore this event, a next
+		 * iteration will no longer find it on the list. We have to
+		 * still restart the iteration to make sure we're not now
+		 * iterating the wrong list.
+		 */
+		if (event->rb == rb) {
+			rcu_assign_pointer(event->rb, NULL);
+			ring_buffer_detach(event, rb);
+			ring_buffer_put(rb); /* can't be last, we still have one */
+		}
+		mutex_unlock(&event->mmap_mutex);
+		put_event(event);
+
+		/*
+		 * Restart the iteration; either we're on the wrong list or
+		 * destroyed its integrity by doing a deletion.
+		 */
+		goto again;
 	}
+	rcu_read_unlock();
+
+	/*
+	 * It could be there's still a few 0-ref events on the list; they'll
+	 * get cleaned up by free_event() -- they'll also still have their
+	 * ref on the rb and will free it whenever they are done with it.
+	 *
+	 * Aside from that, this buffer is 'fully' detached and unmapped,
+	 * undo the VM accounting.
+	 */
+
+	atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+	vma->vm_mm->pinned_vm -= mmap_locked;
+	free_uid(mmap_user);
+
+	ring_buffer_put(rb); /* could be last */
 }
 
 static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3677,10 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
 	mutex_lock(&event->mmap_mutex);
 	if (event->rb) {
-		if (event->rb->nr_pages != nr_pages)
+		if (event->rb->nr_pages != nr_pages) {
 			ret = -EINVAL;
+			goto unlock;
+		}
+
+		if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+			/*
+			 * Raced against perf_mmap_close() through
+			 * perf_event_set_output(). Try again, hope for better
+			 * luck.
+			 */
+			mutex_unlock(&event->mmap_mutex);
+			goto again;
+		}
+
 		goto unlock;
 	}
 
@@ -3722,12 +3788,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
+	atomic_set(&rb->mmap_count, 1);
 	rb->mmap_locked = extra;
 	rb->mmap_user = get_current_user();
 
 	atomic_long_add(user_extra, &user->locked_vm);
 	vma->vm_mm->pinned_vm += extra;
 
+	ring_buffer_attach(event, rb);
 	rcu_assign_pointer(event->rb, rb);
 
 	perf_event_update_userpage(event);
@@ -3737,6 +3805,10 @@ unlock:
 		atomic_inc(&event->mmap_count);
 	mutex_unlock(&event->mmap_mutex);
 
+	/*
+	 * Since pinned accounting is per vm we cannot allow fork() to copy our
+	 * vma.
+	 */
 	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_ops = &perf_mmap_vmops;
 
@@ -6415,6 +6487,8 @@ set:
 	if (atomic_read(&event->mmap_count))
 		goto unlock;
 
+	old_rb = event->rb;
+
 	if (output_event) {
 		/* get the rb we want to redirect to */
 		rb = ring_buffer_get(output_event);
@@ -6422,16 +6496,28 @@ set:
 			goto unlock;
 	}
 
-	old_rb = event->rb;
-	rcu_assign_pointer(event->rb, rb);
 	if (old_rb)
 		ring_buffer_detach(event, old_rb);
+
+	if (rb)
+		ring_buffer_attach(event, rb);
+
+	rcu_assign_pointer(event->rb, rb);
+
+	if (old_rb) {
+		ring_buffer_put(old_rb);
+		/*
+		 * Since we detached before setting the new rb, so that we
+		 * could attach the new rb, we could have missed a wakeup.
+		 * Provide it now.
+		 */
+		wake_up_all(&event->waitq);
+	}
+
 	ret = 0;
 unlock:
 	mutex_unlock(&event->mmap_mutex);
 
-	if (old_rb)
-		ring_buffer_put(old_rb);
 out:
 	return ret;
 }
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 5bc6c8e9b851..ca6599723be5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -31,7 +31,8 @@ struct ring_buffer {
 	spinlock_t			event_lock;
 	struct list_head		event_list;
 
-	int				mmap_locked;
+	atomic_t			mmap_count;
+	unsigned long			mmap_locked;
 	struct user_struct		*mmap_user;
 
 	struct perf_event_mmap_page	*user_page;
-- 
cgit v1.2.3


From 873b4c65b519fd769940eb281f77848227d4e5c1 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 5 Jun 2013 10:13:11 +0200
Subject: sched: Fix clear NOHZ_BALANCE_KICK

I have faced a sequence where the Idle Load Balance was sometime not
triggered for a while on my platform, in the following scenario:

 CPU 0 and CPU 1 are running tasks and CPU 2 is idle

 CPU 1 kicks the Idle Load Balance
 CPU 1 selects CPU 2 as the new Idle Load Balancer
 CPU 2 sets NOHZ_BALANCE_KICK for CPU 2
 CPU 2 sends a reschedule IPI to CPU 2

 While CPU 3 wakes up, CPU 0 or CPU 1 migrates a waking up task A on CPU 2

 CPU 2 finally wakes up, runs task A and discards the Idle Load Balance
       task A quickly goes back to sleep (before a tick occurs on CPU 2)
 CPU 2 goes back to idle with NOHZ_BALANCE_KICK set

Whenever CPU 2 will be selected as the ILB, no reschedule IPI will be sent
because NOHZ_BALANCE_KICK is already set and no Idle Load Balance will be
performed.

We must wait for the sched softirq to be raised on CPU 2 thanks to another
part the kernel to come back to clear NOHZ_BALANCE_KICK.

The proposed solution clears NOHZ_BALANCE_KICK in schedule_ipi if
we can't raise the sched_softirq for the Idle Load Balance.

Change since V1:

- move the clear of NOHZ_BALANCE_KICK in got_nohz_idle_kick if the ILB
  can't run on this CPU (as suggested by Peter)

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1370419991-13870-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 58453b8272fd..919bee68032b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu)
 static inline bool got_nohz_idle_kick(void)
 {
 	int cpu = smp_processor_id();
-	return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+
+	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+		return false;
+
+	if (idle_cpu(cpu) && !need_resched())
+		return true;
+
+	/*
+	 * We can't run Idle Load Balance on this CPU for this time so we
+	 * cancel it and clear NOHZ_BALANCE_KICK
+	 */
+	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+	return false;
 }
 
 #else /* CONFIG_NO_HZ_COMMON */
@@ -1393,8 +1405,9 @@ static void sched_ttwu_pending(void)
 
 void scheduler_ipi(void)
 {
-	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
-	    && !tick_nohz_full_cpu(smp_processor_id()))
+	if (llist_empty(&this_rq()->wake_list)
+			&& !tick_nohz_full_cpu(smp_processor_id())
+			&& !got_nohz_idle_kick())
 		return;
 
 	/*
@@ -1417,7 +1430,7 @@ void scheduler_ipi(void)
 	/*
 	 * Check if someone kicked us for doing the nohz idle load balance.
 	 */
-	if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+	if (unlikely(got_nohz_idle_kick())) {
 		this_rq()->idle_balance = 1;
 		raise_softirq_irqoff(SCHED_SOFTIRQ);
 	}
-- 
cgit v1.2.3


From 29bb9e5a75684106a37593ad75ec75ff8312731b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 24 May 2013 15:23:40 -0400
Subject: tracing/context-tracking: Add preempt_schedule_context() for tracing

Dave Jones hit the following bug report:

 ===============================
 [ INFO: suspicious RCU usage. ]
 3.10.0-rc2+ #1 Not tainted
 -------------------------------
 include/linux/rcupdate.h:771 rcu_read_lock() used illegally while idle!
 other info that might help us debug this:
 RCU used illegally from idle CPU! rcu_scheduler_active = 1, debug_locks = 0
 RCU used illegally from extended quiescent state!
 2 locks held by cc1/63645:
  #0:  (&rq->lock){-.-.-.}, at: [<ffffffff816b39fd>] __schedule+0xed/0x9b0
  #1:  (rcu_read_lock){.+.+..}, at: [<ffffffff8109d645>] cpuacct_charge+0x5/0x1f0

 CPU: 1 PID: 63645 Comm: cc1 Not tainted 3.10.0-rc2+ #1 [loadavg: 40.57 27.55 13.39 25/277 64369]
 Hardware name: Gigabyte Technology Co., Ltd. GA-MA78GM-S2H/GA-MA78GM-S2H, BIOS F12a 04/23/2010
  0000000000000000 ffff88010f78fcf8 ffffffff816ae383 ffff88010f78fd28
  ffffffff810b698d ffff88011c092548 000000000023d073 ffff88011c092500
  0000000000000001 ffff88010f78fd60 ffffffff8109d7c5 ffffffff8109d645
 Call Trace:
  [<ffffffff816ae383>] dump_stack+0x19/0x1b
  [<ffffffff810b698d>] lockdep_rcu_suspicious+0xfd/0x130
  [<ffffffff8109d7c5>] cpuacct_charge+0x185/0x1f0
  [<ffffffff8109d645>] ? cpuacct_charge+0x5/0x1f0
  [<ffffffff8108dffc>] update_curr+0xec/0x240
  [<ffffffff8108f528>] put_prev_task_fair+0x228/0x480
  [<ffffffff816b3a71>] __schedule+0x161/0x9b0
  [<ffffffff816b4721>] preempt_schedule+0x51/0x80
  [<ffffffff816b4800>] ? __cond_resched_softirq+0x60/0x60
  [<ffffffff816b6824>] ? retint_careful+0x12/0x2e
  [<ffffffff810ff3cc>] ftrace_ops_control_func+0x1dc/0x210
  [<ffffffff816be280>] ftrace_call+0x5/0x2f
  [<ffffffff816b681d>] ? retint_careful+0xb/0x2e
  [<ffffffff816b4805>] ? schedule_user+0x5/0x70
  [<ffffffff816b4805>] ? schedule_user+0x5/0x70
  [<ffffffff816b6824>] ? retint_careful+0x12/0x2e
 ------------[ cut here ]------------

What happened was that the function tracer traced the schedule_user() code
that tells RCU that the system is coming back from userspace, and to
add the CPU back to the RCU monitoring.

Because the function tracer does a preempt_disable/enable_notrace() calls
the preempt_enable_notrace() checks the NEED_RESCHED flag. If it is set,
then preempt_schedule() is called. But this is called before the user_exit()
function can inform the kernel that the CPU is no longer in user mode and
needs to be accounted for by RCU.

The fix is to create a new preempt_schedule_context() that checks if
the kernel is still in user mode and if so to switch it to kernel mode
before calling schedule. It also switches back to user mode coming back
from schedule in need be.

The only user of this currently is the preempt_enable_notrace(), which is
only used by the tracing subsystem.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1369423420.6828.226.camel@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/context_tracking.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'kernel')

diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 65349f07b878..66677003e223 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -71,6 +71,46 @@ void user_enter(void)
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_PREEMPT
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+void __sched notrace preempt_schedule_context(void)
+{
+	struct thread_info *ti = current_thread_info();
+	enum ctx_state prev_ctx;
+
+	if (likely(ti->preempt_count || irqs_disabled()))
+		return;
+
+	/*
+	 * Need to disable preemption in case user_exit() is traced
+	 * and the tracer calls preempt_enable_notrace() causing
+	 * an infinite recursion.
+	 */
+	preempt_disable_notrace();
+	prev_ctx = exception_enter();
+	preempt_enable_no_resched_notrace();
+
+	preempt_schedule();
+
+	preempt_disable_notrace();
+	exception_exit(prev_ctx);
+	preempt_enable_notrace();
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_PREEMPT */
 
 /**
  * user_exit - Inform the context tracking that the CPU is
-- 
cgit v1.2.3


From 8b4d801b2b123b6c09742f861fe44a8527b84d47 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 20 Jun 2013 17:50:06 +0200
Subject: hw_breakpoint: Fix cpu check in task_bp_pinned(cpu)

trinity fuzzer triggered WARN_ONCE("Can't find any breakpoint
slot") in arch_install_hw_breakpoint() but the problem is not
arch-specific.

The problem is, task_bp_pinned(cpu) checks "cpu == iter->cpu"
but this doesn't account the "all cpus" events with iter->cpu <
0.

This means that, say, register_user_hw_breakpoint(tsk) can
happily create the arbitrary number > HBP_NUM of breakpoints
which can not be activated. toggle_bp_task_slot() is equally
wrong by the same reason and nr_task_bp_pinned[] can have
negative entries.

Simple test:

	# perl -e 'sleep 1 while 1' &
	# perf record -e mem:0x10,mem:0x10,mem:0x10,mem:0x10,mem:0x10 -p `pidof perl`

Before this patch this triggers the same problem/WARN_ON(),
after the patch it correctly fails with -ENOSPC.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20130620155006.GA6324@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/hw_breakpoint.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index a64f8aeb5c1f..a853deabe6cf 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -120,7 +120,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
 	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
 		if (iter->hw.bp_target == tsk &&
 		    find_slot_idx(iter) == type &&
-		    cpu == iter->cpu)
+		    (iter->cpu < 0 || cpu == iter->cpu))
 			count += hw_breakpoint_weight(iter);
 	}
 
-- 
cgit v1.2.3


From c790b0ad23f427c7522ffed264706238c57c007e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 20 Jun 2013 17:50:09 +0200
Subject: hw_breakpoint: Use cpu_possible_mask in {reserve,release}_bp_slot()

fetch_bp_busy_slots() and toggle_bp_slot() use
for_each_online_cpu(), this is obviously wrong wrt cpu_up() or
cpu_down(), we can over/under account the per-cpu numbers.

For example:

	# echo 0 >> /sys/devices/system/cpu/cpu1/online
	# perf record -e mem:0x10 -p 1 &
	# echo 1 >> /sys/devices/system/cpu/cpu1/online
	# perf record -e mem:0x10,mem:0x10,mem:0x10,mem:0x10 -C1 -a &
	# taskset -p 0x2 1

triggers the same WARN_ONCE("Can't find any breakpoint slot") in
arch_install_hw_breakpoint().

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20130620155009.GA6327@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/hw_breakpoint.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index a853deabe6cf..20185ea64aa6 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -149,7 +149,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 		return;
 	}
 
-	for_each_online_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		unsigned int nr;
 
 		nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -235,7 +235,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
 	if (cpu >= 0) {
 		toggle_bp_task_slot(bp, cpu, enable, type, weight);
 	} else {
-		for_each_online_cpu(cpu)
+		for_each_possible_cpu(cpu)
 			toggle_bp_task_slot(bp, cpu, enable, type, weight);
 	}
 
-- 
cgit v1.2.3


From ea8deb8dfa6b0e8d1b3d1051585706739b46656c Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Mon, 17 Jun 2013 18:15:35 +0200
Subject: tick: Fix tick_broadcast_pending_mask not cleared

The recent modification in the cpuidle framework consolidated the
timer broadcast code across the different drivers by setting a new
flag in the idle state. It tells the cpuidle core code to enter/exit
the broadcast mode for the cpu when entering a deep idle state. The
broadcast timer enter/exit is no longer handled by the back-end
driver.

This change made the local interrupt to be enabled *before* calling
CLOCK_EVENT_NOTIFY_EXIT.

On a tegra114, a four cores system, when the flag has been introduced
in the driver, the following warning appeared:

WARNING: at kernel/time/tick-broadcast.c:578 tick_broadcast_oneshot_control
CPU: 2 PID: 0 Comm: swapper/2 Not tainted 3.10.0-rc3-next-20130529+ #15
[<c00667f8>] (tick_broadcast_oneshot_control+0x1a4/0x1d0) from [<c0065cd0>] (tick_notify+0x240/0x40c)
[<c0065cd0>] (tick_notify+0x240/0x40c) from [<c0044724>] (notifier_call_chain+0x44/0x84)
[<c0044724>] (notifier_call_chain+0x44/0x84) from [<c0044828>] (raw_notifier_call_chain+0x18/0x20)
[<c0044828>] (raw_notifier_call_chain+0x18/0x20) from [<c00650cc>] (clockevents_notify+0x28/0x170)
[<c00650cc>] (clockevents_notify+0x28/0x170) from [<c033f1f0>] (cpuidle_idle_call+0x11c/0x168)
[<c033f1f0>] (cpuidle_idle_call+0x11c/0x168) from [<c000ea94>] (arch_cpu_idle+0x8/0x38)
[<c000ea94>] (arch_cpu_idle+0x8/0x38) from [<c005ea80>] (cpu_startup_entry+0x60/0x134)
[<c005ea80>] (cpu_startup_entry+0x60/0x134) from [<804fe9a4>] (0x804fe9a4)

I don't have the hardware, so I wasn't able to reproduce the warning
but after looking a while at the code, I deduced the following:

 1. the CPU2 enters a deep idle state and sets the broadcast timer

 2. the timer expires, the tick_handle_oneshot_broadcast function is
    called, setting the tick_broadcast_pending_mask and waking up the
    idle cpu CPU2

 3. the CPU2 exits idle handles the interrupt and then invokes
    tick_broadcast_oneshot_control with CLOCK_EVENT_NOTIFY_EXIT which
    runs the following code:

    [...]
    if (dev->next_event.tv64 == KTIME_MAX)
            goto out;

    if (cpumask_test_and_clear_cpu(cpu,
                                 tick_broadcast_pending_mask))
            goto out;
    [...]

    So if there is no next event scheduled for CPU2, we fulfil the
    first condition and jump out without clearing the
    tick_broadcast_pending_mask.

 4. CPU2 goes to deep idle again and calls
    tick_broadcast_oneshot_control with CLOCK_NOTIFY_EVENT_ENTER but
    with the tick_broadcast_pending_mask set for CPU2, triggering the
    warning.

The issue only surfaced due to the modifications of the cpuidle
framework, which resulted in interrupts being enabled before the call
to the clockevents code. If the call happens before interrupts have
been enabled, the warning cannot trigger, because there is still the
event pending which caused the broadcast timer expiry.

Move the check for the next event below the check for the pending bit,
so the pending bit gets cleared whether an event is scheduled on the
cpu or not.

[ tglx: Massaged changelog ]

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reported-and-tested-by: Joseph Lo <josephl@nvidia.com>
Cc: Stephen Warren <swarren@nvidia.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linaro-kernel@lists.linaro.org
Link: http://lkml.kernel.org/r/1371485735-31249-1-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b4c245580b79..20d6fba70652 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -599,8 +599,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 	} else {
 		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
-			if (dev->next_event.tv64 == KTIME_MAX)
-				goto out;
 			/*
 			 * The cpu which was handling the broadcast
 			 * timer marked this cpu in the broadcast
@@ -614,6 +612,11 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 				       tick_broadcast_pending_mask))
 				goto out;
 
+			/*
+			 * Bail out if there is no next event.
+			 */
+			if (dev->next_event.tv64 == KTIME_MAX)
+				goto out;
 			/*
 			 * If the pending bit is not set, then we are
 			 * either the CPU handling the broadcast
-- 
cgit v1.2.3


From 706b23bde27a391f0974df2a8351661770fa2e07 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 28 Jun 2013 09:49:46 -0400
Subject: Fix: kernel/ptrace.c: ptrace_peek_siginfo() missing __put_user()
 validation

This __put_user() could be used by unprivileged processes to write into
kernel memory.  The issue here is that even if copy_siginfo_to_user()
fails, the error code is not checked before __put_user() is executed.

Luckily, ptrace_peek_siginfo() has been added within the 3.10-rc cycle,
so it has not hit a stable release yet.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aed981a3f69c..335a7ae697f5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -665,20 +665,22 @@ static int ptrace_peek_siginfo(struct task_struct *child,
 		if (unlikely(is_compat_task())) {
 			compat_siginfo_t __user *uinfo = compat_ptr(data);
 
-			ret = copy_siginfo_to_user32(uinfo, &info);
-			ret |= __put_user(info.si_code, &uinfo->si_code);
+			if (copy_siginfo_to_user32(uinfo, &info) ||
+			    __put_user(info.si_code, &uinfo->si_code)) {
+				ret = -EFAULT;
+				break;
+			}
+
 		} else
 #endif
 		{
 			siginfo_t __user *uinfo = (siginfo_t __user *) data;
 
-			ret = copy_siginfo_to_user(uinfo, &info);
-			ret |= __put_user(info.si_code, &uinfo->si_code);
-		}
-
-		if (ret) {
-			ret = -EFAULT;
-			break;
+			if (copy_siginfo_to_user(uinfo, &info) ||
+			    __put_user(info.si_code, &uinfo->si_code)) {
+				ret = -EFAULT;
+				break;
+			}
 		}
 
 		data += sizeof(siginfo_t);
-- 
cgit v1.2.3


From 55ccb616a6e42052edb37e9c4f82cf8854a59429 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 28 Jun 2013 00:06:42 +0000
Subject: posix_cpu_timer: consolidate expiry time type

The posix cpu timer expiry time is stored in a union of two types: a 64
bits field if we rely on scheduler precise accounting, or a cputime_t if
we rely on jiffies.

This results in quite some duplicate code and special cases to handle the
two types.

Just unify this into a single 64 bits field.  cputime_t can always fit
into it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Olivier Langlois <olivier@trillion01.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 266 ++++++++++++++++++----------------------------
 1 file changed, 106 insertions(+), 160 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 42670e9b44e0..c3c4ea1225a4 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock)
 	return error;
 }
 
-static inline union cpu_time_count
+static inline unsigned long long
 timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
 {
-	union cpu_time_count ret;
-	ret.sched = 0;		/* high half always zero when .cpu used */
+	unsigned long long ret;
+
+	ret = 0;		/* high half always zero when .cpu used */
 	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-		ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
+		ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
 	} else {
-		ret.cpu = timespec_to_cputime(tp);
+		ret = cputime_to_expires(timespec_to_cputime(tp));
 	}
 	return ret;
 }
 
 static void sample_to_timespec(const clockid_t which_clock,
-			       union cpu_time_count cpu,
+			       unsigned long long expires,
 			       struct timespec *tp)
 {
 	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
-		*tp = ns_to_timespec(cpu.sched);
+		*tp = ns_to_timespec(expires);
 	else
-		cputime_to_timespec(cpu.cpu, tp);
-}
-
-static inline int cpu_time_before(const clockid_t which_clock,
-				  union cpu_time_count now,
-				  union cpu_time_count then)
-{
-	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-		return now.sched < then.sched;
-	}  else {
-		return now.cpu < then.cpu;
-	}
-}
-static inline void cpu_time_add(const clockid_t which_clock,
-				union cpu_time_count *acc,
-			        union cpu_time_count val)
-{
-	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-		acc->sched += val.sched;
-	}  else {
-		acc->cpu += val.cpu;
-	}
-}
-static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
-						union cpu_time_count a,
-						union cpu_time_count b)
-{
-	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-		a.sched -= b.sched;
-	}  else {
-		a.cpu -= b.cpu;
-	}
-	return a;
+		cputime_to_timespec((__force cputime_t)expires, tp);
 }
 
 /*
@@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
  * given the current clock sample.
  */
 static void bump_cpu_timer(struct k_itimer *timer,
-				  union cpu_time_count now)
+			   unsigned long long now)
 {
 	int i;
+	unsigned long long delta, incr;
 
-	if (timer->it.cpu.incr.sched == 0)
+	if (timer->it.cpu.incr == 0)
 		return;
 
-	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
-		unsigned long long delta, incr;
+	if (now < timer->it.cpu.expires)
+		return;
 
-		if (now.sched < timer->it.cpu.expires.sched)
-			return;
-		incr = timer->it.cpu.incr.sched;
-		delta = now.sched + incr - timer->it.cpu.expires.sched;
-		/* Don't use (incr*2 < delta), incr*2 might overflow. */
-		for (i = 0; incr < delta - incr; i++)
-			incr = incr << 1;
-		for (; i >= 0; incr >>= 1, i--) {
-			if (delta < incr)
-				continue;
-			timer->it.cpu.expires.sched += incr;
-			timer->it_overrun += 1 << i;
-			delta -= incr;
-		}
-	} else {
-		cputime_t delta, incr;
+	incr = timer->it.cpu.incr;
+	delta = now + incr - timer->it.cpu.expires;
 
-		if (now.cpu < timer->it.cpu.expires.cpu)
-			return;
-		incr = timer->it.cpu.incr.cpu;
-		delta = now.cpu + incr - timer->it.cpu.expires.cpu;
-		/* Don't use (incr*2 < delta), incr*2 might overflow. */
-		for (i = 0; incr < delta - incr; i++)
-			     incr += incr;
-		for (; i >= 0; incr = incr >> 1, i--) {
-			if (delta < incr)
-				continue;
-			timer->it.cpu.expires.cpu += incr;
-			timer->it_overrun += 1 << i;
-			delta -= incr;
-		}
+	/* Don't use (incr*2 < delta), incr*2 might overflow. */
+	for (i = 0; incr < delta - incr; i++)
+		incr = incr << 1;
+
+	for (; i >= 0; incr >>= 1, i--) {
+		if (delta < incr)
+			continue;
+
+		timer->it.cpu.expires += incr;
+		timer->it_overrun += 1 << i;
+		delta -= incr;
 	}
 }
 
@@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime)
 	return 0;
 }
 
-static inline cputime_t prof_ticks(struct task_struct *p)
+static inline unsigned long long prof_ticks(struct task_struct *p)
 {
 	cputime_t utime, stime;
 
 	task_cputime(p, &utime, &stime);
 
-	return utime + stime;
+	return cputime_to_expires(utime + stime);
 }
-static inline cputime_t virt_ticks(struct task_struct *p)
+static inline unsigned long long virt_ticks(struct task_struct *p)
 {
 	cputime_t utime;
 
 	task_cputime(p, &utime, NULL);
 
-	return utime;
+	return cputime_to_expires(utime);
 }
 
 static int
@@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
  * Sample a per-thread clock for the given task.
  */
 static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
-			    union cpu_time_count *cpu)
+			    unsigned long long *sample)
 {
 	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
-		cpu->cpu = prof_ticks(p);
+		*sample = prof_ticks(p);
 		break;
 	case CPUCLOCK_VIRT:
-		cpu->cpu = virt_ticks(p);
+		*sample = virt_ticks(p);
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = task_sched_runtime(p);
+		*sample = task_sched_runtime(p);
 		break;
 	}
 	return 0;
@@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
  */
 static int cpu_clock_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
-				  union cpu_time_count *cpu)
+				  unsigned long long *sample)
 {
 	struct task_cputime cputime;
 
@@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 		return -EINVAL;
 	case CPUCLOCK_PROF:
 		thread_group_cputime(p, &cputime);
-		cpu->cpu = cputime.utime + cputime.stime;
+		*sample = cputime_to_expires(cputime.utime + cputime.stime);
 		break;
 	case CPUCLOCK_VIRT:
 		thread_group_cputime(p, &cputime);
-		cpu->cpu = cputime.utime;
+		*sample = cputime_to_expires(cputime.utime);
 		break;
 	case CPUCLOCK_SCHED:
 		thread_group_cputime(p, &cputime);
-		cpu->sched = cputime.sum_exec_runtime;
+		*sample = cputime.sum_exec_runtime;
 		break;
 	}
 	return 0;
@@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
 	const pid_t pid = CPUCLOCK_PID(which_clock);
 	int error = -EINVAL;
-	union cpu_time_count rtn;
+	unsigned long long rtn;
 
 	if (pid == 0) {
 		/*
@@ -461,30 +414,30 @@ static void cleanup_timers(struct list_head *head,
 
 	list_for_each_entry_safe(timer, next, head, entry) {
 		list_del_init(&timer->entry);
-		if (timer->expires.cpu < ptime) {
-			timer->expires.cpu = 0;
+		if (timer->expires < cputime_to_expires(ptime)) {
+			timer->expires = 0;
 		} else {
-			timer->expires.cpu -= ptime;
+			timer->expires -= cputime_to_expires(ptime);
 		}
 	}
 
 	++head;
 	list_for_each_entry_safe(timer, next, head, entry) {
 		list_del_init(&timer->entry);
-		if (timer->expires.cpu < utime) {
-			timer->expires.cpu = 0;
+		if (timer->expires < cputime_to_expires(utime)) {
+			timer->expires = 0;
 		} else {
-			timer->expires.cpu -= utime;
+			timer->expires -= cputime_to_expires(utime);
 		}
 	}
 
 	++head;
 	list_for_each_entry_safe(timer, next, head, entry) {
 		list_del_init(&timer->entry);
-		if (timer->expires.sched < sum_exec_runtime) {
-			timer->expires.sched = 0;
+		if (timer->expires < sum_exec_runtime) {
+			timer->expires = 0;
 		} else {
-			timer->expires.sched -= sum_exec_runtime;
+			timer->expires -= sum_exec_runtime;
 		}
 	}
 }
@@ -516,7 +469,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
 		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
 
-static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
+static void clear_dead_task(struct k_itimer *timer, unsigned long long now)
 {
 	/*
 	 * That's all for this thread or process.
@@ -524,9 +477,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
 	 */
 	put_task_struct(timer->it.cpu.task);
 	timer->it.cpu.task = NULL;
-	timer->it.cpu.expires = cpu_time_sub(timer->it_clock,
-					     timer->it.cpu.expires,
-					     now);
+	timer->it.cpu.expires -= now;
 }
 
 static inline int expires_gt(cputime_t expires, cputime_t new_exp)
@@ -558,14 +509,14 @@ static void arm_timer(struct k_itimer *timer)
 
 	listpos = head;
 	list_for_each_entry(next, head, entry) {
-		if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
+		if (nt->expires < next->expires)
 			break;
 		listpos = &next->entry;
 	}
 	list_add(&nt->entry, listpos);
 
 	if (listpos == head) {
-		union cpu_time_count *exp = &nt->expires;
+		unsigned long long exp = nt->expires;
 
 		/*
 		 * We are the new earliest-expiring POSIX 1.b timer, hence
@@ -576,17 +527,17 @@ static void arm_timer(struct k_itimer *timer)
 
 		switch (CPUCLOCK_WHICH(timer->it_clock)) {
 		case CPUCLOCK_PROF:
-			if (expires_gt(cputime_expires->prof_exp, exp->cpu))
-				cputime_expires->prof_exp = exp->cpu;
+			if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
+				cputime_expires->prof_exp = expires_to_cputime(exp);
 			break;
 		case CPUCLOCK_VIRT:
-			if (expires_gt(cputime_expires->virt_exp, exp->cpu))
-				cputime_expires->virt_exp = exp->cpu;
+			if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
+				cputime_expires->virt_exp = expires_to_cputime(exp);
 			break;
 		case CPUCLOCK_SCHED:
 			if (cputime_expires->sched_exp == 0 ||
-			    cputime_expires->sched_exp > exp->sched)
-				cputime_expires->sched_exp = exp->sched;
+			    cputime_expires->sched_exp > exp)
+				cputime_expires->sched_exp = exp;
 			break;
 		}
 	}
@@ -601,20 +552,20 @@ static void cpu_timer_fire(struct k_itimer *timer)
 		/*
 		 * User don't want any signal.
 		 */
-		timer->it.cpu.expires.sched = 0;
+		timer->it.cpu.expires = 0;
 	} else if (unlikely(timer->sigq == NULL)) {
 		/*
 		 * This a special case for clock_nanosleep,
 		 * not a normal timer from sys_timer_create.
 		 */
 		wake_up_process(timer->it_process);
-		timer->it.cpu.expires.sched = 0;
-	} else if (timer->it.cpu.incr.sched == 0) {
+		timer->it.cpu.expires = 0;
+	} else if (timer->it.cpu.incr == 0) {
 		/*
 		 * One-shot timer.  Clear it as soon as it's fired.
 		 */
 		posix_timer_event(timer, 0);
-		timer->it.cpu.expires.sched = 0;
+		timer->it.cpu.expires = 0;
 	} else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
 		/*
 		 * The signal did not get queued because the signal
@@ -632,7 +583,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
  */
 static int cpu_timer_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
-				  union cpu_time_count *cpu)
+				  unsigned long long *sample)
 {
 	struct task_cputime cputime;
 
@@ -641,13 +592,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
-		cpu->cpu = cputime.utime + cputime.stime;
+		*sample = cputime_to_expires(cputime.utime + cputime.stime);
 		break;
 	case CPUCLOCK_VIRT:
-		cpu->cpu = cputime.utime;
+		*sample = cputime_to_expires(cputime.utime);
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+		*sample = cputime.sum_exec_runtime + task_delta_exec(p);
 		break;
 	}
 	return 0;
@@ -694,7 +645,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 			       struct itimerspec *new, struct itimerspec *old)
 {
 	struct task_struct *p = timer->it.cpu.task;
-	union cpu_time_count old_expires, new_expires, old_incr, val;
+	unsigned long long old_expires, new_expires, old_incr, val;
 	int ret;
 
 	if (unlikely(p == NULL)) {
@@ -749,7 +700,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	}
 
 	if (old) {
-		if (old_expires.sched == 0) {
+		if (old_expires == 0) {
 			old->it_value.tv_sec = 0;
 			old->it_value.tv_nsec = 0;
 		} else {
@@ -764,11 +715,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 			 * new setting.
 			 */
 			bump_cpu_timer(timer, val);
-			if (cpu_time_before(timer->it_clock, val,
-					    timer->it.cpu.expires)) {
-				old_expires = cpu_time_sub(
-					timer->it_clock,
-					timer->it.cpu.expires, val);
+			if (val < timer->it.cpu.expires) {
+				old_expires = timer->it.cpu.expires - val;
 				sample_to_timespec(timer->it_clock,
 						   old_expires,
 						   &old->it_value);
@@ -791,8 +739,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		goto out;
 	}
 
-	if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) {
-		cpu_time_add(timer->it_clock, &new_expires, val);
+	if (new_expires != 0 && !(flags & TIMER_ABSTIME)) {
+		new_expires += val;
 	}
 
 	/*
@@ -801,8 +749,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	 * arm the timer (we'll just fake it for timer_gettime).
 	 */
 	timer->it.cpu.expires = new_expires;
-	if (new_expires.sched != 0 &&
-	    cpu_time_before(timer->it_clock, val, new_expires)) {
+	if (new_expires != 0 && val < new_expires) {
 		arm_timer(timer);
 	}
 
@@ -826,8 +773,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	timer->it_overrun_last = 0;
 	timer->it_overrun = -1;
 
-	if (new_expires.sched != 0 &&
-	    !cpu_time_before(timer->it_clock, val, new_expires)) {
+	if (new_expires != 0 && !(val < new_expires)) {
 		/*
 		 * The designated time already passed, so we notify
 		 * immediately, even if the thread never runs to
@@ -849,7 +795,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 
 static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 {
-	union cpu_time_count now;
+	unsigned long long now;
 	struct task_struct *p = timer->it.cpu.task;
 	int clear_dead;
 
@@ -859,7 +805,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	sample_to_timespec(timer->it_clock,
 			   timer->it.cpu.incr, &itp->it_interval);
 
-	if (timer->it.cpu.expires.sched == 0) {	/* Timer not armed at all.  */
+	if (timer->it.cpu.expires == 0) {	/* Timer not armed at all.  */
 		itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
 		return;
 	}
@@ -891,7 +837,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 			 */
 			put_task_struct(p);
 			timer->it.cpu.task = NULL;
-			timer->it.cpu.expires.sched = 0;
+			timer->it.cpu.expires = 0;
 			read_unlock(&tasklist_lock);
 			goto dead;
 		} else {
@@ -912,10 +858,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 		goto dead;
 	}
 
-	if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) {
+	if (now < timer->it.cpu.expires) {
 		sample_to_timespec(timer->it_clock,
-				   cpu_time_sub(timer->it_clock,
-						timer->it.cpu.expires, now),
+				   timer->it.cpu.expires - now,
 				   &itp->it_value);
 	} else {
 		/*
@@ -946,8 +891,8 @@ static void check_thread_timers(struct task_struct *tsk,
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
-			tsk->cputime_expires.prof_exp = t->expires.cpu;
+		if (!--maxfire || prof_ticks(tsk) < t->expires) {
+			tsk->cputime_expires.prof_exp = expires_to_cputime(t->expires);
 			break;
 		}
 		t->firing = 1;
@@ -961,8 +906,8 @@ static void check_thread_timers(struct task_struct *tsk,
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
-			tsk->cputime_expires.virt_exp = t->expires.cpu;
+		if (!--maxfire || virt_ticks(tsk) < t->expires) {
+			tsk->cputime_expires.virt_exp = expires_to_cputime(t->expires);
 			break;
 		}
 		t->firing = 1;
@@ -976,8 +921,8 @@ static void check_thread_timers(struct task_struct *tsk,
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
-			tsk->cputime_expires.sched_exp = t->expires.sched;
+		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires) {
+			tsk->cputime_expires.sched_exp = t->expires;
 			break;
 		}
 		t->firing = 1;
@@ -1030,7 +975,8 @@ static void stop_process_timers(struct signal_struct *sig)
 static u32 onecputick;
 
 static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
-			     cputime_t *expires, cputime_t cur_time, int signo)
+			     unsigned long long *expires,
+			     unsigned long long cur_time, int signo)
 {
 	if (!it->expires)
 		return;
@@ -1068,7 +1014,7 @@ static void check_process_timers(struct task_struct *tsk,
 {
 	int maxfire;
 	struct signal_struct *const sig = tsk->signal;
-	cputime_t utime, ptime, virt_expires, prof_expires;
+	unsigned long long utime, ptime, virt_expires, prof_expires;
 	unsigned long long sum_sched_runtime, sched_expires;
 	struct list_head *timers = sig->cpu_timers;
 	struct task_cputime cputime;
@@ -1078,8 +1024,8 @@ static void check_process_timers(struct task_struct *tsk,
 	 * Collect the current process totals.
 	 */
 	thread_group_cputimer(tsk, &cputime);
-	utime = cputime.utime;
-	ptime = utime + cputime.stime;
+	utime = cputime_to_expires(cputime.utime);
+	ptime = utime + cputime_to_expires(cputime.stime);
 	sum_sched_runtime = cputime.sum_exec_runtime;
 	maxfire = 20;
 	prof_expires = 0;
@@ -1087,8 +1033,8 @@ static void check_process_timers(struct task_struct *tsk,
 		struct cpu_timer_list *tl = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || ptime < tl->expires.cpu) {
-			prof_expires = tl->expires.cpu;
+		if (!--maxfire || ptime < tl->expires) {
+			prof_expires = tl->expires;
 			break;
 		}
 		tl->firing = 1;
@@ -1102,8 +1048,8 @@ static void check_process_timers(struct task_struct *tsk,
 		struct cpu_timer_list *tl = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || utime < tl->expires.cpu) {
-			virt_expires = tl->expires.cpu;
+		if (!--maxfire || utime < tl->expires) {
+			virt_expires = tl->expires;
 			break;
 		}
 		tl->firing = 1;
@@ -1117,8 +1063,8 @@ static void check_process_timers(struct task_struct *tsk,
 		struct cpu_timer_list *tl = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
-			sched_expires = tl->expires.sched;
+		if (!--maxfire || sum_sched_runtime < tl->expires) {
+			sched_expires = tl->expires;
 			break;
 		}
 		tl->firing = 1;
@@ -1162,8 +1108,8 @@ static void check_process_timers(struct task_struct *tsk,
 		}
 	}
 
-	sig->cputime_expires.prof_exp = prof_expires;
-	sig->cputime_expires.virt_exp = virt_expires;
+	sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
+	sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
 	sig->cputime_expires.sched_exp = sched_expires;
 	if (task_cputime_zero(&sig->cputime_expires))
 		stop_process_timers(sig);
@@ -1176,7 +1122,7 @@ static void check_process_timers(struct task_struct *tsk,
 void posix_cpu_timer_schedule(struct k_itimer *timer)
 {
 	struct task_struct *p = timer->it.cpu.task;
-	union cpu_time_count now;
+	unsigned long long now;
 
 	if (unlikely(p == NULL))
 		/*
@@ -1205,7 +1151,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			 */
 			put_task_struct(p);
 			timer->it.cpu.task = p = NULL;
-			timer->it.cpu.expires.sched = 0;
+			timer->it.cpu.expires = 0;
 			goto out_unlock;
 		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
 			/*
@@ -1387,7 +1333,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval)
 {
-	union cpu_time_count now;
+	unsigned long long now;
 
 	BUG_ON(clock_idx == CPUCLOCK_SCHED);
 	cpu_timer_sample_group(clock_idx, tsk, &now);
@@ -1399,17 +1345,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 		 * it to be absolute.
 		 */
 		if (*oldval) {
-			if (*oldval <= now.cpu) {
+			if (*oldval <= now) {
 				/* Just about to fire. */
 				*oldval = cputime_one_jiffy;
 			} else {
-				*oldval -= now.cpu;
+				*oldval -= now;
 			}
 		}
 
 		if (!*newval)
 			goto out;
-		*newval += now.cpu;
+		*newval += now;
 	}
 
 	/*
@@ -1459,7 +1405,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		}
 
 		while (!signal_pending(current)) {
-			if (timer.it.cpu.expires.sched == 0) {
+			if (timer.it.cpu.expires == 0) {
 				/*
 				 * Our timer fired and was reset, below
 				 * deletion can not fail.
-- 
cgit v1.2.3


From 1a7fa510b38e518d11365883934f1afa41625424 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 28 Jun 2013 00:06:42 +0000
Subject: posix_cpu_timers: consolidate timer list cleanups

Cleaning up the posix cpu timers on task exit shares some common code
among timer list types, most notably the list traversal and expiry time
update.

Unify this in a common helper.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Olivier Langlois <olivier@trillion01.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 48 +++++++++++++++++++----------------------------
 1 file changed, 19 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c3c4ea1225a4..b1450cee6d6d 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -399,6 +399,21 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
 	return ret;
 }
 
+static void cleanup_timers_list(struct list_head *head,
+				unsigned long long curr)
+{
+	struct cpu_timer_list *timer, *next;
+
+	list_for_each_entry_safe(timer, next, head, entry) {
+		list_del_init(&timer->entry);
+		if (timer->expires < curr) {
+			timer->expires = 0;
+		} else {
+			timer->expires -= curr;
+		}
+	}
+}
+
 /*
  * Clean out CPU timers still ticking when a thread exited.  The task
  * pointer is cleared, and the expiry time is replaced with the residual
@@ -409,37 +424,12 @@ static void cleanup_timers(struct list_head *head,
 			   cputime_t utime, cputime_t stime,
 			   unsigned long long sum_exec_runtime)
 {
-	struct cpu_timer_list *timer, *next;
-	cputime_t ptime = utime + stime;
 
-	list_for_each_entry_safe(timer, next, head, entry) {
-		list_del_init(&timer->entry);
-		if (timer->expires < cputime_to_expires(ptime)) {
-			timer->expires = 0;
-		} else {
-			timer->expires -= cputime_to_expires(ptime);
-		}
-	}
-
-	++head;
-	list_for_each_entry_safe(timer, next, head, entry) {
-		list_del_init(&timer->entry);
-		if (timer->expires < cputime_to_expires(utime)) {
-			timer->expires = 0;
-		} else {
-			timer->expires -= cputime_to_expires(utime);
-		}
-	}
+	cputime_t ptime = utime + stime;
 
-	++head;
-	list_for_each_entry_safe(timer, next, head, entry) {
-		list_del_init(&timer->entry);
-		if (timer->expires < sum_exec_runtime) {
-			timer->expires = 0;
-		} else {
-			timer->expires -= sum_exec_runtime;
-		}
-	}
+	cleanup_timers_list(head, cputime_to_expires(ptime));
+	cleanup_timers_list(++head, cputime_to_expires(utime));
+	cleanup_timers_list(++head, sum_exec_runtime);
 }
 
 /*
-- 
cgit v1.2.3


From 2473f3e7a97ce8bc0fe7596cdb361b21221418eb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 28 Jun 2013 00:06:43 +0000
Subject: posix_cpu_timers: consolidate expired timers check

Consolidate the common code amongst per thread and per process timers list
on tick time.

List traversal, expiry check and subsequent updates can be shared in a
common helper.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Olivier Langlois <olivier@trillion01.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 118 +++++++++++++---------------------------------
 1 file changed, 33 insertions(+), 85 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index b1450cee6d6d..92a4fbf44f86 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -862,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	}
 }
 
+static unsigned long long
+check_timers_list(struct list_head *timers,
+		  struct list_head *firing,
+		  unsigned long long curr)
+{
+	int maxfire = 20;
+
+	while (!list_empty(timers)) {
+		struct cpu_timer_list *t;
+
+		t = list_first_entry(timers, struct cpu_timer_list, entry);
+
+		if (!--maxfire || curr < t->expires)
+			return t->expires;
+
+		t->firing = 1;
+		list_move_tail(&t->entry, firing);
+	}
+
+	return 0;
+}
+
 /*
  * Check for any per-thread CPU timers that have fired and move them off
  * the tsk->cpu_timers[N] list onto the firing list.  Here we update the
@@ -870,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 static void check_thread_timers(struct task_struct *tsk,
 				struct list_head *firing)
 {
-	int maxfire;
 	struct list_head *timers = tsk->cpu_timers;
 	struct signal_struct *const sig = tsk->signal;
+	struct task_cputime *tsk_expires = &tsk->cputime_expires;
+	unsigned long long expires;
 	unsigned long soft;
 
-	maxfire = 20;
-	tsk->cputime_expires.prof_exp = 0;
-	while (!list_empty(timers)) {
-		struct cpu_timer_list *t = list_first_entry(timers,
-						      struct cpu_timer_list,
-						      entry);
-		if (!--maxfire || prof_ticks(tsk) < t->expires) {
-			tsk->cputime_expires.prof_exp = expires_to_cputime(t->expires);
-			break;
-		}
-		t->firing = 1;
-		list_move_tail(&t->entry, firing);
-	}
+	expires = check_timers_list(timers, firing, prof_ticks(tsk));
+	tsk_expires->prof_exp = expires_to_cputime(expires);
 
-	++timers;
-	maxfire = 20;
-	tsk->cputime_expires.virt_exp = 0;
-	while (!list_empty(timers)) {
-		struct cpu_timer_list *t = list_first_entry(timers,
-						      struct cpu_timer_list,
-						      entry);
-		if (!--maxfire || virt_ticks(tsk) < t->expires) {
-			tsk->cputime_expires.virt_exp = expires_to_cputime(t->expires);
-			break;
-		}
-		t->firing = 1;
-		list_move_tail(&t->entry, firing);
-	}
+	expires = check_timers_list(++timers, firing, virt_ticks(tsk));
+	tsk_expires->virt_exp = expires_to_cputime(expires);
 
-	++timers;
-	maxfire = 20;
-	tsk->cputime_expires.sched_exp = 0;
-	while (!list_empty(timers)) {
-		struct cpu_timer_list *t = list_first_entry(timers,
-						      struct cpu_timer_list,
-						      entry);
-		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires) {
-			tsk->cputime_expires.sched_exp = t->expires;
-			break;
-		}
-		t->firing = 1;
-		list_move_tail(&t->entry, firing);
-	}
+	tsk_expires->sched_exp = check_timers_list(++timers, firing,
+						   tsk->se.sum_exec_runtime);
 
 	/*
 	 * Check for the special case thread timers.
@@ -1002,7 +990,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 static void check_process_timers(struct task_struct *tsk,
 				 struct list_head *firing)
 {
-	int maxfire;
 	struct signal_struct *const sig = tsk->signal;
 	unsigned long long utime, ptime, virt_expires, prof_expires;
 	unsigned long long sum_sched_runtime, sched_expires;
@@ -1017,49 +1004,10 @@ static void check_process_timers(struct task_struct *tsk,
 	utime = cputime_to_expires(cputime.utime);
 	ptime = utime + cputime_to_expires(cputime.stime);
 	sum_sched_runtime = cputime.sum_exec_runtime;
-	maxfire = 20;
-	prof_expires = 0;
-	while (!list_empty(timers)) {
-		struct cpu_timer_list *tl = list_first_entry(timers,
-						      struct cpu_timer_list,
-						      entry);
-		if (!--maxfire || ptime < tl->expires) {
-			prof_expires = tl->expires;
-			break;
-		}
-		tl->firing = 1;
-		list_move_tail(&tl->entry, firing);
-	}
 
-	++timers;
-	maxfire = 20;
-	virt_expires = 0;
-	while (!list_empty(timers)) {
-		struct cpu_timer_list *tl = list_first_entry(timers,
-						      struct cpu_timer_list,
-						      entry);
-		if (!--maxfire || utime < tl->expires) {
-			virt_expires = tl->expires;
-			break;
-		}
-		tl->firing = 1;
-		list_move_tail(&tl->entry, firing);
-	}
-
-	++timers;
-	maxfire = 20;
-	sched_expires = 0;
-	while (!list_empty(timers)) {
-		struct cpu_timer_list *tl = list_first_entry(timers,
-						      struct cpu_timer_list,
-						      entry);
-		if (!--maxfire || sum_sched_runtime < tl->expires) {
-			sched_expires = tl->expires;
-			break;
-		}
-		tl->firing = 1;
-		list_move_tail(&tl->entry, firing);
-	}
+	prof_expires = check_timers_list(timers, firing, ptime);
+	virt_expires = check_timers_list(++timers, firing, utime);
+	sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
 
 	/*
 	 * Check for the special case process timers.
-- 
cgit v1.2.3


From 76cdcdd979ce00f5037804d73da583fb488ec1b2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 28 Jun 2013 00:06:43 +0000
Subject: posix-timers: correctly get dying task time sample in
 posix_cpu_timer_schedule()

In order to re-arm a timer after it fired, we take a sample of the current
process or thread cputime.

If the task is dying though, we don't arm anything but we cache the
remaining timer expiration delta for further reads.

Something similar is performed in posix_cpu_timer_get() but here we forget
to take the process wide cputime sample before caching it.

As a result we are storing random stack content, leading every further
reads of that timer to return junk values.

Fix this by taking the appropriate sample in the case of process wide
timers.

This probably doesn't matter much in practice because, at this stage, the
thread is the last one in the group and we reached exit_notify().  This
implies that we called exit_itimers() and there should be no more timers
to handle for that task.

So this is likely dead code anyway but let's fix the current logic
and the warning that came along:

    kernel/posix-cpu-timers.c: In function 'posix_cpu_timer_schedule':
    kernel/posix-cpu-timers.c:1127: warning: 'now' may be used uninitialized in this function

Then we can start to think further about cleaning up that code.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Chen Gang <gang.chen@asianux.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Chen Gang <gang.chen@asianux.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Olivier Langlois <olivier@trillion01.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 92a4fbf44f86..4ebd8ad07c66 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1097,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			 * not yet reaped.  Take this opportunity to
 			 * drop our task ref.
 			 */
+			cpu_timer_sample_group(timer->it_clock, p, &now);
 			clear_dead_task(timer, now);
 			goto out_unlock;
 		}
-- 
cgit v1.2.3


From a0b2062b0904ef07944c4a6e4d0f88ee44f1e9f2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 28 Jun 2013 00:06:43 +0000
Subject: posix_timers: fix racy timer delta caching on task exit

When a task exits, we perform a caching of the remaining cputime delta
before expiring of its timers.

This is done from the following places:

* When the task is reaped. We iterate through its list of
  posix cpu timers and store the remaining timer delta to
  the timer struct instead of the absolute value.
  (See posix_cpu_timers_exit() / posix_cpu_timers_exit_group() )

* When we call posix_cpu_timer_get() or posix_cpu_timer_schedule().
  If the timer's task is considered dying when watched from these
  places, the same conversion from absolute to relative expiry time
  is performed. Then the given task's reference is released.
  (See clear_dead_task() ).

The relevance of this caching is questionable but this is another
and deeper debate.

The big issue here is that these two sources of caching don't mix
up very well together.

More specifically, the caching can easily be done twice, resulting
in a wrong delta as it gets spuriously substracted a second time by
the elapsed clock. This can happen in the following scenario:

1) The task exits and gets reaped: we call posix_cpu_timers_exit()
   and the absolute timer expiry values are converted to a relative
   delta.

2) timer_gettime() -> posix_cpu_timer_get() is called and relies on
   clear_dead_task() because  tsk->exit_state == EXIT_DEAD.
   The delta gets substracted again by the elapsed clock and we return
   a wrong result.

To fix this, just remove the caching done on task reaping time.  It
doesn't bring much value on its own.  The caching done from
posix_cpu_timer_get/schedule is enough.

And it would also be hard to get it really right: we could make it put and
clear the target task in the timer struct so that readers know if they are
dealing with a relative cached of absolute value.  But it would be racy.
The only safe way to do it would be to lock the itimer->it_lock so that we
know nobody reads the cputime expiry value while we modify it and its
target task reference.  Doing so would involve some funny workarounds to
avoid circular lock against the sighand lock.  There is just no reason to
maintain this.

The user visible effect of this patch can be observed by running the
following code: it creates a subthread that launches a posix cputimer
which expires after 10 seconds. But then the subthread only busy loops for 2
seconds and exits. The parent reaps the subthread and read the timer value.
Its expected value should the be the initial timer's expiration value
minus the cputime elapsed in the subthread. Roughly 10 - 2 = 8 seconds:

	#include <sys/time.h>
	#include <stdio.h>
	#include <unistd.h>
	#include <time.h>
	#include <pthread.h>

	static timer_t id;
	static struct itimerspec val = { .it_value.tv_sec = 10, }, new;

	static void *thread(void *unused)
	{
		int err;
		struct timeval start, end, diff;

		timer_create(CLOCK_THREAD_CPUTIME_ID, NULL, &id);
		if (err < 0) {
			perror("Can't create timer\n");
			return NULL;
		}

		/* Arm 10 sec timer */
		err = timer_settime(id, 0, &val, NULL);
		if (err < 0) {
			perror("Can't set timer\n");
			return NULL;
		}

		/* Exit after 2 seconds of execution */
		gettimeofday(&start, NULL);
	        do {
			gettimeofday(&end, NULL);
			timersub(&end, &start, &diff);
		} while (diff.tv_sec < 2);

		return NULL;
	}

	int main(int argc, char **argv)
	{
		pthread_t pthread;
		int err;

		err = pthread_create(&pthread, NULL, thread, NULL);
		if (err) {
			perror("Can't create thread\n");
			return -1;
		}
		pthread_join(pthread, NULL);
		/* Just wait a little bit to make sure the child got reaped */
		sleep(1);
		err = timer_gettime(id, &new);
		if (err)
			perror("Can't get timer value\n");
		printf("%d %ld\n", new.it_value.tv_sec, new.it_value.tv_nsec);

		return 0;
	}

Before the patch:

       $ ./posix_cpu_timers
       6 2278074

After the patch:

      $ ./posix_cpu_timers
      8 1158766

Before the patch, the elapsed time got two more seconds spuriously accounted.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Olivier Langlois <olivier@trillion01.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 4ebd8ad07c66..c7f31aa272f7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -404,14 +404,8 @@ static void cleanup_timers_list(struct list_head *head,
 {
 	struct cpu_timer_list *timer, *next;
 
-	list_for_each_entry_safe(timer, next, head, entry) {
+	list_for_each_entry_safe(timer, next, head, entry)
 		list_del_init(&timer->entry);
-		if (timer->expires < curr) {
-			timer->expires = 0;
-		} else {
-			timer->expires -= curr;
-		}
-	}
 }
 
 /*
@@ -459,15 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
 		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
 
-static void clear_dead_task(struct k_itimer *timer, unsigned long long now)
+static void clear_dead_task(struct k_itimer *itimer, unsigned long long now)
 {
+	struct cpu_timer_list *timer = &itimer->it.cpu;
+
 	/*
 	 * That's all for this thread or process.
 	 * We leave our residual in expires to be reported.
 	 */
-	put_task_struct(timer->it.cpu.task);
-	timer->it.cpu.task = NULL;
-	timer->it.cpu.expires -= now;
+	put_task_struct(timer->task);
+	timer->task = NULL;
+	if (timer->expires < now) {
+		timer->expires = 0;
+	} else {
+		timer->expires -= now;
+	}
 }
 
 static inline int expires_gt(cputime_t expires, cputime_t new_exp)
-- 
cgit v1.2.3


From fa18f7bde3ad4568d1d343b60d963bfbd8dc3991 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Sun, 26 May 2013 17:35:41 -0400
Subject: posix-cpu-timers: don't account cpu timer after stopped thread
 runtime accounting

When tsk->signal->cputimer->running is 1, signal->cputimer (i.e. per process
timer account) and tsk->sum_sched_runtime (i.e. per thread timer account)
increase at the same pace because update_curr() increases both accounting.

However, there is one exception. When thread exiting, __exit_signal() turns
over task's sum_shced_runtime to sig->sum_sched_runtime, but it doesn't stop
signal->cputimer accounting.

This inconsistency makes POSIX timer wake up too early. This patch fixes it.

Original-patch-by: Olivier Langlois <olivier@trillion01.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Olivier Langlois <olivier@trillion01.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/sched/stats.h | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 2ef90a51ec5e..71bac979d5ee 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -161,6 +161,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
  * on CONFIG_SCHEDSTATS.
  */
 
+/**
+ * cputimer_running - return true if cputimer is running
+ *
+ * @tsk:	Pointer to target task.
+ */
+static inline bool cputimer_running(struct task_struct *tsk)
+
+{
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+	if (!cputimer->running)
+		return false;
+
+	/*
+	 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
+	 * in __exit_signal(), we won't account to the signal struct further
+	 * cputime consumed by that task, even though the task can still be
+	 * ticking after __exit_signal().
+	 *
+	 * In order to keep a consistent behaviour between thread group cputime
+	 * and thread group cputimer accounting, lets also ignore the cputime
+	 * elapsing after __exit_signal() in any thread group timer running.
+	 *
+	 * This makes sure that POSIX CPU clocks and timers are synchronized, so
+	 * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
+	 * clock delta is behind the expiring timer value.
+	 */
+	if (unlikely(!tsk->sighand))
+		return false;
+
+	return true;
+}
+
 /**
  * account_group_user_time - Maintain utime for a thread group.
  *
@@ -176,7 +209,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
-	if (!cputimer->running)
+	if (!cputimer_running(tsk))
 		return;
 
 	raw_spin_lock(&cputimer->lock);
@@ -199,7 +232,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
-	if (!cputimer->running)
+	if (!cputimer_running(tsk))
 		return;
 
 	raw_spin_lock(&cputimer->lock);
@@ -222,7 +255,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
-	if (!cputimer->running)
+	if (!cputimer_running(tsk))
 		return;
 
 	raw_spin_lock(&cputimer->lock);
-- 
cgit v1.2.3