From d6cbf35dac8a3dadb9103379820c96d7c85df3d9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 May 2013 19:44:20 +0800 Subject: cgroup: initialize xattr before calling d_instantiate() cgroup_create_file() calls d_instantiate(), which may decide to look at the xattrs on the file. Smack always does this and SELinux can be configured to do so. But cgroup_add_file() didn't initialize xattrs before calling cgroup_create_file(), which finally leads to dereferencing NULL dentry->d_fsdata. This bug has been there since cgroup xattr was introduced. Cc: # 3.8.x Reported-by: Ivan Bulatovic Reported-by: Casey Schaufler Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a9926275f80..38b136553044 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2699,13 +2699,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, goto out; } + cfe->type = (void *)cft; + cfe->dentry = dentry; + dentry->d_fsdata = cfe; + simple_xattrs_init(&cfe->xattrs); + mode = cgroup_file_mode(cft); error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); if (!error) { - cfe->type = (void *)cft; - cfe->dentry = dentry; - dentry->d_fsdata = cfe; - simple_xattrs_init(&cfe->xattrs); list_add_tail(&cfe->node, &parent->files); cfe = NULL; } -- cgit v1.2.3 From 6faf72834d9d0c0dc6632604eaeffb621e87fcf9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 May 2013 06:53:37 -0700 Subject: rcu: Fix comparison sense in rcu_needs_cpu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c0f4dfd4f (rcu: Make RCU_FAST_NO_HZ take advantage of numbered callbacks) introduced a bug that can result in excessively long grace periods. This bug reverse the senes of the "if" statement checking for lazy callbacks, so that RCU takes a lazy approach when there are in fact non-lazy callbacks. This can result in excessive boot, suspend, and resume times. This commit therefore fixes the sense of this "if" statement. Reported-by: Borislav Petkov Reported-by: Bjørn Mork Reported-by: Joerg Roedel Signed-off-by: Paul E. McKenney Tested-by: Bjørn Mork Tested-by: Joerg Roedel --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 170814dc418f..6d939a645da1 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1667,7 +1667,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) rdtp->last_accelerate = jiffies; /* Request timer delay depending on laziness, and round. */ - if (rdtp->all_lazy) { + if (!rdtp->all_lazy) { *dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; } else { -- cgit v1.2.3 From 8f174b1175a10903ade40f36eb6c896412877ca0 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 1 May 2013 00:07:00 +0900 Subject: workqueue: correct handling of the pool spin_lock When we fail to mutex_trylock(), we release the pool spin_lock and do mutex_lock(). After that, we should regrab the pool spin_lock, but, regrabbing is missed in current code. So correct it. Cc: Lai Jiangshan Signed-off-by: Joonsoo Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1ae602809efb..286847b90225 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2059,6 +2059,7 @@ static bool manage_workers(struct worker *worker) if (unlikely(!mutex_trylock(&pool->manager_mutex))) { spin_unlock_irq(&pool->lock); mutex_lock(&pool->manager_mutex); + spin_lock_irq(&pool->lock); ret = true; } -- cgit v1.2.3 From ad7b1f841f8a54c6d61ff181451f55b68175e15a Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Mon, 6 May 2013 17:44:55 -0400 Subject: workqueue: Make schedule_work() available again to non GPL modules Commit 8425e3d5bdbe ("workqueue: inline trivial wrappers") changed schedule_work() and schedule_delayed_work() to inline wrappers, but these rely on some symbols that are EXPORT_SYMBOL_GPL, while the original functions were EXPORT_SYMBOL. This has the effect of changing the licensing requirement for these functions and making them unavailable to non GPL modules. Make them available again by removing the restriction on the required symbols. Signed-off-by: Marc Dionne Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 286847b90225..02916f421385 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -296,7 +296,7 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; struct workqueue_struct *system_wq __read_mostly; -EXPORT_SYMBOL_GPL(system_wq); +EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; EXPORT_SYMBOL_GPL(system_highpri_wq); struct workqueue_struct *system_long_wq __read_mostly; @@ -1411,7 +1411,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(queue_work_on); +EXPORT_SYMBOL(queue_work_on); void delayed_work_timer_fn(unsigned long __data) { @@ -1485,7 +1485,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(queue_delayed_work_on); +EXPORT_SYMBOL(queue_delayed_work_on); /** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU -- cgit v1.2.3 From 615ee5443ff9bedd356dc6865f3e9c276ce434ea Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 26 Mar 2013 11:35:16 -0400 Subject: rcu: Don't allocate bootmem from rcu_init() When rcu_init() is called we already have slab working, allocating bootmem at that point results in warnings and an allocation from slab. This commit therefore changes alloc_bootmem_cpumask_var() to alloc_cpumask_var() in rcu_bootup_announce_oddness(), which is called from rcu_init(). Signed-off-by: Sasha Levin Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Tested-by: Robin Holt [paulmck: convert to zalloc_cpumask_var(), as suggested by Yinghai Lu.] --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6d939a645da1..3db5a375d8dd 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -88,7 +88,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_RCU_NOCB_CPU #ifndef CONFIG_RCU_NOCB_CPU_NONE if (!have_rcu_nocb_mask) { - alloc_bootmem_cpumask_var(&rcu_nocb_mask); + zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); have_rcu_nocb_mask = true; } #ifdef CONFIG_RCU_NOCB_CPU_ZERO -- cgit v1.2.3 From 1be0c25da56e860992af972a60321563ca2cfcd1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 May 2013 14:24:24 -0700 Subject: workqueue: don't perform NUMA-aware allocations on offline nodes in wq_numa_init() wq_numa_init() builds per-node cpumasks which are later used to make unbound workqueues NUMA-aware. The cpumasks are allocated using alloc_cpumask_var_node() for all possible nodes. Unfortunately, on machines with off-line nodes, this leads to NUMA-aware allocations on existing bug offline nodes, which in turn triggers BUG in the memory allocation code. Fix it by using NUMA_NO_NODE for cpumask allocations for offline nodes. kernel BUG at include/linux/gfp.h:323! invalid opcode: 0000 [#1] SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0+ #1 Hardware name: ProLiant BL465c G7, BIOS A19 12/10/2011 task: ffff880234608000 ti: ffff880234602000 task.ti: ffff880234602000 RIP: 0010:[] [] new_slab+0x2ad/0x340 RSP: 0000:ffff880234603bf8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff880237404b40 RCX: 00000000000000d0 RDX: 0000000000000001 RSI: 0000000000000003 RDI: 00000000002052d0 RBP: ffff880234603c28 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000001 R11: ffffffff812e3aa8 R12: 0000000000000001 R13: ffff8802378161c0 R14: 0000000000030027 R15: 00000000000040d0 FS: 0000000000000000(0000) GS:ffff880237800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff88043fdff000 CR3: 00000000018d5000 CR4: 00000000000007f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffff880234603c28 0000000000000001 00000000000000d0 ffff8802378161c0 ffff880237404b40 ffff880237404b40 ffff880234603d28 ffffffff815edba1 ffff880237816140 0000000000000000 ffff88023740e1c0 Call Trace: [] __slab_alloc+0x330/0x4f2 [] kmem_cache_alloc_node_trace+0xa5/0x200 [] alloc_cpumask_var_node+0x28/0x90 [] wq_numa_init+0x10d/0x1be [] init_workqueues+0x64/0x341 [] do_one_initcall+0xea/0x1a0 [] kernel_init_freeable+0xb7/0x1ec [] kernel_init+0xe/0xf0 [] ret_from_fork+0x7c/0xb0 Code: 45 84 ac 00 00 00 f0 41 80 4d 00 40 e9 f6 fe ff ff 66 0f 1f 84 00 00 00 00 00 e8 eb 4b ff ff 49 89 c5 e9 05 fe ff ff <0f> 0b 4c 8b 73 38 44 89 ff 81 cf 00 00 20 00 4c 89 f6 48 c1 ee Signed-off-by: Tejun Heo Reported-and-Tested-by: Lingzhu Xiang --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 02916f421385..ee8e29a2320c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4905,7 +4905,8 @@ static void __init wq_numa_init(void) BUG_ON(!tbl); for_each_node(node) - BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node)); + BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, + node_online(node) ? node : NUMA_NO_NODE)); for_each_possible_cpu(cpu) { node = cpu_to_node(cpu); -- cgit v1.2.3 From 6ed0106667d76589cb648c27edb4f4ffbf9d59ca Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 16 May 2013 20:48:49 +0900 Subject: tracing: Return -EBUSY when event_enable_func() fails to get module Since try_module_get() returns false( = 0) when it fails to pindown a module, event_enable_func() returns 0 which means "succeed". This can cause a kernel panic when the entry is removed, because the event is already released. This fixes the bug by returning -EBUSY, because the reason why it fails is that the module is being removed at that time. Link: http://lkml.kernel.org/r/20130516114848.13508.97899.stgit@mhiramat-M0-7522 Cc: Srikar Dronamraju Cc: Oleg Nesterov Cc: Tom Zanussi Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7a0cf68027cc..27963e2bf4bf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2072,8 +2072,10 @@ event_enable_func(struct ftrace_hash *hash, out_reg: /* Don't let event modules unload while probe registered */ ret = try_module_get(file->event_call->mod); - if (!ret) + if (!ret) { + ret = -EBUSY; goto out_free; + } ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) -- cgit v1.2.3 From 264b83c07a84223f0efd0d1db9ccc66d6f88288f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 16 May 2013 17:43:55 +0200 Subject: usermodehelper: check subprocess_info->path != NULL argv_split(empty_or_all_spaces) happily succeeds, it simply returns argc == 0 and argv[0] == NULL. Change call_usermodehelper_exec() to check sub_info->path != NULL to avoid the crash. This is the minimal fix, todo: - perhaps we should change argv_split() to return NULL or change the callers. - kill or justify ->path[0] check - narrow the scope of helper_lock() Signed-off-by: Oleg Nesterov Acked-By: Lucas De Marchi Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/kmod.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 1296e72e4161..8241906c4b61 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -569,6 +569,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) int retval = 0; helper_lock(); + if (!sub_info->path) { + retval = -EINVAL; + goto out; + } + if (sub_info->path[0] == '\0') goto out; -- cgit v1.2.3 From 06c9494c0e9bdfcaa14d6d2b096a0ff7abe8494f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 May 2013 20:33:01 +0100 Subject: kmemleak: Scan all allocated, writeable and not executable module sections Instead of just picking data sections by name (names that start with .data, .bss or .ref.data), use the section flags and scan all sections that are allocated, writable and not executable. Which should cover all sections of a module that might reference data. Signed-off-by: Steven Rostedt [catalin.marinas@arm.com: removed unused 'name' variable] [catalin.marinas@arm.com: collapsed 'if' blocks] Signed-off-by: Catalin Marinas Acked-by: Rusty Russell --- kernel/module.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index b049939177f6..06f496a5d182 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2431,10 +2431,10 @@ static void kmemleak_load_module(const struct module *mod, kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); for (i = 1; i < info->hdr->e_shnum; i++) { - const char *name = info->secstrings + info->sechdrs[i].sh_name; - if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) - continue; - if (!strstarts(name, ".data") && !strstarts(name, ".bss")) + /* Scan all writable sections that's not executable */ + if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) || + !(info->sechdrs[i].sh_flags & SHF_WRITE) || + (info->sechdrs[i].sh_flags & SHF_EXECINSTR)) continue; kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, -- cgit v1.2.3 From 89c837351db0b9b52fd572ec8b0445a42e59b75c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 May 2013 20:46:23 +0100 Subject: kmemleak: No need for scanning specific module sections As kmemleak now scans all module sections that are allocated, writable and non executable, there's no need to scan individual sections that might reference data. Signed-off-by: Steven Rostedt Signed-off-by: Catalin Marinas Acked-by: Rusty Russell --- kernel/module.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 06f496a5d182..cab4bce49c23 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2769,24 +2769,11 @@ static void find_module_sections(struct module *mod, struct load_info *info) mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * - mod->num_trace_events, GFP_KERNEL); #endif #ifdef CONFIG_TRACING mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", sizeof(*mod->trace_bprintk_fmt_start), &mod->num_trace_bprintk_fmt); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_bprintk_fmt_start, - sizeof(*mod->trace_bprintk_fmt_start) * - mod->num_trace_bprintk_fmt, GFP_KERNEL); #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ -- cgit v1.2.3 From fbe06b7bae7c9cf6ab05168fce5ee93b2f4bae7c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 17 May 2013 11:49:10 -0700 Subject: x86, range: fix missing merge during add range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Christian found v3.9 does not work with E350 with EFI is enabled. [ 1.658832] Trying to unpack rootfs image as initramfs... [ 1.679935] BUG: unable to handle kernel paging request at ffff88006e3fd000 [ 1.686940] IP: [] memset+0x1f/0xb0 [ 1.692010] PGD 1f77067 PUD 1f7a067 PMD 61420067 PTE 0 but early memtest report all memory could be accessed without problem. early page table is set in following sequence: [ 0.000000] init_memory_mapping: [mem 0x00000000-0x000fffff] [ 0.000000] init_memory_mapping: [mem 0x6e600000-0x6e7fffff] [ 0.000000] init_memory_mapping: [mem 0x6c000000-0x6e5fffff] [ 0.000000] init_memory_mapping: [mem 0x00100000-0x6bffffff] [ 0.000000] init_memory_mapping: [mem 0x6e800000-0x6ea07fff] but later efi_enter_virtual_mode try set mapping again wrongly. [ 0.010644] pid_max: default: 32768 minimum: 301 [ 0.015302] init_memory_mapping: [mem 0x640c5000-0x6e3fcfff] that means it fails with pfn_range_is_mapped. It turns out that we have a bug in add_range_with_merge and it does not merge range properly when new add one fill the hole between two exsiting ranges. In the case when [mem 0x00100000-0x6bffffff] is the hole between [mem 0x00000000-0x000fffff] and [mem 0x6c000000-0x6e7fffff]. Fix the add_range_with_merge by calling itself recursively. Reported-by: "Christian König" Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQVofGoSk7q5-0irjkBxemqK729cND4hov-1QCBJDhxpgQ@mail.gmail.com Cc: v3.9 Signed-off-by: H. Peter Anvin --- kernel/range.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/range.c b/kernel/range.c index 071b0ab455cb..eb911dbce267 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -48,9 +48,11 @@ int add_range_with_merge(struct range *range, int az, int nr_range, final_start = min(range[i].start, start); final_end = max(range[i].end, end); - range[i].start = final_start; - range[i].end = final_end; - return nr_range; + /* clear it and add it back for further merge */ + range[i].start = 0; + range[i].end = 0; + return add_range_with_merge(range, az, nr_range, + final_start, final_end); } /* Need to add it: */ -- cgit v1.2.3 From ca1643186d3dce6171d8f171e516b02496360a9e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 23 May 2013 11:51:10 -0400 Subject: tracing: Fix crash when ftrace=nop on the kernel command line If ftrace= is on the kernel command line, when that tracer is registered, it will be initiated by tracing_set_tracer() to execute that tracer. The nop tracer is just a stub tracer that is used to have no tracer enabled. It is assigned at early bootup as it is the default tracer. But if ftrace=nop is on the kernel command line, the registering of the nop tracer will call tracing_set_tracer() which will try to execute the nop tracer. But it expects tr->current_trace to be assigned something as it usually is assigned to the nop tracer. As it hasn't been assigned to anything yet, it causes the system to crash. The simple fix is to move the tr->current_trace = nop before registering the nop tracer. The functionality is still the same as the nop tracer doesn't do anything anyway. Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ae6fa2d1cdf7..4d79485b3237 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6216,10 +6216,15 @@ __init static int tracer_alloc_buffers(void) trace_init_cmdlines(); - register_tracer(&nop_trace); - + /* + * register_tracer() might reference current_trace, so it + * needs to be set before we register anything. This is + * just a bootstrap of current_trace anyway. + */ global_trace.current_trace = &nop_trace; + register_tracer(&nop_trace); + /* All seems OK, enable tracing */ tracing_disabled = 0; -- cgit v1.2.3 From 7805d000db30a3787a4c969bab6ae4d8a5fd8ce6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:50:24 +0900 Subject: cgroup: fix a subtle bug in descendant pre-order walk When cgroup_next_descendant_pre() initiates a walk, it checks whether the subtree root doesn't have any children and if not returns NULL. Later code assumes that the subtree isn't empty. This is broken because the subtree may become empty inbetween, which can lead to the traversal escaping the subtree by walking to the sibling of the subtree root. There's no reason to have the early exit path. Remove it along with the later assumption that the subtree isn't empty. This simplifies the code a bit and fixes the subtle bug. While at it, fix the comment of cgroup_for_each_descendant_pre() which was incorrectly referring to ->css_offline() instead of ->css_online(). Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Cc: stable@vger.kernel.org --- kernel/cgroup.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 38b136553044..31e9ef319070 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2954,11 +2954,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, WARN_ON_ONCE(!rcu_read_lock_held()); /* if first iteration, pretend we just visited @cgroup */ - if (!pos) { - if (list_empty(&cgroup->children)) - return NULL; + if (!pos) pos = cgroup; - } /* visit the first child if exists */ next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); @@ -2966,14 +2963,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, return next; /* no child, visit my or the closest ancestor's next sibling */ - do { + while (pos != cgroup) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); if (&next->sibling != &pos->parent->children) return next; pos = pos->parent; - } while (pos != cgroup); + } return NULL; } -- cgit v1.2.3 From 387b8b3e37cb1c257fb607787f73815c30d22859 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 24 May 2013 15:55:25 -0700 Subject: auditfilter.c: fix kernel-doc warnings Fix kernel-doc warnings in kernel/auditfilter.c: Warning(kernel/auditfilter.c:1029): Excess function parameter 'loginuid' description in 'audit_receive_filter' Warning(kernel/auditfilter.c:1029): Excess function parameter 'sessionid' description in 'audit_receive_filter' Warning(kernel/auditfilter.c:1029): Excess function parameter 'sid' description in 'audit_receive_filter' Signed-off-by: Randy Dunlap Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 83a2970295d1..6bd4a90d1991 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1021,9 +1021,6 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re * @seq: netlink audit message sequence (serial) number * @data: payload data * @datasz: size of payload data - * @loginuid: loginuid of sender - * @sessionid: sessionid for netlink audit message - * @sid: SE Linux Security ID of sender */ int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) { -- cgit v1.2.3 From 2938d2757fc99c26aa678ce4eba910c4a77c3a55 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 May 2013 09:33:01 +0200 Subject: tick: Cure broadcast false positive pending bit warning commit 26517f3e (tick: Avoid programming the local cpu timer if broadcast pending) added a warning if the cpu enters broadcast mode again while the pending bit is still set. Meelis reported that the warning triggers. There are two corner cases which have been not considered: 1) cpuidle calls clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER) twice. That can result in the following scenario CPU0 CPU1 cpuidle_idle_call() clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER) set cpu in tick_broadcast_oneshot_mask broadcast interrupt event expired for cpu1 set pending bit acpi_idle_enter_simple() clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER) WARN_ON(pending bit) Move the WARN_ON into the section where we enter broadcast mode so it wont provide false positives on the second call. 2) safe_halt() enables interrupts, so a broadcast interrupt can be delivered befor the broadcast mode is disabled. That sets the pending bit for the CPU which receives the broadcast interrupt. Though the interrupt is delivered right away from the broadcast handler and leaves the pending bit stale. Clear the pending bit for the current cpu in the broadcast handler. Reported-and-tested-by: Meelis Roos Cc: Len Brown Cc: Frederic Weisbecker Cc: Borislav Petkov Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1305271841130.4220@ionos Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 24938d577669..0c739423b0f9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -511,6 +511,12 @@ again: } } + /* + * Remove the current cpu from the pending mask. The event is + * delivered immediately in tick_do_broadcast() ! + */ + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); + /* Take care of enforced broadcast requests */ cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); cpumask_clear(tick_broadcast_force_mask); @@ -575,8 +581,8 @@ void tick_broadcast_oneshot_control(unsigned long reason) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { + WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); /* * We only reprogram the broadcast timer if we -- cgit v1.2.3 From 7b959fc582741227a1c4cba710d6aff8fb183128 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 22 May 2013 18:34:09 +0900 Subject: kprobes: Fix to free gone and unused optprobes Fix to free gone and unused optprobes. This bug will cause a kernel panic if the user reuses the killed and unused probe. Reported at: http://sourceware.org/ml/systemtap/2013-q2/msg00142.html In the normal path, an optprobe on an init function is unregistered when a module goes live. unregister_kprobe(kp) -> __unregister_kprobe_top ->__disable_kprobe ->disarm_kprobe(ap == op) ->__disarm_kprobe ->unoptimize_kprobe : the op is queued on unoptimizing_list and do nothing in __unregister_kprobe_bottom After a while (usually wait 5 jiffies), kprobe_optimizer runs to unoptimize and free optprobe. kprobe_optimizer ->do_unoptimize_kprobes ->arch_unoptimize_kprobes : moved to free_list ->do_free_cleaned_kprobes ->hlist_del: the op is removed ->free_aggr_kprobe ->arch_remove_optimized_kprobe ->arch_remove_kprobe ->kfree: the op is freed Here, if kprobes_module_callback is called and the delayed unoptimizing probe is picked BEFORE kprobe_optimizer runs, kprobes_module_callback ->kill_kprobe ->kill_optimized_kprobe : dequeued from unoptimizing_list <=!!! ->arch_remove_optimized_kprobe ->arch_remove_kprobe (but op is not freed, and on the kprobe hash table) This doesn't happen if the probe unregistration is done AFTER kprobes_module_callback is called (because at that time the op is gone), and kprobe-tracer does it. To fix this bug, this patch changes kprobes_module_callback to enqueue the op to freeing_list at kill_optimized_kprobe only if the op is unused. The unused probes on freeing_list will be freed in do_free_cleaned_kprobes. Note that this calls arch_remove_*kprobe twice on the same probe. Thus those functions have to check the double free. Fortunately, most of arch codes already checked that except for mips. This will be fixed in the next patch. Signed-off-by: Masami Hiramatsu Cc: Timo Juhani Lindfors Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Frank Ch. Eigler Cc: systemtap@sourceware.org Cc: yrl.pp-manager.tt@hitachi.com Cc: David S. Miller Cc: "David S. Miller" Link: http://lkml.kernel.org/r/20130522093409.9084.63554.stgit@mhiramat-M0-7522 [ Minor edits. ] Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3fed7f0cbcdf..bddf3b201a48 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -467,6 +467,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) /* Optimization staging list, protected by kprobe_mutex */ static LIST_HEAD(optimizing_list); static LIST_HEAD(unoptimizing_list); +static LIST_HEAD(freeing_list); static void kprobe_optimizer(struct work_struct *work); static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); @@ -504,7 +505,7 @@ static __kprobes void do_optimize_kprobes(void) * Unoptimize (replace a jump with a breakpoint and remove the breakpoint * if need) kprobes listed on unoptimizing_list. */ -static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) +static __kprobes void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; @@ -515,9 +516,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) /* Ditto to do_optimize_kprobes */ get_online_cpus(); mutex_lock(&text_mutex); - arch_unoptimize_kprobes(&unoptimizing_list, free_list); + arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ - list_for_each_entry_safe(op, tmp, free_list, list) { + list_for_each_entry_safe(op, tmp, &freeing_list, list) { /* Disarm probes if marked disabled */ if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); @@ -536,11 +537,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) } /* Reclaim all kprobes on the free_list */ -static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) +static __kprobes void do_free_cleaned_kprobes(void) { struct optimized_kprobe *op, *tmp; - list_for_each_entry_safe(op, tmp, free_list, list) { + list_for_each_entry_safe(op, tmp, &freeing_list, list) { BUG_ON(!kprobe_unused(&op->kp)); list_del_init(&op->list); free_aggr_kprobe(&op->kp); @@ -556,8 +557,6 @@ static __kprobes void kick_kprobe_optimizer(void) /* Kprobe jump optimizer */ static __kprobes void kprobe_optimizer(struct work_struct *work) { - LIST_HEAD(free_list); - mutex_lock(&kprobe_mutex); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -566,7 +565,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) * kprobes before waiting for quiesence period. */ - do_unoptimize_kprobes(&free_list); + do_unoptimize_kprobes(); /* * Step 2: Wait for quiesence period to ensure all running interrupts @@ -581,7 +580,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) do_optimize_kprobes(); /* Step 4: Free cleaned kprobes after quiesence period */ - do_free_cleaned_kprobes(&free_list); + do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); mutex_unlock(&kprobe_mutex); @@ -723,8 +722,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p) if (!list_empty(&op->list)) /* Dequeue from the (un)optimization queue */ list_del_init(&op->list); - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + + if (kprobe_unused(p)) { + /* Enqueue if it is unused */ + list_add(&op->list, &freeing_list); + /* + * Remove unused probes from the hash list. After waiting + * for synchronization, this probe is reclaimed. + * (reclaiming is done by do_free_cleaned_kprobes().) + */ + hlist_del_rcu(&op->kp.hlist); + } + /* Don't touch the code, because it is already freed. */ arch_remove_optimized_kprobe(op); } -- cgit v1.2.3 From 26cb63ad11e04047a64309362674bcbbd6a6f246 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 May 2013 10:55:48 +0200 Subject: perf: Fix perf mmap bugs Vince reported a problem found by his perf specific trinity fuzzer. Al noticed 2 problems with perf's mmap(): - it has issues against fork() since we use vma->vm_mm for accounting. - it has an rb refcount leak on double mmap(). We fix the issues against fork() by using VM_DONTCOPY; I don't think there's code out there that uses this; we didn't hear about weird accounting problems/crashes. If we do need this to work, the previously proposed VM_PINNED could make this work. Aside from the rb reference leak spotted by Al, Vince's example prog was indeed doing a double mmap() through the use of perf_event_set_output(). This exposes another problem, since we now have 2 events with one buffer, the accounting gets screwy because we account per event. Fix this by making the buffer responsible for its own accounting. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Al Viro Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20130528085548.GA12193@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 37 ++++++++++++++++++++----------------- kernel/events/internal.h | 3 +++ 2 files changed, 23 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9dc297faf7c0..ae752cd4a086 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2917,7 +2917,7 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static void ring_buffer_put(struct ring_buffer *rb); +static bool ring_buffer_put(struct ring_buffer *rb); static void free_event(struct perf_event *event) { @@ -3582,13 +3582,13 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -static void ring_buffer_put(struct ring_buffer *rb) +static bool ring_buffer_put(struct ring_buffer *rb) { struct perf_event *event, *n; unsigned long flags; if (!atomic_dec_and_test(&rb->refcount)) - return; + return false; spin_lock_irqsave(&rb->event_lock, flags); list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { @@ -3598,6 +3598,7 @@ static void ring_buffer_put(struct ring_buffer *rb) spin_unlock_irqrestore(&rb->event_lock, flags); call_rcu(&rb->rcu_head, rb_free_rcu); + return true; } static void perf_mmap_open(struct vm_area_struct *vma) @@ -3612,18 +3613,20 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->rb); - struct user_struct *user = event->mmap_user; struct ring_buffer *rb = event->rb; + struct user_struct *mmap_user = rb->mmap_user; + int mmap_locked = rb->mmap_locked; + unsigned long size = perf_data_size(rb); - atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->pinned_vm -= event->mmap_locked; rcu_assign_pointer(event->rb, NULL); ring_buffer_detach(event, rb); mutex_unlock(&event->mmap_mutex); - ring_buffer_put(rb); - free_uid(user); + if (ring_buffer_put(rb)) { + atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= mmap_locked; + free_uid(mmap_user); + } } } @@ -3676,9 +3679,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages == nr_pages) - atomic_inc(&event->rb->refcount); - else + if (event->rb->nr_pages != nr_pages) ret = -EINVAL; goto unlock; } @@ -3720,12 +3721,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; goto unlock; } - rcu_assign_pointer(event->rb, rb); + + rb->mmap_locked = extra; + rb->mmap_user = get_current_user(); atomic_long_add(user_extra, &user->locked_vm); - event->mmap_locked = extra; - event->mmap_user = get_current_user(); - vma->vm_mm->pinned_vm += event->mmap_locked; + vma->vm_mm->pinned_vm += extra; + + rcu_assign_pointer(event->rb, rb); perf_event_update_userpage(event); @@ -3734,7 +3737,7 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; return ret; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index eb675c4d59df..5bc6c8e9b851 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -31,6 +31,9 @@ struct ring_buffer { spinlock_t event_lock; struct list_head event_list; + int mmap_locked; + struct user_struct *mmap_user; + struct perf_event_mmap_page *user_page; void *data_pages[0]; }; -- cgit v1.2.3 From 6721cb60022629ae76365551f05d9658b8d14c55 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 23 May 2013 14:21:36 -0400 Subject: ring-buffer: Do not poll non allocated cpu buffers The tracing infrastructure sets up for possible CPUs, but it uses the ring buffer polling, it is possible to call the ring buffer polling code with a CPU that hasn't been allocated. This will cause a kernel oops when it access a ring buffer cpu buffer that is part of the possible cpus but hasn't been allocated yet as the CPU has never been online. Reported-by: Mauro Carvalho Chehab Tested-by: Mauro Carvalho Chehab Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b59aea2c48c2..e444ff88f0a4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -620,6 +620,9 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, if (cpu == RING_BUFFER_ALL_CPUS) work = &buffer->irq_work; else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + cpu_buffer = buffer->buffers[cpu]; work = &cpu_buffer->irq_work; } -- cgit v1.2.3 From aa848233f740abbabfa7669daca0ab94aaa37bcd Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 3 May 2013 23:27:07 +0200 Subject: ntp: Remove unused variable flags in __hardpps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kernel/time/ntp.c: In function ‘__hardpps’: kernel/time/ntp.c:877: warning: unused variable ‘flags’ commit a076b2146fabb0894cae5e0189a8ba3f1502d737 ("ntp: Remove ntp_lock, using the timekeeping locks to protect ntp state") removed its users, but not the actual variable. Signed-off-by: Geert Uytterhoeven Signed-off-by: John Stultz --- kernel/time/ntp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 12ff13a838c6..8f5b3b98577b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -874,7 +874,6 @@ static void hardpps_update_phase(long error) void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) { struct pps_normtime pts_norm, freq_norm; - unsigned long flags; pts_norm = pps_normalize_ts(*phase_ts); -- cgit v1.2.3 From 0d6bd9953f739dad96d9a0de65383e479ab4e10d Mon Sep 17 00:00:00 2001 From: Zoran Markovic Date: Fri, 17 May 2013 11:24:05 -0700 Subject: timekeeping: Correct run-time detection of persistent_clock. Since commit 31ade30692dc9680bfc95700d794818fa3f754ac, timekeeping_init() checks for presence of persistent clock by attempting to read a non-zero time value. This is an issue on platforms where persistent_clock (instead is implemented as a free-running counter (instead of an RTC) starting from zero on each boot and running during suspend. Examples are some ARM platforms (e.g. PandaBoard). An attempt to read such a clock during timekeeping_init() may return zero value and falsely declare persistent clock as missing. Additionally, in the above case suspend times may be accounted twice (once from timekeeping_resume() and once from rtc_resume()), resulting in a gradual drift of system time. This patch does a run-time correction of the issue by doing the same check during timekeeping_suspend(). A better long-term solution would have to return error when trying to read non-existing clock and zero when trying to read an uninitialized clock, but that would require changing all persistent_clock implementations. This patch addresses the immediate breakage, for now. Cc: John Stultz Cc: Thomas Gleixner Cc: Feng Tang Cc: stable@vger.kernel.org Signed-off-by: Zoran Markovic [jstultz: Tweaked commit message and subject] Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 98cd470bbe49..baeeb5c87cf1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -975,6 +975,14 @@ static int timekeeping_suspend(void) read_persistent_clock(&timekeeping_suspend_time); + /* + * On some systems the persistent_clock can not be detected at + * timekeeping_init by its return value, so if we see a valid + * value returned, update the persistent_clock_exists flag. + */ + if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) + persistent_clock_exist = true; + raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); -- cgit v1.2.3 From 2a0ff3fbe39bc93f719ff857e5a359d9780579ff Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Sun, 26 May 2013 21:33:09 +0800 Subject: cgroup: warn about mismatching options of a new mount of an existing hierarchy With the new __DEVEL__sane_behavior mount option was introduced, if the root cgroup is alive with no xattr function, to mount a new cgroup with xattr will be rejected in terms of design which just fine. However, if the root cgroup does not mounted with __DEVEL__sane_hehavior, to create a new cgroup with xattr option will succeed although after that the EA function does not works as expected but will get ENOTSUPP for setting up attributes under either cgroup. e.g. setfattr: /cgroup2/test: Operation not supported Instead of keeping silence in this case, it's better to drop a log entry in warning level. That would be helpful to understand the reason behind the scene from the user's perspective, and this is essentially an improvement does not break the backward compatibilities. With this fix, above mount attemption will keep up works as usual but the following line cound be found at the system log: [ ...] cgroup: new mount options do not match the existing superblock tj: minor formatting / message updates. Signed-off-by: Jie Liu Reported-by: Alexey Kodanev Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 31e9ef319070..a7c9e6ddb979 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1686,11 +1686,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ cgroup_drop_root(opts.new_root); - if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && - root->flags != opts.flags) { - pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); - ret = -EINVAL; - goto drop_new_super; + if (root->flags != opts.flags) { + if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { + pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + ret = -EINVAL; + goto drop_new_super; + } else { + pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); + } } /* no subsys rebinding, so refcounts don't change */ -- cgit v1.2.3 From 1bb539ca36e21c2f4fce0865e11df384bc7b7656 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 May 2013 14:38:43 -0400 Subject: ftrace: Use the rcu _notrace variants for rcu_dereference_raw() and friends As rcu_dereference_raw() under RCU debug config options can add quite a bit of checks, and that tracing uses rcu_dereference_raw(), these checks happen with the function tracer. The function tracer also happens to trace these debug checks too. This added overhead can livelock the system. Have the function tracer use the new RCU _notrace equivalents that do not do the debug checks for RCU. Link: http://lkml.kernel.org/r/20130528184209.467603904@goodmis.org Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b549b0f5b977..6c508ff33c62 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -120,22 +120,22 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); /* * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw() is that elements removed from this list + * can use rcu_dereference_raw_notrace() is that elements removed from this list * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw() calls are needed to handle + * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle * concurrent insertions into the ftrace_global_list. * * Silly Alpha and silly pointer-speculation compiler optimizations! */ #define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw(list); \ + op = rcu_dereference_raw_notrace(list); \ do /* * Optimized for just a single item in the list (as that is the normal case). */ #define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw((op)->next)) && \ + while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ unlikely((op) != &ftrace_list_end)) static inline void ftrace_ops_init(struct ftrace_ops *ops) @@ -779,7 +779,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) if (hlist_empty(hhd)) return NULL; - hlist_for_each_entry_rcu(rec, hhd, node) { + hlist_for_each_entry_rcu_notrace(rec, hhd, node) { if (rec->ip == ip) return rec; } @@ -1165,7 +1165,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) hhd = &hash->buckets[key]; - hlist_for_each_entry_rcu(entry, hhd, hlist) { + hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { if (entry->ip == ip) return entry; } @@ -1422,8 +1422,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) struct ftrace_hash *notrace_hash; int ret; - filter_hash = rcu_dereference_raw(ops->filter_hash); - notrace_hash = rcu_dereference_raw(ops->notrace_hash); + filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); + notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && @@ -2920,7 +2920,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - hlist_for_each_entry_rcu(entry, hhd, node) { + hlist_for_each_entry_rcu_notrace(entry, hhd, node) { if (entry->ip == ip) entry->ops->func(ip, parent_ip, &entry->data); } -- cgit v1.2.3 From 0184d50f9fd17658c232d6ee6d465a87f989d706 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 29 May 2013 15:56:49 -0400 Subject: tracing: Fix bad parameter passed in branch selftest The branch selftest calls trace_test_buffer(), but with the new code it expects the first parameter to be a pointer to a struct trace_buffer. All self tests were changed but the branch selftest was missed. This caused either a crash or failed test when the branch selftest was enabled. Link: http://lkml.kernel.org/r/20130529141333.GA24064@localhost Reported-by: Fengguang Wu Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 55e2cf66967b..2901e3b88590 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1159,7 +1159,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); -- cgit v1.2.3 From 45eacc692771bd2b1ea3d384e6345cab3da10861 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 May 2013 22:16:32 +0200 Subject: vtime: Use consistent clocks among nohz accounting While computing the cputime delta of dynticks CPUs, we are mixing up clocks of differents natures: * local_clock() which takes care of unstable clock sources and fix these if needed. * sched_clock() which is the weaker version of local_clock(). It doesn't compute any fixup in case of unstable source. If the clock source is stable, those two clocks are the same and we can safely compute the difference against two random points. Otherwise it results in random deltas as sched_clock() can randomly drift away, back or forward, from local_clock(). As a consequence, some strange behaviour with unstable tsc has been observed such as non progressing constant zero cputime. (The 'top' command showing no load). Fix this by only using local_clock(), or its irq safe/remote equivalent, in vtime code. Reported-by: Mike Galbraith Suggested-by: Mike Galbraith Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/cputime.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272fd..e1a27f918723 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4745,7 +4745,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); - vtime_init_idle(idle); + vtime_init_idle(idle, cpu); #if defined(CONFIG_SMP) sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cc2dc3eea8a3..b5ccba22603b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -747,17 +747,17 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqlock(¤t->vtime_seqlock); current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = sched_clock(); + current->vtime_snap = sched_clock_cpu(smp_processor_id()); write_sequnlock(¤t->vtime_seqlock); } -void vtime_init_idle(struct task_struct *t) +void vtime_init_idle(struct task_struct *t, int cpu) { unsigned long flags; write_seqlock_irqsave(&t->vtime_seqlock, flags); t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = sched_clock(); + t->vtime_snap = sched_clock_cpu(cpu); write_sequnlock_irqrestore(&t->vtime_seqlock, flags); } -- cgit v1.2.3 From 521921bad1192fb1b8f9b6a5aa673635848b8b5f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 May 2013 01:21:38 +0200 Subject: kvm: Move guest entry/exit APIs to context_tracking The kvm_host.h header file doesn't handle well inclusion when archs don't support KVM. This results in build crashes for such archs when they want to implement context tracking because this subsystem includes kvm_host.h in order to implement the guest_enter/exit APIs but it doesn't handle KVM off case. To fix this, move the guest_enter()/guest_exit() declarations and generic implementation to the context tracking headers. These generic APIs actually belong to this subsystem, besides other domains boundary tracking like user_enter() et al. KVM now properly becomes a user of this library, not the other buggy way around. Reported-by: Kevin Hilman Reviewed-by: Kevin Hilman Tested-by: Kevin Hilman Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Kevin Hilman Cc: Marcelo Tosatti Cc: Gleb Natapov Signed-off-by: Ingo Molnar --- kernel/context_tracking.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b878..85bdde1137eb 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -15,7 +15,6 @@ */ #include -#include #include #include #include -- cgit v1.2.3 From 1a7f829f094dd7951e7d46c571a18080e455a436 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 17 May 2013 16:44:04 +0800 Subject: nohz: Fix notifier return val that enforce timekeeping In tick_nohz_cpu_down_callback() if the cpu is the one handling timekeeping, we must return something that stops the CPU_DOWN_PREPARE notifiers and then start notify CPU_DOWN_FAILED on the already called notifier call backs. However traditional errno values are not handled by the notifier unless these are encapsulated using errno_to_notifier(). Hence the current -EINVAL is misinterpreted and converted to junk after notifier_to_errno(), leaving the notifier subsystem to random behaviour such as eventually allowing the cpu to go down. Fix this by using the standard NOTIFY_BAD instead. Signed-off-by: Li Zhong Reviewed-by: Srivatsa S. Bhat Acked-by: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4208138fbf4..0cf1c1453181 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -306,7 +306,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, * we can't safely shutdown that CPU. */ if (have_nohz_full_mask && tick_do_timer_cpu == cpu) - return -EINVAL; + return NOTIFY_BAD; break; } return NOTIFY_OK; -- cgit v1.2.3 From f5d00c1f9adb350c24c5301600f7bf2da99b66de Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Tue, 28 May 2013 15:29:03 +0200 Subject: tick: Remove useless timekeeping duty attribution to broadcast source Since 7300711e ("clockevents: broadcast fixup possible waiters"), the timekeeping duty is assigned to the CPU that handles the tick broadcast clock device by the time it is set in one shot mode. This is an issue in full dynticks mode where the timekeeping duty must stay handled by the boot CPU for now. Otherwise it prevents secondary CPUs from offlining and this breaks suspend/shutdown/reboot/... As it appears there is no reason for this timekeeping duty to be moved to the broadcast CPU, besides nothing prevent it from being later re-assigned to another target, let's simply remove it. Signed-off-by: Jiri Bohac Reported-by: Steven Rostedt Acked-by: Thomas Gleixner Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Borislav Petkov Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0c739423b0f9..b4c245580b79 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -698,10 +698,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) bc->event_handler = tick_handle_oneshot_broadcast; - /* Take the do_timer update */ - if (!tick_nohz_full_cpu(cpu)) - tick_do_timer_cpu = cpu; - /* * We must be careful here. There might be other CPUs * waiting for periodic broadcast. We need to set the -- cgit v1.2.3 From f17a5194859a82afe4164e938b92035b86c55794 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 30 May 2013 21:10:37 -0400 Subject: tracing: Use current_uid() for critical time tracing The irqsoff tracer records the max time that interrupts are disabled. There are hooks in the assembly code that calls back into the tracer when interrupts are disabled or enabled. When they are enabled, the tracer checks if the amount of time they were disabled is larger than the previous recorded max interrupts off time. If it is, it creates a snapshot of the currently running trace to store where the last largest interrupts off time was held and how it happened. During testing, this RCU lockdep dump appeared: [ 1257.829021] =============================== [ 1257.829021] [ INFO: suspicious RCU usage. ] [ 1257.829021] 3.10.0-rc1-test+ #171 Tainted: G W [ 1257.829021] ------------------------------- [ 1257.829021] /home/rostedt/work/git/linux-trace.git/include/linux/rcupdate.h:780 rcu_read_lock() used illegally while idle! [ 1257.829021] [ 1257.829021] other info that might help us debug this: [ 1257.829021] [ 1257.829021] [ 1257.829021] RCU used illegally from idle CPU! [ 1257.829021] rcu_scheduler_active = 1, debug_locks = 0 [ 1257.829021] RCU used illegally from extended quiescent state! [ 1257.829021] 2 locks held by trace-cmd/4831: [ 1257.829021] #0: (max_trace_lock){......}, at: [] stop_critical_timing+0x1a3/0x209 [ 1257.829021] #1: (rcu_read_lock){.+.+..}, at: [] __update_max_tr+0x88/0x1ee [ 1257.829021] [ 1257.829021] stack backtrace: [ 1257.829021] CPU: 3 PID: 4831 Comm: trace-cmd Tainted: G W 3.10.0-rc1-test+ #171 [ 1257.829021] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007 [ 1257.829021] 0000000000000001 ffff880065f49da8 ffffffff8153dd2b ffff880065f49dd8 [ 1257.829021] ffffffff81092a00 ffff88006bd78680 ffff88007add7500 0000000000000003 [ 1257.829021] ffff88006bd78680 ffff880065f49e18 ffffffff810daebf ffffffff810dae5a [ 1257.829021] Call Trace: [ 1257.829021] [] dump_stack+0x19/0x1b [ 1257.829021] [] lockdep_rcu_suspicious+0x109/0x112 [ 1257.829021] [] __update_max_tr+0xed/0x1ee [ 1257.829021] [] ? __update_max_tr+0x88/0x1ee [ 1257.829021] [] ? user_enter+0xfd/0x107 [ 1257.829021] [] update_max_tr_single+0x11d/0x12d [ 1257.829021] [] ? user_enter+0xfd/0x107 [ 1257.829021] [] stop_critical_timing+0x141/0x209 [ 1257.829021] [] ? trace_hardirqs_on+0xd/0xf [ 1257.829021] [] ? user_enter+0xfd/0x107 [ 1257.829021] [] time_hardirqs_on+0x2a/0x2f [ 1257.829021] [] ? user_enter+0xfd/0x107 [ 1257.829021] [] trace_hardirqs_on_caller+0x16/0x197 [ 1257.829021] [] trace_hardirqs_on+0xd/0xf [ 1257.829021] [] user_enter+0xfd/0x107 [ 1257.829021] [] do_notify_resume+0x92/0x97 [ 1257.829021] [] int_signal+0x12/0x17 What happened was entering into the user code, the interrupts were enabled and a max interrupts off was recorded. The trace buffer was saved along with various information about the task: comm, pid, uid, priority, etc. The uid is recorded with task_uid(tsk). But this is a macro that uses rcu_read_lock() to retrieve the data, and this happened to happen where RCU is blind (user_enter). As only the preempt and irqs off tracers can have this happen, and they both only have the tsk == current, if tsk == current, use current_uid() instead of task_uid(), as current_uid() does not use RCU as only current can change its uid. This fixes the RCU suspicious splat. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4d79485b3237..1a41023a1f88 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -843,7 +843,15 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; - max_data->uid = task_uid(tsk); + /* + * If tsk == current, then use current_uid(), as that does not use + * RCU. The irq tracer can be called out of RCU scope. + */ + if (tsk == current) + max_data->uid = current_uid(); + else + max_data->uid = task_uid(tsk); + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; max_data->policy = tsk->policy; max_data->rt_priority = tsk->rt_priority; -- cgit v1.2.3 From 346dbb79ea0118ebb0df372b35cab9d5805216cd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 25 Apr 2013 19:28:54 +0200 Subject: irqdomain: export irq_domain_add_simple All other irq_domain_add_* functions are exported already, and apparently this one got left out by mistake, which causes build errors for ARM allmodconfig kernels: ERROR: "irq_domain_add_simple" [drivers/gpio/gpio-rcar.ko] undefined! ERROR: "irq_domain_add_simple" [drivers/gpio/gpio-em.ko] undefined! Signed-off-by: Arnd Bergmann Acked-by: Simon Horman Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5a83dde8ca0c..d1adaedb435f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -191,6 +191,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, /* A linear domain is the default */ return irq_domain_add_linear(of_node, size, ops, host_data); } +EXPORT_SYMBOL_GPL(irq_domain_add_simple); /** * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. -- cgit v1.2.3 From 275e31b10ce20613aedceaa5160129c64b260a98 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Tue, 14 May 2013 19:02:45 +0800 Subject: kernel/irq/irqdomain.c: before use 'irq_data', need check it whether valid. Since irq_data may be NULL, if so, we WARN_ON(), and continue, 'hwirq' which related with 'irq_data' has to initialize later, or it will cause issue. Signed-off-by: Chen Gang Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d1adaedb435f..8c4c8ea6a205 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -398,11 +398,12 @@ static void irq_domain_disassociate_many(struct irq_domain *domain, while (count--) { int irq = irq_base + count; struct irq_data *irq_data = irq_get_irq_data(irq); - irq_hw_number_t hwirq = irq_data->hwirq; + irq_hw_number_t hwirq; if (WARN_ON(!irq_data || irq_data->domain != domain)) continue; + hwirq = irq_data->hwirq; irq_set_status_flags(irq, IRQ_NOREQUEST); /* remove chip and handler */ -- cgit v1.2.3 From 94a63da0ac1a67bfb8b30aec1086523c5031ea5a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 6 Jun 2013 12:10:23 +0100 Subject: irqdomain: document the simple domain first_irq The first_irq needs to be zero to get a linear domain and that comes with special semantics. We want to simplify this going forward but some documentation never hurts. Signed-off-by: Linus Walleij Signed-off-by: Grant Likely --- kernel/irq/irqdomain.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8c4c8ea6a205..54a4d5223238 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -143,7 +143,10 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, * irq_domain_add_simple() - Allocate and register a simple irq_domain. * @of_node: pointer to interrupt controller's device tree node. * @size: total number of irqs in mapping - * @first_irq: first number of irq block assigned to the domain + * @first_irq: first number of irq block assigned to the domain, + * pass zero to assign irqs on-the-fly. This will result in a + * linear IRQ domain so it is important to use irq_create_mapping() + * for each used IRQ, especially when SPARSE_IRQ is enabled. * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * -- cgit v1.2.3 From 016a8d5be6ddcc72ef0432d82d9f6fa34f61b907 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 May 2013 17:32:53 -0400 Subject: rcu: Don't call wakeup() with rcu_node structure ->lock held This commit fixes a lockdep-detected deadlock by moving a wake_up() call out from a rnp->lock critical section. Please see below for the long version of this story. On Tue, 2013-05-28 at 16:13 -0400, Dave Jones wrote: > [12572.705832] ====================================================== > [12572.750317] [ INFO: possible circular locking dependency detected ] > [12572.796978] 3.10.0-rc3+ #39 Not tainted > [12572.833381] ------------------------------------------------------- > [12572.862233] trinity-child17/31341 is trying to acquire lock: > [12572.870390] (rcu_node_0){..-.-.}, at: [] rcu_read_unlock_special+0x9f/0x4c0 > [12572.878859] > but task is already holding lock: > [12572.894894] (&ctx->lock){-.-...}, at: [] perf_lock_task_context+0x7d/0x2d0 > [12572.903381] > which lock already depends on the new lock. > > [12572.927541] > the existing dependency chain (in reverse order) is: > [12572.943736] > -> #4 (&ctx->lock){-.-...}: > [12572.960032] [] lock_acquire+0x91/0x1f0 > [12572.968337] [] _raw_spin_lock+0x40/0x80 > [12572.976633] [] __perf_event_task_sched_out+0x2e7/0x5e0 > [12572.984969] [] perf_event_task_sched_out+0x93/0xa0 > [12572.993326] [] __schedule+0x2cf/0x9c0 > [12573.001652] [] schedule_user+0x2e/0x70 > [12573.009998] [] retint_careful+0x12/0x2e > [12573.018321] > -> #3 (&rq->lock){-.-.-.}: > [12573.034628] [] lock_acquire+0x91/0x1f0 > [12573.042930] [] _raw_spin_lock+0x40/0x80 > [12573.051248] [] wake_up_new_task+0xb7/0x260 > [12573.059579] [] do_fork+0x105/0x470 > [12573.067880] [] kernel_thread+0x26/0x30 > [12573.076202] [] rest_init+0x23/0x140 > [12573.084508] [] start_kernel+0x3f1/0x3fe > [12573.092852] [] x86_64_start_reservations+0x2a/0x2c > [12573.101233] [] x86_64_start_kernel+0xcc/0xcf > [12573.109528] > -> #2 (&p->pi_lock){-.-.-.}: > [12573.125675] [] lock_acquire+0x91/0x1f0 > [12573.133829] [] _raw_spin_lock_irqsave+0x4b/0x90 > [12573.141964] [] try_to_wake_up+0x31/0x320 > [12573.150065] [] default_wake_function+0x12/0x20 > [12573.158151] [] autoremove_wake_function+0x18/0x40 > [12573.166195] [] __wake_up_common+0x58/0x90 > [12573.174215] [] __wake_up+0x39/0x50 > [12573.182146] [] rcu_start_gp_advanced.isra.11+0x4a/0x50 > [12573.190119] [] rcu_start_future_gp+0x1c9/0x1f0 > [12573.198023] [] rcu_nocb_kthread+0x114/0x930 > [12573.205860] [] kthread+0xed/0x100 > [12573.213656] [] ret_from_fork+0x7c/0xb0 > [12573.221379] > -> #1 (&rsp->gp_wq){..-.-.}: > [12573.236329] [] lock_acquire+0x91/0x1f0 > [12573.243783] [] _raw_spin_lock_irqsave+0x4b/0x90 > [12573.251178] [] __wake_up+0x23/0x50 > [12573.258505] [] rcu_start_gp_advanced.isra.11+0x4a/0x50 > [12573.265891] [] rcu_start_future_gp+0x1c9/0x1f0 > [12573.273248] [] rcu_nocb_kthread+0x114/0x930 > [12573.280564] [] kthread+0xed/0x100 > [12573.287807] [] ret_from_fork+0x7c/0xb0 Notice the above call chain. rcu_start_future_gp() is called with the rnp->lock held. Then it calls rcu_start_gp_advance, which does a wakeup. You can't do wakeups while holding the rnp->lock, as that would mean that you could not do a rcu_read_unlock() while holding the rq lock, or any lock that was taken while holding the rq lock. This is because... (See below). > [12573.295067] > -> #0 (rcu_node_0){..-.-.}: > [12573.309293] [] __lock_acquire+0x1786/0x1af0 > [12573.316568] [] lock_acquire+0x91/0x1f0 > [12573.323825] [] _raw_spin_lock+0x40/0x80 > [12573.331081] [] rcu_read_unlock_special+0x9f/0x4c0 > [12573.338377] [] __rcu_read_unlock+0x96/0xa0 > [12573.345648] [] perf_lock_task_context+0x143/0x2d0 > [12573.352942] [] find_get_context+0x4e/0x1f0 > [12573.360211] [] SYSC_perf_event_open+0x514/0xbd0 > [12573.367514] [] SyS_perf_event_open+0x9/0x10 > [12573.374816] [] tracesys+0xdd/0xe2 Notice the above trace. perf took its own ctx->lock, which can be taken while holding the rq lock. While holding this lock, it did a rcu_read_unlock(). The perf_lock_task_context() basically looks like: rcu_read_lock(); raw_spin_lock(ctx->lock); rcu_read_unlock(); Now, what looks to have happened, is that we scheduled after taking that first rcu_read_lock() but before taking the spin lock. When we scheduled back in and took the ctx->lock, the following rcu_read_unlock() triggered the "special" code. The rcu_read_unlock_special() takes the rnp->lock, which gives us a possible deadlock scenario. CPU0 CPU1 CPU2 ---- ---- ---- rcu_nocb_kthread() lock(rq->lock); lock(ctx->lock); lock(rnp->lock); wake_up(); lock(rq->lock); rcu_read_unlock(); rcu_read_unlock_special(); lock(rnp->lock); lock(ctx->lock); **** DEADLOCK **** > [12573.382068] > other info that might help us debug this: > > [12573.403229] Chain exists of: > rcu_node_0 --> &rq->lock --> &ctx->lock > > [12573.424471] Possible unsafe locking scenario: > > [12573.438499] CPU0 CPU1 > [12573.445599] ---- ---- > [12573.452691] lock(&ctx->lock); > [12573.459799] lock(&rq->lock); > [12573.467010] lock(&ctx->lock); > [12573.474192] lock(rcu_node_0); > [12573.481262] > *** DEADLOCK *** > > [12573.501931] 1 lock held by trinity-child17/31341: > [12573.508990] #0: (&ctx->lock){-.-...}, at: [] perf_lock_task_context+0x7d/0x2d0 > [12573.516475] > stack backtrace: > [12573.530395] CPU: 1 PID: 31341 Comm: trinity-child17 Not tainted 3.10.0-rc3+ #39 > [12573.545357] ffffffff825b4f90 ffff880219f1dbc0 ffffffff816e375b ffff880219f1dc00 > [12573.552868] ffffffff816dfa5d ffff880219f1dc50 ffff88023ce4d1f8 ffff88023ce4ca40 > [12573.560353] 0000000000000001 0000000000000001 ffff88023ce4d1f8 ffff880219f1dcc0 > [12573.567856] Call Trace: > [12573.575011] [] dump_stack+0x19/0x1b > [12573.582284] [] print_circular_bug+0x200/0x20f > [12573.589637] [] __lock_acquire+0x1786/0x1af0 > [12573.596982] [] ? sched_clock_cpu+0xb5/0x100 > [12573.604344] [] lock_acquire+0x91/0x1f0 > [12573.611652] [] ? rcu_read_unlock_special+0x9f/0x4c0 > [12573.619030] [] _raw_spin_lock+0x40/0x80 > [12573.626331] [] ? rcu_read_unlock_special+0x9f/0x4c0 > [12573.633671] [] rcu_read_unlock_special+0x9f/0x4c0 > [12573.640992] [] ? perf_lock_task_context+0x7d/0x2d0 > [12573.648330] [] ? put_lock_stats.isra.29+0xe/0x40 > [12573.655662] [] ? delay_tsc+0x90/0xe0 > [12573.662964] [] __rcu_read_unlock+0x96/0xa0 > [12573.670276] [] perf_lock_task_context+0x143/0x2d0 > [12573.677622] [] ? __perf_event_enable+0x370/0x370 > [12573.684981] [] find_get_context+0x4e/0x1f0 > [12573.692358] [] SYSC_perf_event_open+0x514/0xbd0 > [12573.699753] [] ? get_parent_ip+0xd/0x50 > [12573.707135] [] ? trace_hardirqs_on_caller+0xfd/0x1c0 > [12573.714599] [] SyS_perf_event_open+0x9/0x10 > [12573.721996] [] tracesys+0xdd/0xe2 This commit delays the wakeup via irq_work(), which is what perf and ftrace use to perform wakeups in critical sections. Reported-by: Dave Jones Signed-off-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 17 +++++++++++++++-- kernel/rcutree.h | 2 ++ 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 16ea67925015..b61d20c5ee7b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1613,6 +1613,14 @@ static int __noreturn rcu_gp_kthread(void *arg) } } +static void rsp_wakeup(struct irq_work *work) +{ + struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); + + /* Wake up rcu_gp_kthread() to start the grace period. */ + wake_up(&rsp->gp_wq); +} + /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold @@ -1637,8 +1645,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, } rsp->gp_flags = RCU_GP_FLAG_INIT; - /* Wake up rcu_gp_kthread() to start the grace period. */ - wake_up(&rsp->gp_wq); + /* + * We can't do wakeups while holding the rnp->lock, as that + * could cause possible deadlocks with the rq->lock. Deter + * the wakeup to interrupt context. + */ + irq_work_queue(&rsp->wakeup_work); } /* @@ -3235,6 +3247,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, rsp->rda = rda; init_waitqueue_head(&rsp->gp_wq); + init_irq_work(&rsp->wakeup_work, rsp_wakeup); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index da77a8f57ff9..4df503470e42 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -27,6 +27,7 @@ #include #include #include +#include /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -442,6 +443,7 @@ struct rcu_state { char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ + struct irq_work wakeup_work; /* Postponed wakeups */ }; /* Values for rcu_state structure's gp_flags field. */ -- cgit v1.2.3 From 971394f389992f8462c4e5ae0e3b49a10a9534a3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 2 Jun 2013 07:13:57 -0700 Subject: rcu: Fix deadlock with CPU hotplug, RCU GP init, and timer migration In Steven Rostedt's words: > I've been debugging the last couple of days why my tests have been > locking up. One of my tracing tests, runs all available tracers. The > lockup always happened with the mmiotrace, which is used to trace > interactions between priority drivers and the kernel. But to do this > easily, when the tracer gets registered, it disables all but the boot > CPUs. The lockup always happened after it got done disabling the CPUs. > > Then I decided to try this: > > while :; do > for i in 1 2 3; do > echo 0 > /sys/devices/system/cpu/cpu$i/online > done > for i in 1 2 3; do > echo 1 > /sys/devices/system/cpu/cpu$i/online > done > done > > Well, sure enough, that locked up too, with the same users. Doing a > sysrq-w (showing all blocked tasks): > > [ 2991.344562] task PC stack pid father > [ 2991.344562] rcu_preempt D ffff88007986fdf8 0 10 2 0x00000000 > [ 2991.344562] ffff88007986fc98 0000000000000002 ffff88007986fc48 0000000000000908 > [ 2991.344562] ffff88007986c280 ffff88007986ffd8 ffff88007986ffd8 00000000001d3c80 > [ 2991.344562] ffff880079248a40 ffff88007986c280 0000000000000000 00000000fffd4295 > [ 2991.344562] Call Trace: > [ 2991.344562] [] schedule+0x64/0x66 > [ 2991.344562] [] schedule_timeout+0xbc/0xf9 > [ 2991.344562] [] ? ftrace_call+0x5/0x2f > [ 2991.344562] [] ? cascade+0xa8/0xa8 > [ 2991.344562] [] schedule_timeout_uninterruptible+0x1e/0x20 > [ 2991.344562] [] rcu_gp_kthread+0x502/0x94b > [ 2991.344562] [] ? __init_waitqueue_head+0x50/0x50 > [ 2991.344562] [] ? rcu_gp_fqs+0x64/0x64 > [ 2991.344562] [] kthread+0xb1/0xb9 > [ 2991.344562] [] ? lock_release_holdtime.part.23+0x4e/0x55 > [ 2991.344562] [] ? __init_kthread_worker+0x58/0x58 > [ 2991.344562] [] ret_from_fork+0x7c/0xb0 > [ 2991.344562] [] ? __init_kthread_worker+0x58/0x58 > [ 2991.344562] kworker/0:1 D ffffffff81a30680 0 47 2 0x00000000 > [ 2991.344562] Workqueue: events cpuset_hotplug_workfn > [ 2991.344562] ffff880078dbbb58 0000000000000002 0000000000000006 00000000000000d8 > [ 2991.344562] ffff880078db8100 ffff880078dbbfd8 ffff880078dbbfd8 00000000001d3c80 > [ 2991.344562] ffff8800779ca5c0 ffff880078db8100 ffffffff81541fcf 0000000000000000 > [ 2991.344562] Call Trace: > [ 2991.344562] [] ? __mutex_lock_common+0x3d4/0x609 > [ 2991.344562] [] schedule+0x64/0x66 > [ 2991.344562] [] schedule_preempt_disabled+0x18/0x24 > [ 2991.344562] [] __mutex_lock_common+0x3d4/0x609 > [ 2991.344562] [] ? get_online_cpus+0x3c/0x50 > [ 2991.344562] [] ? get_online_cpus+0x3c/0x50 > [ 2991.344562] [] mutex_lock_nested+0x3b/0x40 > [ 2991.344562] [] get_online_cpus+0x3c/0x50 > [ 2991.344562] [] rebuild_sched_domains_locked+0x6e/0x3a8 > [ 2991.344562] [] rebuild_sched_domains+0x1c/0x2a > [ 2991.344562] [] cpuset_hotplug_workfn+0x1c7/0x1d3 > [ 2991.344562] [] ? cpuset_hotplug_workfn+0x5/0x1d3 > [ 2991.344562] [] process_one_work+0x2d4/0x4d1 > [ 2991.344562] [] ? process_one_work+0x207/0x4d1 > [ 2991.344562] [] worker_thread+0x2e7/0x3b5 > [ 2991.344562] [] ? rescuer_thread+0x332/0x332 > [ 2991.344562] [] kthread+0xb1/0xb9 > [ 2991.344562] [] ? __init_kthread_worker+0x58/0x58 > [ 2991.344562] [] ret_from_fork+0x7c/0xb0 > [ 2991.344562] [] ? __init_kthread_worker+0x58/0x58 > [ 2991.344562] bash D ffffffff81a4aa80 0 2618 2612 0x10000000 > [ 2991.344562] ffff8800379abb58 0000000000000002 0000000000000006 0000000000000c2c > [ 2991.344562] ffff880077fea140 ffff8800379abfd8 ffff8800379abfd8 00000000001d3c80 > [ 2991.344562] ffff8800779ca5c0 ffff880077fea140 ffffffff81541fcf 0000000000000000 > [ 2991.344562] Call Trace: > [ 2991.344562] [] ? __mutex_lock_common+0x3d4/0x609 > [ 2991.344562] [] schedule+0x64/0x66 > [ 2991.344562] [] schedule_preempt_disabled+0x18/0x24 > [ 2991.344562] [] __mutex_lock_common+0x3d4/0x609 > [ 2991.344562] [] ? rcu_cpu_notify+0x2f5/0x86e > [ 2991.344562] [] ? rcu_cpu_notify+0x2f5/0x86e > [ 2991.344562] [] mutex_lock_nested+0x3b/0x40 > [ 2991.344562] [] rcu_cpu_notify+0x2f5/0x86e > [ 2991.344562] [] ? __lock_is_held+0x32/0x53 > [ 2991.344562] [] notifier_call_chain+0x6b/0x98 > [ 2991.344562] [] __raw_notifier_call_chain+0xe/0x10 > [ 2991.344562] [] __cpu_notify+0x20/0x32 > [ 2991.344562] [] cpu_notify_nofail+0x17/0x36 > [ 2991.344562] [] _cpu_down+0x154/0x259 > [ 2991.344562] [] cpu_down+0x2d/0x3a > [ 2991.344562] [] store_online+0x4e/0xe7 > [ 2991.344562] [] dev_attr_store+0x20/0x22 > [ 2991.344562] [] sysfs_write_file+0x108/0x144 > [ 2991.344562] [] vfs_write+0xfd/0x158 > [ 2991.344562] [] SyS_write+0x5c/0x83 > [ 2991.344562] [] tracesys+0xdd/0xe2 > > As well as held locks: > > [ 3034.728033] Showing all locks held in the system: > [ 3034.728033] 1 lock held by rcu_preempt/10: > [ 3034.728033] #0: (rcu_preempt_state.onoff_mutex){+.+...}, at: [] rcu_gp_kthread+0x167/0x94b > [ 3034.728033] 4 locks held by kworker/0:1/47: > [ 3034.728033] #0: (events){.+.+.+}, at: [] process_one_work+0x207/0x4d1 > [ 3034.728033] #1: (cpuset_hotplug_work){+.+.+.}, at: [] process_one_work+0x207/0x4d1 > [ 3034.728033] #2: (cpuset_mutex){+.+.+.}, at: [] rebuild_sched_domains+0x17/0x2a > [ 3034.728033] #3: (cpu_hotplug.lock){+.+.+.}, at: [] get_online_cpus+0x3c/0x50 > [ 3034.728033] 1 lock held by mingetty/2563: > [ 3034.728033] #0: (&ldata->atomic_read_lock){+.+...}, at: [] n_tty_read+0x252/0x7e8 > [ 3034.728033] 1 lock held by mingetty/2565: > [ 3034.728033] #0: (&ldata->atomic_read_lock){+.+...}, at: [] n_tty_read+0x252/0x7e8 > [ 3034.728033] 1 lock held by mingetty/2569: > [ 3034.728033] #0: (&ldata->atomic_read_lock){+.+...}, at: [] n_tty_read+0x252/0x7e8 > [ 3034.728033] 1 lock held by mingetty/2572: > [ 3034.728033] #0: (&ldata->atomic_read_lock){+.+...}, at: [] n_tty_read+0x252/0x7e8 > [ 3034.728033] 1 lock held by mingetty/2575: > [ 3034.728033] #0: (&ldata->atomic_read_lock){+.+...}, at: [] n_tty_read+0x252/0x7e8 > [ 3034.728033] 7 locks held by bash/2618: > [ 3034.728033] #0: (sb_writers#5){.+.+.+}, at: [] file_start_write+0x2a/0x2c > [ 3034.728033] #1: (&buffer->mutex#2){+.+.+.}, at: [] sysfs_write_file+0x3c/0x144 > [ 3034.728033] #2: (s_active#54){.+.+.+}, at: [] sysfs_write_file+0xe7/0x144 > [ 3034.728033] #3: (x86_cpu_hotplug_driver_mutex){+.+.+.}, at: [] cpu_hotplug_driver_lock+0x17/0x19 > [ 3034.728033] #4: (cpu_add_remove_lock){+.+.+.}, at: [] cpu_maps_update_begin+0x17/0x19 > [ 3034.728033] #5: (cpu_hotplug.lock){+.+.+.}, at: [] cpu_hotplug_begin+0x2c/0x6d > [ 3034.728033] #6: (rcu_preempt_state.onoff_mutex){+.+...}, at: [] rcu_cpu_notify+0x2f5/0x86e > [ 3034.728033] 1 lock held by bash/2980: > [ 3034.728033] #0: (&ldata->atomic_read_lock){+.+...}, at: [] n_tty_read+0x252/0x7e8 > > Things looked a little weird. Also, this is a deadlock that lockdep did > not catch. But what we have here does not look like a circular lock > issue: > > Bash is blocked in rcu_cpu_notify(): > > 1961 /* Exclude any attempts to start a new grace period. */ > 1962 mutex_lock(&rsp->onoff_mutex); > > > kworker is blocked in get_online_cpus(), which makes sense as we are > currently taking down a CPU. > > But rcu_preempt is not blocked on anything. It is simply sleeping in > rcu_gp_kthread (really rcu_gp_init) here: > > 1453 #ifdef CONFIG_PROVE_RCU_DELAY > 1454 if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && > 1455 system_state == SYSTEM_RUNNING) > 1456 schedule_timeout_uninterruptible(2); > 1457 #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ > > And it does this while holding the onoff_mutex that bash is waiting for. > > Doing a function trace, it showed me where it happened: > > [ 125.940066] rcu_pree-10 3.... 28384115273: schedule_timeout_uninterruptible <-rcu_gp_kthread > [...] > [ 125.940066] rcu_pree-10 3d..3 28384202439: sched_switch: prev_comm=rcu_preempt prev_pid=10 prev_prio=120 prev_state=D ==> next_comm=watchdog/3 next_pid=38 next_prio=120 > > The watchdog ran, and then: > > [ 125.940066] watchdog-38 3d..3 28384692863: sched_switch: prev_comm=watchdog/3 prev_pid=38 prev_prio=120 prev_state=P ==> next_comm=modprobe next_pid=2848 next_prio=118 > > Not sure what modprobe was doing, but shortly after that: > > [ 125.940066] modprobe-2848 3d..3 28385041749: sched_switch: prev_comm=modprobe prev_pid=2848 prev_prio=118 prev_state=R+ ==> next_comm=migration/3 next_pid=40 next_prio=0 > > Where the migration thread took down the CPU: > > [ 125.940066] migratio-40 3d..3 28389148276: sched_switch: prev_comm=migration/3 prev_pid=40 prev_prio=0 prev_state=P ==> next_comm=swapper/3 next_pid=0 next_prio=120 > > which finally did: > > [ 125.940066] -0 3...1 28389282142: arch_cpu_idle_dead <-cpu_startup_entry > [ 125.940066] -0 3...1 28389282548: native_play_dead <-arch_cpu_idle_dead > [ 125.940066] -0 3...1 28389282924: play_dead_common <-native_play_dead > [ 125.940066] -0 3...1 28389283468: idle_task_exit <-play_dead_common > [ 125.940066] -0 3...1 28389284644: amd_e400_remove_cpu <-play_dead_common > > > CPU 3 is now offline, the rcu_preempt thread that ran on CPU 3 is still > doing a schedule_timeout_uninterruptible() and it registered it's > timeout to the timer base for CPU 3. You would think that it would get > migrated right? The issue here is that the timer migration happens at > the CPU notifier for CPU_DEAD. The problem is that the rcu notifier for > CPU_DOWN is blocked waiting for the onoff_mutex to be released, which is > held by the thread that just put itself into a uninterruptible sleep, > that wont wake up until the CPU_DEAD notifier of the timer > infrastructure is called, which wont happen until the rcu notifier > finishes. Here's our deadlock! This commit breaks this deadlock cycle by substituting a shorter udelay() for the previous schedule_timeout_uninterruptible(), while at the same time increasing the probability of the delay. This maintains the intensity of the testing. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Tested-by: Steven Rostedt --- kernel/rcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b61d20c5ee7b..35380019f0fc 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1451,9 +1451,9 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); #ifdef CONFIG_PROVE_RCU_DELAY - if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && + if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && system_state == SYSTEM_RUNNING) - schedule_timeout_uninterruptible(2); + udelay(200); #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); } -- cgit v1.2.3 From 34376a50fb1fa095b9d0636fa41ed2e73125f214 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 6 Jun 2013 14:29:49 -0700 Subject: Fix lockup related to stop_machine being stuck in __do_softirq. The stop machine logic can lock up if all but one of the migration threads make it through the disable-irq step and the one remaining thread gets stuck in __do_softirq. The reason __do_softirq can hang is that it has a bail-out based on jiffies timeout, but in the lockup case, jiffies itself is not incremented. To work around this, re-add the max_restart counter in __do_irq and stop processing irqs after 10 restarts. Thanks to Tejun Heo and Rusty Russell and others for helping me track this down. This was introduced in 3.9 by commit c10d73671ad3 ("softirq: reduce latencies"). It may be worth looking into ath9k to see if it has issues with its irq handler at a later date. The hang stack traces look something like this: ------------[ cut here ]------------ WARNING: at kernel/watchdog.c:245 watchdog_overflow_callback+0x9c/0xa7() Watchdog detected hard LOCKUP on cpu 2 Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen lockd sunrpc] Pid: 23, comm: migration/2 Tainted: G C 3.9.4+ #11 Call Trace: warn_slowpath_common+0x85/0x9f warn_slowpath_fmt+0x46/0x48 watchdog_overflow_callback+0x9c/0xa7 __perf_event_overflow+0x137/0x1cb perf_event_overflow+0x14/0x16 intel_pmu_handle_irq+0x2dc/0x359 perf_event_nmi_handler+0x19/0x1b nmi_handle+0x7f/0xc2 do_nmi+0xbc/0x304 end_repeat_nmi+0x1e/0x2e <> cpu_stopper_thread+0xae/0x162 smpboot_thread_fn+0x258/0x260 kthread+0xc7/0xcf ret_from_fork+0x7c/0xb0 ---[ end trace 4947dfa9b0a4cec3 ]--- BUG: soft lockup - CPU#1 stuck for 22s! [migration/1:17] Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen lockd sunrpc] irq event stamp: 835637905 hardirqs last enabled at (835637904): __do_softirq+0x9f/0x257 hardirqs last disabled at (835637905): apic_timer_interrupt+0x6d/0x80 softirqs last enabled at (5654720): __do_softirq+0x1ff/0x257 softirqs last disabled at (5654725): irq_exit+0x5f/0xbb CPU 1 Pid: 17, comm: migration/1 Tainted: G WC 3.9.4+ #11 To be filled by O.E.M. To be filled by O.E.M./To be filled by O.E.M. RIP: tasklet_hi_action+0xf0/0xf0 Process migration/1 Call Trace: __do_softirq+0x117/0x257 irq_exit+0x5f/0xbb smp_apic_timer_interrupt+0x8a/0x98 apic_timer_interrupt+0x72/0x80 printk+0x4d/0x4f stop_machine_cpu_stop+0x22c/0x274 cpu_stopper_thread+0xae/0x162 smpboot_thread_fn+0x258/0x260 kthread+0xc7/0xcf ret_from_fork+0x7c/0xb0 Signed-off-by: Ben Greear Acked-by: Tejun Heo Acked-by: Pekka Riikonen Cc: Eric Dumazet Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- kernel/softirq.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index b5197dcb0dad..3d6833f125d3 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -195,8 +195,12 @@ void local_bh_enable_ip(unsigned long ip) EXPORT_SYMBOL(local_bh_enable_ip); /* - * We restart softirq processing for at most 2 ms, - * and if need_resched() is not set. + * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, + * but break the loop if need_resched() is set or after 2 ms. + * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in + * certain cases, such as stop_machine(), jiffies may cease to + * increment and so we need the MAX_SOFTIRQ_RESTART limit as + * well to make sure we eventually return from this method. * * These limits have been established via experimentation. * The two things to balance is latency against fairness - @@ -204,6 +208,7 @@ EXPORT_SYMBOL(local_bh_enable_ip); * should not be able to lock up the box. */ #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) +#define MAX_SOFTIRQ_RESTART 10 asmlinkage void __do_softirq(void) { @@ -212,6 +217,7 @@ asmlinkage void __do_softirq(void) unsigned long end = jiffies + MAX_SOFTIRQ_TIME; int cpu; unsigned long old_flags = current->flags; + int max_restart = MAX_SOFTIRQ_RESTART; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -265,7 +271,8 @@ restart: pending = local_softirq_pending(); if (pending) { - if (time_before(jiffies, end) && !need_resched()) + if (time_before(jiffies, end) && !need_resched() && + --max_restart) goto restart; wakeup_softirqd(); -- cgit v1.2.3 From 58e8eedf18577c7eac722d5d1f190507ea263d1b Mon Sep 17 00:00:00 2001 From: Yoshihiro YUNOMAE Date: Tue, 23 Apr 2013 10:32:39 +0900 Subject: tracing: Fix outputting formats of x86-tsc and counter when use trace_clock Outputting formats of x86-tsc and counter should be a raw format, but after applying the patch(2b6080f28c7cc3efc8625ab71495aae89aeb63a0), the format was changed to nanosec. This is because the global variable trace_clock_id was used. When we use multiple buffers, clock_id of each sub-buffer should be used. Then, this patch uses tr->clock_id instead of the global variable trace_clock_id. [ Basically, this fixes a regression where the multibuffer code changed the trace_clock file to update tr->clock_id but the traces still use the old global trace_clock_id variable, negating the file's effect. The global trace_clock_id variable is obsolete and removed. - SR ] Link: http://lkml.kernel.org/r/20130423013239.22334.7394.stgit@yunodevel Signed-off-by: Yoshihiro YUNOMAE Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 +++----- kernel/trace/trace.h | 2 -- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1a41023a1f88..e71a8be4a6ee 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -652,8 +652,6 @@ static struct { ARCH_TRACE_CLOCKS }; -int trace_clock_id; - /* * trace_parser_get_init - gets the buffer for trace parser */ @@ -2826,7 +2824,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ - if (trace_clocks[trace_clock_id].in_ns) + if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; /* stop the trace while dumping if we are not opening "snapshot" */ @@ -3825,7 +3823,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->iter_flags |= TRACE_FILE_LAT_FMT; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ - if (trace_clocks[trace_clock_id].in_ns) + if (trace_clocks[tr->clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; iter->cpu_file = tc->cpu; @@ -5095,7 +5093,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); - if (trace_clocks[trace_clock_id].in_ns) { + if (trace_clocks[tr->clock_id].in_ns) { /* local or global for trace_clock */ t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 711ca7d3e7f1..20572ed88c5c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -700,8 +700,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; -extern int trace_clock_id; - /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -- cgit v1.2.3 From d7880812b3594d3c6dcbe3cfd71dabb17347d082 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jun 2013 16:52:03 +0200 Subject: idle: Add the stack canary init to cpu_startup_entry() Moving x86 to the generic idle implementation (commit 7d1a9417 "x86: Use generic idle loop") wreckaged the stack protector. I stupidly missed that boot_init_stack_canary() must be inlined from a function which never returns, but I put that call into arch_cpu_idle_prepare() which of course returns. I pondered to play tricks with arch_cpu_idle_prepare() first, but then I noticed, that the other archs which have implemented the stackprotector (ARM and SH) do not initialize the canary for the non-boot cpus. So I decided to move the boot_init_stack_canary() call into cpu_startup_entry() ifdeffed with an CONFIG_X86 for now. This #ifdef is just a temporary measure as I don't want to inflict the boot_init_stack_canary() call on ARM and SH that late in the cycle. I'll queue a patch for 3.11 which removes the #ifdef if the ARM/SH maintainers have no objection. Reported-by: Wouter van Kesteren Cc: x86@kernel.org Cc: Russell King Cc: Paul Mundt Signed-off-by: Thomas Gleixner --- kernel/cpu/idle.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index d5585f5e038e..bf2ee1aafa0e 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -112,6 +113,21 @@ static void cpu_idle_loop(void) void cpu_startup_entry(enum cpuhp_state state) { + /* + * This #ifdef needs to die, but it's too late in the cycle to + * make this generic (arm and sh have never invoked the canary + * init for the non boot cpus!). Will be fixed in 3.11 + */ +#ifdef CONFIG_X86 + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. The boot CPU already has it initialized but no harm + * in doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); +#endif current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); -- cgit v1.2.3 From 16e53dbf10a2d7e228709a7286310e629ede5e45 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 12 Jun 2013 14:04:36 -0700 Subject: CPU hotplug: provide a generic helper to disable/enable CPU hotplug There are instances in the kernel where we would like to disable CPU hotplug (from sysfs) during some important operation. Today the freezer code depends on this and the code to do it was kinda tailor-made for that. Restructure the code and make it generic enough to be useful for other usecases too. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Shawn Guo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 55 +++++++++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index b5e4ab2d427e..198a38883e64 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -133,6 +133,27 @@ static void cpu_hotplug_done(void) mutex_unlock(&cpu_hotplug.lock); } +/* + * Wait for currently running CPU hotplug operations to complete (if any) and + * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects + * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the + * hotplug path before performing hotplug operations. So acquiring that lock + * guarantees mutual exclusion from any currently running hotplug operations. + */ +void cpu_hotplug_disable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + +void cpu_hotplug_enable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + #else /* #if CONFIG_HOTPLUG_CPU */ static void cpu_hotplug_begin(void) {} static void cpu_hotplug_done(void) {} @@ -540,36 +561,6 @@ static int __init alloc_frozen_cpus(void) } core_initcall(alloc_frozen_cpus); -/* - * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU - * hotplug when tasks are about to be frozen. Also, don't allow the freezer - * to continue until any currently running CPU hotplug operation gets - * completed. - * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the - * 'cpu_add_remove_lock'. And this same lock is also taken by the regular - * CPU hotplug path and released only after it is complete. Thus, we - * (and hence the freezer) will block here until any currently running CPU - * hotplug operation gets completed. - */ -void cpu_hotplug_disable_before_freeze(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; - cpu_maps_update_done(); -} - - -/* - * When tasks have been thawed, re-enable regular CPU hotplug (which had been - * disabled while beginning to freeze tasks). - */ -void cpu_hotplug_enable_after_thaw(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; - cpu_maps_update_done(); -} - /* * When callbacks for CPU hotplug notifications are being executed, we must * ensure that the state of the system with respect to the tasks being frozen @@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: - cpu_hotplug_disable_before_freeze(); + cpu_hotplug_disable(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: - cpu_hotplug_enable_after_thaw(); + cpu_hotplug_enable(); break; default: -- cgit v1.2.3 From cf7df378aa4ff7da3a44769b7ff6e9eef1a9f3db Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Wed, 12 Jun 2013 14:04:37 -0700 Subject: reboot: rigrate shutdown/reboot to boot cpu We recently noticed that reboot of a 1024 cpu machine takes approx 16 minutes of just stopping the cpus. The slowdown was tracked to commit f96972f2dc63 ("kernel/sys.c: call disable_nonboot_cpus() in kernel_restart()"). The current implementation does all the work of hot removing the cpus before halting the system. We are switching to just migrating to the boot cpu and then continuing with shutdown/reboot. This also has the effect of not breaking x86's command line parameter for specifying the reboot cpu. Note, this code was shamelessly copied from arch/x86/kernel/reboot.c with bits removed pertaining to the reboot_cpu command line parameter. Signed-off-by: Robin Holt Tested-by: Shawn Guo Cc: "Srivatsa S. Bhat" Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index b95d3c72ba21..2bbd9a73b54c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +/* Add backwards compatibility for stable trees. */ +#ifndef PF_NO_SETAFFINITY +#define PF_NO_SETAFFINITY PF_THREAD_BOUND +#endif + +static void migrate_to_reboot_cpu(void) +{ + /* The boot cpu is always logical cpu 0 */ + int cpu = 0; + + cpu_hotplug_disable(); + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_online(cpu)) + cpu = cpumask_first(cpu_online_mask); + + /* Prevent races with other tasks migrating this task */ + current->flags |= PF_NO_SETAFFINITY; + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + /** * kernel_restart - reboot the system * @cmd: pointer to buffer containing command to execute for restart @@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); @@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); @@ -419,7 +442,7 @@ void kernel_power_off(void) kernel_shutdown_prepare(SYSTEM_POWER_OFF); if (pm_power_off_prepare) pm_power_off_prepare(); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "Power down.\n"); kmsg_dump(KMSG_DUMP_POWEROFF); -- cgit v1.2.3 From 637241a900cbd982f744d44646b48a273d609b34 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jun 2013 14:04:39 -0700 Subject: kmsg: honor dmesg_restrict sysctl on /dev/kmsg The dmesg_restrict sysctl currently covers the syslog method for access dmesg, however /dev/kmsg isn't covered by the same protections. Most people haven't noticed because util-linux dmesg(1) defaults to using the syslog method for access in older versions. With util-linux dmesg(1) defaults to reading directly from /dev/kmsg. To fix /dev/kmsg, let's compare the existing interfaces and what they allow: - /proc/kmsg allows: - open (SYSLOG_ACTION_OPEN) if CAP_SYSLOG since it uses a destructive single-reader interface (SYSLOG_ACTION_READ). - everything, after an open. - syslog syscall allows: - anything, if CAP_SYSLOG. - SYSLOG_ACTION_READ_ALL and SYSLOG_ACTION_SIZE_BUFFER, if dmesg_restrict==0. - nothing else (EPERM). The use-cases were: - dmesg(1) needs to do non-destructive SYSLOG_ACTION_READ_ALLs. - sysklog(1) needs to open /proc/kmsg, drop privs, and still issue the destructive SYSLOG_ACTION_READs. AIUI, dmesg(1) is moving to /dev/kmsg, and systemd-journald doesn't clear the ring buffer. Based on the comments in devkmsg_llseek, it sounds like actions besides reading aren't going to be supported by /dev/kmsg (i.e. SYSLOG_ACTION_CLEAR), so we have a strict subset of the non-destructive syslog syscall actions. To this end, move the check as Josh had done, but also rename the constants to reflect their new uses (SYSLOG_FROM_CALL becomes SYSLOG_FROM_READER, and SYSLOG_FROM_FILE becomes SYSLOG_FROM_PROC). SYSLOG_FROM_READER allows non-destructive actions, and SYSLOG_FROM_PROC allows destructive actions after a capabilities-constrained SYSLOG_ACTION_OPEN check. - /dev/kmsg allows: - open if CAP_SYSLOG or dmesg_restrict==0 - reading/polling, after open Addresses https://bugzilla.redhat.com/show_bug.cgi?id=903192 [akpm@linux-foundation.org: use pr_warn_once()] Signed-off-by: Kees Cook Reported-by: Christian Kujau Tested-by: Josh Boyer Cc: Kay Sievers Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 91 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index fa36e1494420..8212c1aef125 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -363,6 +363,53 @@ static void log_store(int facility, int level, log_next_seq++; } +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* + * Unless restricted, we allow "read all" and "get buffer size" + * for everybody. + */ + return type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + return 0; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + return 0; + /* + * For historical reasons, accept CAP_SYS_ADMIN too, with + * a warning. + */ + if (capable(CAP_SYS_ADMIN)) { + pr_warn_once("%s (%d): Attempt to access syslog with " + "CAP_SYS_ADMIN but no CAP_SYSLOG " + "(deprecated).\n", + current->comm, task_pid_nr(current)); + return 0; + } + return -EPERM; + } + return security_syslog(type); +} + + /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; @@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file) if ((file->f_flags & O_ACCMODE) == O_WRONLY) return 0; - err = security_syslog(SYSLOG_ACTION_READ_ALL); + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); if (err) return err; @@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level) } #endif -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif - -static int syslog_action_restricted(int type) -{ - if (dmesg_restrict) - return 1; - /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ - return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; -} - -static int check_syslog_permissions(int type, bool from_file) -{ - /* - * If this is from /proc/kmsg and we've already opened it, then we've - * already done the capabilities checks at open time. - */ - if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; - - if (syslog_action_restricted(type)) { - if (capable(CAP_SYSLOG)) - return 0; - /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ - if (capable(CAP_SYS_ADMIN)) { - printk_once(KERN_WARNING "%s (%d): " - "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated).\n", - current->comm, task_pid_nr(current)); - return 0; - } - return -EPERM; - } - return 0; -} - #if defined(CONFIG_PRINTK_TIME) static bool printk_time = 1; #else @@ -1249,7 +1258,7 @@ out: SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { - return do_syslog(type, buf, len, SYSLOG_FROM_CALL); + return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* -- cgit v1.2.3 From f000cfdde5de4fc15dead5ccf524359c07eadf2b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 12 Jun 2013 14:04:46 -0700 Subject: audit: wait_for_auditd() should use TASK_UNINTERRUPTIBLE audit_log_start() does wait_for_auditd() in a loop until audit_backlog_wait_time passes or audit_skb_queue has a room. If signal_pending() is true this becomes a busy-wait loop, schedule() in TASK_INTERRUPTIBLE won't block. Thanks to Guy for fully investigating and explaining the problem. (akpm: that'll cause the system to lock up on a non-preemptible uniprocessor kernel) (Guy: "Our customer was in fact running a uniprocessor machine, and they reported a system hang.") Signed-off-by: Oleg Nesterov Reported-by: Guy Streeter Cc: Eric Paris Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 21c7fa615bd3..91e53d04b6a9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, static void wait_for_auditd(unsigned long sleep_time) { DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(&audit_backlog_wait, &wait); if (audit_backlog_limit && -- cgit v1.2.3 From 736f3203a06eafd0944103775a98584082744c6b Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 12 Jun 2013 14:05:07 -0700 Subject: kernel/audit_tree.c:audit_add_tree_rule(): protect `rule' from kill_rules() audit_add_tree_rule() must set 'rule->tree = NULL;' firstly, to protect the rule itself freed in kill_rules(). The reason is when it is killed, the 'rule' itself may have already released, we should not access it. one example: we add a rule to an inode, just at the same time the other task is deleting this inode. The work flow for adding a rule: audit_receive() -> (need audit_cmd_mutex lock) audit_receive_skb() -> audit_receive_msg() -> audit_receive_filter() -> audit_add_rule() -> audit_add_tree_rule() -> (need audit_filter_mutex lock) ... unlock audit_filter_mutex get_tree() ... iterate_mounts() -> (iterate all related inodes) tag_mount() -> tag_trunk() -> create_trunk() -> (assume it is 1st rule) fsnotify_add_mark() -> fsnotify_add_inode_mark() -> (add mark to inode->i_fsnotify_marks) ... get_tree(); (each inode will get one) ... lock audit_filter_mutex The work flow for deleting an inode: __destroy_inode() -> fsnotify_inode_delete() -> __fsnotify_inode_delete() -> fsnotify_clear_marks_by_inode() -> (get mark from inode->i_fsnotify_marks) fsnotify_destroy_mark() -> fsnotify_destroy_mark_locked() -> audit_tree_freeing_mark() -> evict_chunk() -> ... tree->goner = 1 ... kill_rules() -> (assume current->audit_context == NULL) call_rcu() -> (rule->tree != NULL) audit_free_rule_rcu() -> audit_free_rule() ... audit_schedule_prune() -> (assume current->audit_context == NULL) kthread_run() -> (need audit_cmd_mutex and audit_filter_mutex lock) prune_one() -> (delete it from prue_list) put_tree(); (match the original get_tree above) Signed-off-by: Chen Gang Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit_tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index a291aa23fb3f..43c307dc9453 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct vfsmount *mnt; int err; + rule->tree = NULL; list_for_each_entry(tree, &tree_list, list) { if (!strcmp(seed->pathname, tree->pathname)) { put_tree(seed); -- cgit v1.2.3 From 29ce3785b22da47c49f4ef6e14b9014fa5dee261 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 8 May 2013 14:05:34 -0700 Subject: idle: Enable interrupts in the weak arch_cpu_idle() implementation PARISC bootup triggers the warning at kernel/cpu/idle.c:96. That's caused by the weak arch_cpu_idle() implementation, which is provided to avoid that architectures implement idle_poll over and over. The switchover to polling mode happens in the first call of the weak arch_cpu_idle() implementation, but that code fails to reenable interrupts and therefor triggers the warning. Fix this by enabling interrupts in the weak arch_cpu_idle() code. [ tglx: Made the changelog match the patch ] Signed-off-by: James Bottomley Reviewed-by: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/1371236142.2726.43.camel@dabdike Signed-off-by: Thomas Gleixner --- kernel/cpu/idle.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index bf2ee1aafa0e..e695c0a0bcb5 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -59,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; + local_irq_enable(); } /* -- cgit v1.2.3 From 8aac62706adaaf0fab02c4327761561c8bda9448 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 14 Jun 2013 21:09:49 +0200 Subject: move exit_task_namespaces() outside of exit_notify() exit_notify() does exit_task_namespaces() after forget_original_parent(). This was needed to ensure that ->nsproxy can't be cleared prematurely, an exiting child we are going to reparent can do do_notify_parent() and use the parent's (ours) pid_ns. However, after 32084504 "pidns: use task_active_pid_ns in do_notify_parent" ->nsproxy != NULL is no longer needed, we rely on task_active_pid_ns(). Move exit_task_namespaces() from exit_notify() to do_exit(), after exit_fs() and before exit_task_work(). This solves the problem reported by Andrey, free_ipc_ns()->shm_destroy() does fput() which needs task_work_add(). Note: this particular problem can be fixed if we change fput(), and that change makes sense anyway. But there is another reason to move the callsite. The original reason for exit_task_namespaces() from the middle of exit_notify() was subtle and it has already gone away, now this looks confusing. And this allows us do simplify exit_notify(), we can avoid unlock/lock(tasklist) and we can use ->exit_state instead of PF_EXITING in forget_original_parent(). Reported-by: Andrey Vagin Signed-off-by: Oleg Nesterov Acked-by: "Eric W. Biederman" Acked-by: Andrey Vagin Signed-off-by: Al Viro --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index af2eb3cbd499..7bb73f9d09db 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -649,7 +649,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ forget_original_parent(tsk); - exit_task_namespaces(tsk); write_lock_irq(&tasklist_lock); if (group_dead) @@ -795,6 +794,7 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + exit_task_namespaces(tsk); exit_task_work(tsk); check_stack_usage(); exit_thread(); -- cgit v1.2.3 From 0541881502a1276149889fe468662ff6a8fc8f6d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 13 Jun 2013 13:17:02 -0700 Subject: range: Do not add new blank slot with add_range_with_merge Joshua reported: Commit cd7b304dfaf1 (x86, range: fix missing merge during add range) broke mtrr cleanup on his setup in 3.9.5. corresponding commit in upstream is fbe06b7bae7c. The reason is add_range_with_merge could generate blank spot. We could avoid that by searching new expanded start/end, that new range should include all connected ranges in range array. At last add the new expanded start/end to the range array. Also move up left array so do not add new blank slot in the range array. -v2: move left array to avoid enhance add_range() -v3: include fix from Joshua about memmove declaring when DYN_DEBUG is used. Reported-by: Joshua Covington Tested-by: Joshua Covington Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1371154622-8929-3-git-send-email-yinghai@kernel.org Cc: v3.9 Signed-off-by: H. Peter Anvin --- kernel/range.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/range.c b/kernel/range.c index eb911dbce267..322ea8e93e4b 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -4,7 +4,7 @@ #include #include #include - +#include #include int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) @@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range, if (start >= end) return nr_range; - /* Try to merge it with old one: */ + /* get new start/end: */ for (i = 0; i < nr_range; i++) { - u64 final_start, final_end; u64 common_start, common_end; if (!range[i].end) @@ -45,14 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range, if (common_start > common_end) continue; - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); + /* new start/end, will add it back at last */ + start = min(range[i].start, start); + end = max(range[i].end, end); - /* clear it and add it back for further merge */ - range[i].start = 0; - range[i].end = 0; - return add_range_with_merge(range, az, nr_range, - final_start, final_end); + memmove(&range[i], &range[i + 1], + (nr_range - (i + 1)) * sizeof(range[i])); + range[nr_range - 1].start = 0; + range[nr_range - 1].end = 0; + nr_range--; + i--; } /* Need to add it: */ -- cgit v1.2.3 From 9bb5d40cd93c9dd4be74834b1dcb1ba03629716b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Jun 2013 10:44:21 +0200 Subject: perf: Fix mmap() accounting hole Vince's fuzzer once again found holes. This time it spotted a leak in the locked page accounting. When an event had redirected output and its close() was the last reference to the buffer we didn't have a vm context to undo accounting. Change the code to destroy the buffer on the last munmap() and detach all redirected events at that time. This provides us the right context to undo the vm accounting. Reported-and-tested-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130604084421.GI8923@twins.programming.kicks-ass.net Cc: Signed-off-by: Ingo Molnar --- kernel/events/core.c | 228 ++++++++++++++++++++++++++++++++--------------- kernel/events/internal.h | 3 +- 2 files changed, 159 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ae752cd4a086..b391907d5352 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); -static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb); - void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -2917,7 +2914,8 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static bool ring_buffer_put(struct ring_buffer *rb); +static void ring_buffer_put(struct ring_buffer *rb); +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); static void free_event(struct perf_event *event) { @@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event) if (has_branch_stack(event)) { static_key_slow_dec_deferred(&perf_sched_events); /* is system-wide event */ - if (!(event->attach_state & PERF_ATTACH_TASK)) + if (!(event->attach_state & PERF_ATTACH_TASK)) { atomic_dec(&per_cpu(perf_branch_stack_events, event->cpu)); + } } } if (event->rb) { - ring_buffer_put(event->rb); - event->rb = NULL; + struct ring_buffer *rb; + + /* + * Can happen when we close an event with re-directed output. + * + * Since we have a 0 refcount, perf_mmap_close() will skip + * over us; possibly making our ring_buffer_put() the last. + */ + mutex_lock(&event->mmap_mutex); + rb = event->rb; + if (rb) { + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + ring_buffer_put(rb); /* could be last */ + } + mutex_unlock(&event->mmap_mutex); } if (is_cgroup_event(event)) @@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) unsigned int events = POLL_HUP; /* - * Race between perf_event_set_output() and perf_poll(): perf_poll() - * grabs the rb reference but perf_event_set_output() overrides it. - * Here is the timeline for two threads T1, T2: - * t0: T1, rb = rcu_dereference(event->rb) - * t1: T2, old_rb = event->rb - * t2: T2, event->rb = new rb - * t3: T2, ring_buffer_detach(old_rb) - * t4: T1, ring_buffer_attach(rb1) - * t5: T1, poll_wait(event->waitq) - * - * To avoid this problem, we grab mmap_mutex in perf_poll() - * thereby ensuring that the assignment of the new ring buffer - * and the detachment of the old buffer appear atomic to perf_poll() + * Pin the event->rb by taking event->mmap_mutex; otherwise + * perf_event_set_output() can swizzle our rb and make us miss wakeups. */ mutex_lock(&event->mmap_mutex); - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (rb) { - ring_buffer_attach(event, rb); + rb = event->rb; + if (rb) events = atomic_xchg(&rb->poll, 0); - } - rcu_read_unlock(); - mutex_unlock(&event->mmap_mutex); poll_wait(file, &event->waitq, wait); @@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event, return; spin_lock_irqsave(&rb->event_lock, flags); - if (!list_empty(&event->rb_entry)) - goto unlock; - - list_add(&event->rb_entry, &rb->event_list); -unlock: + if (list_empty(&event->rb_entry)) + list_add(&event->rb_entry, &rb->event_list); spin_unlock_irqrestore(&rb->event_lock, flags); } -static void ring_buffer_detach(struct perf_event *event, - struct ring_buffer *rb) +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) { unsigned long flags; @@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_lock(); rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - list_for_each_entry_rcu(event, &rb->event_list, rb_entry) - wake_up_all(&event->waitq); - -unlock: + if (rb) { + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) + wake_up_all(&event->waitq); + } rcu_read_unlock(); } @@ -3582,23 +3571,14 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -static bool ring_buffer_put(struct ring_buffer *rb) +static void ring_buffer_put(struct ring_buffer *rb) { - struct perf_event *event, *n; - unsigned long flags; - if (!atomic_dec_and_test(&rb->refcount)) - return false; + return; - spin_lock_irqsave(&rb->event_lock, flags); - list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { - list_del_init(&event->rb_entry); - wake_up_all(&event->waitq); - } - spin_unlock_irqrestore(&rb->event_lock, flags); + WARN_ON_ONCE(!list_empty(&rb->event_list)); call_rcu(&rb->rcu_head, rb_free_rcu); - return true; } static void perf_mmap_open(struct vm_area_struct *vma) @@ -3606,28 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; atomic_inc(&event->mmap_count); + atomic_inc(&event->rb->mmap_count); } +/* + * A buffer can be mmap()ed multiple times; either directly through the same + * event, or through other events by use of perf_event_set_output(). + * + * In order to undo the VM accounting done by perf_mmap() we need to destroy + * the buffer here, where we still have a VM context. This means we need + * to detach all events redirecting to us. + */ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - struct ring_buffer *rb = event->rb; - struct user_struct *mmap_user = rb->mmap_user; - int mmap_locked = rb->mmap_locked; - unsigned long size = perf_data_size(rb); + struct ring_buffer *rb = event->rb; + struct user_struct *mmap_user = rb->mmap_user; + int mmap_locked = rb->mmap_locked; + unsigned long size = perf_data_size(rb); - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - mutex_unlock(&event->mmap_mutex); + atomic_dec(&rb->mmap_count); + + if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + return; + + /* Detach current event from the buffer. */ + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + mutex_unlock(&event->mmap_mutex); + + /* If there's still other mmap()s of this buffer, we're done. */ + if (atomic_read(&rb->mmap_count)) { + ring_buffer_put(rb); /* can't be last */ + return; + } - if (ring_buffer_put(rb)) { - atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= mmap_locked; - free_uid(mmap_user); + /* + * No other mmap()s, detach from all other events that might redirect + * into the now unreachable buffer. Somewhat complicated by the + * fact that rb::event_lock otherwise nests inside mmap_mutex. + */ +again: + rcu_read_lock(); + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { + if (!atomic_long_inc_not_zero(&event->refcount)) { + /* + * This event is en-route to free_event() which will + * detach it and remove it from the list. + */ + continue; } + rcu_read_unlock(); + + mutex_lock(&event->mmap_mutex); + /* + * Check we didn't race with perf_event_set_output() which can + * swizzle the rb from under us while we were waiting to + * acquire mmap_mutex. + * + * If we find a different rb; ignore this event, a next + * iteration will no longer find it on the list. We have to + * still restart the iteration to make sure we're not now + * iterating the wrong list. + */ + if (event->rb == rb) { + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + ring_buffer_put(rb); /* can't be last, we still have one */ + } + mutex_unlock(&event->mmap_mutex); + put_event(event); + + /* + * Restart the iteration; either we're on the wrong list or + * destroyed its integrity by doing a deletion. + */ + goto again; } + rcu_read_unlock(); + + /* + * It could be there's still a few 0-ref events on the list; they'll + * get cleaned up by free_event() -- they'll also still have their + * ref on the rb and will free it whenever they are done with it. + * + * Aside from that, this buffer is 'fully' detached and unmapped, + * undo the VM accounting. + */ + + atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= mmap_locked; + free_uid(mmap_user); + + ring_buffer_put(rb); /* could be last */ } static const struct vm_operations_struct perf_mmap_vmops = { @@ -3677,10 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; WARN_ON_ONCE(event->ctx->parent_ctx); +again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages != nr_pages) + if (event->rb->nr_pages != nr_pages) { ret = -EINVAL; + goto unlock; + } + + if (!atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Raced against perf_mmap_close() through + * perf_event_set_output(). Try again, hope for better + * luck. + */ + mutex_unlock(&event->mmap_mutex); + goto again; + } + goto unlock; } @@ -3722,12 +3788,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } + atomic_set(&rb->mmap_count, 1); rb->mmap_locked = extra; rb->mmap_user = get_current_user(); atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->pinned_vm += extra; + ring_buffer_attach(event, rb); rcu_assign_pointer(event->rb, rb); perf_event_update_userpage(event); @@ -3737,6 +3805,10 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); + /* + * Since pinned accounting is per vm we cannot allow fork() to copy our + * vma. + */ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; @@ -6415,6 +6487,8 @@ set: if (atomic_read(&event->mmap_count)) goto unlock; + old_rb = event->rb; + if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); @@ -6422,16 +6496,28 @@ set: goto unlock; } - old_rb = event->rb; - rcu_assign_pointer(event->rb, rb); if (old_rb) ring_buffer_detach(event, old_rb); + + if (rb) + ring_buffer_attach(event, rb); + + rcu_assign_pointer(event->rb, rb); + + if (old_rb) { + ring_buffer_put(old_rb); + /* + * Since we detached before setting the new rb, so that we + * could attach the new rb, we could have missed a wakeup. + * Provide it now. + */ + wake_up_all(&event->waitq); + } + ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_rb) - ring_buffer_put(old_rb); out: return ret; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 5bc6c8e9b851..ca6599723be5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -31,7 +31,8 @@ struct ring_buffer { spinlock_t event_lock; struct list_head event_list; - int mmap_locked; + atomic_t mmap_count; + unsigned long mmap_locked; struct user_struct *mmap_user; struct perf_event_mmap_page *user_page; -- cgit v1.2.3 From 873b4c65b519fd769940eb281f77848227d4e5c1 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 5 Jun 2013 10:13:11 +0200 Subject: sched: Fix clear NOHZ_BALANCE_KICK I have faced a sequence where the Idle Load Balance was sometime not triggered for a while on my platform, in the following scenario: CPU 0 and CPU 1 are running tasks and CPU 2 is idle CPU 1 kicks the Idle Load Balance CPU 1 selects CPU 2 as the new Idle Load Balancer CPU 2 sets NOHZ_BALANCE_KICK for CPU 2 CPU 2 sends a reschedule IPI to CPU 2 While CPU 3 wakes up, CPU 0 or CPU 1 migrates a waking up task A on CPU 2 CPU 2 finally wakes up, runs task A and discards the Idle Load Balance task A quickly goes back to sleep (before a tick occurs on CPU 2) CPU 2 goes back to idle with NOHZ_BALANCE_KICK set Whenever CPU 2 will be selected as the ILB, no reschedule IPI will be sent because NOHZ_BALANCE_KICK is already set and no Idle Load Balance will be performed. We must wait for the sched softirq to be raised on CPU 2 thanks to another part the kernel to come back to clear NOHZ_BALANCE_KICK. The proposed solution clears NOHZ_BALANCE_KICK in schedule_ipi if we can't raise the sched_softirq for the Idle Load Balance. Change since V1: - move the clear of NOHZ_BALANCE_KICK in got_nohz_idle_kick if the ILB can't run on this CPU (as suggested by Peter) Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1370419991-13870-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272fd..919bee68032b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu) static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); - return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + + if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + return false; + + if (idle_cpu(cpu) && !need_resched()) + return true; + + /* + * We can't run Idle Load Balance on this CPU for this time so we + * cancel it and clear NOHZ_BALANCE_KICK + */ + clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + return false; } #else /* CONFIG_NO_HZ_COMMON */ @@ -1393,8 +1405,9 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() - && !tick_nohz_full_cpu(smp_processor_id())) + if (llist_empty(&this_rq()->wake_list) + && !tick_nohz_full_cpu(smp_processor_id()) + && !got_nohz_idle_kick()) return; /* @@ -1417,7 +1430,7 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ - if (unlikely(got_nohz_idle_kick() && !need_resched())) { + if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } -- cgit v1.2.3 From 29bb9e5a75684106a37593ad75ec75ff8312731b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 May 2013 15:23:40 -0400 Subject: tracing/context-tracking: Add preempt_schedule_context() for tracing Dave Jones hit the following bug report: =============================== [ INFO: suspicious RCU usage. ] 3.10.0-rc2+ #1 Not tainted ------------------------------- include/linux/rcupdate.h:771 rcu_read_lock() used illegally while idle! other info that might help us debug this: RCU used illegally from idle CPU! rcu_scheduler_active = 1, debug_locks = 0 RCU used illegally from extended quiescent state! 2 locks held by cc1/63645: #0: (&rq->lock){-.-.-.}, at: [] __schedule+0xed/0x9b0 #1: (rcu_read_lock){.+.+..}, at: [] cpuacct_charge+0x5/0x1f0 CPU: 1 PID: 63645 Comm: cc1 Not tainted 3.10.0-rc2+ #1 [loadavg: 40.57 27.55 13.39 25/277 64369] Hardware name: Gigabyte Technology Co., Ltd. GA-MA78GM-S2H/GA-MA78GM-S2H, BIOS F12a 04/23/2010 0000000000000000 ffff88010f78fcf8 ffffffff816ae383 ffff88010f78fd28 ffffffff810b698d ffff88011c092548 000000000023d073 ffff88011c092500 0000000000000001 ffff88010f78fd60 ffffffff8109d7c5 ffffffff8109d645 Call Trace: [] dump_stack+0x19/0x1b [] lockdep_rcu_suspicious+0xfd/0x130 [] cpuacct_charge+0x185/0x1f0 [] ? cpuacct_charge+0x5/0x1f0 [] update_curr+0xec/0x240 [] put_prev_task_fair+0x228/0x480 [] __schedule+0x161/0x9b0 [] preempt_schedule+0x51/0x80 [] ? __cond_resched_softirq+0x60/0x60 [] ? retint_careful+0x12/0x2e [] ftrace_ops_control_func+0x1dc/0x210 [] ftrace_call+0x5/0x2f [] ? retint_careful+0xb/0x2e [] ? schedule_user+0x5/0x70 [] ? schedule_user+0x5/0x70 [] ? retint_careful+0x12/0x2e ------------[ cut here ]------------ What happened was that the function tracer traced the schedule_user() code that tells RCU that the system is coming back from userspace, and to add the CPU back to the RCU monitoring. Because the function tracer does a preempt_disable/enable_notrace() calls the preempt_enable_notrace() checks the NEED_RESCHED flag. If it is set, then preempt_schedule() is called. But this is called before the user_exit() function can inform the kernel that the CPU is no longer in user mode and needs to be accounted for by RCU. The fix is to create a new preempt_schedule_context() that checks if the kernel is still in user mode and if so to switch it to kernel mode before calling schedule. It also switches back to user mode coming back from schedule in need be. The only user of this currently is the preempt_enable_notrace(), which is only used by the tracing subsystem. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1369423420.6828.226.camel@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/context_tracking.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b878..66677003e223 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -71,6 +71,46 @@ void user_enter(void) local_irq_restore(flags); } +#ifdef CONFIG_PREEMPT +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +void __sched notrace preempt_schedule_context(void) +{ + struct thread_info *ti = current_thread_info(); + enum ctx_state prev_ctx; + + if (likely(ti->preempt_count || irqs_disabled())) + return; + + /* + * Need to disable preemption in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + preempt_disable_notrace(); + prev_ctx = exception_enter(); + preempt_enable_no_resched_notrace(); + + preempt_schedule(); + + preempt_disable_notrace(); + exception_exit(prev_ctx); + preempt_enable_notrace(); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_PREEMPT */ /** * user_exit - Inform the context tracking that the CPU is -- cgit v1.2.3 From 8b4d801b2b123b6c09742f861fe44a8527b84d47 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 17:50:06 +0200 Subject: hw_breakpoint: Fix cpu check in task_bp_pinned(cpu) trinity fuzzer triggered WARN_ONCE("Can't find any breakpoint slot") in arch_install_hw_breakpoint() but the problem is not arch-specific. The problem is, task_bp_pinned(cpu) checks "cpu == iter->cpu" but this doesn't account the "all cpus" events with iter->cpu < 0. This means that, say, register_user_hw_breakpoint(tsk) can happily create the arbitrary number > HBP_NUM of breakpoints which can not be activated. toggle_bp_task_slot() is equally wrong by the same reason and nr_task_bp_pinned[] can have negative entries. Simple test: # perl -e 'sleep 1 while 1' & # perf record -e mem:0x10,mem:0x10,mem:0x10,mem:0x10,mem:0x10 -p `pidof perl` Before this patch this triggers the same problem/WARN_ON(), after the patch it correctly fails with -ENOSPC. Reported-by: Vince Weaver Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Link: http://lkml.kernel.org/r/20130620155006.GA6324@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a64f8aeb5c1f..a853deabe6cf 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -120,7 +120,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) list_for_each_entry(iter, &bp_task_head, hw.bp_list) { if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type && - cpu == iter->cpu) + (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); } -- cgit v1.2.3 From c790b0ad23f427c7522ffed264706238c57c007e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 17:50:09 +0200 Subject: hw_breakpoint: Use cpu_possible_mask in {reserve,release}_bp_slot() fetch_bp_busy_slots() and toggle_bp_slot() use for_each_online_cpu(), this is obviously wrong wrt cpu_up() or cpu_down(), we can over/under account the per-cpu numbers. For example: # echo 0 >> /sys/devices/system/cpu/cpu1/online # perf record -e mem:0x10 -p 1 & # echo 1 >> /sys/devices/system/cpu/cpu1/online # perf record -e mem:0x10,mem:0x10,mem:0x10,mem:0x10 -C1 -a & # taskset -p 0x2 1 triggers the same WARN_ONCE("Can't find any breakpoint slot") in arch_install_hw_breakpoint(). Reported-by: Vince Weaver Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Link: http://lkml.kernel.org/r/20130620155009.GA6327@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a853deabe6cf..20185ea64aa6 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -149,7 +149,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, return; } - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { unsigned int nr; nr = per_cpu(nr_cpu_bp_pinned[type], cpu); @@ -235,7 +235,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, if (cpu >= 0) { toggle_bp_task_slot(bp, cpu, enable, type, weight); } else { - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) toggle_bp_task_slot(bp, cpu, enable, type, weight); } -- cgit v1.2.3 From ea8deb8dfa6b0e8d1b3d1051585706739b46656c Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 17 Jun 2013 18:15:35 +0200 Subject: tick: Fix tick_broadcast_pending_mask not cleared The recent modification in the cpuidle framework consolidated the timer broadcast code across the different drivers by setting a new flag in the idle state. It tells the cpuidle core code to enter/exit the broadcast mode for the cpu when entering a deep idle state. The broadcast timer enter/exit is no longer handled by the back-end driver. This change made the local interrupt to be enabled *before* calling CLOCK_EVENT_NOTIFY_EXIT. On a tegra114, a four cores system, when the flag has been introduced in the driver, the following warning appeared: WARNING: at kernel/time/tick-broadcast.c:578 tick_broadcast_oneshot_control CPU: 2 PID: 0 Comm: swapper/2 Not tainted 3.10.0-rc3-next-20130529+ #15 [] (tick_broadcast_oneshot_control+0x1a4/0x1d0) from [] (tick_notify+0x240/0x40c) [] (tick_notify+0x240/0x40c) from [] (notifier_call_chain+0x44/0x84) [] (notifier_call_chain+0x44/0x84) from [] (raw_notifier_call_chain+0x18/0x20) [] (raw_notifier_call_chain+0x18/0x20) from [] (clockevents_notify+0x28/0x170) [] (clockevents_notify+0x28/0x170) from [] (cpuidle_idle_call+0x11c/0x168) [] (cpuidle_idle_call+0x11c/0x168) from [] (arch_cpu_idle+0x8/0x38) [] (arch_cpu_idle+0x8/0x38) from [] (cpu_startup_entry+0x60/0x134) [] (cpu_startup_entry+0x60/0x134) from [<804fe9a4>] (0x804fe9a4) I don't have the hardware, so I wasn't able to reproduce the warning but after looking a while at the code, I deduced the following: 1. the CPU2 enters a deep idle state and sets the broadcast timer 2. the timer expires, the tick_handle_oneshot_broadcast function is called, setting the tick_broadcast_pending_mask and waking up the idle cpu CPU2 3. the CPU2 exits idle handles the interrupt and then invokes tick_broadcast_oneshot_control with CLOCK_EVENT_NOTIFY_EXIT which runs the following code: [...] if (dev->next_event.tv64 == KTIME_MAX) goto out; if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_pending_mask)) goto out; [...] So if there is no next event scheduled for CPU2, we fulfil the first condition and jump out without clearing the tick_broadcast_pending_mask. 4. CPU2 goes to deep idle again and calls tick_broadcast_oneshot_control with CLOCK_NOTIFY_EVENT_ENTER but with the tick_broadcast_pending_mask set for CPU2, triggering the warning. The issue only surfaced due to the modifications of the cpuidle framework, which resulted in interrupts being enabled before the call to the clockevents code. If the call happens before interrupts have been enabled, the warning cannot trigger, because there is still the event pending which caused the broadcast timer expiry. Move the check for the next event below the check for the pending bit, so the pending bit gets cleared whether an event is scheduled on the cpu or not. [ tglx: Massaged changelog ] Signed-off-by: Daniel Lezcano Reported-and-tested-by: Joseph Lo Cc: Stephen Warren Cc: linux-arm-kernel@lists.infradead.org Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/1371485735-31249-1-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b4c245580b79..20d6fba70652 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -599,8 +599,6 @@ void tick_broadcast_oneshot_control(unsigned long reason) } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - if (dev->next_event.tv64 == KTIME_MAX) - goto out; /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast @@ -614,6 +612,11 @@ void tick_broadcast_oneshot_control(unsigned long reason) tick_broadcast_pending_mask)) goto out; + /* + * Bail out if there is no next event. + */ + if (dev->next_event.tv64 == KTIME_MAX) + goto out; /* * If the pending bit is not set, then we are * either the CPU handling the broadcast -- cgit v1.2.3 From 706b23bde27a391f0974df2a8351661770fa2e07 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 28 Jun 2013 09:49:46 -0400 Subject: Fix: kernel/ptrace.c: ptrace_peek_siginfo() missing __put_user() validation This __put_user() could be used by unprivileged processes to write into kernel memory. The issue here is that even if copy_siginfo_to_user() fails, the error code is not checked before __put_user() is executed. Luckily, ptrace_peek_siginfo() has been added within the 3.10-rc cycle, so it has not hit a stable release yet. Signed-off-by: Mathieu Desnoyers Acked-by: Oleg Nesterov Cc: Andrey Vagin Cc: Roland McGrath Cc: Paul McKenney Cc: David Howells Cc: Dave Jones Cc: Pavel Emelyanov Cc: Pedro Alves Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aed981a3f69c..335a7ae697f5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -665,20 +665,22 @@ static int ptrace_peek_siginfo(struct task_struct *child, if (unlikely(is_compat_task())) { compat_siginfo_t __user *uinfo = compat_ptr(data); - ret = copy_siginfo_to_user32(uinfo, &info); - ret |= __put_user(info.si_code, &uinfo->si_code); + if (copy_siginfo_to_user32(uinfo, &info) || + __put_user(info.si_code, &uinfo->si_code)) { + ret = -EFAULT; + break; + } + } else #endif { siginfo_t __user *uinfo = (siginfo_t __user *) data; - ret = copy_siginfo_to_user(uinfo, &info); - ret |= __put_user(info.si_code, &uinfo->si_code); - } - - if (ret) { - ret = -EFAULT; - break; + if (copy_siginfo_to_user(uinfo, &info) || + __put_user(info.si_code, &uinfo->si_code)) { + ret = -EFAULT; + break; + } } data += sizeof(siginfo_t); -- cgit v1.2.3 From 55ccb616a6e42052edb37e9c4f82cf8854a59429 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:42 +0000 Subject: posix_cpu_timer: consolidate expiry time type The posix cpu timer expiry time is stored in a union of two types: a 64 bits field if we rely on scheduler precise accounting, or a cputime_t if we rely on jiffies. This results in quite some duplicate code and special cases to handle the two types. Just unify this into a single 64 bits field. cputime_t can always fit into it. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 266 ++++++++++++++++++---------------------------- 1 file changed, 106 insertions(+), 160 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 42670e9b44e0..c3c4ea1225a4 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock) return error; } -static inline union cpu_time_count +static inline unsigned long long timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) { - union cpu_time_count ret; - ret.sched = 0; /* high half always zero when .cpu used */ + unsigned long long ret; + + ret = 0; /* high half always zero when .cpu used */ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; + ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; } else { - ret.cpu = timespec_to_cputime(tp); + ret = cputime_to_expires(timespec_to_cputime(tp)); } return ret; } static void sample_to_timespec(const clockid_t which_clock, - union cpu_time_count cpu, + unsigned long long expires, struct timespec *tp) { if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) - *tp = ns_to_timespec(cpu.sched); + *tp = ns_to_timespec(expires); else - cputime_to_timespec(cpu.cpu, tp); -} - -static inline int cpu_time_before(const clockid_t which_clock, - union cpu_time_count now, - union cpu_time_count then) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - return now.sched < then.sched; - } else { - return now.cpu < then.cpu; - } -} -static inline void cpu_time_add(const clockid_t which_clock, - union cpu_time_count *acc, - union cpu_time_count val) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - acc->sched += val.sched; - } else { - acc->cpu += val.cpu; - } -} -static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, - union cpu_time_count a, - union cpu_time_count b) -{ - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - a.sched -= b.sched; - } else { - a.cpu -= b.cpu; - } - return a; + cputime_to_timespec((__force cputime_t)expires, tp); } /* @@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, * given the current clock sample. */ static void bump_cpu_timer(struct k_itimer *timer, - union cpu_time_count now) + unsigned long long now) { int i; + unsigned long long delta, incr; - if (timer->it.cpu.incr.sched == 0) + if (timer->it.cpu.incr == 0) return; - if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { - unsigned long long delta, incr; + if (now < timer->it.cpu.expires) + return; - if (now.sched < timer->it.cpu.expires.sched) - return; - incr = timer->it.cpu.incr.sched; - delta = now.sched + incr - timer->it.cpu.expires.sched; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr = incr << 1; - for (; i >= 0; incr >>= 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.sched += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } - } else { - cputime_t delta, incr; + incr = timer->it.cpu.incr; + delta = now + incr - timer->it.cpu.expires; - if (now.cpu < timer->it.cpu.expires.cpu) - return; - incr = timer->it.cpu.incr.cpu; - delta = now.cpu + incr - timer->it.cpu.expires.cpu; - /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; incr < delta - incr; i++) - incr += incr; - for (; i >= 0; incr = incr >> 1, i--) { - if (delta < incr) - continue; - timer->it.cpu.expires.cpu += incr; - timer->it_overrun += 1 << i; - delta -= incr; - } + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; incr < delta - incr; i++) + incr = incr << 1; + + for (; i >= 0; incr >>= 1, i--) { + if (delta < incr) + continue; + + timer->it.cpu.expires += incr; + timer->it_overrun += 1 << i; + delta -= incr; } } @@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime) return 0; } -static inline cputime_t prof_ticks(struct task_struct *p) +static inline unsigned long long prof_ticks(struct task_struct *p) { cputime_t utime, stime; task_cputime(p, &utime, &stime); - return utime + stime; + return cputime_to_expires(utime + stime); } -static inline cputime_t virt_ticks(struct task_struct *p) +static inline unsigned long long virt_ticks(struct task_struct *p) { cputime_t utime; task_cputime(p, &utime, NULL); - return utime; + return cputime_to_expires(utime); } static int @@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) * Sample a per-thread clock for the given task. */ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { switch (CPUCLOCK_WHICH(which_clock)) { default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = prof_ticks(p); + *sample = prof_ticks(p); break; case CPUCLOCK_VIRT: - cpu->cpu = virt_ticks(p); + *sample = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = task_sched_runtime(p); + *sample = task_sched_runtime(p); break; } return 0; @@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) */ static int cpu_clock_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return -EINVAL; case CPUCLOCK_PROF: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: thread_group_cputime(p, &cputime); - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: thread_group_cputime(p, &cputime); - cpu->sched = cputime.sum_exec_runtime; + *sample = cputime.sum_exec_runtime; break; } return 0; @@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int error = -EINVAL; - union cpu_time_count rtn; + unsigned long long rtn; if (pid == 0) { /* @@ -461,30 +414,30 @@ static void cleanup_timers(struct list_head *head, list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.cpu < ptime) { - timer->expires.cpu = 0; + if (timer->expires < cputime_to_expires(ptime)) { + timer->expires = 0; } else { - timer->expires.cpu -= ptime; + timer->expires -= cputime_to_expires(ptime); } } ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.cpu < utime) { - timer->expires.cpu = 0; + if (timer->expires < cputime_to_expires(utime)) { + timer->expires = 0; } else { - timer->expires.cpu -= utime; + timer->expires -= cputime_to_expires(utime); } } ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (timer->expires.sched < sum_exec_runtime) { - timer->expires.sched = 0; + if (timer->expires < sum_exec_runtime) { + timer->expires = 0; } else { - timer->expires.sched -= sum_exec_runtime; + timer->expires -= sum_exec_runtime; } } } @@ -516,7 +469,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } -static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) +static void clear_dead_task(struct k_itimer *timer, unsigned long long now) { /* * That's all for this thread or process. @@ -524,9 +477,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) */ put_task_struct(timer->it.cpu.task); timer->it.cpu.task = NULL; - timer->it.cpu.expires = cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, - now); + timer->it.cpu.expires -= now; } static inline int expires_gt(cputime_t expires, cputime_t new_exp) @@ -558,14 +509,14 @@ static void arm_timer(struct k_itimer *timer) listpos = head; list_for_each_entry(next, head, entry) { - if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) + if (nt->expires < next->expires) break; listpos = &next->entry; } list_add(&nt->entry, listpos); if (listpos == head) { - union cpu_time_count *exp = &nt->expires; + unsigned long long exp = nt->expires; /* * We are the new earliest-expiring POSIX 1.b timer, hence @@ -576,17 +527,17 @@ static void arm_timer(struct k_itimer *timer) switch (CPUCLOCK_WHICH(timer->it_clock)) { case CPUCLOCK_PROF: - if (expires_gt(cputime_expires->prof_exp, exp->cpu)) - cputime_expires->prof_exp = exp->cpu; + if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) + cputime_expires->prof_exp = expires_to_cputime(exp); break; case CPUCLOCK_VIRT: - if (expires_gt(cputime_expires->virt_exp, exp->cpu)) - cputime_expires->virt_exp = exp->cpu; + if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) + cputime_expires->virt_exp = expires_to_cputime(exp); break; case CPUCLOCK_SCHED: if (cputime_expires->sched_exp == 0 || - cputime_expires->sched_exp > exp->sched) - cputime_expires->sched_exp = exp->sched; + cputime_expires->sched_exp > exp) + cputime_expires->sched_exp = exp; break; } } @@ -601,20 +552,20 @@ static void cpu_timer_fire(struct k_itimer *timer) /* * User don't want any signal. */ - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. */ wake_up_process(timer->it_process); - timer->it.cpu.expires.sched = 0; - } else if (timer->it.cpu.incr.sched == 0) { + timer->it.cpu.expires = 0; + } else if (timer->it.cpu.incr == 0) { /* * One-shot timer. Clear it as soon as it's fired. */ posix_timer_event(timer, 0); - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { /* * The signal did not get queued because the signal @@ -632,7 +583,7 @@ static void cpu_timer_fire(struct k_itimer *timer) */ static int cpu_timer_sample_group(const clockid_t which_clock, struct task_struct *p, - union cpu_time_count *cpu) + unsigned long long *sample) { struct task_cputime cputime; @@ -641,13 +592,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock, default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime.utime + cputime.stime; + *sample = cputime_to_expires(cputime.utime + cputime.stime); break; case CPUCLOCK_VIRT: - cpu->cpu = cputime.utime; + *sample = cputime_to_expires(cputime.utime); break; case CPUCLOCK_SCHED: - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + *sample = cputime.sum_exec_runtime + task_delta_exec(p); break; } return 0; @@ -694,7 +645,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count old_expires, new_expires, old_incr, val; + unsigned long long old_expires, new_expires, old_incr, val; int ret; if (unlikely(p == NULL)) { @@ -749,7 +700,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, } if (old) { - if (old_expires.sched == 0) { + if (old_expires == 0) { old->it_value.tv_sec = 0; old->it_value.tv_nsec = 0; } else { @@ -764,11 +715,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * new setting. */ bump_cpu_timer(timer, val); - if (cpu_time_before(timer->it_clock, val, - timer->it.cpu.expires)) { - old_expires = cpu_time_sub( - timer->it_clock, - timer->it.cpu.expires, val); + if (val < timer->it.cpu.expires) { + old_expires = timer->it.cpu.expires - val; sample_to_timespec(timer->it_clock, old_expires, &old->it_value); @@ -791,8 +739,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, goto out; } - if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { - cpu_time_add(timer->it_clock, &new_expires, val); + if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { + new_expires += val; } /* @@ -801,8 +749,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * arm the timer (we'll just fake it for timer_gettime). */ timer->it.cpu.expires = new_expires; - if (new_expires.sched != 0 && - cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && val < new_expires) { arm_timer(timer); } @@ -826,8 +773,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, timer->it_overrun_last = 0; timer->it_overrun = -1; - if (new_expires.sched != 0 && - !cpu_time_before(timer->it_clock, val, new_expires)) { + if (new_expires != 0 && !(val < new_expires)) { /* * The designated time already passed, so we notify * immediately, even if the thread never runs to @@ -849,7 +795,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { - union cpu_time_count now; + unsigned long long now; struct task_struct *p = timer->it.cpu.task; int clear_dead; @@ -859,7 +805,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) sample_to_timespec(timer->it_clock, timer->it.cpu.incr, &itp->it_interval); - if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ + if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; return; } @@ -891,7 +837,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) */ put_task_struct(p); timer->it.cpu.task = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; read_unlock(&tasklist_lock); goto dead; } else { @@ -912,10 +858,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) goto dead; } - if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { + if (now < timer->it.cpu.expires) { sample_to_timespec(timer->it_clock, - cpu_time_sub(timer->it_clock, - timer->it.cpu.expires, now), + timer->it.cpu.expires - now, &itp->it_value); } else { /* @@ -946,8 +891,8 @@ static void check_thread_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.prof_exp = t->expires.cpu; + if (!--maxfire || prof_ticks(tsk) < t->expires) { + tsk->cputime_expires.prof_exp = expires_to_cputime(t->expires); break; } t->firing = 1; @@ -961,8 +906,8 @@ static void check_thread_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { - tsk->cputime_expires.virt_exp = t->expires.cpu; + if (!--maxfire || virt_ticks(tsk) < t->expires) { + tsk->cputime_expires.virt_exp = expires_to_cputime(t->expires); break; } t->firing = 1; @@ -976,8 +921,8 @@ static void check_thread_timers(struct task_struct *tsk, struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->cputime_expires.sched_exp = t->expires.sched; + if (!--maxfire || tsk->se.sum_exec_runtime < t->expires) { + tsk->cputime_expires.sched_exp = t->expires; break; } t->firing = 1; @@ -1030,7 +975,8 @@ static void stop_process_timers(struct signal_struct *sig) static u32 onecputick; static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, - cputime_t *expires, cputime_t cur_time, int signo) + unsigned long long *expires, + unsigned long long cur_time, int signo) { if (!it->expires) return; @@ -1068,7 +1014,7 @@ static void check_process_timers(struct task_struct *tsk, { int maxfire; struct signal_struct *const sig = tsk->signal; - cputime_t utime, ptime, virt_expires, prof_expires; + unsigned long long utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; struct list_head *timers = sig->cpu_timers; struct task_cputime cputime; @@ -1078,8 +1024,8 @@ static void check_process_timers(struct task_struct *tsk, * Collect the current process totals. */ thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - ptime = utime + cputime.stime; + utime = cputime_to_expires(cputime.utime); + ptime = utime + cputime_to_expires(cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; maxfire = 20; prof_expires = 0; @@ -1087,8 +1033,8 @@ static void check_process_timers(struct task_struct *tsk, struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || ptime < tl->expires.cpu) { - prof_expires = tl->expires.cpu; + if (!--maxfire || ptime < tl->expires) { + prof_expires = tl->expires; break; } tl->firing = 1; @@ -1102,8 +1048,8 @@ static void check_process_timers(struct task_struct *tsk, struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || utime < tl->expires.cpu) { - virt_expires = tl->expires.cpu; + if (!--maxfire || utime < tl->expires) { + virt_expires = tl->expires; break; } tl->firing = 1; @@ -1117,8 +1063,8 @@ static void check_process_timers(struct task_struct *tsk, struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || sum_sched_runtime < tl->expires.sched) { - sched_expires = tl->expires.sched; + if (!--maxfire || sum_sched_runtime < tl->expires) { + sched_expires = tl->expires; break; } tl->firing = 1; @@ -1162,8 +1108,8 @@ static void check_process_timers(struct task_struct *tsk, } } - sig->cputime_expires.prof_exp = prof_expires; - sig->cputime_expires.virt_exp = virt_expires; + sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); + sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires); sig->cputime_expires.sched_exp = sched_expires; if (task_cputime_zero(&sig->cputime_expires)) stop_process_timers(sig); @@ -1176,7 +1122,7 @@ static void check_process_timers(struct task_struct *tsk, void posix_cpu_timer_schedule(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count now; + unsigned long long now; if (unlikely(p == NULL)) /* @@ -1205,7 +1151,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) */ put_task_struct(p); timer->it.cpu.task = p = NULL; - timer->it.cpu.expires.sched = 0; + timer->it.cpu.expires = 0; goto out_unlock; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { /* @@ -1387,7 +1333,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) { - union cpu_time_count now; + unsigned long long now; BUG_ON(clock_idx == CPUCLOCK_SCHED); cpu_timer_sample_group(clock_idx, tsk, &now); @@ -1399,17 +1345,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, * it to be absolute. */ if (*oldval) { - if (*oldval <= now.cpu) { + if (*oldval <= now) { /* Just about to fire. */ *oldval = cputime_one_jiffy; } else { - *oldval -= now.cpu; + *oldval -= now; } } if (!*newval) goto out; - *newval += now.cpu; + *newval += now; } /* @@ -1459,7 +1405,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, } while (!signal_pending(current)) { - if (timer.it.cpu.expires.sched == 0) { + if (timer.it.cpu.expires == 0) { /* * Our timer fired and was reset, below * deletion can not fail. -- cgit v1.2.3 From 1a7fa510b38e518d11365883934f1afa41625424 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:42 +0000 Subject: posix_cpu_timers: consolidate timer list cleanups Cleaning up the posix cpu timers on task exit shares some common code among timer list types, most notably the list traversal and expiry time update. Unify this in a common helper. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 48 +++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c3c4ea1225a4..b1450cee6d6d 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -399,6 +399,21 @@ static int posix_cpu_timer_del(struct k_itimer *timer) return ret; } +static void cleanup_timers_list(struct list_head *head, + unsigned long long curr) +{ + struct cpu_timer_list *timer, *next; + + list_for_each_entry_safe(timer, next, head, entry) { + list_del_init(&timer->entry); + if (timer->expires < curr) { + timer->expires = 0; + } else { + timer->expires -= curr; + } + } +} + /* * Clean out CPU timers still ticking when a thread exited. The task * pointer is cleared, and the expiry time is replaced with the residual @@ -409,37 +424,12 @@ static void cleanup_timers(struct list_head *head, cputime_t utime, cputime_t stime, unsigned long long sum_exec_runtime) { - struct cpu_timer_list *timer, *next; - cputime_t ptime = utime + stime; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires < cputime_to_expires(ptime)) { - timer->expires = 0; - } else { - timer->expires -= cputime_to_expires(ptime); - } - } - - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires < cputime_to_expires(utime)) { - timer->expires = 0; - } else { - timer->expires -= cputime_to_expires(utime); - } - } + cputime_t ptime = utime + stime; - ++head; - list_for_each_entry_safe(timer, next, head, entry) { - list_del_init(&timer->entry); - if (timer->expires < sum_exec_runtime) { - timer->expires = 0; - } else { - timer->expires -= sum_exec_runtime; - } - } + cleanup_timers_list(head, cputime_to_expires(ptime)); + cleanup_timers_list(++head, cputime_to_expires(utime)); + cleanup_timers_list(++head, sum_exec_runtime); } /* -- cgit v1.2.3 From 2473f3e7a97ce8bc0fe7596cdb361b21221418eb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:43 +0000 Subject: posix_cpu_timers: consolidate expired timers check Consolidate the common code amongst per thread and per process timers list on tick time. List traversal, expiry check and subsequent updates can be shared in a common helper. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 118 +++++++++++++--------------------------------- 1 file changed, 33 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index b1450cee6d6d..92a4fbf44f86 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -862,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) } } +static unsigned long long +check_timers_list(struct list_head *timers, + struct list_head *firing, + unsigned long long curr) +{ + int maxfire = 20; + + while (!list_empty(timers)) { + struct cpu_timer_list *t; + + t = list_first_entry(timers, struct cpu_timer_list, entry); + + if (!--maxfire || curr < t->expires) + return t->expires; + + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + return 0; +} + /* * Check for any per-thread CPU timers that have fired and move them off * the tsk->cpu_timers[N] list onto the firing list. Here we update the @@ -870,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) static void check_thread_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct list_head *timers = tsk->cpu_timers; struct signal_struct *const sig = tsk->signal; + struct task_cputime *tsk_expires = &tsk->cputime_expires; + unsigned long long expires; unsigned long soft; - maxfire = 20; - tsk->cputime_expires.prof_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || prof_ticks(tsk) < t->expires) { - tsk->cputime_expires.prof_exp = expires_to_cputime(t->expires); - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(timers, firing, prof_ticks(tsk)); + tsk_expires->prof_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.virt_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || virt_ticks(tsk) < t->expires) { - tsk->cputime_expires.virt_exp = expires_to_cputime(t->expires); - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + expires = check_timers_list(++timers, firing, virt_ticks(tsk)); + tsk_expires->virt_exp = expires_to_cputime(expires); - ++timers; - maxfire = 20; - tsk->cputime_expires.sched_exp = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || tsk->se.sum_exec_runtime < t->expires) { - tsk->cputime_expires.sched_exp = t->expires; - break; - } - t->firing = 1; - list_move_tail(&t->entry, firing); - } + tsk_expires->sched_exp = check_timers_list(++timers, firing, + tsk->se.sum_exec_runtime); /* * Check for the special case thread timers. @@ -1002,7 +990,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, static void check_process_timers(struct task_struct *tsk, struct list_head *firing) { - int maxfire; struct signal_struct *const sig = tsk->signal; unsigned long long utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; @@ -1017,49 +1004,10 @@ static void check_process_timers(struct task_struct *tsk, utime = cputime_to_expires(cputime.utime); ptime = utime + cputime_to_expires(cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; - maxfire = 20; - prof_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || ptime < tl->expires) { - prof_expires = tl->expires; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - ++timers; - maxfire = 20; - virt_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || utime < tl->expires) { - virt_expires = tl->expires; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } - - ++timers; - maxfire = 20; - sched_expires = 0; - while (!list_empty(timers)) { - struct cpu_timer_list *tl = list_first_entry(timers, - struct cpu_timer_list, - entry); - if (!--maxfire || sum_sched_runtime < tl->expires) { - sched_expires = tl->expires; - break; - } - tl->firing = 1; - list_move_tail(&tl->entry, firing); - } + prof_expires = check_timers_list(timers, firing, ptime); + virt_expires = check_timers_list(++timers, firing, utime); + sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); /* * Check for the special case process timers. -- cgit v1.2.3 From 76cdcdd979ce00f5037804d73da583fb488ec1b2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:43 +0000 Subject: posix-timers: correctly get dying task time sample in posix_cpu_timer_schedule() In order to re-arm a timer after it fired, we take a sample of the current process or thread cputime. If the task is dying though, we don't arm anything but we cache the remaining timer expiration delta for further reads. Something similar is performed in posix_cpu_timer_get() but here we forget to take the process wide cputime sample before caching it. As a result we are storing random stack content, leading every further reads of that timer to return junk values. Fix this by taking the appropriate sample in the case of process wide timers. This probably doesn't matter much in practice because, at this stage, the thread is the last one in the group and we reached exit_notify(). This implies that we called exit_itimers() and there should be no more timers to handle for that task. So this is likely dead code anyway but let's fix the current logic and the warning that came along: kernel/posix-cpu-timers.c: In function 'posix_cpu_timer_schedule': kernel/posix-cpu-timers.c:1127: warning: 'now' may be used uninitialized in this function Then we can start to think further about cleaning up that code. Reported-by: Andrew Morton Reported-by: Chen Gang Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Chen Gang Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 92a4fbf44f86..4ebd8ad07c66 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1097,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) * not yet reaped. Take this opportunity to * drop our task ref. */ + cpu_timer_sample_group(timer->it_clock, p, &now); clear_dead_task(timer, now); goto out_unlock; } -- cgit v1.2.3 From a0b2062b0904ef07944c4a6e4d0f88ee44f1e9f2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 28 Jun 2013 00:06:43 +0000 Subject: posix_timers: fix racy timer delta caching on task exit When a task exits, we perform a caching of the remaining cputime delta before expiring of its timers. This is done from the following places: * When the task is reaped. We iterate through its list of posix cpu timers and store the remaining timer delta to the timer struct instead of the absolute value. (See posix_cpu_timers_exit() / posix_cpu_timers_exit_group() ) * When we call posix_cpu_timer_get() or posix_cpu_timer_schedule(). If the timer's task is considered dying when watched from these places, the same conversion from absolute to relative expiry time is performed. Then the given task's reference is released. (See clear_dead_task() ). The relevance of this caching is questionable but this is another and deeper debate. The big issue here is that these two sources of caching don't mix up very well together. More specifically, the caching can easily be done twice, resulting in a wrong delta as it gets spuriously substracted a second time by the elapsed clock. This can happen in the following scenario: 1) The task exits and gets reaped: we call posix_cpu_timers_exit() and the absolute timer expiry values are converted to a relative delta. 2) timer_gettime() -> posix_cpu_timer_get() is called and relies on clear_dead_task() because tsk->exit_state == EXIT_DEAD. The delta gets substracted again by the elapsed clock and we return a wrong result. To fix this, just remove the caching done on task reaping time. It doesn't bring much value on its own. The caching done from posix_cpu_timer_get/schedule is enough. And it would also be hard to get it really right: we could make it put and clear the target task in the timer struct so that readers know if they are dealing with a relative cached of absolute value. But it would be racy. The only safe way to do it would be to lock the itimer->it_lock so that we know nobody reads the cputime expiry value while we modify it and its target task reference. Doing so would involve some funny workarounds to avoid circular lock against the sighand lock. There is just no reason to maintain this. The user visible effect of this patch can be observed by running the following code: it creates a subthread that launches a posix cputimer which expires after 10 seconds. But then the subthread only busy loops for 2 seconds and exits. The parent reaps the subthread and read the timer value. Its expected value should the be the initial timer's expiration value minus the cputime elapsed in the subthread. Roughly 10 - 2 = 8 seconds: #include #include #include #include #include static timer_t id; static struct itimerspec val = { .it_value.tv_sec = 10, }, new; static void *thread(void *unused) { int err; struct timeval start, end, diff; timer_create(CLOCK_THREAD_CPUTIME_ID, NULL, &id); if (err < 0) { perror("Can't create timer\n"); return NULL; } /* Arm 10 sec timer */ err = timer_settime(id, 0, &val, NULL); if (err < 0) { perror("Can't set timer\n"); return NULL; } /* Exit after 2 seconds of execution */ gettimeofday(&start, NULL); do { gettimeofday(&end, NULL); timersub(&end, &start, &diff); } while (diff.tv_sec < 2); return NULL; } int main(int argc, char **argv) { pthread_t pthread; int err; err = pthread_create(&pthread, NULL, thread, NULL); if (err) { perror("Can't create thread\n"); return -1; } pthread_join(pthread, NULL); /* Just wait a little bit to make sure the child got reaped */ sleep(1); err = timer_gettime(id, &new); if (err) perror("Can't get timer value\n"); printf("%d %ld\n", new.it_value.tv_sec, new.it_value.tv_nsec); return 0; } Before the patch: $ ./posix_cpu_timers 6 2278074 After the patch: $ ./posix_cpu_timers 8 1158766 Before the patch, the elapsed time got two more seconds spuriously accounted. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Olivier Langlois Signed-off-by: Andrew Morton --- kernel/posix-cpu-timers.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 4ebd8ad07c66..c7f31aa272f7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -404,14 +404,8 @@ static void cleanup_timers_list(struct list_head *head, { struct cpu_timer_list *timer, *next; - list_for_each_entry_safe(timer, next, head, entry) { + list_for_each_entry_safe(timer, next, head, entry) list_del_init(&timer->entry); - if (timer->expires < curr) { - timer->expires = 0; - } else { - timer->expires -= curr; - } - } } /* @@ -459,15 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } -static void clear_dead_task(struct k_itimer *timer, unsigned long long now) +static void clear_dead_task(struct k_itimer *itimer, unsigned long long now) { + struct cpu_timer_list *timer = &itimer->it.cpu; + /* * That's all for this thread or process. * We leave our residual in expires to be reported. */ - put_task_struct(timer->it.cpu.task); - timer->it.cpu.task = NULL; - timer->it.cpu.expires -= now; + put_task_struct(timer->task); + timer->task = NULL; + if (timer->expires < now) { + timer->expires = 0; + } else { + timer->expires -= now; + } } static inline int expires_gt(cputime_t expires, cputime_t new_exp) -- cgit v1.2.3 From fa18f7bde3ad4568d1d343b60d963bfbd8dc3991 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Sun, 26 May 2013 17:35:41 -0400 Subject: posix-cpu-timers: don't account cpu timer after stopped thread runtime accounting When tsk->signal->cputimer->running is 1, signal->cputimer (i.e. per process timer account) and tsk->sum_sched_runtime (i.e. per thread timer account) increase at the same pace because update_curr() increases both accounting. However, there is one exception. When thread exiting, __exit_signal() turns over task's sum_shced_runtime to sig->sum_sched_runtime, but it doesn't stop signal->cputimer accounting. This inconsistency makes POSIX timer wake up too early. This patch fixes it. Original-patch-by: Olivier Langlois Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Acked-by: Peter Zijlstra Signed-off-by: Olivier Langlois Signed-off-by: KOSAKI Motohiro Signed-off-by: Frederic Weisbecker --- kernel/sched/stats.h | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 2ef90a51ec5e..71bac979d5ee 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -161,6 +161,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) * on CONFIG_SCHEDSTATS. */ +/** + * cputimer_running - return true if cputimer is running + * + * @tsk: Pointer to target task. + */ +static inline bool cputimer_running(struct task_struct *tsk) + +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return false; + + /* + * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime + * in __exit_signal(), we won't account to the signal struct further + * cputime consumed by that task, even though the task can still be + * ticking after __exit_signal(). + * + * In order to keep a consistent behaviour between thread group cputime + * and thread group cputimer accounting, lets also ignore the cputime + * elapsing after __exit_signal() in any thread group timer running. + * + * This makes sure that POSIX CPU clocks and timers are synchronized, so + * that a POSIX CPU timer won't expire while the corresponding POSIX CPU + * clock delta is behind the expiring timer value. + */ + if (unlikely(!tsk->sighand)) + return false; + + return true; +} + /** * account_group_user_time - Maintain utime for a thread group. * @@ -176,7 +209,7 @@ static inline void account_group_user_time(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); @@ -199,7 +232,7 @@ static inline void account_group_system_time(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); @@ -222,7 +255,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, { struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; - if (!cputimer->running) + if (!cputimer_running(tsk)) return; raw_spin_lock(&cputimer->lock); -- cgit v1.2.3