From 44b7f4b98d8877e2a4427f2a2f2e42ae8227a58f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 13 Dec 2011 20:40:45 +0100 Subject: perf events: Fix ring_buffer_wakeup() brown paperbag bug Commit 10c6db11 ("perf: Fix loss of notification with multi-event") seems to unconditionally dereference event->rb in the wakeup handler, this is wrong, there might not be a buffer attached. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111213152651.GP20297@mudshark.cambridge.arm.com [ minor edits ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d3b9df5962c2..58690af323e4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3558,9 +3558,13 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_lock(); rb = rcu_dereference(event->rb); - list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { + if (!rb) + goto unlock; + + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) wake_up_all(&event->waitq); - } + +unlock: rcu_read_unlock(); } -- cgit v1.2.3 From ab2789213d224202237292d78aaa0c386c7b28b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Dec 2011 11:47:00 -0800 Subject: sched: Fix select_idle_sibling() regression in selecting an idle SMT sibling Mike Galbraith reported that this recent commit: commit 4dcfe1025b513c2c1da5bf5586adb0e80148f612 Author: Peter Zijlstra Date: Thu Nov 10 13:01:10 2011 +0100 sched: Avoid SMT siblings in select_idle_sibling() if possible stopped selecting an idle SMT sibling when there are no idle cores in a single socket system. Intent of the select_idle_sibling() was to fallback to an idle SMT sibling, if it fails to identify an idle core. But this fallback was not happening on systems where all the scheduler domains had `SD_SHARE_PKG_RESOURCES' flag set. Fix it. Slightly bigger patch of cleaning all these goto's etc is queued up for the next release. Reported-by: Mike Galbraith Reported-by: Alex Shi Signed-off-by: Peter Zijlstra Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1323978421.1984.244.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a78ed2736ba7..8a39fa3e3c6c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2352,13 +2352,11 @@ again: if (!smt && (sd->flags & SD_SHARE_CPUPOWER)) continue; - if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) { - if (!smt) { - smt = 1; - goto again; - } + if (smt && !(sd->flags & SD_SHARE_CPUPOWER)) + break; + + if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) break; - } sg = sd->groups; do { @@ -2378,6 +2376,10 @@ next: sg = sg->next; } while (sg != sd->groups); } + if (!smt) { + smt = 1; + goto again; + } done: rcu_read_unlock(); -- cgit v1.2.3 From b1b73d095084e754562961c443aa8f6587a55f8e Mon Sep 17 00:00:00 2001 From: Kusanagi Kouichi Date: Mon, 19 Dec 2011 18:13:19 +0900 Subject: time/clocksource: Fix kernel-doc warnings Fix various KernelDoc build warnings. Signed-off-by: Kusanagi Kouichi Cc: John Stultz Link: http://lkml.kernel.org/r/20111219091320.0D5AF6FC03D@msa105.auone-net.jp Signed-off-by: Ingo Molnar --- kernel/time/clocksource.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index da2f760e780c..d3ad022136e5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -647,7 +647,7 @@ static void clocksource_enqueue(struct clocksource *cs) /** * __clocksource_updatefreq_scale - Used update clocksource with new freq - * @t: clocksource to be registered + * @cs: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale * @@ -699,7 +699,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); /** * __clocksource_register_scale - Used to install new clocksources - * @t: clocksource to be registered + * @cs: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale * @@ -727,7 +727,7 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale); /** * clocksource_register - Used to install new clocksources - * @t: clocksource to be registered + * @cs: clocksource to be registered * * Returns -EBUSY if registration fails, zero otherwise. */ @@ -761,6 +761,8 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) /** * clocksource_change_rating - Change the rating of a registered clocksource + * @cs: clocksource to be changed + * @rating: new rating */ void clocksource_change_rating(struct clocksource *cs, int rating) { @@ -772,6 +774,7 @@ EXPORT_SYMBOL(clocksource_change_rating); /** * clocksource_unregister - remove a registered clocksource + * @cs: clocksource to be unregistered */ void clocksource_unregister(struct clocksource *cs) { @@ -787,6 +790,7 @@ EXPORT_SYMBOL(clocksource_unregister); /** * sysfs_show_current_clocksources - sysfs interface for current clocksource * @dev: unused + * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing current clocksource. @@ -807,6 +811,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, /** * sysfs_override_clocksource - interface for manually overriding clocksource * @dev: unused + * @attr: unused * @buf: name of override clocksource * @count: length of buffer * @@ -842,6 +847,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, /** * sysfs_show_available_clocksources - sysfs interface for listing clocksource * @dev: unused + * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing registered clocksources -- cgit v1.2.3 From e0197aae59e55c06db172bfbe1a1cdb8c0e1cab3 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 15 Dec 2011 11:36:43 -0800 Subject: cgroups: fix a css_set not found bug in cgroup_attach_proc There is a BUG when migrating a PF_EXITING proc. Since css_set_prefetch() is not called for the PF_EXITING case, find_existing_css_set() will return NULL inside cgroup_task_migrate() causing a BUG. This bug is easy to reproduce. Create a zombie and echo its pid to cgroup.procs. $ cat zombie.c \#include int main() { if (fork()) pause(); return 0; } $ We are hitting this bug pretty regularly on ChromeOS. This bug is already fixed by Tejun Heo's cgroup patchset which is targetted for the next merge window: https://lkml.org/lkml/2011/11/1/356 I've create a smaller patch here which just fixes this bug so that a fix can be merged into the current release and stable. Signed-off-by: Mandeep Singh Baines Downstream-Bug-Report: http://crosbug.com/23953 Reviewed-by: Li Zefan Signed-off-by: Tejun Heo Cc: containers@lists.linux-foundation.org Cc: cgroups@vger.kernel.org Cc: stable@kernel.org Cc: KAMEZAWA Hiroyuki Cc: Frederic Weisbecker Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage Cc: Olof Johansson --- kernel/cgroup.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d9d5648f3cdc..a184470cf9b5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2098,11 +2098,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) continue; /* get old css_set pointer */ task_lock(tsk); - if (tsk->flags & PF_EXITING) { - /* ignore this task if it's going away */ - task_unlock(tsk); - continue; - } oldcg = tsk->cgroups; get_css_set(oldcg); task_unlock(tsk); -- cgit v1.2.3 From b246272ecc5ac68c743b15c9e41a2275f7ce70e2 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 19 Dec 2011 17:11:52 -0800 Subject: cpusets: stall when updating mems_allowed for mempolicy or disjoint nodemask Kernels where MAX_NUMNODES > BITS_PER_LONG may temporarily see an empty nodemask in a tsk's mempolicy if its previous nodemask is remapped onto a new set of allowed cpuset nodes where the two nodemasks, as a result of the remap, are now disjoint. c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") adds get_mems_allowed() to prevent the set of allowed nodes from changing for a thread. This causes any update to a set of allowed nodes to stall until put_mems_allowed() is called. This stall is unncessary, however, if at least one node remains unchanged in the update to the set of allowed nodes. This was addressed by 89e8a244b97e ("cpusets: avoid looping when storing to mems_allowed if one node remains set"), but it's still possible that an empty nodemask may be read from a mempolicy because the old nodemask may be remapped to the new nodemask during rebind. To prevent this, only avoid the stall if there is no mempolicy for the thread being changed. This is a temporary solution until all reads from mempolicy nodemasks can be guaranteed to not be empty without the get_mems_allowed() synchronization. Also moves the check for nodemask intersection inside task_lock() so that tsk->mems_allowed cannot change. This ensures that nothing can set this tsk's mems_allowed out from under us and also protects tsk->mempolicy. Reported-by: Miao Xie Signed-off-by: David Rientjes Cc: KOSAKI Motohiro Cc: Paul Menage Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9fe58c46a426..0b1712dba587 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } +#ifdef CONFIG_NUMA +static inline bool task_has_mempolicy(struct task_struct *task) +{ + return task->mempolicy; +} +#else +static inline bool task_has_mempolicy(struct task_struct *task) +{ + return false; +} +#endif + + /* bits in struct cpuset flags field */ typedef enum { CS_CPU_EXCLUSIVE, @@ -949,7 +962,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { - bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed); + bool need_loop; repeat: /* @@ -962,6 +975,14 @@ repeat: return; task_lock(tsk); + /* + * Determine if a loop is necessary if another thread is doing + * get_mems_allowed(). If at least one node remains unchanged and + * tsk does not have a mempolicy, then an empty nodemask will not be + * possible when mems_allowed is larger than a word. + */ + need_loop = task_has_mempolicy(tsk) || + !nodes_intersects(*newmems, tsk->mems_allowed); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); @@ -981,11 +1002,9 @@ repeat: /* * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. No wait is necessary, however, if at least one - * node remains unchanged. + * for the read-side. */ - while (masks_disjoint && - ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { task_unlock(tsk); if (!task_curr(tsk)) yield(); -- cgit v1.2.3 From 3d3c8f93a237b64580c5c5e138edeb1377e98230 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 19 Dec 2011 17:12:06 -0800 Subject: binary_sysctl(): fix memory leak binary_sysctl() calls sysctl_getname() which allocates from names_cache slab usin __getname() The matching function to free the name is __putname(), and not putname() which should be used only to match getname() allocations. This is because when auditing is enabled, putname() calls audit_putname *instead* (not in addition) to __putname(). Then, if a syscall is in progress, audit_putname does not release the name - instead, it expects the name to get released when the syscall completes, but that will happen only if audit_getname() was called previously, i.e. if the name was allocated with getname() rather than the naked __getname(). So, __getname() followed by putname() ends up leaking memory. Signed-off-by: Michel Lespinasse Acked-by: Al Viro Cc: Christoph Hellwig Cc: Eric Paris Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_binary.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 6318b511afa1..a650694883a1 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1354,7 +1354,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, fput(file); out_putname: - putname(pathname); + __putname(pathname); out: return result; } -- cgit v1.2.3 From c87fb57346fc7653ace98769f148e0dcd88ac1ee Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Wed, 14 Dec 2011 23:43:16 +0100 Subject: ARM: 7235/1: irqdomain: export irq_domain_simple_ops for !CONFIG_OF irqdomain support is used in interrupt controller drivers that may not have device tree support but only need the basic HW->Linux irq translation. Rather than having each of these implement their own IRQ domain, allow them to use the simple ops. Acked-by: Thomas Gleixner Acked-by: Rob Herring Cc: Grant Likely Signed-off-by: Jamie Iles Signed-off-by: Russell King --- kernel/irq/irqdomain.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 200ce832c585..7ca523b249ef 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -143,11 +143,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d, return 0; } -struct irq_domain_ops irq_domain_simple_ops = { - .dt_translate = irq_domain_simple_dt_translate, -}; -EXPORT_SYMBOL_GPL(irq_domain_simple_ops); - /** * irq_domain_create_simple() - Set up a 'simple' translation range */ @@ -182,3 +177,10 @@ void irq_domain_generate_simple(const struct of_device_id *match, } EXPORT_SYMBOL_GPL(irq_domain_generate_simple); #endif /* CONFIG_OF_IRQ */ + +struct irq_domain_ops irq_domain_simple_ops = { +#ifdef CONFIG_OF_IRQ + .dt_translate = irq_domain_simple_dt_translate, +#endif /* CONFIG_OF_IRQ */ +}; +EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -- cgit v1.2.3 From 3b87487ac5008072f138953b07505a7e3493327f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 30 Dec 2011 13:24:40 -0800 Subject: Revert "clockevents: Set noop handler in clockevents_exchange_device()" This reverts commit de28f25e8244c7353abed8de0c7792f5f883588c. It results in resume problems for various people. See for example http://thread.gmane.org/gmane.linux.kernel/1233033 http://thread.gmane.org/gmane.linux.kernel/1233389 http://thread.gmane.org/gmane.linux.kernel/1233159 http://thread.gmane.org/gmane.linux.kernel/1227868/focus=1230877 and the fedora and ubuntu bug reports https://bugzilla.redhat.com/show_bug.cgi?id=767248 https://bugs.launchpad.net/ubuntu/+source/linux/+bug/904569 which got bisected down to the stable version of this commit. Reported-by: Jonathan Nieder Reported-by: Phil Miller Reported-by: Philip Langdale Reported-by: Tim Gardner Cc: Thomas Gleixner Cc: Greg KH Cc: stable@kernel.org # for stable kernels that applied the original Signed-off-by: Linus Torvalds --- kernel/time/clockevents.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index c4eb71c8b2ea..1ecd6ba36d6c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -387,7 +387,6 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { - old->event_handler = clockevents_handle_noop; clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); -- cgit v1.2.3 From e6780f7243eddb133cc20ec37fa69317c218b709 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 31 Dec 2011 11:44:01 -0800 Subject: futex: Fix uninterruptible loop due to gate_area It was found (by Sasha) that if you use a futex located in the gate area we get stuck in an uninterruptible infinite loop, much like the ZERO_PAGE issue. While looking at this problem, PeterZ realized you'll get into similar trouble when hitting any install_special_pages() mapping. And are there still drivers setting up their own special mmaps without page->mapping, and without special VM or pte flags to make get_user_pages fail? In most cases, if page->mapping is NULL, we do not need to retry at all: Linus points out that even /proc/sys/vm/drop_caches poses no problem, because it ends up using remove_mapping(), which takes care not to interfere when the page reference count is raised. But there is still one case which does need a retry: if memory pressure called shmem_writepage in between get_user_pages_fast dropping page table lock and our acquiring page lock, then the page gets switched from filecache to swapcache (and ->mapping set to NULL) whatever the refcount. Fault it back in to get the page->mapping needed for key->shared.inode. Reported-by: Sasha Levin Signed-off-by: Hugh Dickins Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/futex.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index ea87f4d2f455..1614be20173d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -314,17 +314,29 @@ again: #endif lock_page(page_head); + + /* + * If page_head->mapping is NULL, then it cannot be a PageAnon + * page; but it might be the ZERO_PAGE or in the gate area or + * in a special mapping (all cases which we are happy to fail); + * or it may have been a good file page when get_user_pages_fast + * found it, but truncated or holepunched or subjected to + * invalidate_complete_page2 before we got the page lock (also + * cases which we are happy to fail). And we hold a reference, + * so refcount care in invalidate_complete_page's remove_mapping + * prevents drop_caches from setting mapping to NULL beneath us. + * + * The case we do have to guard against is when memory pressure made + * shmem_writepage move it from filecache to swapcache beneath us: + * an unlikely race, but we do need to retry for page_head->mapping. + */ if (!page_head->mapping) { + int shmem_swizzled = PageSwapCache(page_head); unlock_page(page_head); put_page(page_head); - /* - * ZERO_PAGE pages don't have a mapping. Avoid a busy loop - * trying to find one. RW mapping would have COW'd (and thus - * have a mapping) so this page is RO and won't ever change. - */ - if ((page_head == ZERO_PAGE(address))) - return -EFAULT; - goto again; + if (shmem_swizzled) + goto again; + return -EFAULT; } /* -- cgit v1.2.3