From a6c066b0703eeafc61eafdd5addf157ee671bd68 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 19 Dec 2011 17:12:06 -0800 Subject: binary_sysctl(): fix memory leak commit 3d3c8f93a237b64580c5c5e138edeb1377e98230 upstream. binary_sysctl() calls sysctl_getname() which allocates from names_cache slab usin __getname() The matching function to free the name is __putname(), and not putname() which should be used only to match getname() allocations. This is because when auditing is enabled, putname() calls audit_putname *instead* (not in addition) to __putname(). Then, if a syscall is in progress, audit_putname does not release the name - instead, it expects the name to get released when the syscall completes, but that will happen only if audit_getname() was called previously, i.e. if the name was allocated with getname() rather than the naked __getname(). So, __getname() followed by putname() ends up leaking memory. Signed-off-by: Michel Lespinasse Acked-by: Al Viro Cc: Christoph Hellwig Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/sysctl_binary.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index e8bffbe2ba4b..2ce1b3086726 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1354,7 +1354,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, fput(file); out_putname: - putname(pathname); + __putname(pathname); out: return result; } -- cgit v1.2.3 From 61bf2d48574d6ce5418b988e9547937c2efdd084 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 15 Dec 2011 11:36:43 -0800 Subject: cgroups: fix a css_set not found bug in cgroup_attach_proc commit e0197aae59e55c06db172bfbe1a1cdb8c0e1cab3 upstream. There is a BUG when migrating a PF_EXITING proc. Since css_set_prefetch() is not called for the PF_EXITING case, find_existing_css_set() will return NULL inside cgroup_task_migrate() causing a BUG. This bug is easy to reproduce. Create a zombie and echo its pid to cgroup.procs. $ cat zombie.c \#include int main() { if (fork()) pause(); return 0; } $ We are hitting this bug pretty regularly on ChromeOS. This bug is already fixed by Tejun Heo's cgroup patchset which is targetted for the next merge window: https://lkml.org/lkml/2011/11/1/356 I've create a smaller patch here which just fixes this bug so that a fix can be merged into the current release and stable. Signed-off-by: Mandeep Singh Baines Downstream-Bug-Report: http://crosbug.com/23953 Reviewed-by: Li Zefan Signed-off-by: Tejun Heo Cc: containers@lists.linux-foundation.org Cc: cgroups@vger.kernel.org Cc: KAMEZAWA Hiroyuki Cc: Frederic Weisbecker Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage Cc: Olof Johansson Signed-off-by: Greg Kroah-Hartman --- kernel/cgroup.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1d2b6ceea95d..b7ab0b86a966 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2098,11 +2098,6 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) continue; /* get old css_set pointer */ task_lock(tsk); - if (tsk->flags & PF_EXITING) { - /* ignore this task if it's going away */ - task_unlock(tsk); - continue; - } oldcg = tsk->cgroups; get_css_set(oldcg); task_unlock(tsk); -- cgit v1.2.3 From 04a763aed3c611460ef4888d14a1f5101e8373bc Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 31 Dec 2011 11:44:01 -0800 Subject: futex: Fix uninterruptible loop due to gate_area commit e6780f7243eddb133cc20ec37fa69317c218b709 upstream. It was found (by Sasha) that if you use a futex located in the gate area we get stuck in an uninterruptible infinite loop, much like the ZERO_PAGE issue. While looking at this problem, PeterZ realized you'll get into similar trouble when hitting any install_special_pages() mapping. And are there still drivers setting up their own special mmaps without page->mapping, and without special VM or pte flags to make get_user_pages fail? In most cases, if page->mapping is NULL, we do not need to retry at all: Linus points out that even /proc/sys/vm/drop_caches poses no problem, because it ends up using remove_mapping(), which takes care not to interfere when the page reference count is raised. But there is still one case which does need a retry: if memory pressure called shmem_writepage in between get_user_pages_fast dropping page table lock and our acquiring page lock, then the page gets switched from filecache to swapcache (and ->mapping set to NULL) whatever the refcount. Fault it back in to get the page->mapping needed for key->shared.inode. Reported-by: Sasha Levin Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 11cbe052b2e8..e6160fa842e4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -314,17 +314,29 @@ again: #endif lock_page(page_head); + + /* + * If page_head->mapping is NULL, then it cannot be a PageAnon + * page; but it might be the ZERO_PAGE or in the gate area or + * in a special mapping (all cases which we are happy to fail); + * or it may have been a good file page when get_user_pages_fast + * found it, but truncated or holepunched or subjected to + * invalidate_complete_page2 before we got the page lock (also + * cases which we are happy to fail). And we hold a reference, + * so refcount care in invalidate_complete_page's remove_mapping + * prevents drop_caches from setting mapping to NULL beneath us. + * + * The case we do have to guard against is when memory pressure made + * shmem_writepage move it from filecache to swapcache beneath us: + * an unlikely race, but we do need to retry for page_head->mapping. + */ if (!page_head->mapping) { + int shmem_swizzled = PageSwapCache(page_head); unlock_page(page_head); put_page(page_head); - /* - * ZERO_PAGE pages don't have a mapping. Avoid a busy loop - * trying to find one. RW mapping would have COW'd (and thus - * have a mapping) so this page is RO and won't ever change. - */ - if ((page_head == ZERO_PAGE(address))) - return -EFAULT; - goto again; + if (shmem_swizzled) + goto again; + return -EFAULT; } /* -- cgit v1.2.3 From 12ed227da1fb7099978c73b65a756caf51a95553 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Tue, 3 Jan 2012 14:41:13 -0800 Subject: hung_task: fix false positive during vfork commit f9fab10bbd768b0e5254e53a4a8477a94bfc4b96 upstream. vfork parent uninterruptibly and unkillably waits for its child to exec/exit. This wait is of unbounded length. Ignore such waits in the hung_task detector. Signed-off-by: Mandeep Singh Baines Reported-by: Sasha Levin LKML-Reference: <1325344394.28904.43.camel@lappy> Cc: Linus Torvalds Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Andrew Morton Cc: John Kacur Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/hung_task.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ea640120ab86..e972276f12ff 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -74,11 +74,17 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) /* * Ensure the task is not frozen. - * Also, when a freshly created task is scheduled once, changes - * its state to TASK_UNINTERRUPTIBLE without having ever been - * switched out once, it musn't be checked. + * Also, skip vfork and any other user process that freezer should skip. */ - if (unlikely(t->flags & PF_FROZEN || !switch_count)) + if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) + return; + + /* + * When a freshly created task is scheduled once, changes its state to + * TASK_UNINTERRUPTIBLE without having ever been switched out once, it + * musn't be checked. + */ + if (unlikely(!switch_count)) return; if (switch_count != t->last_switch_count) { -- cgit v1.2.3 From 9e83ca1fcff0eee2ea7ef2508de95691a0cdeb0c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jan 2012 17:29:02 +0100 Subject: ptrace: partially fix the do_wait(WEXITED) vs EXIT_DEAD->EXIT_ZOMBIE race commit 50b8d257486a45cba7b65ca978986ed216bbcc10 upstream. Test-case: int main(void) { int pid, status; pid = fork(); if (!pid) { for (;;) { if (!fork()) return 0; if (waitpid(-1, &status, 0) < 0) { printf("ERR!! wait: %m\n"); return 0; } } } assert(ptrace(PTRACE_ATTACH, pid, 0,0) == 0); assert(waitpid(-1, NULL, 0) == pid); assert(ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_TRACEFORK) == 0); do { ptrace(PTRACE_CONT, pid, 0, 0); pid = waitpid(-1, NULL, 0); } while (pid > 0); return 1; } It fails because ->real_parent sees its child in EXIT_DEAD state while the tracer is going to change the state back to EXIT_ZOMBIE in wait_task_zombie(). The offending commit is 823b018e which moved the EXIT_DEAD check, but in fact we should not blame it. The original code was not correct as well because it didn't take ptrace_reparented() into account and because we can't really trust ->ptrace. This patch adds the additional check to close this particular race but it doesn't solve the whole problem. We simply can't rely on ->ptrace in this case, it can be cleared if the tracer is multithreaded by the exiting ->parent. I think we should kill EXIT_DEAD altogether, we should always remove the soon-to-be-reaped child from ->children or at least we should never do the DEAD->ZOMBIE transition. But this is too complex for 3.2. Reported-and-tested-by: Denys Vlasenko Tested-by: Lukasz Michalik Acked-by: Tejun Heo Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/exit.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2913b3509d42..9e316ae4984b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1542,8 +1542,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, } /* dead body doesn't have much to contribute */ - if (p->exit_state == EXIT_DEAD) + if (unlikely(p->exit_state == EXIT_DEAD)) { + /* + * But do not ignore this task until the tracer does + * wait_task_zombie()->do_notify_parent(). + */ + if (likely(!ptrace) && unlikely(ptrace_reparented(p))) + wo->notask_error = 0; return 0; + } /* slay zombie? */ if (p->exit_state == EXIT_ZOMBIE) { -- cgit v1.2.3 From 2c60e18b07e8e08c7e3b6cc8288b0e04e18844f7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jan 2012 17:29:20 +0100 Subject: ptrace: ensure JOBCTL_STOP_SIGMASK is not zero after detach commit 8a88951b5878dc475dcd841cefc767e36397d14e upstream. This is the temporary simple fix for 3.2, we need more changes in this area. 1. do_signal_stop() assumes that the running untraced thread in the stopped thread group is not possible. This was our goal but it is not yet achieved: a stopped-but-resumed tracee can clone the running thread which can initiate another group-stop. Remove WARN_ON_ONCE(!current->ptrace). 2. A new thread always starts with ->jobctl = 0. If it is auto-attached and this group is stopped, __ptrace_unlink() sets JOBCTL_STOP_PENDING but JOBCTL_STOP_SIGMASK part is zero, this triggers WANR_ON(!signr) in do_jobctl_trap() if another debugger attaches. Change __ptrace_unlink() to set the artificial SIGSTOP for report. Alternatively we could change ptrace_init_task() to copy signr from current, but this means we can copy it for no reason and hide the possible similar problems. Acked-by: Tejun Heo Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/ptrace.c | 13 ++++++++++++- kernel/signal.c | 2 -- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a70d2a5d8c7b..67d1fdd3c553 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -96,9 +96,20 @@ void __ptrace_unlink(struct task_struct *child) */ if (!(child->flags & PF_EXITING) && (child->signal->flags & SIGNAL_STOP_STOPPED || - child->signal->group_stop_count)) + child->signal->group_stop_count)) { child->jobctl |= JOBCTL_STOP_PENDING; + /* + * This is only possible if this thread was cloned by the + * traced task running in the stopped group, set the signal + * for the future reports. + * FIXME: we should change ptrace_init_task() to handle this + * case. + */ + if (!(child->jobctl & JOBCTL_STOP_SIGMASK)) + child->jobctl |= SIGSTOP; + } + /* * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick * @child in the butt. Note that @resume should be used iff @child diff --git a/kernel/signal.c b/kernel/signal.c index 291c9700be75..195331c56ad3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1986,8 +1986,6 @@ static bool do_signal_stop(int signr) */ if (!(sig->flags & SIGNAL_STOP_STOPPED)) sig->group_exit_code = signr; - else - WARN_ON_ONCE(!current->ptrace); sig->group_stop_count = 0; -- cgit v1.2.3 From b5c66ace2892b73dc2a55c1d3360224f388841cc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 27 Dec 2011 14:25:55 +0800 Subject: cgroup: fix to allow mounting a hierarchy by name commit 0d19ea866562e46989412a0676412fa0983c9ce7 upstream. If we mount a hierarchy with a specified name, the name is unique, and we can use it to mount the hierarchy without specifying its set of subsystem names. This feature is documented is Documentation/cgroups/cgroups.txt section 2.3 Here's an example: # mount -t cgroup -o cpuset,name=myhier xxx /cgroup1 # mount -t cgroup -o name=myhier xxx /cgroup2 But it was broken by commit 32a8cf235e2f192eb002755076994525cdbaa35a (cgroup: make the mount options parsing more accurate) This fixes the regression. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b7ab0b86a966..e4cbdfbc2b14 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1175,10 +1175,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* * If the 'all' option was specified select all the subsystems, - * otherwise 'all, 'none' and a subsystem name options were not - * specified, let's default to 'all' + * otherwise if 'none', 'name=' and a subsystem name options + * were not specified, let's default to 'all' */ - if (all_ss || (!all_ss && !one_ss && !opts->none)) { + if (all_ss || (!one_ss && !opts->none && !opts->name)) { for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss == NULL) -- cgit v1.2.3 From d12b918be6b7af2824d55940a5fbf9cfe7116052 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 3 Nov 2011 00:59:25 +0100 Subject: PM / Sleep: Fix race between CPU hotplug and freezer commit 79cfbdfa87e84992d509e6c1648a18e1d7e68c20 upstream. The CPU hotplug notifications sent out by the _cpu_up() and _cpu_down() functions depend on the value of the 'tasks_frozen' argument passed to them (which indicates whether tasks have been frozen or not). (Examples for such CPU hotplug notifications: CPU_ONLINE, CPU_ONLINE_FROZEN, CPU_DEAD, CPU_DEAD_FROZEN). Thus, it is essential that while the callbacks for those notifications are running, the state of the system with respect to the tasks being frozen or not remains unchanged, *throughout that duration*. Hence there is a need for synchronizing the CPU hotplug code with the freezer subsystem. Since the freezer is involved only in the Suspend/Hibernate call paths, this patch hooks the CPU hotplug code to the suspend/hibernate notifiers PM_[SUSPEND|HIBERNATE]_PREPARE and PM_POST_[SUSPEND|HIBERNATE] to prevent the race between CPU hotplug and freezer, thus ensuring that CPU hotplug notifications will always be run with the state of the system really being what the notifications indicate, _throughout_ their execution time. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- kernel/cpu.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 12b7458f23b1..aa39dd7a3846 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ @@ -476,6 +477,79 @@ static int alloc_frozen_cpus(void) return 0; } core_initcall(alloc_frozen_cpus); + +/* + * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU + * hotplug when tasks are about to be frozen. Also, don't allow the freezer + * to continue until any currently running CPU hotplug operation gets + * completed. + * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the + * 'cpu_add_remove_lock'. And this same lock is also taken by the regular + * CPU hotplug path and released only after it is complete. Thus, we + * (and hence the freezer) will block here until any currently running CPU + * hotplug operation gets completed. + */ +void cpu_hotplug_disable_before_freeze(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + + +/* + * When tasks have been thawed, re-enable regular CPU hotplug (which had been + * disabled while beginning to freeze tasks). + */ +void cpu_hotplug_enable_after_thaw(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + +/* + * When callbacks for CPU hotplug notifications are being executed, we must + * ensure that the state of the system with respect to the tasks being frozen + * or not, as reported by the notification, remains unchanged *throughout the + * duration* of the execution of the callbacks. + * Hence we need to prevent the freezer from racing with regular CPU hotplug. + * + * This synchronization is implemented by mutually excluding regular CPU + * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/ + * Hibernate notifications. + */ +static int +cpu_hotplug_pm_callback(struct notifier_block *nb, + unsigned long action, void *ptr) +{ + switch (action) { + + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + cpu_hotplug_disable_before_freeze(); + break; + + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + cpu_hotplug_enable_after_thaw(); + break; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + + +int cpu_hotplug_pm_sync_init(void) +{ + pm_notifier(cpu_hotplug_pm_callback, 0); + return 0; +} +core_initcall(cpu_hotplug_pm_sync_init); + #endif /* CONFIG_PM_SLEEP_SMP */ /** -- cgit v1.2.3