From c0ff7453bb5c7c98e0885fb94279f2571946f280 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 24 May 2010 14:32:08 -0700 Subject: cpuset,mm: fix no node to alloc memory when changing cpuset's mems Before applying this patch, cpuset updates task->mems_allowed and mempolicy by setting all new bits in the nodemask first, and clearing all old unallowed bits later. But in the way, the allocator may find that there is no node to alloc memory. The reason is that cpuset rebinds the task's mempolicy, it cleans the nodes which the allocater can alloc pages on, for example: (mpol: mempolicy) task1 task1's mpol task2 alloc page 1 alloc on node0? NO 1 1 change mems from 1 to 0 1 rebind task1's mpol 0-1 set new bits 0 clear disallowed bits alloc on node1? NO 0 ... can't alloc page goto oom This patch fixes this problem by expanding the nodes range first(set newly allowed bits) and shrink it lazily(clear newly disallowed bits). So we use a variable to tell the write-side task that read-side task is reading nodemask, and the write-side task clears newly disallowed nodes after read-side task ends the current memory allocation. [akpm@linux-foundation.org: fix spello] Signed-off-by: Miao Xie Cc: David Rientjes Cc: Nick Piggin Cc: Paul Menage Cc: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index eabca5a73a85..019a2843bf95 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1002,8 +1002,10 @@ NORET_TYPE void do_exit(long code) exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA + task_lock(tsk); mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; + task_unlock(tsk); #endif #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) -- cgit v1.2.3 From 9c3391684415c9dca239130d9e433a60a4edf04b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:10 -0700 Subject: exit: exit_notify() can trust signal->notify_count < 0 signal_struct->count in its current form must die. - it has no reasons to be atomic_t - it looks like a reference counter, but it is not - otoh, we really need to make task->signal refcountable, just look at the extremely ugly task_rq_unlock_wait() called from __exit_signals(). - we should change the lifetime rules for task->signal, it should be pinned to task_struct. We have a lot of code which can be simplified after that. - it is not needed! while the code is correct, any usage of this counter is artificial, except fs/proc uses it correctly to show the number of threads. This series removes the usage of sig->count from exit pathes. This patch: Now that Veaceslav changed copy_signal() to use zalloc(), exit_notify() can just check notify_count < 0 to ensure the execing sub-threads needs the notification from us. No need to do other checks, notify_count != 0 must always mean ->group_exit_task != NULL is waiting for us. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Veaceslav Falico Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 019a2843bf95..59a104c673f7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -856,12 +856,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; - /* mt-exec, de_thread() is waiting for us */ - if (thread_group_leader(tsk) && - tsk->signal->group_exit_task && - tsk->signal->notify_count < 0) + /* mt-exec, de_thread() is waiting for group leader */ + if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); - write_unlock_irq(&tasklist_lock); tracehook_report_death(tsk, signal, cookie, group_dead); -- cgit v1.2.3 From d344193a05da89c97e965da2c5cbf687d7385eae Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:11 -0700 Subject: exit: avoid sig->count in de_thread/__exit_signal synchronization de_thread() and __exit_signal() use signal_struct->count/notify_count for synchronization. We can simplify the code and use ->notify_count only. Instead of comparing these two counters, we can change de_thread() to set ->notify_count = nr_of_sub_threads, then change __exit_signal() to dec-and-test this counter and notify group_exit_task. Note that __exit_signal() checks "notify_count > 0" just for symmetry with exit_notify(), we could just check it is != 0. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Veaceslav Falico Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 59a104c673f7..9220967f4256 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -97,7 +97,7 @@ static void __exit_signal(struct task_struct *tsk) * If there is any task waiting for the group exit * then notify it: */ - if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) + if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exit_task); if (tsk == sig->curr_target) -- cgit v1.2.3 From 4a5999429739844367d0f77a65efdd7db8202779 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:12 -0700 Subject: exit: avoid sig->count in __exit_signal() to detect the group-dead case Change __exit_signal() to check thread_group_leader() instead of atomic_dec_and_test(&sig->count). This must be equivalent, the group leader must be released only after all other threads have exited and passed __exit_signal(). Henceforth sig->count is not actually used, except in fs/proc for get_nr_threads/etc. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Veaceslav Falico Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 9220967f4256..4c70c377d21f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -88,11 +88,12 @@ static void __exit_signal(struct task_struct *tsk) rcu_read_lock_held() || lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); + atomic_dec(&sig->count); posix_cpu_timers_exit(tsk); - if (atomic_dec_and_test(&sig->count)) + if (thread_group_leader(tsk)) { posix_cpu_timers_exit_group(tsk); - else { + } else { /* * If there is any task waiting for the group exit * then notify it: -- cgit v1.2.3 From 4dec2a91fd7e8815d730afbfdcf085cbf53433ac Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:15 -0700 Subject: fork/exit: move tty_kref_put() outside of __cleanup_signal() tty_kref_put() has two callsites in copy_process() paths, 1. if copy_process() suceeds it is called before we copy signal->tty from parent 2. otherwise it is called from __cleanup_signal() under bad_fork_cleanup_signal: label In both cases tty_kref_put() is not right and unneeded because we don't have the balancing tty_kref_get(). Fortunately, this is harmless because this can only happen without CLONE_THREAD, and in this case signal->tty must be NULL. Remove tty_kref_put() from copy_process() and __cleanup_signal(), and change another caller of __cleanup_signal(), __exit_signal(), to call tty_kref_put() by hand. I hope this change makes sense by itself, but it is also needed to make ->signal refcountable. Signed-off-by: Oleg Nesterov Acked-by: Alan Cox Acked-by: Roland McGrath Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 4c70c377d21f..4a72f1753edb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -149,6 +149,7 @@ static void __exit_signal(struct task_struct *tsk) * see account_group_exec_runtime(). */ task_rq_unlock_wait(tsk); + tty_kref_put(sig->tty); __cleanup_signal(sig); } } -- cgit v1.2.3 From ea6d290ca34c4fd91b7348338c0cc7bdeff94a35 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:16 -0700 Subject: signals: make task_struct->signal immutable/refcountable We have a lot of problems with accessing task_struct->signal, it can "disappear" at any moment. Even current can't use its ->signal safely after exit_notify(). ->siglock helps, but it is not convenient, not always possible, and sometimes it makes sense to use task->signal even after this task has already dead. This patch adds the reference counter, sigcnt, into signal_struct. This reference is owned by task_struct and it is dropped in __put_task_struct(). Perhaps it makes sense to export get/put_signal_struct() later, but currently I don't see the immediate reason. Rename __cleanup_signal() to free_signal_struct() and unexport it. With the previous changes it does nothing except kmem_cache_free(). Change __exit_signal() to not clear/free ->signal, it will be freed when the last reference to any thread in the thread group goes away. Note: - when the last thead exits signal->tty can point to nowhere, see the next patch. - with or without this patch signal_struct->count should go away, or at least it should be "int nr_threads" for fs/proc. This will be addressed later. Signed-off-by: Oleg Nesterov Cc: Alan Cox Cc: Ingo Molnar Cc: Peter Zijlstra Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 4a72f1753edb..92af5cde9bbe 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -134,8 +134,6 @@ static void __exit_signal(struct task_struct *tsk) * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. */ flush_sigqueue(&tsk->pending); - - tsk->signal = NULL; tsk->sighand = NULL; spin_unlock(&sighand->siglock); @@ -150,7 +148,6 @@ static void __exit_signal(struct task_struct *tsk) */ task_rq_unlock_wait(tsk); tty_kref_put(sig->tty); - __cleanup_signal(sig); } } -- cgit v1.2.3 From 4ada856fb0ee62f6fe3aac3de726deac0640d929 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:17 -0700 Subject: signals: clear signal->tty when the last thread exits When the last thread exits signal->tty is freed, but the pointer is not cleared and points to nowhere. This is OK. Nobody should use signal->tty lockless, and it is no longer possible to take ->siglock. However this looks wrong even if correct, and the nice OOPS is better than subtle and hard to find bugs. Change __exit_signal() to clear signal->tty under ->siglock. Note: __exit_signal() needs more cleanups. It should not check "sig != NULL" to detect the all-dead case and we have the same issues with signal->stats. Signed-off-by: Oleg Nesterov Cc: Alan Cox Cc: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 92af5cde9bbe..356d91fa095f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -80,6 +80,7 @@ static void __exit_signal(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct sighand_struct *sighand; + struct tty_struct *uninitialized_var(tty); BUG_ON(!sig); BUG_ON(!atomic_read(&sig->count)); @@ -93,6 +94,8 @@ static void __exit_signal(struct task_struct *tsk) posix_cpu_timers_exit(tsk); if (thread_group_leader(tsk)) { posix_cpu_timers_exit_group(tsk); + tty = sig->tty; + sig->tty = NULL; } else { /* * If there is any task waiting for the group exit @@ -147,7 +150,7 @@ static void __exit_signal(struct task_struct *tsk) * see account_group_exec_runtime(). */ task_rq_unlock_wait(tsk); - tty_kref_put(sig->tty); + tty_kref_put(tty); } } -- cgit v1.2.3 From b7b8ff6373d4b910af081f76888395e6df53249d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:18 -0700 Subject: signals: kill the awful task_rq_unlock_wait() hack Now that task->signal can't go away we can revert the horrible hack added by ad474caca3e2a0550b7ce0706527ad5ab389a4d4 ("fix for account_group_exec_runtime(), make sure ->signal can't be freed under rq->lock"). And we can do more cleanups sched_stats.h/posix-cpu-timers.c later. Signed-off-by: Oleg Nesterov Cc: Alan Cox Cc: Ingo Molnar Cc: Peter Zijlstra Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 356d91fa095f..bbc790646502 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -145,11 +145,6 @@ static void __exit_signal(struct task_struct *tsk) if (sig) { flush_sigqueue(&sig->shared_pending); taskstats_tgid_free(sig); - /* - * Make sure ->signal can't go away under rq->lock, - * see account_group_exec_runtime(). - */ - task_rq_unlock_wait(tsk); tty_kref_put(tty); } } -- cgit v1.2.3 From d40e48e02f3785b9342ee4eb3d7cc9f12981b7f5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:19 -0700 Subject: exit: __exit_signal: use thread_group_leader() consistently Cleanup: - Add the boolean, group_dead = thread_group_leader(), for clarity. - Do not test/set sig == NULL to detect the all-dead case, use this boolean. - Pass this boolen to __unhash_process() and use it instead of another thread_group_leader() call which needs ->group_leader. This can be considered as microoptimization, but hopefully this also allows us do do other cleanups later. Signed-off-by: Oleg Nesterov Cc: Balbir Singh Cc: Roland McGrath Cc: Veaceslav Falico Cc: Stanislaw Gruszka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index bbc790646502..3602f468e3a0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -58,11 +58,11 @@ static void exit_mm(struct task_struct * tsk); -static void __unhash_process(struct task_struct *p) +static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; detach_pid(p, PIDTYPE_PID); - if (thread_group_leader(p)) { + if (group_dead) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); @@ -79,6 +79,7 @@ static void __unhash_process(struct task_struct *p) static void __exit_signal(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; + bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *uninitialized_var(tty); @@ -92,7 +93,7 @@ static void __exit_signal(struct task_struct *tsk) atomic_dec(&sig->count); posix_cpu_timers_exit(tsk); - if (thread_group_leader(tsk)) { + if (group_dead) { posix_cpu_timers_exit_group(tsk); tty = sig->tty; sig->tty = NULL; @@ -127,10 +128,9 @@ static void __exit_signal(struct task_struct *tsk) sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; - sig = NULL; /* Marker for below. */ } - __unhash_process(tsk); + __unhash_process(tsk, group_dead); /* * Do this under ->siglock, we can race with another thread @@ -142,7 +142,7 @@ static void __exit_signal(struct task_struct *tsk) __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - if (sig) { + if (group_dead) { flush_sigqueue(&sig->shared_pending); taskstats_tgid_free(sig); tty_kref_put(tty); -- cgit v1.2.3 From 97101eb41d0d3c97543878ce40e0b8a8b2747ed7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:20 -0700 Subject: exit: move taskstats_tgid_free() from __exit_signal() to free_signal_struct() Move taskstats_tgid_free() from __exit_signal() to free_signal_struct(). This way signal->stats never points to nowhere and we can read ->stats lockless. Signed-off-by: Oleg Nesterov Cc: Balbir Singh Cc: Roland McGrath Cc: Veaceslav Falico Cc: Stanislaw Gruszka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 3602f468e3a0..357d443d5a00 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -144,7 +144,6 @@ static void __exit_signal(struct task_struct *tsk) clear_tsk_thread_flag(tsk,TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); - taskstats_tgid_free(sig); tty_kref_put(tty); } } -- cgit v1.2.3 From b3ac022cb9dc5883505a88b159d1b240ad1ef405 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 26 May 2010 14:43:24 -0700 Subject: proc: turn signal_struct->count into "int nr_threads" No functional changes, just s/atomic_t count/int nr_threads/. With the recent changes this counter has a single user, get_nr_threads() And, none of its callers need the really accurate number of threads, not to mention each caller obviously races with fork/exit. It is only used to report this value to the user-space, except first_tid() uses it to avoid the unnecessary while_each_thread() loop in the unlikely case. It is a bit sad we need a word in struct signal_struct for this, perhaps we can change get_nr_threads() to approximate the number of threads using signal->live and kill ->nr_threads later. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Oleg Nesterov Cc: Alexey Dobriyan Cc: "Eric W. Biederman" Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 357d443d5a00..ceffc67b564a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -83,14 +83,10 @@ static void __exit_signal(struct task_struct *tsk) struct sighand_struct *sighand; struct tty_struct *uninitialized_var(tty); - BUG_ON(!sig); - BUG_ON(!atomic_read(&sig->count)); - sighand = rcu_dereference_check(tsk->sighand, rcu_read_lock_held() || lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); - atomic_dec(&sig->count); posix_cpu_timers_exit(tsk); if (group_dead) { @@ -130,6 +126,7 @@ static void __exit_signal(struct task_struct *tsk) sig->sum_sched_runtime += tsk->se.sum_exec_runtime; } + sig->nr_threads--; __unhash_process(tsk, group_dead); /* -- cgit v1.2.3