From b72c186999e689cb0b055ab1c7b3cd8fffbeb5ed Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 16 Apr 2015 12:47:29 -0700 Subject: ptrace: fix race between ptrace_resume() and wait_task_stopped() ptrace_resume() is called when the tracee is still __TASK_TRACED. We set tracee->exit_code and then wake_up_state() changes tracee->state. If the tracer's sub-thread does wait() in between, task_stopped_code(ptrace => T) wrongly looks like another report from tracee. This confuses debugger, and since wait_task_stopped() clears ->exit_code the tracee can miss a signal. Test-case: #include #include #include #include #include #include int pid; void *waiter(void *arg) { int stat; for (;;) { assert(pid == wait(&stat)); assert(WIFSTOPPED(stat)); if (WSTOPSIG(stat) == SIGHUP) continue; assert(WSTOPSIG(stat) == SIGCONT); printf("ERR! extra/wrong report:%x\n", stat); } } int main(void) { pthread_t thread; pid = fork(); if (!pid) { assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0); for (;;) kill(getpid(), SIGHUP); } assert(pthread_create(&thread, NULL, waiter, NULL) == 0); for (;;) ptrace(PTRACE_CONT, pid, 0, SIGCONT); return 0; } Note for stable: the bug is very old, but without 9899d11f6544 "ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL" the fix should use lock_task_sighand(child). Signed-off-by: Oleg Nesterov Reported-by: Pavel Labath Tested-by: Pavel Labath Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 227fec36b12a..9a34bd80a745 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -697,6 +697,8 @@ static int ptrace_peek_siginfo(struct task_struct *child, static int ptrace_resume(struct task_struct *child, long request, unsigned long data) { + bool need_siglock; + if (!valid_signal(data)) return -EIO; @@ -724,8 +726,26 @@ static int ptrace_resume(struct task_struct *child, long request, user_disable_single_step(child); } + /* + * Change ->exit_code and ->state under siglock to avoid the race + * with wait_task_stopped() in between; a non-zero ->exit_code will + * wrongly look like another report from tracee. + * + * Note that we need siglock even if ->exit_code == data and/or this + * status was not reported yet, the new status must not be cleared by + * wait_task_stopped() after resume. + * + * If data == 0 we do not care if wait_task_stopped() reports the old + * status and clears the code too; this can't race with the tracee, it + * takes siglock after resume. + */ + need_siglock = data && !thread_group_empty(current); + if (need_siglock) + spin_lock_irq(&child->sighand->siglock); child->exit_code = data; wake_up_state(child, __TASK_TRACED); + if (need_siglock) + spin_unlock_irq(&child->sighand->siglock); return 0; } -- cgit v1.2.3 From 64a4096c5cdab377b6e1f44008ee8b2636db579d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 16 Apr 2015 12:47:32 -0700 Subject: ptrace: ptrace_detach() can no longer race with SIGKILL ptrace_detach() re-checks ->ptrace under tasklist lock and calls release_task() if __ptrace_detach() returns true. This was needed because the __TASK_TRACED tracee could be killed/untraced, and it could even pass exit_notify() before we take tasklist_lock. But this is no longer possible after 9899d11f6544 "ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL". We can turn these checks into WARN_ON() and remove release_task(). While at it, document the setting of child->exit_code. Signed-off-by: Oleg Nesterov Cc: Pavel Labath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 9a34bd80a745..c8e0e050a36a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -456,8 +456,6 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) static int ptrace_detach(struct task_struct *child, unsigned int data) { - bool dead = false; - if (!valid_signal(data)) return -EIO; @@ -467,18 +465,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) write_lock_irq(&tasklist_lock); /* - * This child can be already killed. Make sure de_thread() or - * our sub-thread doing do_wait() didn't do release_task() yet. + * We rely on ptrace_freeze_traced(). It can't be killed and + * untraced by another thread, it can't be a zombie. */ - if (child->ptrace) { - child->exit_code = data; - dead = __ptrace_detach(current, child); - } + WARN_ON(!child->ptrace || child->exit_state); + /* + * tasklist_lock avoids the race with wait_task_stopped(), see + * the comment in ptrace_resume(). + */ + child->exit_code = data; + __ptrace_detach(current, child); write_unlock_irq(&tasklist_lock); proc_ptrace_connector(child, PTRACE_DETACH); - if (unlikely(dead)) - release_task(child); return 0; } -- cgit v1.2.3 From 69828dce7af2cb6d08ef5a03de687d422fb7ec1f Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 16 Apr 2015 12:47:35 -0700 Subject: signal: remove warning about using SI_TKILL in rt_[tg]sigqueueinfo Sending SI_TKILL from rt_[tg]sigqueueinfo was deprecated, so now we issue a warning on the first attempt of doing it. We use WARN_ON_ONCE, which is not informative and, what is worse, taints the kernel, making the trinity syscall fuzzer complain false-positively from time to time. It does not look like we need this warning at all, because the behaviour changed quite a long time ago (2.6.39), and if an application relies on the old API, it gets EPERM anyway and can issue a warning by itself. So let us zap the warning in kernel. Signed-off-by: Vladimir Davydov Acked-by: Oleg Nesterov Cc: Richard Weinberger Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a390499943e4..d51c5ddd855c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2992,11 +2992,9 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) * Nor can they impersonate a kill()/tgkill(), which adds source info. */ if ((info->si_code >= 0 || info->si_code == SI_TKILL) && - (task_pid_vnr(current) != pid)) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info->si_code < 0); + (task_pid_vnr(current) != pid)) return -EPERM; - } + info->si_signo = sig; /* POSIX.1b doesn't mention process groups. */ @@ -3041,12 +3039,10 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ - if (((info->si_code >= 0 || info->si_code == SI_TKILL)) && - (task_pid_vnr(current) != pid)) { - /* We used to allow any < 0 si_code */ - WARN_ON_ONCE(info->si_code < 0); + if ((info->si_code >= 0 || info->si_code == SI_TKILL) && + (task_pid_vnr(current) != pid)) return -EPERM; - } + info->si_signo = sig; return do_send_specific(tgid, pid, sig, info); -- cgit v1.2.3 From 35f71bc0a09a45924bed268d8ccd0d3407bc476f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 16 Apr 2015 12:47:38 -0700 Subject: fork: report pid reservation failure properly copy_process will report any failure in alloc_pid as ENOMEM currently which is misleading because the pid allocation might fail not only when the memory is short but also when the pid space is consumed already. The current man page even mentions this case: : EAGAIN : : A system-imposed limit on the number of threads was encountered. : There are a number of limits that may trigger this error: the : RLIMIT_NPROC soft resource limit (set via setrlimit(2)), which : limits the number of processes and threads for a real user ID, was : reached; the kernel's system-wide limit on the number of processes : and threads, /proc/sys/kernel/threads-max, was reached (see : proc(5)); or the maximum number of PIDs, /proc/sys/kernel/pid_max, : was reached (see proc(5)). so the current behavior is also incorrect wrt. documentation. POSIX man page also suggest returing EAGAIN when the process count limit is reached. This patch simply propagates error code from alloc_pid and makes sure we return -EAGAIN due to reservation failure. This will make behavior of fork closer to both our documentation and POSIX. alloc_pid might alsoo fail when the reaper in the pid namespace is dead (the namespace basically disallows all new processes) and there is no good error code which would match documented ones. We have traditionally returned ENOMEM for this case which is misleading as well but as per Eric W. Biederman this behavior is documented in man pid_namespaces(7) : If the "init" process of a PID namespace terminates, the kernel : terminates all of the processes in the namespace via a SIGKILL signal. : This behavior reflects the fact that the "init" process is essential for : the correct operation of a PID namespace. In this case, a subsequent : fork(2) into this PID namespace will fail with the error ENOMEM; it is : not possible to create a new processes in a PID namespace whose "init" : process has terminated. and introducing a new error code would be too risky so let's stick to ENOMEM for this case. Signed-off-by: Michal Hocko Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 +++-- kernel/pid.c | 15 ++++++++------- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f2c1e7352298..d778016ac1e3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1403,10 +1403,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_io; if (pid != &init_struct_pid) { - retval = -ENOMEM; pid = alloc_pid(p->nsproxy->pid_ns_for_children); - if (!pid) + if (IS_ERR(pid)) { + retval = PTR_ERR(pid); goto bad_fork_cleanup_io; + } } p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; diff --git a/kernel/pid.c b/kernel/pid.c index cd36a5e0d173..4fd07d5b7baf 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -182,7 +182,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) spin_unlock_irq(&pidmap_lock); kfree(page); if (unlikely(!map->page)) - break; + return -ENOMEM; } if (likely(atomic_read(&map->nr_free))) { for ( ; ; ) { @@ -210,7 +210,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) } pid = mk_pid(pid_ns, map, offset); } - return -1; + return -EAGAIN; } int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) @@ -301,17 +301,20 @@ struct pid *alloc_pid(struct pid_namespace *ns) int i, nr; struct pid_namespace *tmp; struct upid *upid; + int retval = -ENOMEM; pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) - goto out; + return ERR_PTR(retval); tmp = ns; pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); - if (nr < 0) + if (IS_ERR_VALUE(nr)) { + retval = nr; goto out_free; + } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; @@ -339,7 +342,6 @@ struct pid *alloc_pid(struct pid_namespace *ns) } spin_unlock_irq(&pidmap_lock); -out: return pid; out_unlock: @@ -351,8 +353,7 @@ out_free: free_pidmap(pid->numbers + i); kmem_cache_free(ns->pid_cachep, pid); - pid = NULL; - goto out; + return ERR_PTR(retval); } void disable_pid_allocation(struct pid_namespace *ns) -- cgit v1.2.3 From 3ea7f5e25ec271909451b7dc17be37581b888de6 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 16 Apr 2015 12:47:41 -0700 Subject: fork_init: update max_threads comment The comment explaining what value max_threads is set to is outdated. The maximum memory consumption ratio for thread structures was 1/2 until February 2002, then it was briefly changed to 1/16 before being set to 1/8 which we still use today. The comment was never updated to reflect that change, it's about time. Signed-off-by: Jean Delvare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d778016ac1e3..c507e29bcb01 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -270,8 +270,8 @@ void __init fork_init(unsigned long mempages) /* * The default maximum number of threads is set to a safe - * value: the thread structures can take up at most half - * of memory. + * value: the thread structures can take up at most one + * eighth of the memory. */ max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); -- cgit v1.2.3 From ff691f6e03815dc8f99461ea509df863a879fc3a Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Thu, 16 Apr 2015 12:47:44 -0700 Subject: kernel/fork.c: new function for max_threads PAGE_SIZE is not guaranteed to be equal to or less than 8 times the THREAD_SIZE. E.g. architecture hexagon may have page size 1M and thread size 4096. This would lead to a division by zero in the calculation of max_threads. With this patch the buggy code is moved to a separate function set_max_threads. The error is not fixed. After fixing the problem in a separate patch the new function can be reused to adjust max_threads after adding or removing memory. Argument mempages of function fork_init() is removed as totalram_pages is an exported symbol. The creation of separate patches for refactoring to a new function and for fixing the logic was suggested by Ingo Molnar. Signed-off-by: Heinrich Schuchardt Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c507e29bcb01..01038e6f51a8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -253,7 +253,26 @@ EXPORT_SYMBOL_GPL(__put_task_struct); void __init __weak arch_task_cache_init(void) { } -void __init fork_init(unsigned long mempages) +/* + * set_max_threads + */ +static void set_max_threads(void) +{ + /* + * The default maximum number of threads is set to a safe + * value: the thread structures can take up at most one + * eighth of the memory. + */ + max_threads = totalram_pages / (8 * THREAD_SIZE / PAGE_SIZE); + + /* + * we need to allow at least 20 threads to boot a system + */ + if (max_threads < 20) + max_threads = 20; +} + +void __init fork_init(void) { #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN @@ -268,18 +287,7 @@ void __init fork_init(unsigned long mempages) /* do the arch specific task caches init */ arch_task_cache_init(); - /* - * The default maximum number of threads is set to a safe - * value: the thread structures can take up at most one - * eighth of the memory. - */ - max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); - - /* - * we need to allow at least 20 threads to boot a system - */ - if (max_threads < 20) - max_threads = 20; + set_max_threads(); init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; -- cgit v1.2.3 From ac1b398de1ef94aeee8ba87b0120763526572a6e Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Thu, 16 Apr 2015 12:47:47 -0700 Subject: kernel/fork.c: avoid division by zero PAGE_SIZE is not guaranteed to be equal to or less than 8 times the THREAD_SIZE. E.g. architecture hexagon may have page size 1M and thread size 4096. This would lead to a division by zero in the calculation of max_threads. With 32-bit calculation there is no solution which delivers valid results for all possible combinations of the parameters. The code is only called once. Hence a 64-bit calculation can be used as solution. [akpm@linux-foundation.org: use clamp_t(), per Oleg] Signed-off-by: Heinrich Schuchardt Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 01038e6f51a8..c7f2e1a4187a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -87,6 +87,16 @@ #define CREATE_TRACE_POINTS #include +/* + * Minimum number of threads to boot the kernel + */ +#define MIN_THREADS 20 + +/* + * Maximum number of threads + */ +#define MAX_THREADS FUTEX_TID_MASK + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -258,18 +268,19 @@ void __init __weak arch_task_cache_init(void) { } */ static void set_max_threads(void) { - /* - * The default maximum number of threads is set to a safe - * value: the thread structures can take up at most one - * eighth of the memory. - */ - max_threads = totalram_pages / (8 * THREAD_SIZE / PAGE_SIZE); + u64 threads; /* - * we need to allow at least 20 threads to boot a system + * The number of threads shall be limited such that the thread + * structures may only consume a small part of the available memory. */ - if (max_threads < 20) - max_threads = 20; + if (fls64(totalram_pages) + fls64(PAGE_SIZE) > 64) + threads = MAX_THREADS; + else + threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, + (u64) THREAD_SIZE * 8UL); + + max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); } void __init fork_init(void) -- cgit v1.2.3 From 16db3d3f1170fb0efca652c9378ce7c5f5cb4232 Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Thu, 16 Apr 2015 12:47:50 -0700 Subject: kernel/sysctl.c: threads-max observe limits Users can change the maximum number of threads by writing to /proc/sys/kernel/threads-max. With the patch the value entered is checked against the same limits that apply when fork_init is called. Signed-off-by: Heinrich Schuchardt Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 31 +++++++++++++++++++++++++++++-- kernel/sysctl.c | 6 ++---- 2 files changed, 31 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c7f2e1a4187a..8807a129711b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -266,7 +267,7 @@ void __init __weak arch_task_cache_init(void) { } /* * set_max_threads */ -static void set_max_threads(void) +static void set_max_threads(unsigned int max_threads_suggested) { u64 threads; @@ -280,6 +281,9 @@ static void set_max_threads(void) threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, (u64) THREAD_SIZE * 8UL); + if (threads > max_threads_suggested) + threads = max_threads_suggested; + max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); } @@ -298,7 +302,7 @@ void __init fork_init(void) /* do the arch specific task caches init */ arch_task_cache_init(); - set_max_threads(); + set_max_threads(MAX_THREADS); init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; @@ -2020,3 +2024,26 @@ int unshare_files(struct files_struct **displaced) task_unlock(task); return 0; } + +int sysctl_max_threads(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int ret; + int threads = max_threads; + int min = MIN_THREADS; + int max = MAX_THREADS; + + t = *table; + t.data = &threads; + t.extra1 = &min; + t.extra2 = &max; + + ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + set_max_threads(threads); + + return 0; +} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 42b7fc2860c1..3c0998426b57 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -93,11 +93,9 @@ #include #endif - #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int max_threads; extern int suid_dumpable; #ifdef CONFIG_COREDUMP extern int core_uses_pid; @@ -710,10 +708,10 @@ static struct ctl_table kern_table[] = { #endif { .procname = "threads-max", - .data = &max_threads, + .data = NULL, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysctl_max_threads, }, { .procname = "random", -- cgit v1.2.3 From 90f31d0ea88880f780574f3d0bb1a227c4c66ca3 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 16 Apr 2015 12:47:56 -0700 Subject: mm: rcu-protected get_mm_exe_file() This patch removes mm->mmap_sem from mm->exe_file read side. Also it kills dup_mm_exe_file() and moves exe_file duplication into dup_mmap() where both mmap_sems are locked. [akpm@linux-foundation.org: fix comment typo] Signed-off-by: Konstantin Khlebnikov Cc: Davidlohr Bueso Cc: Al Viro Cc: Oleg Nesterov Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 56 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8807a129711b..259202637531 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -403,6 +403,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + /* No ordering required: file already has been exposed. */ + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + mm->total_vm = oldmm->total_vm; mm->shared_vm = oldmm->shared_vm; mm->exec_vm = oldmm->exec_vm; @@ -528,7 +531,13 @@ static inline void mm_free_pgd(struct mm_struct *mm) pgd_free(mm, mm->pgd); } #else -#define dup_mmap(mm, oldmm) (0) +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + down_write(&oldmm->mmap_sem); + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + up_write(&oldmm->mmap_sem); + return 0; +} #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ @@ -697,35 +706,46 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +/** + * set_mm_exe_file - change a reference to the mm's executable file + * + * This changes mm's executable file (shown as symlink /proc/[pid]/exe). + * + * Main users are mmput(), sys_execve() and sys_prctl(PR_SET_MM_MAP/EXE_FILE). + * Callers prevent concurrent invocations: in mmput() nobody alive left, + * in execve task is single-threaded, prctl holds mmap_sem exclusively. + */ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { + struct file *old_exe_file = rcu_dereference_protected(mm->exe_file, + !atomic_read(&mm->mm_users) || current->in_execve || + lockdep_is_held(&mm->mmap_sem)); + if (new_exe_file) get_file(new_exe_file); - if (mm->exe_file) - fput(mm->exe_file); - mm->exe_file = new_exe_file; + rcu_assign_pointer(mm->exe_file, new_exe_file); + if (old_exe_file) + fput(old_exe_file); } +/** + * get_mm_exe_file - acquire a reference to the mm's executable file + * + * Returns %NULL if mm has no associated executable file. + * User must release file via fput(). + */ struct file *get_mm_exe_file(struct mm_struct *mm) { struct file *exe_file; - /* We need mmap_sem to protect against races with removal of exe_file */ - down_read(&mm->mmap_sem); - exe_file = mm->exe_file; - if (exe_file) - get_file(exe_file); - up_read(&mm->mmap_sem); + rcu_read_lock(); + exe_file = rcu_dereference(mm->exe_file); + if (exe_file && !get_file_rcu(exe_file)) + exe_file = NULL; + rcu_read_unlock(); return exe_file; } -static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) -{ - /* It's safe to write the exe_file pointer without exe_file_lock because - * this is called during fork when the task is not yet in /proc */ - newmm->exe_file = get_mm_exe_file(oldmm); -} - /** * get_task_mm - acquire a reference to the task's mm * @@ -887,8 +907,6 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) if (!mm_init(mm, tsk)) goto fail_nomem; - dup_mm_exe_file(oldmm, mm); - err = dup_mmap(mm, oldmm); if (err) goto free_pt; -- cgit v1.2.3 From 6e399cd144d8500ffb5d40fa6848890e2580a80a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 16 Apr 2015 12:47:59 -0700 Subject: prctl: avoid using mmap_sem for exe_file serialization Oleg cleverly suggested using xchg() to set the new mm->exe_file instead of calling set_mm_exe_file() which requires some form of serialization -- mmap_sem in this case. For archs that do not have atomic rmw instructions we still fallback to a spinlock alternative, so this should always be safe. As such, we only need the mmap_sem for looking up the backing vm_file, which can be done sharing the lock. Naturally, this means we need to manually deal with both the new and old file reference counting, and we need not worry about the MMF_EXE_FILE_CHANGED bits, which can probably be deleted in the future anyway. Signed-off-by: Davidlohr Bueso Suggested-by: Oleg Nesterov Acked-by: Oleg Nesterov Reviewed-by: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 19 +++++++++++++------ kernel/sys.c | 47 ++++++++++++++++++++++++++++------------------- 2 files changed, 41 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 259202637531..0d23e76a0c61 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -711,15 +711,22 @@ EXPORT_SYMBOL_GPL(mmput); * * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * - * Main users are mmput(), sys_execve() and sys_prctl(PR_SET_MM_MAP/EXE_FILE). - * Callers prevent concurrent invocations: in mmput() nobody alive left, - * in execve task is single-threaded, prctl holds mmap_sem exclusively. + * Main users are mmput() and sys_execve(). Callers prevent concurrent + * invocations: in mmput() nobody alive left, in execve task is single + * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the + * mm->exe_file, but does so without using set_mm_exe_file() in order + * to do avoid the need for any locks. */ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { - struct file *old_exe_file = rcu_dereference_protected(mm->exe_file, - !atomic_read(&mm->mm_users) || current->in_execve || - lockdep_is_held(&mm->mmap_sem)); + struct file *old_exe_file; + + /* + * It is safe to dereference the exe_file without RCU as + * this function is only called if nobody else can access + * this mm -- see comment above for justification. + */ + old_exe_file = rcu_dereference_raw(mm->exe_file); if (new_exe_file) get_file(new_exe_file); diff --git a/kernel/sys.c b/kernel/sys.c index 3be344902316..a4e372b798a5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1649,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } -static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) +static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; + struct file *old_exe, *exe_file; struct inode *inode; int err; - VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - exe = fdget(fd); if (!exe.file) return -EBADF; @@ -1680,15 +1679,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) /* * Forbid mm->exe_file change if old file still mapped. */ + exe_file = get_mm_exe_file(mm); err = -EBUSY; - if (mm->exe_file) { + if (exe_file) { struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) - if (vma->vm_file && - path_equal(&vma->vm_file->f_path, - &mm->exe_file->f_path)) - goto exit; + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (path_equal(&vma->vm_file->f_path, + &exe_file->f_path)) + goto exit_err; + } + + up_read(&mm->mmap_sem); + fput(exe_file); } /* @@ -1702,10 +1708,18 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) goto exit; err = 0; - set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ + /* set the new file, lockless */ + get_file(exe.file); + old_exe = xchg(&mm->exe_file, exe.file); + if (old_exe) + fput(old_exe); exit: fdput(exe); return err; +exit_err: + up_read(&mm->mmap_sem); + fput(exe_file); + goto exit; } #ifdef CONFIG_CHECKPOINT_RESTORE @@ -1840,10 +1854,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; } - down_write(&mm->mmap_sem); if (prctl_map.exe_fd != (u32)-1) - error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); - downgrade_write(&mm->mmap_sem); + error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); + down_read(&mm->mmap_sem); if (error) goto out; @@ -1909,12 +1922,8 @@ static int prctl_set_mm(int opt, unsigned long addr, if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (opt == PR_SET_MM_EXE_FILE) { - down_write(&mm->mmap_sem); - error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr); - up_write(&mm->mmap_sem); - return error; - } + if (opt == PR_SET_MM_EXE_FILE) + return prctl_set_mm_exe_file(mm, (unsigned int)addr); if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; -- cgit v1.2.3 From 230633d109e35b0a24277498e773edeb79b4a331 Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Thu, 16 Apr 2015 12:48:07 -0700 Subject: kernel/sysctl.c: detect overflows when converting to int When converting unsigned long to int overflows may occur. These currently are not detected when writing to the sysctl file system. E.g. on a system where int has 32 bits and long has 64 bits echo 0x800001234 > /proc/sys/kernel/threads-max has the same effect as echo 0x1234 > /proc/sys/kernel/threads-max The patch adds the missing check in do_proc_dointvec_conv. With the patch an overflow will result in an error EINVAL when writing to the the sysctl file system. Signed-off-by: Heinrich Schuchardt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3c0998426b57..2082b1a88fb9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1981,7 +1981,15 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int write, void *data) { if (write) { - *valp = *negp ? -*lvalp : *lvalp; + if (*negp) { + if (*lvalp > (unsigned long) INT_MAX + 1) + return -EINVAL; + *valp = -*lvalp; + } else { + if (*lvalp > (unsigned long) INT_MAX) + return -EINVAL; + *valp = *lvalp; + } } else { int val = *valp; if (val < 0) { -- cgit v1.2.3 From 9d796e66230205cd3366f5660387bd9ecca9d336 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 16 Apr 2015 12:48:10 -0700 Subject: gcov: fix softlockups gcov profiling if enabled with other heavy compile-time instrumentation like KASan could trigger following softlockups: NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [swapper/0:1] Modules linked in: irq event stamp: 22823276 hardirqs last enabled at (22823275): [] mutex_lock_nested+0x7d9/0x930 hardirqs last disabled at (22823276): [] apic_timer_interrupt+0x6d/0x80 softirqs last enabled at (22823172): [] __do_softirq+0x4db/0x729 softirqs last disabled at (22823167): [] irq_exit+0x7d/0x15b CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 3.19.0-05245-gbb33326-dirty #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org 04/01/2014 task: ffff88006cba8000 ti: ffff88006cbb0000 task.ti: ffff88006cbb0000 RIP: kasan_mem_to_shadow+0x1e/0x1f Call Trace: strcmp+0x28/0x70 get_node_by_name+0x66/0x99 gcov_event+0x4f/0x69e gcov_enable_events+0x54/0x7b gcov_fs_init+0xf8/0x134 do_one_initcall+0x1b2/0x288 kernel_init_freeable+0x467/0x580 kernel_init+0x15/0x18b ret_from_fork+0x7c/0xb0 Kernel panic - not syncing: softlockup: hung tasks Fix this by sticking cond_resched() in gcov_enable_events(). Signed-off-by: Andrey Ryabinin Reported-by: Fengguang Wu Cc: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/base.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index b358a802fd18..a744098e4eb7 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "gcov.h" static int gcov_events_enabled; @@ -107,8 +108,10 @@ void gcov_enable_events(void) gcov_events_enabled = 1; /* Perform event callback for previously registered entries. */ - while ((info = gcov_info_next(info))) + while ((info = gcov_info_next(info))) { gcov_event(GCOV_ADD, info); + cond_resched(); + } mutex_unlock(&gcov_lock); } -- cgit v1.2.3 From 11163348a23cdbcdca5fb42485418e75f8566a5c Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 16 Apr 2015 12:49:12 -0700 Subject: oprofile: reduce mmap_sem hold for mm->exe_file sync_buffer() needs the mmap_sem for two distinct operations, both only occurring upon user context switch handling: 1) Dealing with the exe_file. 2) Adding the dcookie data as we need to lookup the vma that backs it. This is done via add_sample() and add_data(). This patch isolates 1), for it will no longer need the mmap_sem for serialization. However, for now, make of the more standard get_mm_exe_file(), requiring only holding the mmap_sem to read the value, and relying on reference counting to make sure that the exe file won't dissappear underneath us while doing the get dcookie. As a consequence, for 2) we move the mmap_sem locking into where we really need it, in lookup_dcookie(). The benefits are twofold: reduce mmap_sem hold times, and cleaner code. [akpm@linux-foundation.org: export get_mm_exe_file for arch/x86/oprofile/oprofile.ko] Signed-off-by: Davidlohr Bueso Cc: Robert Richter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0d23e76a0c61..03c1eaaa6ef5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -752,6 +752,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm) rcu_read_unlock(); return exe_file; } +EXPORT_SYMBOL(get_mm_exe_file); /** * get_task_mm - acquire a reference to the task's mm -- cgit v1.2.3