From cf475ad28ac35cc9ba612d67158f29b73b38b05d Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 29 Apr 2008 01:00:16 -0700 Subject: cgroups: add an owner to the mm_struct Remove the mem_cgroup member from mm_struct and instead adds an owner. This approach was suggested by Paul Menage. The advantage of this approach is that, once the mm->owner is known, using the subsystem id, the cgroup can be determined. It also allows several control groups that are virtually grouped by mm_struct, to exist independent of the memory controller i.e., without adding mem_cgroup's for each controller, to mm_struct. A new config option CONFIG_MM_OWNER is added and the memory resource controller selects this config option. This patch also adds cgroup callbacks to notify subsystems when mm->owner changes. The mm_cgroup_changed callback is called with the task_lock() of the new task held and is called just prior to changing the mm->owner. I am indebted to Paul Menage for the several reviews of this patchset and helping me make it lighter and simpler. This patch was tested on a powerpc box, it was compiled with both the MM_OWNER config turned on and off. After the thread group leader exits, it's moved to init_css_state by cgroup_exit(), thus all future charges from runnings threads would be redirected to the init_css_set's subsystem. Signed-off-by: Balbir Singh Cc: Pavel Emelianov Cc: Hugh Dickins Cc: Sudhir Kumar Cc: YAMAMOTO Takashi Cc: Hirokazu Takahashi Cc: David Rientjes , Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Acked-by: Pekka Enberg Reviewed-by: Paul Menage Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 6067e429f281..156db96ff754 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -381,14 +381,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->ioctx_list = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; - mm_init_cgroup(mm, p); + mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; return mm; } - mm_free_cgroup(mm); free_mm(mm); return NULL; } @@ -438,7 +437,6 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); - mm_free_cgroup(mm); mmdrop(mm); } } @@ -982,6 +980,13 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } +#ifdef CONFIG_MM_OWNER +void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ + mm->owner = p; +} +#endif /* CONFIG_MM_OWNER */ + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. -- cgit v1.2.3 From 9edff4ab1f8d82675277a04e359d0ed8bf14a7b7 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Tue, 29 Apr 2008 01:00:57 -0700 Subject: ipc: sysvsem: implement sys_unshare(CLONE_SYSVSEM) sys_unshare(CLONE_NEWIPC) doesn't handle the undo lists properly, this can cause a kernel memory corruption. CLONE_NEWIPC must detach from the existing undo lists. Fix, part 1: add support for sys_unshare(CLONE_SYSVSEM) The original reason to not support it was the potential (inevitable?) confusion due to the fact that sys_unshare(CLONE_SYSVSEM) has the inverse meaning of clone(CLONE_SYSVSEM). Our two most reasonable options then appear to be (1) fully support CLONE_SYSVSEM, or (2) continue to refuse explicit CLONE_SYSVSEM, but always do it anyway on unshare(CLONE_SYSVSEM). This patch does (1). Changelog: Apr 16: SEH: switch to Manfred's alternative patch which removes the unshare_semundo() function which always refused CLONE_SYSVSEM. Signed-off-by: Manfred Spraul Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Michael Kerrisk Cc: Pierre Peiffer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 156db96ff754..01666979beac 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1668,18 +1668,6 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp return 0; } -/* - * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not - * supported yet - */ -static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp) -{ - if (unshare_flags & CLONE_SYSVSEM) - return -EINVAL; - - return 0; -} - /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* @@ -1695,8 +1683,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; - struct sem_undo_list *new_ulist = NULL; struct nsproxy *new_nsproxy = NULL; + int do_sysvsem = 0; check_unshare_flags(&unshare_flags); @@ -1708,6 +1696,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) CLONE_NEWNET)) goto bad_unshare_out; + if (unshare_flags & CLONE_SYSVSEM) + do_sysvsem = 1; if ((err = unshare_thread(unshare_flags))) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) @@ -1718,13 +1708,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_cleanup_sigh; if ((err = unshare_fd(unshare_flags, &new_fd))) goto bad_unshare_cleanup_vm; - if ((err = unshare_semundo(unshare_flags, &new_ulist))) - goto bad_unshare_cleanup_fd; if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) - goto bad_unshare_cleanup_semundo; + goto bad_unshare_cleanup_fd; - if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { + if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (do_sysvsem) { + /* + * CLONE_SYSVSEM is equivalent to sys_exit(). + */ + exit_sem(current); + } if (new_nsproxy) { switch_task_namespaces(current, new_nsproxy); @@ -1760,7 +1754,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) if (new_nsproxy) put_nsproxy(new_nsproxy); -bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); -- cgit v1.2.3 From 6013f67fc1a4c7fa5bcab2d39c1eaa3e260c7ac1 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Tue, 29 Apr 2008 01:00:59 -0700 Subject: ipc: sysvsem: force unshare(CLONE_SYSVSEM) when CLONE_NEWIPC sys_unshare(CLONE_NEWIPC) doesn't handle the undo lists properly, this can cause a kernel memory corruption. CLONE_NEWIPC must detach from the existing undo lists. Fix, part 2: perform an implicit CLONE_SYSVSEM in CLONE_NEWIPC. CLONE_NEWIPC creates a new IPC namespace, the task cannot access the existing semaphore arrays after the unshare syscall. Thus the task can/must detach from the existing undo list entries, too. This fixes the kernel corruption, because it makes it impossible that undo records from two different namespaces are in sysvsem.undo_list. Signed-off-by: Manfred Spraul Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Michael Kerrisk Cc: Pierre Peiffer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 01666979beac..de5c16c6b6ec 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1696,7 +1696,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) CLONE_NEWNET)) goto bad_unshare_out; - if (unshare_flags & CLONE_SYSVSEM) + /* + * CLONE_NEWIPC must also detach from the undolist: after switching + * to a new ipc namespace, the semaphore arrays from the old + * namespace are unreachable. + */ + if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; if ((err = unshare_thread(unshare_flags))) goto bad_unshare_out; -- cgit v1.2.3 From 925d1c401fa6cfd0df5d2e37da8981494ccdec07 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Tue, 29 Apr 2008 01:01:36 -0700 Subject: procfs task exe symlink The kernel implements readlink of /proc/pid/exe by getting the file from the first executable VMA. Then the path to the file is reconstructed and reported as the result. Because of the VMA walk the code is slightly different on nommu systems. This patch avoids separate /proc/pid/exe code on nommu systems. Instead of walking the VMAs to find the first executable file-backed VMA we store a reference to the exec'd file in the mm_struct. That reference would prevent the filesystem holding the executable file from being unmounted even after unmapping the VMAs. So we track the number of VM_EXECUTABLE VMAs and drop the new reference when the last one is unmapped. This avoids pinning the mounted filesystem. [akpm@linux-foundation.org: improve comments] [yamamoto@valinux.co.jp: fix dup_mmap] Signed-off-by: Matt Helsley Cc: Oleg Nesterov Cc: David Howells Cc:"Eric W. Biederman" Cc: Christoph Hellwig Cc: Al Viro Cc: Hugh Dickins Signed-off-by: YAMAMOTO Takashi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index de5c16c6b6ec..068ffe007529 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -431,6 +431,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); exit_mmap(mm); + set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); list_del(&mm->mmlist); @@ -543,6 +544,8 @@ struct mm_struct *dup_mm(struct task_struct *tsk) if (init_new_context(tsk, mm)) goto fail_nocontext; + dup_mm_exe_file(oldmm, mm); + err = dup_mmap(mm, oldmm); if (err) goto free_pt; -- cgit v1.2.3 From db51aeccd7097ce19a522a4c5ff91c320f870e2b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2008 00:52:52 -0700 Subject: signals: microoptimize the usage of ->curr_target Suggested by Roland McGrath. Initialize signal->curr_target in copy_signal(). This way ->curr_target is never == NULL, we can kill the check in __group_complete_signal's hot path. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 068ffe007529..2bb675af4de3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -892,7 +892,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->group_exit_code = 0; sig->group_exit_task = NULL; sig->group_stop_count = 0; - sig->curr_target = NULL; + sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); -- cgit v1.2.3 From 9f3acc3140444a900ab280de942291959f0f615d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Apr 2008 07:44:08 -0400 Subject: [PATCH] split linux/file.h Initial splitoff of the low-level stuff; taken to fdtable.h Signed-off-by: Al Viro --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 2bb675af4de3..933e60ebccae 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3