diff options
author | Balbir Singh <balbir@linux.vnet.ibm.com> | 2008-04-29 01:00:16 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-29 08:06:10 -0700 |
commit | cf475ad28ac35cc9ba612d67158f29b73b38b05d (patch) | |
tree | 2c7cd568d00357bd42643ea602884e731cc24f26 /kernel | |
parent | 29486df325e1fe6e1764afcb19e3370804c2b002 (diff) |
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 30 | ||||
-rw-r--r-- | kernel/exit.c | 83 | ||||
-rw-r--r-- | kernel/fork.c | 11 |
3 files changed, 121 insertions, 3 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index abc433772e5a..b9d467d83fc1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -119,6 +119,7 @@ static int root_count; * be called. */ static int need_forkexit_callback; +static int need_mm_owner_callback __read_mostly; /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) @@ -2498,6 +2499,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; need_forkexit_callback |= ss->fork || ss->exit; + need_mm_owner_callback |= !!ss->mm_owner_changed; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't @@ -2748,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child) } } +#ifdef CONFIG_MM_OWNER +/** + * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes + * @p: the new owner + * + * Called on every change to mm->owner. mm_init_owner() does not + * invoke this routine, since it assigns the mm->owner the first time + * and does not change it. + */ +void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) +{ + struct cgroup *oldcgrp, *newcgrp; + + if (need_mm_owner_callback) { + int i; + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + oldcgrp = task_cgroup(old, ss->subsys_id); + newcgrp = task_cgroup(new, ss->subsys_id); + if (oldcgrp == newcgrp) + continue; + if (ss->mm_owner_changed) + ss->mm_owner_changed(ss, oldcgrp, newcgrp); + } + } +} +#endif /* CONFIG_MM_OWNER */ + /** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question diff --git a/kernel/exit.c b/kernel/exit.c index 2a9d98c641ac..ae0f2c4e452b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -557,6 +557,88 @@ void exit_fs(struct task_struct *tsk) EXPORT_SYMBOL_GPL(exit_fs); +#ifdef CONFIG_MM_OWNER +/* + * Task p is exiting and it owned mm, lets find a new owner for it + */ +static inline int +mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) +{ + /* + * If there are other users of the mm and the owner (us) is exiting + * we need to find a new owner to take on the responsibility. + */ + if (!mm) + return 0; + if (atomic_read(&mm->mm_users) <= 1) + return 0; + if (mm->owner != p) + return 0; + return 1; +} + +void mm_update_next_owner(struct mm_struct *mm) +{ + struct task_struct *c, *g, *p = current; + +retry: + if (!mm_need_new_owner(mm, p)) + return; + + read_lock(&tasklist_lock); + /* + * Search in the children + */ + list_for_each_entry(c, &p->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search in the siblings + */ + list_for_each_entry(c, &p->parent->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search through everything else. We should not get + * here often + */ + do_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + } while_each_thread(g, c); + + read_unlock(&tasklist_lock); + return; + +assign_new_owner: + BUG_ON(c == p); + get_task_struct(c); + /* + * The task_lock protects c->mm from changing. + * We always want mm->owner->mm == mm + */ + task_lock(c); + /* + * Delay read_unlock() till we have the task_lock() + * to ensure that c does not slip away underneath us + */ + read_unlock(&tasklist_lock); + if (c->mm != mm) { + task_unlock(c); + put_task_struct(c); + goto retry; + } + cgroup_mm_owner_callbacks(mm->owner, c); + mm->owner = c; + task_unlock(c); + put_task_struct(c); +} +#endif /* CONFIG_MM_OWNER */ + /* * Turn us into a lazy TLB process if we * aren't already.. @@ -596,6 +678,7 @@ static void exit_mm(struct task_struct * tsk) /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); task_unlock(tsk); + mm_update_next_owner(mm); mmput(mm); } diff --git a/kernel/fork.c b/kernel/fork.c index 6067e429f281..156db96ff754 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -381,14 +381,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->ioctx_list = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; - mm_init_cgroup(mm, p); + mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; return mm; } - mm_free_cgroup(mm); free_mm(mm); return NULL; } @@ -438,7 +437,6 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); - mm_free_cgroup(mm); mmdrop(mm); } } @@ -982,6 +980,13 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } +#ifdef CONFIG_MM_OWNER +void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ + mm->owner = p; +} +#endif /* CONFIG_MM_OWNER */ + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. |