From 21acb9caa2e30b100e9a1943d995bb99d40f4035 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 4 Feb 2009 10:12:08 +0100 Subject: trivial: fix where cgroup documentation is not correctly referred to cgroup documentation was moved to Documentation/cgroups/. There are some places that still refer to Documentation/controllers/, Documentation/cgroups.txt and Documentation/cpusets.txt. Fix those. Signed-off-by: Thadeu Lima de Souza Cascardo Reviewed-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Jiri Kosina --- include/linux/cgroup.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 499900d0cee7..b837631fe499 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -342,7 +342,10 @@ int cgroup_task_count(const struct cgroup *cgrp); /* Return true if the cgroup is a descendant of the current cgroup */ int cgroup_is_descendant(const struct cgroup *cgrp); -/* Control Group subsystem type. See Documentation/cgroups.txt for details */ +/* + * Control Group subsystem type. + * See Documentation/cgroups/cgroups.txt for details + */ struct cgroup_subsys { struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, -- cgit v1.2.3 From d20a390a0ee2bf2f692c539c6ce1c829e1080bb5 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Thu, 2 Apr 2009 16:57:22 -0700 Subject: cgroups: fix cgroup.h comments Fix the style of some multi-line comments in cgroup.h to match Documentation/CodingStyle Signed-off-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 71 +++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 30 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 499900d0cee7..bb8feb9feccd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -47,14 +47,18 @@ enum cgroup_subsys_id { /* Per-subsystem/per-cgroup state maintained by the system. */ struct cgroup_subsys_state { - /* The cgroup that this subsystem is attached to. Useful + /* + * The cgroup that this subsystem is attached to. Useful * for subsystems that want to know about the cgroup - * hierarchy structure */ + * hierarchy structure + */ struct cgroup *cgroup; - /* State maintained by the cgroup system to allow subsystems + /* + * State maintained by the cgroup system to allow subsystems * to be "busy". Should be accessed via css_get(), - * css_tryget() and and css_put(). */ + * css_tryget() and and css_put(). + */ atomic_t refcnt; @@ -120,8 +124,10 @@ static inline void css_put(struct cgroup_subsys_state *css) enum { /* Control Group is dead */ CGRP_REMOVED, - /* Control Group has previously had a child cgroup or a task, - * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */ + /* + * Control Group has previously had a child cgroup or a task, + * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) + */ CGRP_RELEASABLE, /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, @@ -130,9 +136,10 @@ enum { struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ - /* count users of this cgroup. >0 means busy, but doesn't - * necessarily indicate the number of tasks in the - * cgroup */ + /* + * count users of this cgroup. >0 means busy, but doesn't + * necessarily indicate the number of tasks in the cgroup + */ atomic_t count; /* @@ -142,7 +149,7 @@ struct cgroup { struct list_head sibling; /* my parent's children */ struct list_head children; /* my children */ - struct cgroup *parent; /* my parent */ + struct cgroup *parent; /* my parent */ struct dentry *dentry; /* cgroup fs entry, RCU protected */ /* Private pointers for each registered subsystem */ @@ -177,11 +184,12 @@ struct cgroup { struct rcu_head rcu_head; }; -/* A css_set is a structure holding pointers to a set of +/* + * A css_set is a structure holding pointers to a set of * cgroup_subsys_state objects. This saves space in the task struct * object and speeds up fork()/exit(), since a single inc/dec and a - * list_add()/del() can bump the reference count on the entire - * cgroup set for a task. + * list_add()/del() can bump the reference count on the entire cgroup + * set for a task. */ struct css_set { @@ -226,13 +234,8 @@ struct cgroup_map_cb { void *state; }; -/* struct cftype: - * - * The files in the cgroup filesystem mostly have a very simple read/write - * handling, some common function will take care of it. Nevertheless some cases - * (read tasks) are special and therefore I define this structure for every - * kind of file. - * +/* + * struct cftype: handler definitions for cgroup control files * * When reading/writing to a file: * - the cgroup to use is file->f_dentry->d_parent->d_fsdata @@ -241,8 +244,10 @@ struct cgroup_map_cb { #define MAX_CFTYPE_NAME 64 struct cftype { - /* By convention, the name should begin with the name of the - * subsystem, followed by a period */ + /* + * By convention, the name should begin with the name of the + * subsystem, followed by a period + */ char name[MAX_CFTYPE_NAME]; int private; @@ -321,13 +326,17 @@ struct cgroup_scanner { struct ptr_heap *heap; }; -/* Add a new file to the given cgroup directory. Should only be - * called by subsystems from within a populate() method */ +/* + * Add a new file to the given cgroup directory. Should only be + * called by subsystems from within a populate() method + */ int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, const struct cftype *cft); -/* Add a set of new files to the given cgroup directory. Should - * only be called by subsystems from within a populate() method */ +/* + * Add a set of new files to the given cgroup directory. Should + * only be called by subsystems from within a populate() method + */ int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, const struct cftype cft[], @@ -419,7 +428,8 @@ struct cgroup_iter { struct list_head *task; }; -/* To iterate across the tasks in a cgroup: +/* + * To iterate across the tasks in a cgroup: * * 1) call cgroup_iter_start to intialize an iterator * @@ -428,9 +438,10 @@ struct cgroup_iter { * * 3) call cgroup_iter_end() to destroy the iterator. * - * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset. - * - cgroup_scan_tasks() holds the css_set_lock when calling the test_task() - * callback, but not while calling the process_task() callback. + * Or, call cgroup_scan_tasks() to iterate through every task in a + * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling + * the test_task() callback, but not while calling the process_task() + * callback. */ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it); struct task_struct *cgroup_iter_next(struct cgroup *cgrp, -- cgit v1.2.3 From 313e924c0852943e67335fad9d2608701f0dfe8e Mon Sep 17 00:00:00 2001 From: Grzegorz Nosek Date: Thu, 2 Apr 2009 16:57:23 -0700 Subject: cgroups: relax ns_can_attach checks to allow attaching to grandchild cgroups The ns_proxy cgroup allows moving processes to child cgroups only one level deep at a time. This commit relaxes this restriction and makes it possible to attach tasks directly to grandchild cgroups, e.g.: ($pid is in the root cgroup) echo $pid > /cgroup/CG1/CG2/tasks Previously this operation would fail with -EPERM and would have to be performed as two steps: echo $pid > /cgroup/CG1/tasks echo $pid > /cgroup/CG1/CG2/tasks Also, the target cgroup no longer needs to be empty to move a task there. Signed-off-by: Grzegorz Nosek Acked-by: Serge Hallyn Reviewed-by: Li Zefan Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bb8feb9feccd..788c4964c142 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -348,8 +348,8 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); int cgroup_task_count(const struct cgroup *cgrp); -/* Return true if the cgroup is a descendant of the current cgroup */ -int cgroup_is_descendant(const struct cgroup *cgrp); +/* Return true if cgrp is a descendant of the task's cgroup */ +int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); /* Control Group subsystem type. See Documentation/cgroups.txt for details */ -- cgit v1.2.3 From 38460b48d06440de46b34cb778bd6c4855030754 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:25 -0700 Subject: cgroup: CSS ID support Patch for Per-CSS(Cgroup Subsys State) ID and private hierarchy code. This patch attaches unique ID to each css and provides following. - css_lookup(subsys, id) returns pointer to struct cgroup_subysys_state of id. - css_get_next(subsys, id, rootid, depth, foundid) returns the next css under "root" by scanning When cgroup_subsys->use_id is set, an id for css is maintained. The cgroup framework only parepares - css_id of root css for subsys - id is automatically attached at creation of css. - id is *not* freed automatically. Because the cgroup framework don't know lifetime of cgroup_subsys_state. free_css_id() function is provided. This must be called by subsys. There are several reasons to develop this. - Saving space .... For example, memcg's swap_cgroup is array of pointers to cgroup. But it is not necessary to be very fast. By replacing pointers(8bytes per ent) to ID (2byes per ent), we can reduce much amount of memory usage. - Scanning without lock. CSS_ID provides "scan id under this ROOT" function. By this, scanning css under root can be written without locks. ex) do { rcu_read_lock(); next = cgroup_get_next(subsys, id, root, &found); /* check sanity of next here */ css_tryget(); rcu_read_unlock(); id = found + 1 } while(...) Characteristics: - Each css has unique ID under subsys. - Lifetime of ID is controlled by subsys. - css ID contains "ID" and "Depth in hierarchy" and stack of hierarchy - Allowed ID is 1-65535, ID 0 is UNUSED ID. Design Choices: - scan-by-ID v.s. scan-by-tree-walk. As /proc's pid scan does, scan-by-ID is robust when scanning is done by following kind of routine. scan -> rest a while(release a lock) -> conitunue from interrupted memcg's hierarchical reclaim does this. - When subsys->use_id is set, # of css in the system is limited to 65535. [bharata@linux.vnet.ibm.com: remove rcu_read_lock() from css_get_next()] Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Bharata B Rao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 788c4964c142..9a23bb098205 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUPS @@ -22,6 +23,7 @@ struct cgroupfs_root; struct cgroup_subsys; struct inode; struct cgroup; +struct css_id; extern int cgroup_init_early(void); extern int cgroup_init(void); @@ -63,6 +65,8 @@ struct cgroup_subsys_state { atomic_t refcnt; unsigned long flags; + /* ID for this css, if possible */ + struct css_id *id; }; /* bits in struct cgroup_subsys_state flags field */ @@ -373,6 +377,11 @@ struct cgroup_subsys { int active; int disabled; int early_init; + /* + * True if this subsys uses ID. ID is not available before cgroup_init() + * (not available in early_init time.) + */ + bool use_id; #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; @@ -395,6 +404,9 @@ struct cgroup_subsys { */ struct cgroupfs_root *root; struct list_head sibling; + /* used when use_id == true */ + struct idr idr; + spinlock_t id_lock; }; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; @@ -450,6 +462,44 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); int cgroup_attach_task(struct cgroup *, struct task_struct *); +/* + * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works + * if cgroup_subsys.use_id == true. It can be used for looking up and scanning. + * CSS ID is assigned at cgroup allocation (create) automatically + * and removed when subsys calls free_css_id() function. This is because + * the lifetime of cgroup_subsys_state is subsys's matter. + * + * Looking up and scanning function should be called under rcu_read_lock(). + * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls. + * But the css returned by this routine can be "not populated yet" or "being + * destroyed". The caller should check css and cgroup's status. + */ + +/* + * Typically Called at ->destroy(), or somewhere the subsys frees + * cgroup_subsys_state. + */ +void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); + +/* Find a cgroup_subsys_state which has given ID */ + +struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); + +/* + * Get a cgroup whose id is greater than or equal to id under tree of root. + * Returning a cgroup_subsys_state or NULL. + */ +struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, + struct cgroup_subsys_state *root, int *foundid); + +/* Returns true if root is ancestor of cg */ +bool css_is_ancestor(struct cgroup_subsys_state *cg, + struct cgroup_subsys_state *root); + +/* Get id and depth of css */ +unsigned short css_id(struct cgroup_subsys_state *css); +unsigned short css_depth(struct cgroup_subsys_state *css); + #else /* !CONFIG_CGROUPS */ static inline int cgroup_init_early(void) { return 0; } -- cgit v1.2.3 From ec64f51545fffbc4cb968f0cea56341a4b07e85a Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:26 -0700 Subject: cgroup: fix frequent -EBUSY at rmdir In following situation, with memory subsystem, /groupA use_hierarchy==1 /01 some tasks /02 some tasks /03 some tasks /04 empty When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim is triggered and the kernel walks tree under groupA. In this case, rmdir /groupA/04 fails with -EBUSY frequently because of temporal refcnt from the kernel. In general. cgroup can be rmdir'd if there are no children groups and no tasks. Frequent fails of rmdir() is not useful to users. (And the reason for -EBUSY is unknown to users.....in most cases) This patch tries to modify above behavior, by - retries if css_refcnt is got by someone. - add "return value" to pre_destroy() and allows subsystem to say "we're really busy!" Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9a23bb098205..7d824b80b3d7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -135,6 +135,10 @@ enum { CGRP_RELEASABLE, /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, + /* + * A thread in rmdir() is wating for this cgroup. + */ + CGRP_WAIT_ON_RMDIR, }; struct cgroup { @@ -360,7 +364,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); struct cgroup_subsys { struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, struct cgroup *cgrp); - void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); + int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk); -- cgit v1.2.3 From 099fca3225b39f7a3ed853036038054172b55581 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 2 Apr 2009 16:57:29 -0700 Subject: cgroups: show correct file mode We have some read-only files and write-only files, but currently they are all set to 0644, which is counter-intuitive and cause trouble for some cgroup tools like libcgroup. This patch adds 'mode' to struct cftype to allow cgroup subsys to set it's own files' file mode, and for the most cases cft->mode can be default to 0 and cgroup will figure out proper mode. Acked-by: Paul Menage Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7d824b80b3d7..b2816fba5306 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -258,6 +258,11 @@ struct cftype { */ char name[MAX_CFTYPE_NAME]; int private; + /* + * If not 0, file mode is set to this value, otherwise it will + * be figured out automatically + */ + mode_t mode; /* * If non-zero, defines the maximum length of string that can -- cgit v1.2.3 From 0b7f569e45bb6be142d87017030669a6a7d327a1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:38 -0700 Subject: memcg: fix OOM killer under memcg This patch tries to fix OOM Killer problems caused by hierarchy. Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to kill a task in memcg. But, when hierarchy is used, it's broken and correct task cannot be killed. For example, in following cgroup /groupA/ hierarchy=1, limit=1G, 01 nolimit 02 nolimit All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to groupA's 1Gbytes but OOM Killer just kills tasks in groupA. This patch provides makes the bad process be selected from all tasks under hierarchy. BTW, currently, oom_jiffies is updated against groupA in above case. oom_jiffies of tree should be updated. To see how oom_jiffies is used, please check mem_cgroup_oom_called() callers. [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: const fix] Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b2816fba5306..43763bd772b9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -503,7 +503,7 @@ struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, /* Returns true if root is ancestor of cg */ bool css_is_ancestor(struct cgroup_subsys_state *cg, - struct cgroup_subsys_state *root); + const struct cgroup_subsys_state *root); /* Get id and depth of css */ unsigned short css_id(struct cgroup_subsys_state *css); -- cgit v1.2.3 From bd1a8ab73edd449fecda633449cc277b856ad4f5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 2 Apr 2009 16:57:50 -0700 Subject: cgroups: add 'data' field to struct cgroup_scanner We need to pass some data to test_task() or process_task() in some cases. Will be used later. Signed-off-by: Li Zefan Cc: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 43763bd772b9..4316a546beb5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -337,6 +337,7 @@ struct cgroup_scanner { void (*process_task)(struct task_struct *p, struct cgroup_scanner *scan); struct ptr_heap *heap; + void *data; }; /* -- cgit v1.2.3