From f392e51cd6ae6f6ee5b9b6d611cdc282b4c1711e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:14 -0400 Subject: cgroup: update cgroup->subsys_mask to ->child_subsys_mask and restore cgroup_root->subsys_mask 944196278d3d ("cgroup: move ->subsys_mask from cgroupfs_root to cgroup") moved ->subsys_mask from cgroup_root to cgroup to prepare for the unified hierarhcy; however, it turns out that carrying the subsys_mask of the children in the parent, instead of itself, is a lot more natural. This patch restores cgroup_root->subsys_mask and morphs cgroup->subsys_mask into cgroup->child_subsys_mask. * Uses of root->cgrp.subsys_mask are restored to root->subsys_mask. * Remove automatic setting and clearing of cgrp->subsys_mask and instead just inherit ->child_subsys_mask from the parent during cgroup creation. Note that this doesn't affect any current behaviors. * Undo __kill_css() separation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c2515851c1aa..1b5b2fe1b228 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -173,8 +173,8 @@ struct cgroup { */ u64 serial_nr; - /* The bitmask of subsystems attached to this cgroup */ - unsigned long subsys_mask; + /* the bitmask of subsystems enabled on the child cgroups */ + unsigned long child_subsys_mask; /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; @@ -282,6 +282,9 @@ enum { struct cgroup_root { struct kernfs_root *kf_root; + /* The bitmask of subsystems attached to this hierarchy */ + unsigned long subsys_mask; + /* Unique id for this hierarchy. */ int hierarchy_id; -- cgit v1.2.3 From 2d8f243a5e6efa57fb7c46fe83fafa45b33d0ec2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: implement cgroup->e_csets[] On the default unified hierarchy, a cgroup may be associated with csses of its ancestors, which means that a css of a given cgroup may be associated with css_sets of descendant cgroups. This means that we can't walk all tasks associated with a css by iterating the css_sets associated with the cgroup as there are css_sets which are pointing to the css but linked on the descendants. This patch adds per-subsystem list heads cgroup->e_csets[]. Any css_set which is pointing to a css is linked to css->cgroup->e_csets[$SUBSYS_ID] through css_set->e_cset_node[$SUBSYS_ID]. The lists are protected by css_set_rwsem and will allow us to walk all css_sets associated with a given css so that we can find out all associated tasks. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1b5b2fe1b228..33a0043ef454 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -187,6 +187,15 @@ struct cgroup { */ struct list_head cset_links; + /* + * On the default hierarchy, a css_set for a cgroup with some + * susbsys disabled will point to css's which are associated with + * the closest ancestor which has the subsys enabled. The + * following lists all css_sets which point to this cgroup's css + * for the given subsystem. + */ + struct list_head e_csets[CGROUP_SUBSYS_COUNT]; + /* * Linked list running through all cgroups that can * potentially be reaped by the release agent. Protected by @@ -369,6 +378,15 @@ struct css_set { struct cgroup *mg_src_cgrp; struct css_set *mg_dst_cset; + /* + * On the default hierarhcy, ->subsys[ssid] may point to a css + * attached to an ancestor instead of the cgroup this css_set is + * associated with. The following node is anchored at + * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to + * iterate through all css's attached to a given cgroup. + */ + struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; + /* For RCU-protected deletion */ struct rcu_head rcu_head; }; -- cgit v1.2.3 From 0f0a2b4fa6210147131082999f1f16d7fb79abf8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: reorganize css_task_iter This patch reorganizes css_task_iter so that adding effective css support is easier. * s/->cset_link/->cset_pos/ and s/->task/->task_pos/ for consistency * ->origin_css is used to determine whether the iteration reached the last css_set. Replace it with explicit ->cset_head so that css_advance_task_iter() doesn't have to know the termination condition directly. * css_task_iter_next() currently assumes that it's walking list of cgrp_cset_link and reaches into the current cset through the current link to determine the termination conditions for task walking. As this won't always be true for effective css walking, add ->tasks_head and ->mg_tasks_head and use them to control task walking so that css_task_iter_next() doesn't have to know how css_sets are being walked. This patch doesn't make any behavior changes. The iteration logic stays unchanged after the patch. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 33a0043ef454..bee390586120 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -842,9 +842,12 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, /* A css_task_iter should be treated as an opaque object */ struct css_task_iter { - struct cgroup_subsys_state *origin_css; - struct list_head *cset_link; - struct list_head *task; + struct list_head *cset_pos; + struct list_head *cset_head; + + struct list_head *task_pos; + struct list_head *tasks_head; + struct list_head *mg_tasks_head; }; void css_task_iter_start(struct cgroup_subsys_state *css, -- cgit v1.2.3 From 3ebb2b6ef38875b866ec0118bfae7bc52afd0166 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: teach css_task_iter about effective csses Currently, css_task_iter iterates tasks associated with a css by visiting each css_set associated with the owning cgroup and walking tasks of each of them. This works fine for !unified hierarchies as each cgroup has its own css for each associated subsystem on the hierarchy; however, on the planned unified hierarchy, a cgroup may not have csses associated and its tasks would be considered associated with the matching css of the nearest ancestor which has the subsystem enabled. This means that on the default unified hierarchy, just walking all tasks associated with a cgroup isn't enough to walk all tasks which are associated with the specified css. If any of its children doesn't have the matching css enabled, task iteration should also include all tasks from the subtree. We already added cgroup->e_csets[] to list all css_sets effectively associated with a given css and walk css_sets on that list instead to achieve such iteration. This patch updates css_task_iter iteration such that it walks css_sets on cgroup->e_csets[] instead of cgroup->cset_links if iteration is requested on an non-dummy css. Thanks to the previous iteration update, this change can be achieved with the addition of css_task_iter->ss and minimal updates to css_advance_task_iter() and css_task_iter_start(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bee390586120..18fcae39e63e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -842,6 +842,8 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, /* A css_task_iter should be treated as an opaque object */ struct css_task_iter { + struct cgroup_subsys *ss; + struct list_head *cset_pos; struct list_head *cset_head; -- cgit v1.2.3 From 6803c006282768ec850760766a6e4eb1a6ff87df Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: add css_set->dfl_cgrp To implement the unified hierarchy behavior, we'll need to be able to determine the associated cgroup on the default hierarchy from css_set. Let's add css_set->dfl_cgrp so that it can be accessed conveniently and efficiently. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 18fcae39e63e..c49d161a71cd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -354,6 +354,9 @@ struct css_set { */ struct list_head cgrp_links; + /* the default cgroup associated with this css_set */ + struct cgroup *dfl_cgrp; + /* * Set of subsystem states, one for each subsystem. This array is * immutable after creation apart from the init_css_set during -- cgit v1.2.3 From f8f22e53a262ebee37fc98004f16b066cf5bc125 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: implement dynamic subtree controller enable/disable on the default hierarchy cgroup is switching away from multiple hierarchies and will use one unified default hierarchy where controllers can be dynamically enabled and disabled per subtree. The default hierarchy will serve as the unified hierarchy to which all controllers are attached and a css on the default hierarchy would need to also serve the tasks of descendant cgroups which don't have the controller enabled - ie. the tree may be collapsed from leaf towards root when viewed from specific controllers. This has been implemented through effective css in the previous patches. This patch finally implements dynamic subtree controller enable/disable on the default hierarchy via a new knob - "cgroup.subtree_control" which controls which controllers are enabled on the child cgroups. Let's assume a hierarchy like the following. root - A - B - C \ D root's "cgroup.subtree_control" determines which controllers are enabled on A. A's on B. B's on C and D. This coincides with the fact that controllers on the immediate sub-level are used to distribute the resources of the parent. In fact, it's natural to assume that resource control knobs of a child belong to its parent. Enabling a controller in "cgroup.subtree_control" declares that distribution of the respective resources of the cgroup will be controlled. Note that this means that controller enable states are shared among siblings. The default hierarchy has an extra restriction - only cgroups which don't contain any task may have controllers enabled in "cgroup.subtree_control". Combined with the other properties of the default hierarchy, this guarantees that, from the view point of controllers, tasks are only on the leaf cgroups. In other words, only leaf csses may contain tasks. This rules out situations where child cgroups compete against internal tasks of the parent, which is a competition between two different types of entities without any clear way to determine resource distribution between the two. Different controllers handle it differently and all the implemented behaviors are ambiguous, ad-hoc, cumbersome and/or just wrong. Having this structural constraints imposed from cgroup core removes the burden from controller implementations and enables showing one consistent behavior across all controllers. When a controller is enabled or disabled, css associations for the controller in the subtrees of each child should be updated. After enabling, the whole subtree of a child should point to the new css of the child. After disabling, the whole subtree of a child should point to the cgroup's css. This is implemented by first updating cgroup states such that cgroup_e_css() result points to the appropriate css and then invoking cgroup_update_dfl_csses() which migrates all tasks in the affected subtrees to the self cgroup on the default hierarchy. * When read, "cgroup.subtree_control" lists all the currently enabled controllers on the children of the cgroup. * White-space separated list of controller names prefixed with either '+' or '-' can be written to "cgroup.subtree_control". The ones prefixed with '+' are enabled on the controller and '-' disabled. * A controller can be enabled iff the parent's "cgroup.subtree_control" enables it and disabled iff no child's "cgroup.subtree_control" has it enabled. * If a cgroup has tasks, no controller can be enabled via "cgroup.subtree_control". Likewise, if "cgroup.subtree_control" has some controllers enabled, tasks can't be migrated into the cgroup. * All controllers which aren't bound on other hierarchies are automatically associated with the root cgroup of the default hierarchy. All the controllers which are bound to the default hierarchy are listed in the read-only file "cgroup.controllers" in the root directory. * "cgroup.controllers" in all non-root cgroups is read-only file whose content is equal to that of "cgroup.subtree_control" of the parent. This indicates which controllers can be used in the cgroup's "cgroup.subtree_control". This is still experimental and there are some holes, one of which is that ->can_attach() failure during cgroup_update_dfl_csses() may leave the cgroups in an undefined state. The issues will be addressed by future patches. v2: Non-root cgroups now also have "cgroup.controllers". Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c49d161a71cd..ada239253ec7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -21,6 +21,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUPS @@ -164,6 +165,7 @@ struct cgroup { struct cgroup *parent; /* my parent */ struct kernfs_node *kn; /* cgroup kernfs entry */ + struct kernfs_node *control_kn; /* kn for "cgroup.subtree_control" */ /* * Monotonically increasing unique serial number which defines a @@ -216,6 +218,9 @@ struct cgroup { /* For css percpu_ref killing and RCU-protected deletion */ struct rcu_head rcu_head; struct work_struct destroy_work; + + /* used to wait for offlining of csses */ + wait_queue_head_t offline_waitq; }; #define MAX_CGROUP_ROOT_NAMELEN 64 -- cgit v1.2.3 From 842b597ee0a7e1aa5a3148164ffdba00ec17f614 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 25 Apr 2014 18:28:02 -0400 Subject: cgroup: implement cgroup.populated for the default hierarchy cgroup users often need a way to determine when a cgroup's subhierarchy becomes empty so that it can be cleaned up. cgroup currently provides release_agent for it; unfortunately, this mechanism is riddled with issues. * It delivers events by forking and execing a userland binary specified as the release_agent. This is a long deprecated method of notification delivery. It's extremely heavy, slow and cumbersome to integrate with larger infrastructure. * There is single monitoring point at the root. There's no way to delegate management of a subtree. * The event isn't recursive. It triggers when a cgroup doesn't have any tasks or child cgroups. Events for internal nodes trigger only after all children are removed. This again makes it impossible to delegate management of a subtree. * Events are filtered from the kernel side. "notify_on_release" file is used to subscribe to or suppress release event. This is unnecessarily complicated and probably done this way because event delivery itself was expensive. This patch implements interface file "cgroup.populated" which can be used to monitor whether the cgroup's subhierarchy has tasks in it or not. Its value is 0 if there is no task in the cgroup and its descendants; otherwise, 1, and kernfs_notify() notificaiton is triggers when the value changes, which can be monitored through poll and [di]notify. This is a lot ligther and simpler and trivially allows delegating management of subhierarchy - subhierarchy monitoring can block further propgation simply by putting itself or another process in the root of the subhierarchy and monitor events that it's interested in from there without interfering with monitoring higher in the tree. v2: Patch description updated as per Serge. v3: "cgroup.subtree_populated" renamed to "cgroup.populated". The subtree_ prefix was a bit confusing because "cgroup.subtree_control" uses it to denote the tree rooted at the cgroup sans the cgroup itself while the populated state includes the cgroup itself. Signed-off-by: Tejun Heo Acked-by: Serge Hallyn Acked-by: Li Zefan Cc: Lennart Poettering --- include/linux/cgroup.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ada239253ec7..4b38e2d6110d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -154,6 +154,14 @@ struct cgroup { /* the number of attached css's */ int nr_css; + /* + * If this cgroup contains any tasks, it contributes one to + * populated_cnt. All children with non-zero popuplated_cnt of + * their own contribute one. The count is zero iff there's no task + * in this cgroup or its subtree. + */ + int populated_cnt; + atomic_t refcnt; /* @@ -166,6 +174,7 @@ struct cgroup { struct cgroup *parent; /* my parent */ struct kernfs_node *kn; /* cgroup kernfs entry */ struct kernfs_node *control_kn; /* kn for "cgroup.subtree_control" */ + struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ /* * Monotonically increasing unique serial number which defines a @@ -264,6 +273,12 @@ enum { * * - "cgroup.clone_children" is removed. * + * - "cgroup.subtree_populated" is available. Its value is 0 if + * the cgroup and its descendants contain no task; otherwise, 1. + * The file also generates kernfs notification which can be + * monitored through poll and [di]notify when the value of the + * file changes. + * * - If mount is requested with sane_behavior but without any * subsystem, the default unified hierarchy is mounted. * -- cgit v1.2.3 From 69dfa00ccb72a37f3810687ca110e5a8154c6eed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:13 -0400 Subject: cgroup: make flags and subsys_masks unsigned int There's no reason to use atomic bitops for cgroup_subsys_state->flags, cgroup_root->flags and various subsys_masks. This patch updates those to use bitwise and/or operations instead and converts them form unsigned long to unsigned int. This makes the fields occupy (marginally) smaller space and makes it clear that they don't require atomicity. This patch doesn't cause any behavior difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4b38e2d6110d..c6c703f2486b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -62,7 +62,7 @@ struct cgroup_subsys_state { /* the parent css */ struct cgroup_subsys_state *parent; - unsigned long flags; + unsigned int flags; /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; @@ -185,7 +185,7 @@ struct cgroup { u64 serial_nr; /* the bitmask of subsystems enabled on the child cgroups */ - unsigned long child_subsys_mask; + unsigned int child_subsys_mask; /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; @@ -312,7 +312,7 @@ struct cgroup_root { struct kernfs_root *kf_root; /* The bitmask of subsystems attached to this hierarchy */ - unsigned long subsys_mask; + unsigned int subsys_mask; /* Unique id for this hierarchy. */ int hierarchy_id; @@ -327,7 +327,7 @@ struct cgroup_root { struct list_head root_list; /* Hierarchy-specific flags */ - unsigned long flags; + unsigned int flags; /* IDs for cgroups in this hierarchy */ struct idr cgroup_idr; -- cgit v1.2.3 From 7d699ddb2b181a2c76e5ea18b1bdf102c4bebe4b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:13 -0400 Subject: cgroup, memcg: allocate cgroup ID from 1 Currently, cgroup->id is allocated from 0, which is always assigned to the root cgroup; unfortunately, memcg wants to use ID 0 to indicate invalid IDs and ends up incrementing all IDs by one. It's reasonable to reserve 0 for special purposes. This patch updates cgroup core so that ID 0 is not used and the root cgroups get ID 1. The ID incrementing is removed form memcg. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Cc: Johannes Weiner Acked-by: Li Zefan --- include/linux/cgroup.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c6c703f2486b..793f70a48820 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -144,8 +144,8 @@ struct cgroup { /* * idr allocated in-hierarchy ID. * - * The ID of the root cgroup is always 0, and a new cgroup - * will be assigned with a smallest available ID. + * ID 0 is not used, the ID of the root cgroup is always 1, and a + * new cgroup will be assigned with a smallest available ID. * * Allocating/Removing ID must be protected by cgroup_mutex. */ -- cgit v1.2.3 From 15a4c835e4ed3e60dd68727cd1907e3dd89563f4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 4 May 2014 15:09:14 -0400 Subject: cgroup, memcg: implement css->id and convert css_from_id() to use it Until now, cgroup->id has been used to identify all the associated csses and css_from_id() takes cgroup ID and returns the matching css by looking up the cgroup and then dereferencing the css associated with it; however, now that the lifetimes of cgroup and css are separate, this is incorrect and breaks on the unified hierarchy when a controller is disabled and enabled back again before the previous instance is released. This patch adds css->id which is a subsystem-unique ID and converts css_from_id() to look up by the new css->id instead. memcg is the only user of css_from_id() and also converted to use css->id instead. For traditional hierarchies, this shouldn't make any functional difference. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Jianyu Zhan Acked-by: Li Zefan --- include/linux/cgroup.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 793f70a48820..2dfabb3b749a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -62,6 +62,12 @@ struct cgroup_subsys_state { /* the parent css */ struct cgroup_subsys_state *parent; + /* + * Subsys-unique ID. 0 is unused and root is always 1. The + * matching css can be looked up using css_from_id(). + */ + int id; + unsigned int flags; /* percpu_ref killing and RCU release */ @@ -655,6 +661,9 @@ struct cgroup_subsys { /* link to parent, protected by cgroup_lock() */ struct cgroup_root *root; + /* idr for css->id */ + struct idr css_idr; + /* * List of cftypes. Each entry is the first entry of an array * terminated by zero length name. -- cgit v1.2.3 From 2b53f41fa8604845f4f7c538723694a453088b15 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 7 May 2014 09:21:56 -0400 Subject: cgroup: remove unused CGRP_SANE_BEHAVIOR This cgroup flag has never been used. Only CGRP_ROOT_SANE_BEHAVIOR is used. Remove it. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2dfabb3b749a..f482f95c2c72 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -140,8 +140,6 @@ enum { * specified at mount time and thus is implemented here. */ CGRP_CPUSET_CLONE_CHILDREN, - /* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */ - CGRP_SANE_BEHAVIOR, }; struct cgroup { -- cgit v1.2.3 From 2070d50e1cbe3d7f157cbf8e63279c893f375d7f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 May 2014 15:11:53 -0400 Subject: percpu-refcount: rename percpu_ref_tryget() to percpu_ref_tryget_live() percpu_ref_tryget() is different from the usual tryget semantics in that it fails if the refcnt is in its dying stage even if the refcnt hasn't reached zero yet. We're about to introduce the more conventional tryget and the current one has only one user. Let's rename it to percpu_ref_tryget_live() so that it explicitly signifies the peculiarities of its semantics. This is pure rename. Signed-off-by: Tejun Heo Acked-by: Kent Overstreet --- include/linux/cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c2515851c1aa..549aed8de32b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -101,7 +101,7 @@ static inline bool css_tryget(struct cgroup_subsys_state *css) { if (css->flags & CSS_ROOT) return true; - return percpu_ref_tryget(&css->refcnt); + return percpu_ref_tryget_live(&css->refcnt); } /** -- cgit v1.2.3 From ec903c0c858e4963a9e0724bdcadfa837253341c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:11:01 -0400 Subject: cgroup: rename css_tryget*() to css_tryget_online*() Unlike the more usual refcnting, what css_tryget() provides is the distinction between online and offline csses instead of protection against upping a refcnt which already reached zero. cgroup is planning to provide actual tryget which fails if the refcnt already reached zero. Let's rename the existing trygets so that they clearly indicate that they're onliness. I thought about keeping the existing names as-are and introducing new names for the planned actual tryget; however, given that each controller participates in the synchronization of the online state, it seems worthwhile to make it explicit that these functions are about on/offline state. Rename css_tryget() to css_tryget_online() and css_tryget_from_dir() to css_tryget_online_from_dir(). This is pure rename. v2: cgroup_freezer grew new usages of css_tryget(). Update accordingly. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Li Zefan Cc: Vivek Goyal Cc: Jens Axboe Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- include/linux/cgroup.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bde44618d8c2..c5f3684ef557 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -95,16 +95,16 @@ static inline void css_get(struct cgroup_subsys_state *css) } /** - * css_tryget - try to obtain a reference on the specified css + * css_tryget_online - try to obtain a reference on the specified css if online * @css: target css * - * Obtain a reference on @css if it's alive. The caller naturally needs to - * ensure that @css is accessible but doesn't have to be holding a + * Obtain a reference on @css if it's online. The caller naturally needs + * to ensure that @css is accessible but doesn't have to be holding a * reference on it - IOW, RCU protected access is good enough for this * function. Returns %true if a reference count was successfully obtained; * %false otherwise. */ -static inline bool css_tryget(struct cgroup_subsys_state *css) +static inline bool css_tryget_online(struct cgroup_subsys_state *css) { if (css->flags & CSS_ROOT) return true; @@ -115,7 +115,7 @@ static inline bool css_tryget(struct cgroup_subsys_state *css) * css_put - put a css reference * @css: target css * - * Put a reference obtained via css_get() and css_tryget(). + * Put a reference obtained via css_get() and css_tryget_online(). */ static inline void css_put(struct cgroup_subsys_state *css) { @@ -905,8 +905,8 @@ void css_task_iter_end(struct css_task_iter *it); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); -struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss); +struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss); #else /* !CONFIG_CGROUPS */ -- cgit v1.2.3 From b41686401e501430ffe93b575ef7959d2ecc6f2e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:21 -0400 Subject: cgroup: implement cftype->write() During the recent conversion to kernfs, cftype's seq_file operations are updated so that they are directly mapped to kernfs operations and thus can fully access the associated kernfs and cgroup contexts; however, write path hasn't seen similar updates and none of the existing write operations has access to, for example, the associated kernfs_open_file. Let's introduce a new operation cftype->write() which maps directly to the kernfs write operation and has access to all the arguments and contexts. This will replace ->write_string() and ->trigger() and ease manipulation of kernfs active protection from cgroup file operations. Two accessors - of_cft() and of_css() - are introduced to enable accessing the associated cgroup context from cftype->write() which only takes kernfs_open_file for the context information. The accessors for seq_file operations - seq_cft() and seq_css() - are rewritten to wrap the of_ accessors. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c5f3684ef557..c5a170ca4a48 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -515,6 +515,15 @@ struct cftype { */ int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. Use + * of_css/cft() to access the associated css and cft. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); + #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; #endif @@ -552,14 +561,24 @@ static inline ino_t cgroup_ino(struct cgroup *cgrp) return 0; } -static inline struct cftype *seq_cft(struct seq_file *seq) +/* cft/css accessors for cftype->write() operation */ +static inline struct cftype *of_cft(struct kernfs_open_file *of) { - struct kernfs_open_file *of = seq->private; - return of->kn->priv; } -struct cgroup_subsys_state *seq_css(struct seq_file *seq); +struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); + +/* cft/css accessors for cftype->seq_*() operations */ +static inline struct cftype *seq_cft(struct seq_file *seq) +{ + return of_cft(seq->private); +} + +static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) +{ + return of_css(seq->private); +} /* * Name / path handling functions. All are thin wrappers around the kernfs -- cgit v1.2.3 From 451af504df0c62f695a69b83c250486e77c66378 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:21 -0400 Subject: cgroup: replace cftype->write_string() with cftype->write() Convert all cftype->write_string() users to the new cftype->write() which maps directly to kernfs write operation and has full access to kernfs and cgroup contexts. The conversions are mostly mechanical. * @css and @cft are accessed using of_css() and of_cft() accessors respectively instead of being specified as arguments. * Should return @nbytes on success instead of 0. * @buf is not trimmed automatically. Trim if necessary. Note that blkcg and netprio don't need this as the parsers already handle whitespaces. cftype->write_string() has no user left after the conversions and removed. While at it, remove unnecessary local variable @p in cgroup_subtree_control_write() and stale comment about CGROUP_LOCAL_BUFFER_SIZE in cgroup_freezer.c. This patch doesn't introduce any visible behavior changes. v2: netprio was missing from conversion. Converted. Signed-off-by: Tejun Heo Acked-by: Aristeu Rozanski Acked-by: Vivek Goyal Acked-by: Li Zefan Cc: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko Cc: Neil Horman Cc: "David S. Miller" --- include/linux/cgroup.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c5a170ca4a48..aecdc84fe128 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -453,8 +453,7 @@ struct cftype { /* * The maximum length of string, excluding trailing nul, that can - * be passed to write_string. If < PAGE_SIZE-1, PAGE_SIZE-1 is - * assumed. + * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. */ size_t max_write_len; @@ -500,13 +499,6 @@ struct cftype { int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, s64 val); - /* - * write_string() is passed a nul-terminated kernelspace - * buffer of maximum length determined by max_write_len. - * Returns 0 or -ve error code. - */ - int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft, - char *buffer); /* * trigger() callback can be used to get some kick from the * userspace, when the actual string written is not important -- cgit v1.2.3 From 6770c64e5c8da4705d1f0973bdeb5c2bf4f3a404 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:21 -0400 Subject: cgroup: replace cftype->trigger() with cftype->write() cftype->trigger() is pointless. It's trivial to ignore the input buffer from a regular ->write() operation. Convert all ->trigger() users to ->write() and remove ->trigger(). This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner Cc: Michal Hocko --- include/linux/cgroup.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index aecdc84fe128..08eb71ee600b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -499,14 +499,6 @@ struct cftype { int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, s64 val); - /* - * trigger() callback can be used to get some kick from the - * userspace, when the actual string written is not important - * at all. The private field can be used to determine the - * kick type for multiplexing. - */ - int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); - /* * write() is the generic write callback which maps directly to * kernfs write operation and overrides all other operations. -- cgit v1.2.3 From b7fc5ad235936379fae67a9f7b50bb53487a1a3a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 May 2014 12:16:22 -0400 Subject: cgroup: remove cgroup->control_kn Now that cgroup_subtree_control_write() has access to the associated kernfs_open_file and thus the kernfs_node, there's no need to cache it in cgroup->control_kn on creation. Remove cgroup->control_kn and use @of->kn directly. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 08eb71ee600b..aa7353deaaf3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -177,7 +177,6 @@ struct cgroup { struct cgroup *parent; /* my parent */ struct kernfs_node *kn; /* cgroup kernfs entry */ - struct kernfs_node *control_kn; /* kn for "cgroup.subtree_control" */ struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ /* -- cgit v1.2.3 From 9d800df12d31734a6853915e9d2deb5d6747985f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:00 -0400 Subject: cgroup: rename cgroup->dummy_css to ->self and move it to the top cgroup->dummy_css is used as the placeholder css when performing css oriended operations on the cgroup. We're gonna shift more cgroup management to this css. Let's rename it to ->self and move it to the top. This is pure rename and field relocation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index aa7353deaaf3..164851e388e7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -143,6 +143,9 @@ enum { }; struct cgroup { + /* self css with NULL ->ss, points back to this cgroup */ + struct cgroup_subsys_state self; + unsigned long flags; /* "unsigned long" so bitops work */ /* @@ -224,9 +227,6 @@ struct cgroup { struct list_head pidlists; struct mutex pidlist_mutex; - /* dummy css with NULL ->ss, points back to this cgroup */ - struct cgroup_subsys_state dummy_css; - /* For css percpu_ref killing and RCU-protected deletion */ struct rcu_head rcu_head; struct work_struct destroy_work; -- cgit v1.2.3 From 249f3468a282dcbad53484c821bebb447f14ee03 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:01 -0400 Subject: cgroup: remove cgroup_destory_css_killed() cgroup_destroy_css_killed() is cgroup destruction stage which happens after all csses are offlined. After the recent updates, it no longer does anything other than putting the base reference. This patch removes the function and makes cgroup_destroy_locked() put the base ref at the end isntead. This also makes cgroup->nr_css unnecessary. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 164851e388e7..160fcc69149e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -158,9 +158,6 @@ struct cgroup { */ int id; - /* the number of attached css's */ - int nr_css; - /* * If this cgroup contains any tasks, it contributes one to * populated_cnt. All children with non-zero popuplated_cnt of -- cgit v1.2.3 From 9395a4500404e05173eda9a2d198b6fa500e90c5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:02 -0400 Subject: cgroup: enable refcnting for root csses Currently, css_get(), css_tryget() and css_tryget_online() are noops for root csses as an optimization; however, we're planning to use css refcnts to track of cgroup lifetime too and root cgroups also need to be reference counted. Since css has been converted to percpu_refcnt, the overhead of refcnting is miniscule and this optimization isn't too meaningful anymore. Furthermore, controllers which optimize the root cgroup often never even invoke these functions in their hot paths. This patch enables refcnting for root csses too. This makes CSS_ROOT flag unused and removes it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 160fcc69149e..286e39e4e9bf 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -77,7 +77,6 @@ struct cgroup_subsys_state { /* bits in struct cgroup_subsys_state flags field */ enum { - CSS_ROOT = (1 << 0), /* this CSS is the root of the subsystem */ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ }; @@ -89,9 +88,7 @@ enum { */ static inline void css_get(struct cgroup_subsys_state *css) { - /* We don't need to reference count the root state */ - if (!(css->flags & CSS_ROOT)) - percpu_ref_get(&css->refcnt); + percpu_ref_get(&css->refcnt); } /** @@ -106,8 +103,6 @@ static inline void css_get(struct cgroup_subsys_state *css) */ static inline bool css_tryget_online(struct cgroup_subsys_state *css) { - if (css->flags & CSS_ROOT) - return true; return percpu_ref_tryget_live(&css->refcnt); } @@ -119,8 +114,7 @@ static inline bool css_tryget_online(struct cgroup_subsys_state *css) */ static inline void css_put(struct cgroup_subsys_state *css) { - if (!(css->flags & CSS_ROOT)) - percpu_ref_put(&css->refcnt); + percpu_ref_put(&css->refcnt); } /* bits in struct cgroup flags field */ -- cgit v1.2.3 From 9d755d33f0db8c9b49438f71b38a56e375b34360 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 09:15:02 -0400 Subject: cgroup: use cgroup->self.refcnt for cgroup refcnting Currently cgroup implements refcnting separately using atomic_t cgroup->refcnt. The destruction paths of cgroup and css are rather complex and bear a lot of similiarities including the use of RCU and bouncing to a work item. This patch makes cgroup use the refcnt of self css for refcnting instead of using its own. This makes cgroup refcnting use css's percpu refcnt and share the destruction mechanism. * css_release_work_fn() and css_free_work_fn() are updated to handle both csses and cgroups. This is a bit messy but should do until we can make cgroup->self a full css, which currently can't be done thanks to multiple hierarchies. * cgroup_destroy_locked() now performs percpu_ref_kill(&cgrp->self.refcnt) instead of cgroup_put(cgrp). * Negative refcnt sanity check in cgroup_get() is no longer necessary as percpu_ref already handles it. * Similarly, as a cgroup which hasn't been killed will never be released regardless of its refcnt value and percpu_ref has sanity check on kill, cgroup_is_dead() sanity check in cgroup_put() is no longer necessary. * As whether a refcnt reached zero or not can only be decided after the reference count is killed, cgroup_root->cgrp's refcnting can no longer be used to decide whether to kill the root or not. Let's make cgroup_kill_sb() explicitly initiate destruction if the root doesn't have any children. This makes sense anyway as unmounted cgroup hierarchy without any children should be destroyed. While this is a bit messy, this will allow pushing more bookkeeping towards cgroup->self and thus handling cgroups and csses in more uniform way. In the very long term, it should be possible to introduce a base subsystem and convert the self css to a proper one making things whole lot simpler and unified. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 286e39e4e9bf..76dadd77a120 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -160,8 +160,6 @@ struct cgroup { */ int populated_cnt; - atomic_t refcnt; - /* * We link our 'sibling' struct into our parent's 'children'. * Our children link their 'sibling' into our 'children'. @@ -218,10 +216,6 @@ struct cgroup { struct list_head pidlists; struct mutex pidlist_mutex; - /* For css percpu_ref killing and RCU-protected deletion */ - struct rcu_head rcu_head; - struct work_struct destroy_work; - /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; }; -- cgit v1.2.3 From 3b514d24e200fcdcde0a57c354a51d3677a86743 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:47 -0400 Subject: cgroup: skip refcnting on normal root csses and cgrp_dfl_root self css 9395a4500404 ("cgroup: enable refcnting for root csses") enabled reference counting for root csses (cgroup_subsys_states) so that cgroup's self csses can be used to manage the lifetime of the containing cgroups. Unfortunately, this change was incorrect. During early init, cgrp_dfl_root self css refcnt is used. percpu_ref can't initialized during early init and its initialization is deferred till cgroup_init() time. This means that cpu was using percpu_ref which wasn't properly initialized. Due to the way percpu variables are laid out on x86, this didn't blow up immediately on x86 but ended up incrementing and decrementing the percpu variable at offset zero, whatever it may be; however, on other archs, this caused fault and early boot failure. As cgroup self csses for root cgroups of non-dfl hierarchies need working refcounting, we can't revert 9395a4500404. This patch adds CSS_NO_REF which explicitly inhibits reference counting on the css and sets it on all normal (non-self) csses and cgroup_dfl_root self css. v2: cgrp_dfl_root.self is the offending one. Set the flag on it. Signed-off-by: Tejun Heo Reported-by: Stephen Warren Tested-by: Stephen Warren Fixes: 9395a4500404 ("cgroup: enable refcnting for root csses") --- include/linux/cgroup.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 76dadd77a120..1737db0c63fe 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -77,6 +77,7 @@ struct cgroup_subsys_state { /* bits in struct cgroup_subsys_state flags field */ enum { + CSS_NO_REF = (1 << 0), /* no reference counting for this css */ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ }; @@ -88,7 +89,8 @@ enum { */ static inline void css_get(struct cgroup_subsys_state *css) { - percpu_ref_get(&css->refcnt); + if (!(css->flags & CSS_NO_REF)) + percpu_ref_get(&css->refcnt); } /** @@ -103,7 +105,9 @@ static inline void css_get(struct cgroup_subsys_state *css) */ static inline bool css_tryget_online(struct cgroup_subsys_state *css) { - return percpu_ref_tryget_live(&css->refcnt); + if (!(css->flags & CSS_NO_REF)) + return percpu_ref_tryget_live(&css->refcnt); + return true; } /** @@ -114,7 +118,8 @@ static inline bool css_tryget_online(struct cgroup_subsys_state *css) */ static inline void css_put(struct cgroup_subsys_state *css) { - percpu_ref_put(&css->refcnt); + if (!(css->flags & CSS_NO_REF)) + percpu_ref_put(&css->refcnt); } /* bits in struct cgroup flags field */ -- cgit v1.2.3 From 5c9d535b893f30266ea29fe377cb9b002fcd76aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:48 -0400 Subject: cgroup: remove css_parent() cgroup in general is moving towards using cgroup_subsys_state as the fundamental structural component and css_parent() was introduced to convert from using cgroup->parent to css->parent. It was quite some time ago and we're moving forward with making css more prominent. This patch drops the trivial wrapper css_parent() and let the users dereference css->parent. While at it, explicitly mark fields of css which are public and immutable. v2: New usage from device_cgroup.c converted. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Neil Horman Acked-by: "David S. Miller" Acked-by: Li Zefan Cc: Vivek Goyal Cc: Jens Axboe Cc: Peter Zijlstra Cc: Johannes Weiner --- include/linux/cgroup.h | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1737db0c63fe..2549493d518d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -48,22 +48,28 @@ enum cgroup_subsys_id { }; #undef SUBSYS -/* Per-subsystem/per-cgroup state maintained by the system. */ +/* + * Per-subsystem/per-cgroup state maintained by the system. This is the + * fundamental structural building block that controllers deal with. + * + * Fields marked with "PI:" are public and immutable and may be accessed + * directly without synchronization. + */ struct cgroup_subsys_state { - /* the cgroup that this css is attached to */ + /* PI: the cgroup that this css is attached to */ struct cgroup *cgroup; - /* the cgroup subsystem that this css is attached to */ + /* PI: the cgroup subsystem that this css is attached to */ struct cgroup_subsys *ss; /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; - /* the parent css */ + /* PI: the parent css */ struct cgroup_subsys_state *parent; /* - * Subsys-unique ID. 0 is unused and root is always 1. The + * PI: Subsys-unique ID. 0 is unused and root is always 1. The * matching css can be looked up using css_from_id(). */ int id; @@ -669,19 +675,6 @@ struct cgroup_subsys { #include #undef SUBSYS -/** - * css_parent - find the parent css - * @css: the target cgroup_subsys_state - * - * Return the parent css of @css. This function is guaranteed to return - * non-NULL parent as long as @css isn't the root. - */ -static inline -struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css) -{ - return css->parent; -} - /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for -- cgit v1.2.3 From d51f39b05ce0008118c45945e681b20484990571 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:48 -0400 Subject: cgroup: remove cgroup->parent cgroup->parent is redundant as cgroup->self.parent can also be used to determine the parent cgroup and we're moving towards using cgroup_subsys_states as the fundamental structural blocks. This patch introduces cgroup_parent() which follows cgroup->self.parent and removes cgroup->parent. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2549493d518d..fd538f4c2bb6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -178,7 +178,6 @@ struct cgroup { struct list_head sibling; /* my parent's children */ struct list_head children; /* my children */ - struct cgroup *parent; /* my parent */ struct kernfs_node *kn; /* cgroup kernfs entry */ struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ -- cgit v1.2.3 From d5c419b68e368fdd9f1857bf8d4bb4480edb9b80 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:48 -0400 Subject: cgroup: move cgroup->sibling and ->children into cgroup_subsys_state We're moving towards using cgroup_subsys_states as the fundamental structural blocks. Let's move cgroup->sibling and ->children into cgroup_subsys_state. This is pure move without functional change and only cgroup->self's fields are actually used. Other csses will make use of the fields later. While at it, update init_and_link_css() so that it zeroes the whole css before initializing it and remove explicit zeroing of ->flags. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index fd538f4c2bb6..cf8ba26b7c6e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -68,6 +68,10 @@ struct cgroup_subsys_state { /* PI: the parent css */ struct cgroup_subsys_state *parent; + /* siblings list anchored at the parent's ->children */ + struct list_head sibling; + struct list_head children; + /* * PI: Subsys-unique ID. 0 is unused and root is always 1. The * matching css can be looked up using css_from_id(). @@ -171,13 +175,6 @@ struct cgroup { */ int populated_cnt; - /* - * We link our 'sibling' struct into our parent's 'children'. - * Our children link their 'sibling' into our 'children'. - */ - struct list_head sibling; /* my parent's children */ - struct list_head children; /* my children */ - struct kernfs_node *kn; /* cgroup kernfs entry */ struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ -- cgit v1.2.3 From 0cb51d71c1fa9234afe4213089844be76ec1765a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:49 -0400 Subject: cgroup: move cgroup->serial_nr into cgroup_subsys_state We're moving towards using cgroup_subsys_states as the fundamental structural blocks. All csses including the cgroup->self and actual ones now form trees through css->children and ->sibling which follow the same rules as what cgroup->children and ->sibling followed. This patch moves cgroup->serial_nr which is used to implement css iteration into css. Note that all csses, regardless of their types, allocate their serial numbers from the same monotonically increasing counter. This doesn't affect the ordering needed by css iteration or cause any other material behavior changes. This will be used to update css iteration. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index cf8ba26b7c6e..ebe7ce49f4b7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -80,6 +80,14 @@ struct cgroup_subsys_state { unsigned int flags; + /* + * Monotonically increasing unique serial number which defines a + * uniform order among all csses. It's guaranteed that all + * ->children lists are in the ascending order of ->serial_nr and + * used to allow interrupting and resuming iterations. + */ + u64 serial_nr; + /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; @@ -178,14 +186,6 @@ struct cgroup { struct kernfs_node *kn; /* cgroup kernfs entry */ struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ - /* - * Monotonically increasing unique serial number which defines a - * uniform order among all cgroups. It's guaranteed that all - * ->children lists are in the ascending order of ->serial_nr. - * It's used to allow interrupting and resuming iterations. - */ - u64 serial_nr; - /* the bitmask of subsystems enabled on the child cgroups */ unsigned int child_subsys_mask; -- cgit v1.2.3 From de3f034182ecbf0efbcef7ab8b253c6c3049a592 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:49 -0400 Subject: cgroup: introduce CSS_RELEASED and reduce css iteration fallback window css iterations allow the caller to drop RCU read lock. As long as the caller keeps the current position accessible, it can simply re-grab RCU read lock later and continue iteration. This is achieved by using CGRP_DEAD to detect whether the current positions next pointer is safe to dereference and if not re-iterate from the beginning to the next position using ->serial_nr. CGRP_DEAD is used as the marker to invalidate the next pointer and the only requirement is that the marker is set before the next sibling starts its RCU grace period. Because CGRP_DEAD is set at the end of cgroup_destroy_locked() but the cgroup is unlinked when the reference count reaches zero, we currently have a rather large window where this fallback re-iteration logic can be triggered. This patch introduces CSS_RELEASED which is set when a css is unlinked from its sibling list. This still keeps the re-iteration logic working while drastically reducing the window of its activation. While at it, rewrite the comment in css_next_child() to reflect the new flag and better explain the synchronization. This will also enable iterating csses directly instead of through cgroups. v2: CSS_RELEASED now assigned to 1 << 2 as 1 << 0 is used by CSS_NO_REF. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ebe7ce49f4b7..5375582ea5f6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -97,6 +97,7 @@ struct cgroup_subsys_state { enum { CSS_NO_REF = (1 << 0), /* no reference counting for this css */ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ + CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ }; /** -- cgit v1.2.3 From c2931b70a32c705b9bd5762f5044f9eac8a52bb3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:51 -0400 Subject: cgroup: iterate cgroup_subsys_states directly Currently, css_next_child() is implemented as finding the next child cgroup which has the css enabled, which used to be the only way to do it as only cgroups participated in sibling lists and thus could be iteratd. This works as long as what's required during iteration is not missing online csses; however, it turns out that there are use cases where offlined but not yet released csses need to be iterated. This is difficult to implement through cgroup iteration the unified hierarchy as there may be multiple dying csses for the same subsystem associated with single cgroup. After the recent changes, the cgroup self and regular csses behave identically in how they're linked and unlinked from the sibling lists including assertion of CSS_RELEASED and css_next_child() can simply switch to iterating csses directly. This both simplifies the logic and ensures that all visible non-released csses are included in the iteration whether there are multiple dying csses for a subsystem or not. As all other iterators depend on css_next_child() for sibling iteration, this changes behaviors of all css iterators. Add and update explanations on the css states which are included in traversal to all iterators. As css iteration could always contain offlined csses, this shouldn't break any of the current users and new usages which need iteration of all on and offline csses can make use of the new semantics. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner --- include/linux/cgroup.h | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5375582ea5f6..f2ff578fc03a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -764,14 +764,14 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); * @pos: the css * to use as the loop cursor * @parent: css whose children to walk * - * Walk @parent's children. Must be called under rcu_read_lock(). A child - * css which hasn't finished ->css_online() or already has finished - * ->css_offline() may show up during traversal and it's each subsystem's - * responsibility to verify that each @pos is alive. + * Walk @parent's children. Must be called under rcu_read_lock(). * - * If a subsystem synchronizes against the parent in its ->css_online() and - * before starting iterating, a css which finished ->css_online() is - * guaranteed to be visible in the future iterations. + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until @@ -794,17 +794,16 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos); * @root: css whose descendants to walk * * Walk @root's descendants. @root is included in the iteration and the - * first node to be visited. Must be called under rcu_read_lock(). A - * descendant css which hasn't finished ->css_online() or already has - * finished ->css_offline() may show up during traversal and it's each - * subsystem's responsibility to verify that each @pos is alive. + * first node to be visited. Must be called under rcu_read_lock(). * - * If a subsystem synchronizes against the parent in its ->css_online() and - * before starting iterating, and synchronizes against @pos on each - * iteration, any descendant css which finished ->css_online() is - * guaranteed to be visible in the future iterations. + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. * - * In other words, the following guarantees that a descendant can't escape + * For example, the following guarantees that a descendant can't escape * state updates of its ancestors. * * my_online(@css) @@ -860,8 +859,17 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, * * Similar to css_for_each_descendant_pre() but performs post-order * traversal instead. @root is included in the iteration and the last - * node to be visited. Note that the walk visibility guarantee described - * in pre-order walk doesn't apply the same to post-order walks. + * node to be visited. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. + * + * Note that the walk visibility guarantee example described in pre-order + * walk doesn't apply the same to post-order walks. */ #define css_for_each_descendant_post(pos, css) \ for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ -- cgit v1.2.3 From 184faf32328c65c9d86b19577b8d8b90bdd2cd2e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:51 -0400 Subject: cgroup: use CSS_ONLINE instead of CGRP_DEAD Use CSS_ONLINE on the self css to indicate whether a cgroup has been killed instead of CGRP_DEAD. This will allow re-using css online test for cgroup liveliness test. This doesn't introduce any functional change. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f2ff578fc03a..51a339c99eb6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -143,8 +143,6 @@ static inline void css_put(struct cgroup_subsys_state *css) /* bits in struct cgroup flags field */ enum { - /* Control Group is dead */ - CGRP_DEAD, /* * Control Group has previously had a child cgroup or a task, * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) -- cgit v1.2.3 From f3d4650015301d1c880df4523f7e7ef320a38aab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:52 -0400 Subject: cgroup: convert cgroup_has_live_children() into css_has_online_children() Now that cgroup liveliness and css onliness are the same state, convert cgroup_has_live_children() into css_has_online_children() so that it can be used for actual csses too. The function now uses css_for_each_child() for iteration and is published. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 51a339c99eb6..b76999954beb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -873,6 +873,8 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) +bool css_has_online_children(struct cgroup_subsys_state *css); + /* A css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; -- cgit v1.2.3 From 6f4524d355a86769b65d5420a6ef47fb0bba9b72 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 May 2014 13:22:52 -0400 Subject: cgroup: implement css_tryget() Implement css_tryget() which tries to grab a cgroup_subsys_state's reference as long as it already hasn't reached zero. Combined with the recent css iterator changes to include offline && !released csses during traversal, this can be used to access csses regardless of its online state. v2: Take the new flag CSS_NO_REF into account. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Johannes Weiner --- include/linux/cgroup.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b76999954beb..4afe544d3547 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -112,6 +112,24 @@ static inline void css_get(struct cgroup_subsys_state *css) percpu_ref_get(&css->refcnt); } +/** + * css_tryget - try to obtain a reference on the specified css + * @css: target css + * + * Obtain a reference on @css unless it already has reached zero and is + * being released. This function doesn't care whether @css is on or + * offline. The caller naturally needs to ensure that @css is accessible + * but doesn't have to be holding a reference on it - IOW, RCU protected + * access is good enough for this function. Returns %true if a reference + * count was successfully obtained; %false otherwise. + */ +static inline bool css_tryget(struct cgroup_subsys_state *css) +{ + if (!(css->flags & CSS_NO_REF)) + return percpu_ref_tryget(&css->refcnt); + return true; +} + /** * css_tryget_online - try to obtain a reference on the specified css if online * @css: target css -- cgit v1.2.3 From 5533e0114425dcdb878f11b291f2727af8667a7c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 May 2014 19:33:07 -0400 Subject: cgroup: disallow debug controller on the default hierarchy The debug controller, as its name suggests, exposes cgroup core internals to userland to aid debugging. Unfortunately, except for the name, there's no provision to prevent its usage in production configurations and the controller is widely enabled and mounted leaking internal details to userland. Like most other debug information, the information exposed by debug isn't interesting even for debugging itself once the related parts are working reliably. This controller has no reason for existing. This patch implements cgrp_dfl_root_inhibit_ss_mask which can suppress specific subsystems on the default hierarchy and adds the debug subsystem to it so that it can be gradually deprecated as usages move towards the unified hierarchy. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4afe544d3547..8a111dd42d7a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -305,6 +305,8 @@ enum { * the flag is not created. * * - blkcg: blk-throttle becomes properly hierarchical. + * + * - debug: disallowed on the default hierarchy. */ CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), -- cgit v1.2.3