summaryrefslogtreecommitdiff
path: root/include/linux/cgroup-defs.h
diff options
context:
space:
mode:
authorJP Kobryn <inwardvessel@gmail.com>2025-05-14 17:19:34 -0700
committerTejun Heo <tj@kernel.org>2025-05-19 10:28:59 -1000
commit5da3bfa029d6809e192d112f39fca4dbe0137aaf (patch)
treeeecf6a790edcfc794b7d6cb464d1eb2f5a0e3715 /include/linux/cgroup-defs.h
parent541a4219bd66bef56d93dbd306dc64a4d70ae99e (diff)
cgroup: use separate rstat trees for each subsystem
Different subsystems may call cgroup_rstat_updated() within the same cgroup, resulting in a tree of pending updates from multiple subsystems. When one of these subsystems is flushed via cgroup_rstat_flushed(), all other subsystems with pending updates on the tree will also be flushed. Change the paradigm of having a single rstat tree for all subsystems to having separate trees for each subsystem. This separation allows for subsystems to perform flushes without the side effects of other subsystems. As an example, flushing the cpu stats will no longer cause the memory stats to be flushed and vice versa. In order to achieve subsystem-specific trees, change the tree node type from cgroup to cgroup_subsys_state pointer. Then remove those pointers from the cgroup and instead place them on the css. Finally, change update/flush functions to make use of the different node type (css). These changes allow a specific subsystem to be associated with an update or flush. Separate rstat trees will now exist for each unique subsystem. Since updating/flushing will now be done at the subsystem level, there is no longer a need to keep track of updated css nodes at the cgroup level. The list management of these nodes done within the cgroup (rstat_css_list and related) has been removed accordingly. Conditional guards for checking validity of a given css were placed within css_rstat_updated/flush() to prevent undefined behavior occuring from kfunc usage in bpf programs. Guards were also placed within css_rstat_init/exit() in order to help consolidate calls to them. At call sites for all four functions, the existing guards were removed. Signed-off-by: JP Kobryn <inwardvessel@gmail.com> Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'include/linux/cgroup-defs.h')
-rw-r--r--include/linux/cgroup-defs.h46
1 files changed, 21 insertions, 25 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index e58bfb880111..17ecaae9c5f8 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -169,6 +169,8 @@ struct cgroup_subsys_state {
/* reference count - access via css_[try]get() and css_put() */
struct percpu_ref refcnt;
+ struct css_rstat_cpu __percpu *rstat_cpu;
+
/*
* siblings list anchored at the parent's ->children
*
@@ -177,9 +179,6 @@ struct cgroup_subsys_state {
struct list_head sibling;
struct list_head children;
- /* flush target list anchored at cgrp->rstat_css_list */
- struct list_head rstat_css_node;
-
/*
* PI: Subsys-unique ID. 0 is unused and root is always 1. The
* matching css can be looked up using css_from_id().
@@ -219,6 +218,13 @@ struct cgroup_subsys_state {
* Protected by cgroup_mutex.
*/
int nr_descendants;
+
+ /*
+ * A singly-linked list of css structures to be rstat flushed.
+ * This is a scratch field to be used exclusively by
+ * css_rstat_flush() and protected by cgroup_rstat_lock.
+ */
+ struct cgroup_subsys_state *rstat_flush_next;
};
/*
@@ -329,10 +335,10 @@ struct cgroup_base_stat {
/*
* rstat - cgroup scalable recursive statistics. Accounting is done
- * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the
+ * per-cpu in css_rstat_cpu which is then lazily propagated up the
* hierarchy on reads.
*
- * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are
+ * When a stat gets updated, the css_rstat_cpu and its ancestors are
* linked into the updated tree. On the following read, propagation only
* considers and consumes the updated tree. This makes reading O(the
* number of descendants which have been active since last read) instead of
@@ -346,20 +352,20 @@ struct cgroup_base_stat {
* This struct hosts both the fields which implement the above -
* updated_children and updated_next.
*/
-struct cgroup_rstat_cpu {
+struct css_rstat_cpu {
/*
* Child cgroups with stat updates on this cpu since the last read
* are linked on the parent's ->updated_children through
- * ->updated_next.
+ * ->updated_next. updated_children is terminated by its container css.
*
- * In addition to being more compact, singly-linked list pointing
- * to the cgroup makes it unnecessary for each per-cpu struct to
- * point back to the associated cgroup.
+ * In addition to being more compact, singly-linked list pointing to
+ * the css makes it unnecessary for each per-cpu struct to point back
+ * to the associated css.
*
* Protected by per-cpu cgroup_rstat_cpu_lock.
*/
- struct cgroup *updated_children; /* terminated by self cgroup */
- struct cgroup *updated_next; /* NULL iff not on the list */
+ struct cgroup_subsys_state *updated_children;
+ struct cgroup_subsys_state *updated_next; /* NULL if not on the list */
};
/*
@@ -521,25 +527,15 @@ struct cgroup {
struct cgroup *dom_cgrp;
struct cgroup *old_dom_cgrp; /* used while enabling threaded */
- /* per-cpu recursive resource statistics */
- struct cgroup_rstat_cpu __percpu *rstat_cpu;
struct cgroup_rstat_base_cpu __percpu *rstat_base_cpu;
- struct list_head rstat_css_list;
/*
- * Add padding to separate the read mostly rstat_cpu and
- * rstat_css_list into a different cacheline from the following
- * rstat_flush_next and *bstat fields which can have frequent updates.
+ * Add padding to keep the read mostly rstat per-cpu pointer on a
+ * different cacheline than the following *bstat fields which can have
+ * frequent updates.
*/
CACHELINE_PADDING(_pad_);
- /*
- * A singly-linked list of cgroup structures to be rstat flushed.
- * This is a scratch field to be used exclusively by
- * css_rstat_flush_locked() and protected by cgroup_rstat_lock.
- */
- struct cgroup *rstat_flush_next;
-
/* cgroup basic resource statistics */
struct cgroup_base_stat last_bstat;
struct cgroup_base_stat bstat;