From 82d7e59ea707b55dc6c3ba3c56ded36742741bd4 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 2 Dec 2025 02:57:47 +0000 Subject: cgroup: switch to css_is_online() helper Use the new css_is_online() helper that has been introduced to check css online state, instead of testing the CSS_ONLINE flag directly. This improves readability and centralizes the state check logic. No functional changes intended. Signed-off-by: Chen Ridong Acked-by: Shakeel Butt Reviewed-by: Jan Kara Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e717208cfb18..34d6d4d99f97 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4948,7 +4948,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css) rcu_read_lock(); css_for_each_child(child, css) { - if (child->flags & CSS_ONLINE) { + if (css_is_online(child)) { ret = true; break; } @@ -5753,7 +5753,7 @@ static void offline_css(struct cgroup_subsys_state *css) lockdep_assert_held(&cgroup_mutex); - if (!(css->flags & CSS_ONLINE)) + if (!css_is_online(css)) return; if (ss->css_offline) -- cgit v1.2.3 From 6ee43047e8ada63c4dfee01e2ea7e7eadfcda2ab Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 26 Nov 2025 09:11:58 +0000 Subject: cpuset: Remove unnecessary checks in rebuild_sched_domains_locked Commit 406100f3da08 ("cpuset: fix race between hotplug work and later CPU offline") added a check for empty effective_cpus in partitions for cgroup v2. However, this check did not account for remote partitions, which were introduced later. After commit 2125c0034c5d ("cgroup/cpuset: Make cpuset hotplug processing synchronous"), cpuset hotplug handling is now synchronous. This eliminates the race condition with subsequent CPU offline operations that the original check aimed to fix. Instead of extending the check to support remote partitions, this patch removes all the redundant effective_cpus check. Additionally, it adds a check and warning to verify that all generated sched domains consist of active CPUs, preventing partition_sched_domains from being invoked with offline CPUs. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 50 +++++++++++++++----------------------------------- 1 file changed, 15 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6e6eb09b8db6..fea577b4016a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1103,53 +1103,33 @@ void dl_rebuild_rd_accounting(void) */ void rebuild_sched_domains_locked(void) { - struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; cpumask_var_t *doms; - struct cpuset *cs; int ndoms; + int i; lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); force_sd_rebuild = false; - /* - * If we have raced with CPU hotplug, return early to avoid - * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_handle_hotplug() will rebuild sched domains. - * - * With no CPUs in any subpartitions, top_cpuset's effective CPUs - * should be the same as the active CPUs, so checking only top_cpuset - * is enough to detect racing CPU offlines. - */ - if (cpumask_empty(subpartitions_cpus) && - !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - return; + /* Generate domain masks and attrs */ + ndoms = generate_sched_domains(&doms, &attr); /* - * With subpartition CPUs, however, the effective CPUs of a partition - * root should be only a subset of the active CPUs. Since a CPU in any - * partition root could be offlined, all must be checked. - */ - if (!cpumask_empty(subpartitions_cpus)) { - rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (!is_partition_valid(cs)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } - if (!cpumask_subset(cs->effective_cpus, - cpu_active_mask)) { - rcu_read_unlock(); - return; - } - } - rcu_read_unlock(); + * cpuset_hotplug_workfn is invoked synchronously now, thus this + * function should not race with CPU hotplug. And the effective CPUs + * must not include any offline CPUs. Passing an offline CPU in the + * doms to partition_sched_domains() will trigger a kernel panic. + * + * We perform a final check here: if the doms contains any + * offline CPUs, a warning is emitted and we return directly to + * prevent the panic. + */ + for (i = 0; i < ndoms; ++i) { + if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) + return; } - /* Generate domain masks and attrs */ - ndoms = generate_sched_domains(&doms, &attr); - /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); } -- cgit v1.2.3 From 14c11e1b2ac47fbc51503c7d0f35fad9ea5ea46e Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Thu, 18 Dec 2025 09:31:36 +0000 Subject: cpuset: add lockdep_assert_cpuset_lock_held helper Add lockdep_assert_cpuset_lock_held() to allow other subsystems to verify that cpuset_mutex is held. Suggested-by: Waiman Long Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index fea577b4016a..4fa3080da3be 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -271,6 +271,11 @@ void cpuset_unlock(void) mutex_unlock(&cpuset_mutex); } +void lockdep_assert_cpuset_lock_held(void) +{ + lockdep_assert_held(&cpuset_mutex); +} + /** * cpuset_full_lock - Acquire full protection for cpuset modification * -- cgit v1.2.3 From 56805c1bb19ea3e40131ecf1485fdfce1bc0366b Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Thu, 18 Dec 2025 09:31:37 +0000 Subject: cpuset: add cpuset1_online_css helper for v1-specific operations This commit introduces the cpuset1_online_css helper to centralize v1-specific handling during cpuset online. It performs operations such as updating the CS_SPREAD_PAGE, CS_SPREAD_SLAB, and CGRP_CPUSET_CLONE_CHILDREN flags, which are unique to the cpuset v1 control group interface. The helper is now placed in cpuset-v1.c to maintain clear separation between v1 and v2 logic. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 2 ++ kernel/cgroup/cpuset-v1.c | 48 +++++++++++++++++++++++++++++++++++++++++ kernel/cgroup/cpuset.c | 39 +-------------------------------- 3 files changed, 51 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 01976c8e7d49..6c03cad02302 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -293,6 +293,7 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +void cpuset1_online_css(struct cgroup_subsys_state *css); #else static inline void fmeter_init(struct fmeter *fmp) {} static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, @@ -303,6 +304,7 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, bool cpus_updated, bool mems_updated) {} static inline int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { return 0; } +static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {} #endif /* CONFIG_CPUSETS_V1 */ #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 12e76774c75b..c296ce47616a 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -499,6 +499,54 @@ out_unlock: return retval; } +void cpuset1_online_css(struct cgroup_subsys_state *css) +{ + struct cpuset *tmp_cs; + struct cgroup_subsys_state *pos_css; + struct cpuset *cs = css_cs(css); + struct cpuset *parent = parent_cs(cs); + + lockdep_assert_cpus_held(); + lockdep_assert_cpuset_lock_held(); + + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); + + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) + return; + + /* + * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is + * set. This flag handling is implemented in cgroup core for + * historical reasons - the flag may be specified during mount. + * + * Currently, if any sibling cpusets have exclusive cpus or mem, we + * refuse to clone the configuration - thereby refusing the task to + * be entered, and as a result refusing the sys_unshare() or + * clone() which initiated it. If this becomes a problem for some + * users who wish to allow that scenario, then this could be + * changed to grant parent->cpus_allowed-sibling_cpus_exclusive + * (and likewise for mems) to the new cgroup. + */ + rcu_read_lock(); + cpuset_for_each_child(tmp_cs, pos_css, parent) { + if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + + cpuset_callback_lock_irq(); + cs->mems_allowed = parent->mems_allowed; + cs->effective_mems = parent->mems_allowed; + cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); + cpuset_callback_unlock_irq(); +} + /* * for the common functions, 'private' gives the type of file */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4fa3080da3be..5d9dbd1aeed3 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3616,17 +3616,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); - struct cpuset *tmp_cs; - struct cgroup_subsys_state *pos_css; if (!parent) return 0; cpuset_full_lock(); - if (is_spread_page(parent)) - set_bit(CS_SPREAD_PAGE, &cs->flags); - if (is_spread_slab(parent)) - set_bit(CS_SPREAD_SLAB, &cs->flags); /* * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated */ @@ -3641,39 +3635,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; } spin_unlock_irq(&callback_lock); + cpuset1_online_css(css); - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) - goto out_unlock; - - /* - * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is - * set. This flag handling is implemented in cgroup core for - * historical reasons - the flag may be specified during mount. - * - * Currently, if any sibling cpusets have exclusive cpus or mem, we - * refuse to clone the configuration - thereby refusing the task to - * be entered, and as a result refusing the sys_unshare() or - * clone() which initiated it. If this becomes a problem for some - * users who wish to allow that scenario, then this could be - * changed to grant parent->cpus_allowed-sibling_cpus_exclusive - * (and likewise for mems) to the new cgroup. - */ - rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_css, parent) { - if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { - rcu_read_unlock(); - goto out_unlock; - } - } - rcu_read_unlock(); - - spin_lock_irq(&callback_lock); - cs->mems_allowed = parent->mems_allowed; - cs->effective_mems = parent->mems_allowed; - cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); - cpumask_copy(cs->effective_cpus, parent->cpus_allowed); - spin_unlock_irq(&callback_lock); -out_unlock: cpuset_full_unlock(); return 0; } -- cgit v1.2.3 From 4ef42c645f0ec9b56f0715204013a27c2fec801e Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Thu, 18 Dec 2025 09:31:38 +0000 Subject: cpuset: add cpuset1_init helper for v1 initialization This patch introduces the cpuset1_init helper in cpuset_v1.c to initialize v1-specific fields, including the fmeter and relax_domain_level members. The relax_domain_level related code will be moved to cpuset_v1.c in a subsequent patch. After this move, v1-specific members will only be visible when CONFIG_CPUSETS_V1=y. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 10 ++++++---- kernel/cgroup/cpuset-v1.c | 7 ++++++- kernel/cgroup/cpuset.c | 4 ++-- 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 6c03cad02302..a32517da8231 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -144,8 +144,6 @@ struct cpuset { */ nodemask_t old_mems_allowed; - struct fmeter fmeter; /* memory_pressure filter */ - /* * Tasks are being attached to this cpuset. Used to prevent * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). @@ -181,6 +179,10 @@ struct cpuset { /* Used to merge intersecting subsets for generate_sched_domains */ struct uf_node node; + +#ifdef CONFIG_CPUSETS_V1 + struct fmeter fmeter; /* memory_pressure filter */ +#endif }; static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) @@ -285,7 +287,6 @@ void cpuset_full_unlock(void); */ #ifdef CONFIG_CPUSETS_V1 extern struct cftype cpuset1_files[]; -void fmeter_init(struct fmeter *fmp); void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk); void cpuset1_update_tasks_flags(struct cpuset *cs); @@ -293,9 +294,9 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +void cpuset1_init(struct cpuset *cs); void cpuset1_online_css(struct cgroup_subsys_state *css); #else -static inline void fmeter_init(struct fmeter *fmp) {} static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) {} static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {} @@ -304,6 +305,7 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, bool cpus_updated, bool mems_updated) {} static inline int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { return 0; } +static inline void cpuset1_init(struct cpuset *cs) {} static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {} #endif /* CONFIG_CPUSETS_V1 */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index c296ce47616a..84f00ab9c81f 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -62,7 +62,7 @@ struct cpuset_remove_tasks_struct { #define FM_SCALE 1000 /* faux fixed point scale */ /* Initialize a frequency meter */ -void fmeter_init(struct fmeter *fmp) +static void fmeter_init(struct fmeter *fmp) { fmp->cnt = 0; fmp->val = 0; @@ -499,6 +499,11 @@ out_unlock: return retval; } +void cpuset1_init(struct cpuset *cs) +{ + fmeter_init(&cs->fmeter); +} + void cpuset1_online_css(struct cgroup_subsys_state *css) { struct cpuset *tmp_cs; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5d9dbd1aeed3..8ef8b7511659 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3602,7 +3602,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - fmeter_init(&cs->fmeter); + cpuset1_init(cs); cs->relax_domain_level = -1; /* Set CS_MEMORY_MIGRATE for default hierarchy */ @@ -3836,7 +3836,7 @@ int __init cpuset_init(void) cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); - fmeter_init(&top_cpuset.fmeter); + cpuset1_init(&top_cpuset); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); -- cgit v1.2.3 From cb33f8814c4af3c917fa249cd9ef34b17a5ef562 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Thu, 18 Dec 2025 09:31:39 +0000 Subject: cpuset: move update_domain_attr_tree to cpuset_v1.c Since relax_domain_level is only applicable to v1, move update_domain_attr_tree() to cpuset-v1.c, which solely updates relax_domain_level, Additionally, relax_domain_level is now initialized in cpuset1_inited. Accordingly, the initialization of relax_domain_level in top_cpuset is removed. The unnecessary remote_partition initialization in top_cpuset is also cleaned up. As a result, relax_domain_level can be defined in cpuset only when CONFIG_CPUSETS_V1=y. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 11 ++++++++--- kernel/cgroup/cpuset-v1.c | 28 ++++++++++++++++++++++++++++ kernel/cgroup/cpuset.c | 31 ------------------------------- 3 files changed, 36 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index a32517da8231..677053ffb913 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -150,9 +150,6 @@ struct cpuset { */ int attach_in_progress; - /* for custom sched domain */ - int relax_domain_level; - /* partition root state */ int partition_root_state; @@ -182,6 +179,9 @@ struct cpuset { #ifdef CONFIG_CPUSETS_V1 struct fmeter fmeter; /* memory_pressure filter */ + + /* for custom sched domain */ + int relax_domain_level; #endif }; @@ -296,6 +296,8 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); void cpuset1_init(struct cpuset *cs); void cpuset1_online_css(struct cgroup_subsys_state *css); +void update_domain_attr_tree(struct sched_domain_attr *dattr, + struct cpuset *root_cs); #else static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) {} @@ -307,6 +309,9 @@ static inline int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { return 0; } static inline void cpuset1_init(struct cpuset *cs) {} static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {} +static inline void update_domain_attr_tree(struct sched_domain_attr *dattr, + struct cpuset *root_cs) {} + #endif /* CONFIG_CPUSETS_V1 */ #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 84f00ab9c81f..a4f8f1c3cfaa 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -502,6 +502,7 @@ out_unlock: void cpuset1_init(struct cpuset *cs) { fmeter_init(&cs->fmeter); + cs->relax_domain_level = -1; } void cpuset1_online_css(struct cgroup_subsys_state *css) @@ -552,6 +553,33 @@ void cpuset1_online_css(struct cgroup_subsys_state *css) cpuset_callback_unlock_irq(); } +static void +update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) +{ + if (dattr->relax_domain_level < c->relax_domain_level) + dattr->relax_domain_level = c->relax_domain_level; +} + +void update_domain_attr_tree(struct sched_domain_attr *dattr, + struct cpuset *root_cs) +{ + struct cpuset *cp; + struct cgroup_subsys_state *pos_css; + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + /* skip the whole subtree if @cp doesn't have any CPU */ + if (cpumask_empty(cp->cpus_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + if (is_sched_load_balance(cp)) + update_domain_attr(dattr, cp); + } + rcu_read_unlock(); +} + /* * for the common functions, 'private' gives the type of file */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8ef8b7511659..cf2363a9c552 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -215,8 +215,6 @@ static struct cpuset top_cpuset = { .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, - .relax_domain_level = -1, - .remote_partition = false, }; /* @@ -755,34 +753,6 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) return cpumask_intersects(a->effective_cpus, b->effective_cpus); } -static void -update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) -{ - if (dattr->relax_domain_level < c->relax_domain_level) - dattr->relax_domain_level = c->relax_domain_level; - return; -} - -static void update_domain_attr_tree(struct sched_domain_attr *dattr, - struct cpuset *root_cs) -{ - struct cpuset *cp; - struct cgroup_subsys_state *pos_css; - - rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - /* skip the whole subtree if @cp doesn't have any CPU */ - if (cpumask_empty(cp->cpus_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } - - if (is_sched_load_balance(cp)) - update_domain_attr(dattr, cp); - } - rcu_read_unlock(); -} - /* Must be called with cpuset_mutex held. */ static inline int nr_cpusets(void) { @@ -3603,7 +3573,6 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset1_init(cs); - cs->relax_domain_level = -1; /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cpuset_v2()) -- cgit v1.2.3 From 6e1d31ce495c35c12913246586bb04aac019b8e3 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Thu, 18 Dec 2025 09:31:40 +0000 Subject: cpuset: separate generate_sched_domains for v1 and v2 The generate_sched_domains() function currently handles both v1 and v2 logic. However, the underlying mechanisms for building scheduler domains differ significantly between the two versions. For cpuset v2, scheduler domains are straightforwardly derived from valid partitions, whereas cpuset v1 employs a more complex union-find algorithm to merge overlapping cpusets. Co-locating these implementations complicates maintenance. This patch, along with subsequent ones, aims to separate the v1 and v2 logic. For ease of review, this patch first copies the generate_sched_domains() function into cpuset-v1.c as cpuset1_generate_sched_domains() and removes v2-specific code. Common helpers and top_cpuset are declared in cpuset-internal.h. When operating in v1 mode, the code now calls cpuset1_generate_sched_domains(). Currently there is some code duplication, which will be largely eliminated once v1-specific code is removed from v2 in the following patch. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 23 ++++++ kernel/cgroup/cpuset-v1.c | 158 ++++++++++++++++++++++++++++++++++++++++ kernel/cgroup/cpuset.c | 31 +------- 3 files changed, 185 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 677053ffb913..8622a4666170 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -9,6 +9,7 @@ #include #include #include +#include /* See "Frequency meter" comments, below. */ @@ -185,6 +186,8 @@ struct cpuset { #endif }; +extern struct cpuset top_cpuset; + static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) { return css ? container_of(css, struct cpuset, css) : NULL; @@ -242,6 +245,21 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } +/* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? + */ +static inline int cpusets_overlap(struct cpuset *a, struct cpuset *b) +{ + return cpumask_intersects(a->effective_cpus, b->effective_cpus); +} + +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key.key) + 1; +} + /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child @@ -298,6 +316,9 @@ void cpuset1_init(struct cpuset *cs); void cpuset1_online_css(struct cgroup_subsys_state *css); void update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *root_cs); +int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes); + #else static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) {} @@ -311,6 +332,8 @@ static inline void cpuset1_init(struct cpuset *cs) {} static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {} static inline void update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *root_cs) {} +static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes) { return 0; }; #endif /* CONFIG_CPUSETS_V1 */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index a4f8f1c3cfaa..ffa7a8dc6c3a 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -580,6 +580,164 @@ void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } +/* + * cpuset1_generate_sched_domains() + * + * Finding the best partition (set of domains): + * The double nested loops below over i, j scan over the load + * balanced cpusets (using the array of cpuset pointers in csa[]) + * looking for pairs of cpusets that have overlapping cpus_allowed + * and merging them using a union-find algorithm. + * + * The union of the cpus_allowed masks from the set of all cpusets + * having the same root then form the one element of the partition + * (one sched domain) to be passed to partition_sched_domains(). + */ +int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes) +{ + struct cpuset *cp; /* top-down scan of cpusets */ + struct cpuset **csa; /* array of all cpuset ptrs */ + int csn; /* how many cpuset ptrs in csa so far */ + int i, j; /* indices for partition finding loops */ + cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ + struct sched_domain_attr *dattr; /* attributes for custom domains */ + int ndoms = 0; /* number of sched domains in result */ + int nslot; /* next empty doms[] struct cpumask slot */ + struct cgroup_subsys_state *pos_css; + bool root_load_balance = is_sched_load_balance(&top_cpuset); + int nslot_update; + + lockdep_assert_cpuset_lock_held(); + + doms = NULL; + dattr = NULL; + csa = NULL; + + /* Special case for the 99% of systems with one, full, sched domain */ + if (root_load_balance) { + ndoms = 1; + doms = alloc_sched_domains(ndoms); + if (!doms) + goto done; + + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); + if (dattr) { + *dattr = SD_ATTR_INIT; + update_domain_attr_tree(dattr, &top_cpuset); + } + cpumask_and(doms[0], top_cpuset.effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + + goto done; + } + + csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); + if (!csa) + goto done; + csn = 0; + + rcu_read_lock(); + if (root_load_balance) + csa[csn++] = &top_cpuset; + cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { + if (cp == &top_cpuset) + continue; + + /* + * Continue traversing beyond @cp iff @cp has some CPUs and + * isn't load balancing. The former is obvious. The + * latter: All child cpusets contain a subset of the + * parent's cpus, so just skip them, and then we call + * update_domain_attr_tree() to calc relax_domain_level of + * the corresponding sched domain. + */ + if (!cpumask_empty(cp->cpus_allowed) && + !(is_sched_load_balance(cp) && + cpumask_intersects(cp->cpus_allowed, + housekeeping_cpumask(HK_TYPE_DOMAIN)))) + continue; + + if (is_sched_load_balance(cp) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + } + rcu_read_unlock(); + + for (i = 0; i < csn; i++) + uf_node_init(&csa[i]->node); + + /* Merge overlapping cpusets */ + for (i = 0; i < csn; i++) { + for (j = i + 1; j < csn; j++) { + if (cpusets_overlap(csa[i], csa[j])) + uf_union(&csa[i]->node, &csa[j]->node); + } + } + + /* Count the total number of domains */ + for (i = 0; i < csn; i++) { + if (uf_find(&csa[i]->node) == &csa[i]->node) + ndoms++; + } + + /* + * Now we know how many domains to create. + * Convert to and populate cpu masks. + */ + doms = alloc_sched_domains(ndoms); + if (!doms) + goto done; + + /* + * The rest of the code, including the scheduler, can deal with + * dattr==NULL case. No need to abort if alloc fails. + */ + dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), + GFP_KERNEL); + + for (nslot = 0, i = 0; i < csn; i++) { + nslot_update = 0; + for (j = i; j < csn; j++) { + if (uf_find(&csa[j]->node) == &csa[i]->node) { + struct cpumask *dp = doms[nslot]; + + if (i == j) { + nslot_update = 1; + cpumask_clear(dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + } + cpumask_or(dp, dp, csa[j]->effective_cpus); + cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); + if (dattr) + update_domain_attr_tree(dattr + nslot, csa[j]); + } + } + if (nslot_update) + nslot++; + } + BUG_ON(nslot != ndoms); + +done: + kfree(csa); + + /* + * Fallback to the default domain if kmalloc() failed. + * See comments in partition_sched_domains(). + */ + if (doms == NULL) + ndoms = 1; + + *domains = doms; + *attributes = dattr; + return ndoms; +} + /* * for the common functions, 'private' gives the type of file */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index cf2363a9c552..33c929b191e8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -211,7 +211,7 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) * If cpu_online_mask is used while a hotunplug operation is happening in * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ -static struct cpuset top_cpuset = { +struct cpuset top_cpuset = { .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, @@ -744,21 +744,6 @@ out: } #ifdef CONFIG_SMP -/* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? - */ -static int cpusets_overlap(struct cpuset *a, struct cpuset *b) -{ - return cpumask_intersects(a->effective_cpus, b->effective_cpus); -} - -/* Must be called with cpuset_mutex held. */ -static inline int nr_cpusets(void) -{ - /* jump label reference count + the top-level cpuset */ - return static_key_count(&cpusets_enabled_key.key) + 1; -} /* * generate_sched_domains() @@ -798,17 +783,6 @@ static inline int nr_cpusets(void) * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) - * - * Finding the best partition (set of domains): - * The double nested loops below over i, j scan over the load - * balanced cpusets (using the array of cpuset pointers in csa[]) - * looking for pairs of cpusets that have overlapping cpus_allowed - * and merging them using a union-find algorithm. - * - * The union of the cpus_allowed masks from the set of all cpusets - * having the same root then form the one element of the partition - * (one sched domain) to be passed to partition_sched_domains(). - * */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) @@ -826,6 +800,9 @@ static int generate_sched_domains(cpumask_var_t **domains, bool cgrpv2 = cpuset_v2(); int nslot_update; + if (!cgrpv2) + return cpuset1_generate_sched_domains(domains, attributes); + doms = NULL; dattr = NULL; csa = NULL; -- cgit v1.2.3 From 7cc1720589d8cc78752cdf83a687422bb7838268 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Thu, 18 Dec 2025 09:31:41 +0000 Subject: cpuset: remove v1-specific code from generate_sched_domains Following the introduction of cpuset1_generate_sched_domains() for v1 in the previous patch, v1-specific logic can now be removed from the generic generate_sched_domains(). This patch cleans up the v1-only code and ensures uf_node is only visible when CONFIG_CPUSETS_V1=y. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 10 +-- kernel/cgroup/cpuset-v1.c | 2 +- kernel/cgroup/cpuset.c | 145 +++++++--------------------------------- 3 files changed, 28 insertions(+), 129 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 8622a4666170..e718a4f54360 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -175,14 +175,14 @@ struct cpuset { /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; - /* Used to merge intersecting subsets for generate_sched_domains */ - struct uf_node node; - #ifdef CONFIG_CPUSETS_V1 struct fmeter fmeter; /* memory_pressure filter */ /* for custom sched domain */ int relax_domain_level; + + /* Used to merge intersecting subsets for generate_sched_domains */ + struct uf_node node; #endif }; @@ -314,8 +314,6 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); void cpuset1_init(struct cpuset *cs); void cpuset1_online_css(struct cgroup_subsys_state *css); -void update_domain_attr_tree(struct sched_domain_attr *dattr, - struct cpuset *root_cs); int cpuset1_generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes); @@ -330,8 +328,6 @@ static inline int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { return 0; } static inline void cpuset1_init(struct cpuset *cs) {} static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {} -static inline void update_domain_attr_tree(struct sched_domain_attr *dattr, - struct cpuset *root_cs) {} static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { return 0; }; diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index ffa7a8dc6c3a..7303315fdba7 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -560,7 +560,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) dattr->relax_domain_level = c->relax_domain_level; } -void update_domain_attr_tree(struct sched_domain_attr *dattr, +static void update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *root_cs) { struct cpuset *cp; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 33c929b191e8..221da921b4f9 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -789,18 +789,13 @@ static int generate_sched_domains(cpumask_var_t **domains, { struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ - int csn; /* how many cpuset ptrs in csa so far */ int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ - int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; - bool root_load_balance = is_sched_load_balance(&top_cpuset); - bool cgrpv2 = cpuset_v2(); - int nslot_update; - if (!cgrpv2) + if (!cpuset_v2()) return cpuset1_generate_sched_domains(domains, attributes); doms = NULL; @@ -808,70 +803,26 @@ static int generate_sched_domains(cpumask_var_t **domains, csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && cpumask_empty(subpartitions_cpus)) { -single_root_domain: + if (cpumask_empty(subpartitions_cpus)) { ndoms = 1; - doms = alloc_sched_domains(ndoms); - if (!doms) - goto done; - - dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); - if (dattr) { - *dattr = SD_ATTR_INIT; - update_domain_attr_tree(dattr, &top_cpuset); - } - cpumask_and(doms[0], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_TYPE_DOMAIN)); - - goto done; + /* !csa will be checked and can be correctly handled */ + goto generate_doms; } csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); if (!csa) goto done; - csn = 0; + /* Find how many partitions and cache them to csa[] */ rcu_read_lock(); - if (root_load_balance) - csa[csn++] = &top_cpuset; cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { - if (cp == &top_cpuset) - continue; - - if (cgrpv2) - goto v2; - - /* - * v1: - * Continue traversing beyond @cp iff @cp has some CPUs and - * isn't load balancing. The former is obvious. The - * latter: All child cpusets contain a subset of the - * parent's cpus, so just skip them, and then we call - * update_domain_attr_tree() to calc relax_domain_level of - * the corresponding sched domain. - */ - if (!cpumask_empty(cp->cpus_allowed) && - !(is_sched_load_balance(cp) && - cpumask_intersects(cp->cpus_allowed, - housekeeping_cpumask(HK_TYPE_DOMAIN)))) - continue; - - if (is_sched_load_balance(cp) && - !cpumask_empty(cp->effective_cpus)) - csa[csn++] = cp; - - /* skip @cp's subtree */ - pos_css = css_rightmost_descendant(pos_css); - continue; - -v2: /* * Only valid partition roots that are not isolated and with - * non-empty effective_cpus will be saved into csn[]. + * non-empty effective_cpus will be saved into csa[]. */ if ((cp->partition_root_state == PRS_ROOT) && !cpumask_empty(cp->effective_cpus)) - csa[csn++] = cp; + csa[ndoms++] = cp; /* * Skip @cp's subtree if not a partition root and has no @@ -882,40 +833,18 @@ v2: } rcu_read_unlock(); - /* - * If there are only isolated partitions underneath the cgroup root, - * we can optimize out unneeded sched domains scanning. - */ - if (root_load_balance && (csn == 1)) - goto single_root_domain; - - for (i = 0; i < csn; i++) - uf_node_init(&csa[i]->node); - - /* Merge overlapping cpusets */ - for (i = 0; i < csn; i++) { - for (j = i + 1; j < csn; j++) { - if (cpusets_overlap(csa[i], csa[j])) { + for (i = 0; i < ndoms; i++) { + for (j = i + 1; j < ndoms; j++) { + if (cpusets_overlap(csa[i], csa[j])) /* * Cgroup v2 shouldn't pass down overlapping * partition root cpusets. */ - WARN_ON_ONCE(cgrpv2); - uf_union(&csa[i]->node, &csa[j]->node); - } + WARN_ON_ONCE(1); } } - /* Count the total number of domains */ - for (i = 0; i < csn; i++) { - if (uf_find(&csa[i]->node) == &csa[i]->node) - ndoms++; - } - - /* - * Now we know how many domains to create. - * Convert to and populate cpu masks. - */ +generate_doms: doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -932,45 +861,19 @@ v2: * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a * subset of HK_TYPE_DOMAIN housekeeping CPUs. */ - if (cgrpv2) { - for (i = 0; i < ndoms; i++) { - /* - * The top cpuset may contain some boot time isolated - * CPUs that need to be excluded from the sched domain. - */ - if (csa[i] == &top_cpuset) - cpumask_and(doms[i], csa[i]->effective_cpus, - housekeeping_cpumask(HK_TYPE_DOMAIN)); - else - cpumask_copy(doms[i], csa[i]->effective_cpus); - if (dattr) - dattr[i] = SD_ATTR_INIT; - } - goto done; - } - - for (nslot = 0, i = 0; i < csn; i++) { - nslot_update = 0; - for (j = i; j < csn; j++) { - if (uf_find(&csa[j]->node) == &csa[i]->node) { - struct cpumask *dp = doms[nslot]; - - if (i == j) { - nslot_update = 1; - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; - } - cpumask_or(dp, dp, csa[j]->effective_cpus); - cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); - if (dattr) - update_domain_attr_tree(dattr + nslot, csa[j]); - } - } - if (nslot_update) - nslot++; + for (i = 0; i < ndoms; i++) { + /* + * The top cpuset may contain some boot time isolated + * CPUs that need to be excluded from the sched domain. + */ + if (!csa || csa[i] == &top_cpuset) + cpumask_and(doms[i], top_cpuset.effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + else + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; } - BUG_ON(nslot != ndoms); done: kfree(csa); -- cgit v1.2.3 From 269679bdd17cdd4dc3f1f2448f76554932d7de3f Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Sat, 20 Dec 2025 10:15:57 +0000 Subject: cpuset: remove dead code in cpuset-v1.c The commit 6e1d31ce495c ("cpuset: separate generate_sched_domains for v1 and v2") introduced dead code that was originally added for cpuset-v2 partition domain generation. Remove the redundant root_load_balance check. Fixes: 6e1d31ce495c ("cpuset: separate generate_sched_domains for v1 and v2") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/cgroups/9a442808-ed53-4657-988b-882cc0014c0d@huaweicloud.com/T/ Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-v1.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 7303315fdba7..ecfea7800f0d 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -605,7 +605,6 @@ int cpuset1_generate_sched_domains(cpumask_var_t **domains, int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; - bool root_load_balance = is_sched_load_balance(&top_cpuset); int nslot_update; lockdep_assert_cpuset_lock_held(); @@ -615,7 +614,7 @@ int cpuset1_generate_sched_domains(cpumask_var_t **domains, csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance) { + if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -638,8 +637,6 @@ int cpuset1_generate_sched_domains(cpumask_var_t **domains, csn = 0; rcu_read_lock(); - if (root_load_balance) - csa[csn++] = &top_cpuset; cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; -- cgit v1.2.3 From 18bc2425a877c45b59c0972df30afb46084f8816 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 12 Jan 2026 11:00:17 -0500 Subject: cgroup/cpuset: Streamline rm_siblings_excl_cpus() If exclusive_cpus is set, effective_xcpus must be a subset of exclusive_cpus. Currently, rm_siblings_excl_cpus() checks both exclusive_cpus and effective_xcpus consecutively. It is simpler to check only exclusive_cpus if non-empty or just effective_xcpus otherwise. No functional change is expected. Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 221da921b4f9..da2b3b51630e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1355,23 +1355,29 @@ static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, int retval = 0; if (cpumask_empty(excpus)) - return retval; + return 0; /* - * Exclude exclusive CPUs from siblings + * Remove exclusive CPUs from siblings */ rcu_read_lock(); cpuset_for_each_child(sibling, css, parent) { + struct cpumask *sibling_xcpus; + if (sibling == cs) continue; - if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { - cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); - retval++; - continue; - } - if (cpumask_intersects(excpus, sibling->effective_xcpus)) { - cpumask_andnot(excpus, excpus, sibling->effective_xcpus); + /* + * If exclusive_cpus is defined, effective_xcpus will always + * be a subset. Otherwise, effective_xcpus will only be set + * in a valid partition root. + */ + sibling_xcpus = cpumask_empty(sibling->exclusive_cpus) + ? sibling->effective_xcpus + : sibling->exclusive_cpus; + + if (cpumask_intersects(excpus, sibling_xcpus)) { + cpumask_andnot(excpus, excpus, sibling_xcpus); retval++; } } -- cgit v1.2.3 From a1a01793ae1f6f99fd7174988d49b43cd1cb36c3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 12 Jan 2026 11:00:18 -0500 Subject: cgroup/cpuset: Consistently compute effective_xcpus in update_cpumasks_hier() Since commit f62a5d39368e ("cgroup/cpuset: Remove remote_partition_check() & make update_cpumasks_hier() handle remote partition"), the compute_effective_exclusive_cpumask() helper was extended to strip exclusive CPUs from siblings when computing effective_xcpus (cpuset.cpus.exclusive.effective). This helper was later renamed to compute_excpus() in commit 86bbbd1f33ab ("cpuset: Refactor exclusive CPU mask computation logic"). This helper is supposed to be used consistently to compute effective_xcpus. However, there is an exception within the callback critical section in update_cpumasks_hier() when exclusive_cpus of a valid partition root is empty. This can cause effective_xcpus value to differ depending on where exactly it is last computed. Fix this by using compute_excpus() in this case to give a consistent result. Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index da2b3b51630e..894131f47f78 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2168,17 +2168,13 @@ get_css: spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; - if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) - compute_excpus(cp, cp->effective_xcpus); - /* - * Make sure effective_xcpus is properly set for a valid - * partition root. + * Need to compute effective_xcpus if either exclusive_cpus + * is non-empty or it is a valid partition root. */ - if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) - cpumask_and(cp->effective_xcpus, - cp->cpus_allowed, parent->effective_xcpus); - else if (new_prs < 0) + if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus)) + compute_excpus(cp, cp->effective_xcpus); + if (new_prs <= 0) reset_partition_data(cp); spin_unlock_irq(&callback_lock); -- cgit v1.2.3 From 6e6f13f6d5095f3a432da421e78f4d7d51ef39c8 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 12 Jan 2026 11:00:19 -0500 Subject: cgroup/cpuset: Don't fail cpuset.cpus change in v2 Commit fe8cd2736e75 ("cgroup/cpuset: Delay setting of CS_CPU_EXCLUSIVE until valid partition") introduced a new check to disallow the setting of a new cpuset.cpus.exclusive value that is a superset of a sibling's cpuset.cpus value so that there will at least be one CPU left in the sibling in case the cpuset becomes a valid partition root. This new check does have the side effect of failing a cpuset.cpus change that make it a subset of a sibling's cpuset.cpus.exclusive value. With v2, users are supposed to be allowed to set whatever value they want in cpuset.cpus without failure. To maintain this rule, the check is now restricted to only when cpuset.cpus.exclusive is being changed not when cpuset.cpus is changed. The cgroup-v2.rst doc file is also updated to reflect this change. Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 894131f47f78..4819ab429771 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -609,33 +609,31 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) /** * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts - * @cs1: first cpuset to check - * @cs2: second cpuset to check + * @trial: the trial cpuset to be checked + * @sibling: a sibling cpuset to be checked against + * @xcpus_changed: set if exclusive_cpus has been set * * Returns: true if CPU exclusivity conflict exists, false otherwise * * Conflict detection rules: * 1. If either cpuset is CPU exclusive, they must be mutually exclusive * 2. exclusive_cpus masks cannot intersect between cpusets - * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + * 3. The allowed CPUs of a sibling cpuset cannot be a subset of the new exclusive CPUs */ -static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling, + bool xcpus_changed) { /* If either cpuset is exclusive, check if they are mutually exclusive */ - if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) - return !cpusets_are_exclusive(cs1, cs2); + if (is_cpu_exclusive(trial) || is_cpu_exclusive(sibling)) + return !cpusets_are_exclusive(trial, sibling); /* Exclusive_cpus cannot intersect */ - if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) + if (cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus)) return true; - /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ - if (!cpumask_empty(cs1->cpus_allowed) && - cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) - return true; - - if (!cpumask_empty(cs2->cpus_allowed) && - cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */ + if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) && + cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus)) return true; return false; @@ -672,6 +670,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; + bool xcpus_changed; int ret = 0; rcu_read_lock(); @@ -728,10 +727,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; + xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus); cpuset_for_each_child(c, css, par) { if (c == cur) continue; - if (cpus_excl_conflict(trial, c)) + if (cpus_excl_conflict(trial, c, xcpus_changed)) goto out; if (mems_excl_conflict(trial, c)) goto out; -- cgit v1.2.3 From 2a3602030d800b6600ef55c31e21bc54611f7770 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 12 Jan 2026 11:00:20 -0500 Subject: cgroup/cpuset: Don't invalidate sibling partitions on cpuset.cpus conflict Currently, when setting a cpuset's cpuset.cpus to a value that conflicts with the cpuset.cpus/cpuset.cpus.exclusive of a sibling partition, the sibling's partition state becomes invalid. This is overly harsh and is probably not necessary. The cpuset.cpus.exclusive control file, if set, will override the cpuset.cpus of the same cpuset when creating a cpuset partition. So cpuset.cpus has less priority than cpuset.cpus.exclusive in setting up a partition. However, it cannot override a conflicting cpuset.cpus file in a sibling cpuset and the partition creation process will fail. This is inconsistent. That will also make using cpuset.cpus.exclusive less valuable as a tool to set up cpuset partitions as the users have to check if such a cpuset.cpus conflict exists or not. Fix these problems by making sure that once a cpuset.cpus.exclusive is set without failure, it will always be allowed to form a valid partition as long as at least one CPU can be granted from its parent irrespective of the state of the siblings' cpuset.cpus values. Of course, setting cpuset.cpus.exclusive will fail if it conflicts with the cpuset.cpus.exclusive or the cpuset.cpus.exclusive.effective value of a sibling. Partition can still be created by setting only cpuset.cpus without setting cpuset.cpus.exclusive. However, any conflicting CPUs in sibling's cpuset.cpus.exclusive.effective and cpuset.cpus.exclusive values will be removed from its cpuset.cpus.exclusive.effective as long as there is still one or more CPUs left and can be granted from its parent. This CPU stripping is currently done in rm_siblings_excl_cpus(). The new code will now try its best to enable the creation of new partitions with only cpuset.cpus set without invalidating existing ones. However it is not guaranteed that all the CPUs requested in cpuset.cpus will be used in the new partition even when all these CPUs can be granted from the parent. This is similar to the fact that cpuset.cpus.effective may not be able to include all the CPUs requested in cpuset.cpus. In this case, the parent may not able to grant all the exclusive CPUs requested in cpuset.cpus to cpuset.cpus.exclusive.effective if some of them have already been granted to other partitions earlier. With the creation of multiple sibling partitions by setting only cpuset.cpus, this does have the side effect that their exact cpuset.cpus.exclusive.effective settings will depend on the order of partition creation if there are conflicts. Due to the exclusive nature of the CPUs in a partition, it is not easy to make it fair other than the old behavior of invalidating all the conflicting partitions. For example, # echo "0-2" > A1/cpuset.cpus # echo "root" > A1/cpuset.cpus.partition # cat A1/cpuset.cpus.partition root # cat A1/cpuset.cpus.exclusive.effective 0-2 # echo "2-4" > B1/cpuset.cpus # echo "root" > B1/cpuset.cpus.partition # cat B1/cpuset.cpus.partition root # cat B1/cpuset.cpus.exclusive.effective 3-4 # cat B1/cpuset.cpus.effective 3-4 For users who want to be sure that they can get most of the CPUs they want, cpuset.cpus.exclusive should be used instead if they can set it successfully without failure. Setting cpuset.cpus.exclusive will guarantee that sibling conflicts from then onward is no longer possible. To make this change, we have to separate out the is_cpu_exclusive() check in cpus_excl_conflict() into a cgroup v1 only cpuset1_cpus_excl_conflict() helper. The cpus_allowed_validate_change() helper is now no longer needed and can be removed. Some existing tests in test_cpuset_prs.sh are updated and new ones are added to reflect the new behavior. The cgroup-v2.rst doc file is also updated the clarify what exclusive CPUs will be used when a partition is created. Reported-by: Sun Shaojie Closes: https://lore.kernel.org/lkml/20251117015708.977585-1-sunshaojie@kylinos.cn/ Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 3 ++ kernel/cgroup/cpuset-v1.c | 19 ++++++++++ kernel/cgroup/cpuset.c | 80 ++++++++++++++--------------------------- 3 files changed, 48 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index e718a4f54360..e8e2683cb067 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -312,6 +312,7 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2); void cpuset1_init(struct cpuset *cs); void cpuset1_online_css(struct cgroup_subsys_state *css); int cpuset1_generate_sched_domains(cpumask_var_t **domains, @@ -326,6 +327,8 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, bool cpus_updated, bool mems_updated) {} static inline int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { return 0; } +static inline bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, + struct cpuset *cs2) { return false; } static inline void cpuset1_init(struct cpuset *cs) {} static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {} static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains, diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index ecfea7800f0d..04124c38a774 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -373,6 +373,25 @@ out: return ret; } +/* + * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts + * to legacy (v1) + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect. + */ +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return cpumask_intersects(cs1->cpus_allowed, + cs2->cpus_allowed); + + return false; +} + #ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4819ab429771..83fb83a86b4b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -129,6 +129,17 @@ static bool force_sd_rebuild; * For simplicity, a local partition can be created under a local or remote * partition but a remote partition cannot have any partition root in its * ancestor chain except the cgroup root. + * + * A valid partition can be formed by setting exclusive_cpus or cpus_allowed + * if exclusive_cpus is not set. In the case of partition with empty + * exclusive_cpus, all the conflicting exclusive CPUs specified in the + * following cpumasks of sibling cpusets will be removed from its + * cpus_allowed in determining its effective_xcpus. + * - effective_xcpus + * - exclusive_cpus + * + * The "cpuset.cpus.exclusive" control file should be used for setting up + * partition if the users want to get as many CPUs as possible. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -616,27 +627,25 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) * Returns: true if CPU exclusivity conflict exists, false otherwise * * Conflict detection rules: - * 1. If either cpuset is CPU exclusive, they must be mutually exclusive - * 2. exclusive_cpus masks cannot intersect between cpusets - * 3. The allowed CPUs of a sibling cpuset cannot be a subset of the new exclusive CPUs + * o cgroup v1 + * See cpuset1_cpus_excl_conflict() + * o cgroup v2 + * - The exclusive_cpus values cannot overlap. + * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed. */ static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling, bool xcpus_changed) { - /* If either cpuset is exclusive, check if they are mutually exclusive */ - if (is_cpu_exclusive(trial) || is_cpu_exclusive(sibling)) - return !cpusets_are_exclusive(trial, sibling); - - /* Exclusive_cpus cannot intersect */ - if (cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus)) - return true; + if (!cpuset_v2()) + return cpuset1_cpus_excl_conflict(trial, sibling); /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */ if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) && cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus)) return true; - return false; + /* Exclusive_cpus cannot intersect */ + return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus); } static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) @@ -2312,43 +2321,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri return PERR_NONE; } -static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, - struct tmpmasks *tmp) -{ - int retval; - struct cpuset *parent = parent_cs(cs); - - retval = validate_change(cs, trialcs); - - if ((retval == -EINVAL) && cpuset_v2()) { - struct cgroup_subsys_state *css; - struct cpuset *cp; - - /* - * The -EINVAL error code indicates that partition sibling - * CPU exclusivity rule has been violated. We still allow - * the cpumask change to proceed while invalidating the - * partition. However, any conflicting sibling partitions - * have to be marked as invalid too. - */ - trialcs->prs_err = PERR_NOTEXCL; - rcu_read_lock(); - cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = user_xcpus(trialcs); - - if (is_partition_valid(cp) && - cpumask_intersects(xcpus, cp->effective_xcpus)) { - rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); - rcu_read_lock(); - } - } - rcu_read_unlock(); - retval = 0; - } - return retval; -} - /** * partition_cpus_change - Handle partition state changes due to CPU mask updates * @cs: The target cpuset being modified @@ -2408,15 +2380,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; - if (alloc_tmpmasks(&tmp)) - return -ENOMEM; - compute_trialcs_excpus(trialcs, cs); trialcs->prs_err = PERR_NONE; - retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + retval = validate_change(cs, trialcs); if (retval < 0) - goto out_free; + return retval; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; /* * Check all the descendants in update_cpumasks_hier() if @@ -2439,7 +2411,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); -out_free: + free_tmpmasks(&tmp); return retval; } -- cgit v1.2.3 From 272bd8183376a9e20fe08bacbaa44003d7c8acaa Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 12 Jan 2026 11:00:21 -0500 Subject: cgroup/cpuset: Move the v1 empty cpus/mems check to cpuset1_validate_change() As stated in commit 1c09b195d37f ("cpuset: fix a regression in validating config change"), it is not allowed to clear masks of a cpuset if there're tasks in it. This is specific to v1 since empty "cpuset.cpus" or "cpuset.mems" will cause the v2 cpuset to inherit the effective CPUs or memory nodes from its parent. So it is OK to have empty cpus or mems even if there are tasks in the cpuset. Move this empty cpus/mems check in validate_change() to cpuset1_validate_change() to allow more flexibility in setting cpus or mems in v2. cpuset_is_populated() needs to be moved into cpuset-internal.h as it is needed by the empty cpus/mems checking code. Also add a test case to test_cpuset_prs.sh to verify that. Reported-by: Chen Ridong Closes: https://lore.kernel.org/lkml/7a3ec392-2e86-4693-aa9f-1e668a668b9c@huaweicloud.com/ Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 9 +++++++++ kernel/cgroup/cpuset-v1.c | 14 ++++++++++++++ kernel/cgroup/cpuset.c | 23 ----------------------- 3 files changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index e8e2683cb067..fd7d19842ded 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -260,6 +260,15 @@ static inline int nr_cpusets(void) return static_key_count(&cpusets_enabled_key.key) + 1; } +static inline bool cpuset_is_populated(struct cpuset *cs) +{ + lockdep_assert_cpuset_lock_held(); + + /* Cpusets in the process of attaching should be considered as populated */ + return cgroup_is_populated(cs->css.cgroup) || + cs->attach_in_progress; +} + /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 04124c38a774..7a23b9e8778f 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -368,6 +368,20 @@ int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) if (par && !is_cpuset_subset(trial, par)) goto out; + /* + * Cpusets with tasks - existing or newly being attached - can't + * be changed to have empty cpus_allowed or mems_allowed. + */ + ret = -ENOSPC; + if (cpuset_is_populated(cur)) { + if (!cpumask_empty(cur->cpus_allowed) && + cpumask_empty(trial->cpus_allowed)) + goto out; + if (!nodes_empty(cur->mems_allowed) && + nodes_empty(trial->mems_allowed)) + goto out; + } + ret = 0; out: return ret; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 83fb83a86b4b..a3dbca125588 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -370,15 +370,6 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } -static inline bool cpuset_is_populated(struct cpuset *cs) -{ - lockdep_assert_held(&cpuset_mutex); - - /* Cpusets in the process of attaching should be considered as populated */ - return cgroup_is_populated(cs->css.cgroup) || - cs->attach_in_progress; -} - /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked @@ -695,20 +686,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); - /* - * Cpusets with tasks - existing or newly being attached - can't - * be changed to have empty cpus_allowed or mems_allowed. - */ - ret = -ENOSPC; - if (cpuset_is_populated(cur)) { - if (!cpumask_empty(cur->cpus_allowed) && - cpumask_empty(trial->cpus_allowed)) - goto out; - if (!nodes_empty(cur->mems_allowed) && - nodes_empty(trial->mems_allowed)) - goto out; - } - /* * We can't shrink if we won't have enough room for SCHED_DEADLINE * tasks. This check is not done when scheduling is disabled as the -- cgit v1.2.3 From 090e0ae303c76d4026b4bf50b2543690741730ae Mon Sep 17 00:00:00 2001 From: Zhao Mengmeng Date: Mon, 12 Jan 2026 09:25:11 -1000 Subject: cpuset: replace direct lockdep_assert_held() with lockdep_assert_cpuset_lock_held() We already added lockdep_assert_cpuset_lock_held(), use this new function to keep consistency. Signed-off-by: Zhao Mengmeng Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a3dbca125588..003232dc9d2e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -336,7 +336,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) */ static inline void dec_attach_in_progress_locked(struct cpuset *cs) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); cs->attach_in_progress--; if (!cs->attach_in_progress) @@ -899,7 +899,7 @@ void dl_rebuild_rd_accounting(void) int cpu; u64 cookie = ++dl_cookie; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); @@ -950,7 +950,7 @@ void rebuild_sched_domains_locked(void) int i; lockdep_assert_cpus_held(); - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); force_sd_rebuild = false; /* Generate domain masks and attrs */ @@ -1645,7 +1645,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int parent_prs = parent->partition_root_state; bool nocpu; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ /* @@ -2213,7 +2213,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); /* * Check all its siblings and call update_cpumasks_hier() @@ -3047,7 +3047,7 @@ static nodemask_t cpuset_attach_nodemask_to; static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); if (cs != &top_cpuset) guarantee_active_cpus(task, cpus_attach); @@ -3969,7 +3969,7 @@ static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask */ void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); __cpuset_cpus_allowed_locked(tsk, pmask); } -- cgit v1.2.3 From 5eab8c588bf37b7eb498f23a2ac3fb135c258e17 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Sat, 31 Jan 2026 03:05:09 +0000 Subject: cgroup: increase maximum subsystem count from 16 to 32 The current cgroup subsystem limit of 16 is insufficient, as the number of existing subsystems has already reached this limit. When adding a new subsystem that is not yet in the mainline kernel, building with `make allmodconfig` requires first bypassing the `BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16)` restriction to allow compilation to succeed. However, the kernel still fails to boot afterward. This patch increases the maximum number of supported cgroup subsystems from 16 to 32, providing enough room for future subsystem additions. Signed-off-by: Chen Ridong Acked-by: Waiman Long Tested-by: JP Kobryn Acked-by: JP Kobryn Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 8 +++---- kernel/cgroup/cgroup-v1.c | 12 +++++------ kernel/cgroup/cgroup.c | 46 ++++++++++++++++++++--------------------- kernel/cgroup/debug.c | 2 +- 4 files changed, 34 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 22051b4f1ccb..3bfe37693d68 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -52,7 +52,7 @@ struct cgroup_fs_context { bool cpuset_clone_children; bool none; /* User explicitly requested empty subsystem */ bool all_ss; /* Seen 'all' option */ - u16 subsys_mask; /* Selected subsystems */ + u32 subsys_mask; /* Selected subsystems */ char *name; /* Hierarchy name */ char *release_agent; /* Path for release notifications */ }; @@ -146,7 +146,7 @@ struct cgroup_mgctx { struct cgroup_taskset tset; /* subsystems affected by migration */ - u16 ss_mask; + u32 ss_mask; }; #define CGROUP_TASKSET_INIT(tset) \ @@ -235,8 +235,8 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup_favor_dynmods(struct cgroup_root *root, bool favor); void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_fs_context *ctx); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask); +int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask); int cgroup_do_get_tree(struct fs_context *fc); int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index a9e029b570c8..724950c4b690 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -28,7 +28,7 @@ #define CGROUP_PIDLIST_DESTROY_DELAY HZ /* Controllers blocked by the commandline in v1 */ -static u16 cgroup_no_v1_mask; +static u32 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; @@ -1037,13 +1037,13 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) static int check_cgroupfs_options(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); - u16 mask = U16_MAX; - u16 enabled = 0; + u32 mask = U32_MAX; + u32 enabled = 0; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); + mask = ~((u32)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) && @@ -1095,7 +1095,7 @@ int cgroup1_reconfigure(struct fs_context *fc) struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); int ret = 0; - u16 added_mask, removed_mask; + u32 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); @@ -1343,7 +1343,7 @@ static int __init cgroup_no_v1(char *str) continue; if (!strcmp(token, "all")) { - cgroup_no_v1_mask = U16_MAX; + cgroup_no_v1_mask = U32_MAX; continue; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 34d6d4d99f97..c86db0943b44 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -206,13 +206,13 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ -static u16 cgrp_dfl_inhibit_ss_mask; +static u32 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ -static u16 cgrp_dfl_implicit_ss_mask; +static u32 cgrp_dfl_implicit_ss_mask; /* some controllers can be threaded on the default hierarchy */ -static u16 cgrp_dfl_threaded_ss_mask; +static u32 cgrp_dfl_threaded_ss_mask; /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); @@ -234,10 +234,10 @@ static u64 css_serial_nr_next = 1; * These bitmasks identify subsystems with specific features to avoid * having to do iterative checks repeatedly. */ -static u16 have_fork_callback __read_mostly; -static u16 have_exit_callback __read_mostly; -static u16 have_release_callback __read_mostly; -static u16 have_canfork_callback __read_mostly; +static u32 have_fork_callback __read_mostly; +static u32 have_exit_callback __read_mostly; +static u32 have_release_callback __read_mostly; +static u32 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); @@ -475,13 +475,13 @@ static bool cgroup_is_valid_domain(struct cgroup *cgrp) } /* subsystems visibly enabled on a cgroup */ -static u16 cgroup_control(struct cgroup *cgrp) +static u32 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); - u16 root_ss_mask = cgrp->root->subsys_mask; + u32 root_ss_mask = cgrp->root->subsys_mask; if (parent) { - u16 ss_mask = parent->subtree_control; + u32 ss_mask = parent->subtree_control; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) @@ -496,12 +496,12 @@ static u16 cgroup_control(struct cgroup *cgrp) } /* subsystems enabled on a cgroup */ -static u16 cgroup_ss_mask(struct cgroup *cgrp) +static u32 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); if (parent) { - u16 ss_mask = parent->subtree_ss_mask; + u32 ss_mask = parent->subtree_ss_mask; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) @@ -1636,9 +1636,9 @@ static umode_t cgroup_file_mode(const struct cftype *cft) * This function calculates which subsystems need to be enabled if * @subtree_control is to be applied while restricted to @this_ss_mask. */ -static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) +static u32 cgroup_calc_subtree_ss_mask(u32 subtree_control, u32 this_ss_mask) { - u16 cur_ss_mask = subtree_control; + u32 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; @@ -1647,7 +1647,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { - u16 new_ss_mask = cur_ss_mask; + u32 new_ss_mask = cur_ss_mask; do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; @@ -1851,12 +1851,12 @@ err: return ret; } -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) +int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, ret; - u16 dfl_disable_ss_mask = 0; + u32 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -2152,7 +2152,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -3134,7 +3134,7 @@ void cgroup_procs_write_finish(struct task_struct *task, put_task_struct(task); } -static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) +static void cgroup_print_ss_mask(struct seq_file *seq, u32 ss_mask) { struct cgroup_subsys *ss; bool printed = false; @@ -3499,9 +3499,9 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) cgroup_apply_control_disable(cgrp); } -static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) +static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u32 enable) { - u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; + u32 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; /* if nothing is getting enabled, nothing to worry about */ if (!enable) @@ -3544,7 +3544,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - u16 enable = 0, disable = 0; + u32 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -6350,7 +6350,7 @@ int __init cgroup_init(void) struct cgroup_subsys *ss; int ssid; - BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 32); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 81ea38dd6f9d..a5490097fe52 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -230,7 +230,7 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) } static void cgroup_masks_read_one(struct seq_file *seq, const char *name, - u16 mask) + u32 mask) { struct cgroup_subsys *ss; int ssid; -- cgit v1.2.3 From 8b1f3c54f930c3aeda0b5bad97bc317fc80267fd Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Thu, 29 Jan 2026 06:45:16 +0000 Subject: cpuset: fix overlap of partition effective CPUs A warning was detect: WARNING: kernel/cgroup/cpuset.c:825 at rebuild_sched_domains_locked Modules linked in: CPU: 12 UID: 0 PID: 681 Comm: rmdir 6.19.0-rc6-next-20260121+ RIP: 0010:rebuild_sched_domains_locked+0x309/0x4b0 RSP: 0018:ffffc900019bbd28 EFLAGS: 00000202 RAX: ffff888104413508 RBX: 0000000000000008 RCX: ffff888104413510 RDX: ffff888109b5f400 RSI: 000000000000ffcf RDI: 0000000000000001 RBP: 0000000000000002 R08: ffff888104413508 R09: 0000000000000002 R10: ffff888104413508 R11: 0000000000000001 R12: ffff888104413500 R13: 0000000000000002 R14: ffffc900019bbd78 R15: 0000000000000000 FS: 00007fe274b8d740(0000) GS:ffff8881b6b3c000(0000) knlGS: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe274c98b50 CR3: 00000001047a9000 CR4: 00000000000006f0 Call Trace: update_prstate+0x1c7/0x580 cpuset_css_killed+0x2f/0x50 kill_css+0x32/0x180 cgroup_destroy_locked+0xa7/0x200 cgroup_rmdir+0x28/0x100 kernfs_iop_rmdir+0x4c/0x80 vfs_rmdir+0x12c/0x280 filename_rmdir+0x19e/0x200 __x64_sys_rmdir+0x23/0x40 do_syscall_64+0x6b/0x390 It can be reproduced by steps: # cd /sys/fs/cgroup/ # mkdir A1 # mkdir B1 # mkdir C1 # echo 1-3 > A1/cpuset.cpus # echo root > A1/cpuset.cpus.partition # echo 3-5 > B1/cpuset.cpus # echo root > B1/cpuset.cpus.partition # echo 6 > C1/cpuset.cpus # echo root > C1/cpuset.cpus.partition # rmdir A1/ # rmdir C1/ Both A1 and B1 were initially configured with CPU 3, which was exclusively assigned to A1's partition. When A1 was removed, CPU 3 was returned to the root pool. However, B1 incorrectly regained access to CPU 3 when update_cpumasks_hier was triggered during C1's removal, which also updated sibling configurations. The update_sibling_cpumasks function was called to synchronize siblings' effective CPUs due to changes in their parent's effective CPUs. However, parent effective CPU changes should not affect partition-effective CPUs. To fix this issue, update_cpumasks_hier should only be invoked when the sibling is not a valid partition in the update_sibling_cpumasks. Fixes: 2a3602030d80 ("cgroup/cpuset: Don't invalidate sibling partitions on cpuset.cpus conflict") Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 003232dc9d2e..92a51316225d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2222,27 +2222,20 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus - * directly. + * directly. It should not impact valid partition. * * The update_cpumasks_hier() function may sleep. So we have to * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { - if (sibling == cs) + if (sibling == cs || is_partition_valid(sibling)) continue; - if (!is_partition_valid(sibling)) { - compute_effective_cpumask(tmp->new_cpus, sibling, - parent); - if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) - continue; - } else if (is_remote_partition(sibling)) { - /* - * Change in a sibling cpuset won't affect a remote - * partition root. - */ + + compute_effective_cpumask(tmp->new_cpus, sibling, + parent); + if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) continue; - } if (!css_tryget_online(&sibling->css)) continue; -- cgit v1.2.3