summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-27 16:51:27 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-27 16:51:27 -0700
commit3b3bea6d4b9c162f9e555905d96b8c1da67ecd5b (patch)
tree284f9fe4fdf9c12ed32588a5f42f0c1efcb507c1
parenta1a671092d29455a890af272f0702925adafdf59 (diff)
parent981cd338614c96070cf9854679014fd027c1fb1d (diff)
Merge tag 'cgroup-for-7.1-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup fixes from Tejun Heo: - Fix UAF race in psi pressure_write() against cgroup file release by extending cgroup_mutex coverage and ordering of->priv access after cgroup_kn_lock_live() - Fix integer overflow in rdmacg_try_charge() when usage equals INT_MAX by performing the increment in s64 - Fix asymmetric DL bandwidth accounting on cpuset attach rollback by recording the CPU used by dl_bw_alloc() so cancel_attach() returns the reservation to the same root domain - Fix nr_dying_subsys_* race that briefly showed 0 in cgroup.stat after rmdir by incrementing from kill_css() instead of offline_css() - Typo fix in cgroup-v2 documentation * tag 'cgroup-for-7.1-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: docs: cgroup: fix typo 'protetion' -> 'protection' cgroup: Increment nr_dying_subsys_* from rmdir context cgroup/cpuset: record DL BW alloc CPU for attach rollback cgroup/rdma: fix integer overflow in rdmacg_try_charge() sched/psi: fix race between file release and pressure write
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst2
-rw-r--r--kernel/cgroup/cgroup.c46
-rw-r--r--kernel/cgroup/cpuset-internal.h5
-rw-r--r--kernel/cgroup/cpuset.c13
-rw-r--r--kernel/cgroup/rdma.c2
5 files changed, 44 insertions, 24 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 8ad0b2781317..6efd0095ed99 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -220,7 +220,7 @@ cgroup v2 currently supports the following mount options.
memory_hugetlb_accounting
Count HugeTLB memory usage towards the cgroup's overall
memory usage for the memory controller (for the purpose of
- statistics reporting and memory protetion). This is a new
+ statistics reporting and memory protection). This is a new
behavior that could regress existing setups, so it must be
explicitly opted in with this mount option.
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 43adc96c7f1a..45c0b1ed687a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3934,33 +3934,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
- struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_file_ctx *ctx;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
+ ssize_t ret = 0;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
- cgroup_get(cgrp);
- cgroup_kn_unlock(of->kn);
+ ctx = of->priv;
+ if (!ctx) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
/* Allow only one trigger per file descriptor */
if (ctx->psi.trigger) {
- cgroup_put(cgrp);
- return -EBUSY;
+ ret = -EBUSY;
+ goto out_unlock;
}
psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res, of->file, of);
if (IS_ERR(new)) {
- cgroup_put(cgrp);
- return PTR_ERR(new);
+ ret = PTR_ERR(new);
+ goto out_unlock;
}
smp_store_release(&ctx->psi.trigger, new);
- cgroup_put(cgrp);
+
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ if (ret)
+ return ret;
return nbytes;
}
@@ -5716,16 +5724,6 @@ static void offline_css(struct cgroup_subsys_state *css)
RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
wake_up_all(&css->cgroup->offline_waitq);
-
- css->cgroup->nr_dying_subsys[ss->id]++;
- /*
- * Parent css and cgroup cannot be freed until after the freeing
- * of child css, see css_free_rwork_fn().
- */
- while ((css = css->parent)) {
- css->nr_descendants--;
- css->cgroup->nr_dying_subsys[ss->id]++;
- }
}
/**
@@ -6038,6 +6036,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
*/
static void kill_css(struct cgroup_subsys_state *css)
{
+ struct cgroup_subsys *ss = css->ss;
+
lockdep_assert_held(&cgroup_mutex);
if (css->flags & CSS_DYING)
@@ -6074,6 +6074,16 @@ static void kill_css(struct cgroup_subsys_state *css)
* css is confirmed to be seen as killed on all CPUs.
*/
percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
+
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ /*
+ * Parent css and cgroup cannot be freed until after the freeing
+ * of child css, see css_free_rwork_fn().
+ */
+ while ((css = css->parent)) {
+ css->nr_descendants--;
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ }
}
/**
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index fd7d19842ded..bb4e692bea30 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -168,6 +168,11 @@ struct cpuset {
int nr_deadline_tasks;
int nr_migrate_dl_tasks;
u64 sum_migrate_dl_bw;
+ /*
+ * CPU used for temporary DL bandwidth allocation during attach;
+ * -1 if no DL bandwidth was allocated in the current attach.
+ */
+ int dl_bw_cpu;
/* Invalid partition error code, not lock protected */
enum prs_errcode prs_err;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 1335e437098e..e3a081a07c6d 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -288,6 +288,7 @@ struct cpuset top_cpuset = {
.flags = BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
+ .dl_bw_cpu = -1,
};
/**
@@ -579,6 +580,8 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
if (!trial)
return NULL;
+ trial->dl_bw_cpu = -1;
+
/* Setup cpumask pointer array */
cpumask_var_t *pmask[4] = {
&trial->cpus_allowed,
@@ -2980,6 +2983,7 @@ static void reset_migrate_dl_data(struct cpuset *cs)
{
cs->nr_migrate_dl_tasks = 0;
cs->sum_migrate_dl_bw = 0;
+ cs->dl_bw_cpu = -1;
}
/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
@@ -3056,6 +3060,8 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
reset_migrate_dl_data(cs);
goto out_unlock;
}
+
+ cs->dl_bw_cpu = cpu;
}
out_success:
@@ -3080,12 +3086,11 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
mutex_lock(&cpuset_mutex);
dec_attach_in_progress_locked(cs);
- if (cs->nr_migrate_dl_tasks) {
- int cpu = cpumask_any(cs->effective_cpus);
+ if (cs->dl_bw_cpu >= 0)
+ dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
- dl_bw_free(cpu, cs->sum_migrate_dl_bw);
+ if (cs->nr_migrate_dl_tasks)
reset_migrate_dl_data(cs);
- }
mutex_unlock(&cpuset_mutex);
}
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 9967fb25c563..4fdab4cf49e0 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -283,7 +283,7 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
ret = PTR_ERR(rpool);
goto err;
} else {
- new = rpool->resources[index].usage + 1;
+ new = (s64)rpool->resources[index].usage + 1;
if (new > rpool->resources[index].max) {
ret = -EAGAIN;
goto err;