diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-27 16:51:27 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-27 16:51:27 -0700 |
| commit | 3b3bea6d4b9c162f9e555905d96b8c1da67ecd5b (patch) | |
| tree | 284f9fe4fdf9c12ed32588a5f42f0c1efcb507c1 | |
| parent | a1a671092d29455a890af272f0702925adafdf59 (diff) | |
| parent | 981cd338614c96070cf9854679014fd027c1fb1d (diff) | |
Merge tag 'cgroup-for-7.1-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup fixes from Tejun Heo:
- Fix UAF race in psi pressure_write() against cgroup file release by
extending cgroup_mutex coverage and ordering of->priv access after
cgroup_kn_lock_live()
- Fix integer overflow in rdmacg_try_charge() when usage equals INT_MAX
by performing the increment in s64
- Fix asymmetric DL bandwidth accounting on cpuset attach rollback by
recording the CPU used by dl_bw_alloc() so cancel_attach() returns
the reservation to the same root domain
- Fix nr_dying_subsys_* race that briefly showed 0 in cgroup.stat after
rmdir by incrementing from kill_css() instead of offline_css()
- Typo fix in cgroup-v2 documentation
* tag 'cgroup-for-7.1-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
docs: cgroup: fix typo 'protetion' -> 'protection'
cgroup: Increment nr_dying_subsys_* from rmdir context
cgroup/cpuset: record DL BW alloc CPU for attach rollback
cgroup/rdma: fix integer overflow in rdmacg_try_charge()
sched/psi: fix race between file release and pressure write
| -rw-r--r-- | Documentation/admin-guide/cgroup-v2.rst | 2 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup.c | 46 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset-internal.h | 5 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 13 | ||||
| -rw-r--r-- | kernel/cgroup/rdma.c | 2 |
5 files changed, 44 insertions, 24 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 8ad0b2781317..6efd0095ed99 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -220,7 +220,7 @@ cgroup v2 currently supports the following mount options. memory_hugetlb_accounting Count HugeTLB memory usage towards the cgroup's overall memory usage for the memory controller (for the purpose of - statistics reporting and memory protetion). This is a new + statistics reporting and memory protection). This is a new behavior that could regress existing setups, so it must be explicitly opted in with this mount option. diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 43adc96c7f1a..45c0b1ed687a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3934,33 +3934,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { - struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_file_ctx *ctx; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; + ssize_t ret = 0; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - cgroup_get(cgrp); - cgroup_kn_unlock(of->kn); + ctx = of->priv; + if (!ctx) { + ret = -ENODEV; + goto out_unlock; + } /* Allow only one trigger per file descriptor */ if (ctx->psi.trigger) { - cgroup_put(cgrp); - return -EBUSY; + ret = -EBUSY; + goto out_unlock; } psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { - cgroup_put(cgrp); - return PTR_ERR(new); + ret = PTR_ERR(new); + goto out_unlock; } smp_store_release(&ctx->psi.trigger, new); - cgroup_put(cgrp); + +out_unlock: + cgroup_kn_unlock(of->kn); + if (ret) + return ret; return nbytes; } @@ -5716,16 +5724,6 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); - - css->cgroup->nr_dying_subsys[ss->id]++; - /* - * Parent css and cgroup cannot be freed until after the freeing - * of child css, see css_free_rwork_fn(). - */ - while ((css = css->parent)) { - css->nr_descendants--; - css->cgroup->nr_dying_subsys[ss->id]++; - } } /** @@ -6038,6 +6036,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref) */ static void kill_css(struct cgroup_subsys_state *css) { + struct cgroup_subsys *ss = css->ss; + lockdep_assert_held(&cgroup_mutex); if (css->flags & CSS_DYING) @@ -6074,6 +6074,16 @@ static void kill_css(struct cgroup_subsys_state *css) * css is confirmed to be seen as killed on all CPUs. */ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); + + css->cgroup->nr_dying_subsys[ss->id]++; + /* + * Parent css and cgroup cannot be freed until after the freeing + * of child css, see css_free_rwork_fn(). + */ + while ((css = css->parent)) { + css->nr_descendants--; + css->cgroup->nr_dying_subsys[ss->id]++; + } } /** diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index fd7d19842ded..bb4e692bea30 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -168,6 +168,11 @@ struct cpuset { int nr_deadline_tasks; int nr_migrate_dl_tasks; u64 sum_migrate_dl_bw; + /* + * CPU used for temporary DL bandwidth allocation during attach; + * -1 if no DL bandwidth was allocated in the current attach. + */ + int dl_bw_cpu; /* Invalid partition error code, not lock protected */ enum prs_errcode prs_err; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1335e437098e..e3a081a07c6d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -288,6 +288,7 @@ struct cpuset top_cpuset = { .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, + .dl_bw_cpu = -1, }; /** @@ -579,6 +580,8 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) if (!trial) return NULL; + trial->dl_bw_cpu = -1; + /* Setup cpumask pointer array */ cpumask_var_t *pmask[4] = { &trial->cpus_allowed, @@ -2980,6 +2983,7 @@ static void reset_migrate_dl_data(struct cpuset *cs) { cs->nr_migrate_dl_tasks = 0; cs->sum_migrate_dl_bw = 0; + cs->dl_bw_cpu = -1; } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ @@ -3056,6 +3060,8 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) reset_migrate_dl_data(cs); goto out_unlock; } + + cs->dl_bw_cpu = cpu; } out_success: @@ -3080,12 +3086,11 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) mutex_lock(&cpuset_mutex); dec_attach_in_progress_locked(cs); - if (cs->nr_migrate_dl_tasks) { - int cpu = cpumask_any(cs->effective_cpus); + if (cs->dl_bw_cpu >= 0) + dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw); - dl_bw_free(cpu, cs->sum_migrate_dl_bw); + if (cs->nr_migrate_dl_tasks) reset_migrate_dl_data(cs); - } mutex_unlock(&cpuset_mutex); } diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 9967fb25c563..4fdab4cf49e0 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -283,7 +283,7 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg, ret = PTR_ERR(rpool); goto err; } else { - new = rpool->resources[index].usage + 1; + new = (s64)rpool->resources[index].usage + 1; if (new > rpool->resources[index].max) { ret = -EAGAIN; goto err; |
