diff options
| author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2022-02-28 23:12:55 -0800 |
|---|---|---|
| committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2022-02-28 23:12:55 -0800 |
| commit | 1136fa0c07de570dc17858745af8be169d1440ba (patch) | |
| tree | 3221b003517dd3cb13df5ba4b85637cd9ed82692 /kernel/cgroup | |
| parent | ba115adf61b36b8c167126425a62b0efc23f72c0 (diff) | |
| parent | 754e0b0e35608ed5206d6a67a791563c631cec07 (diff) | |
Merge tag 'v5.17-rc4' into for-linus
Merge with mainline to get the Intel ASoC generic helpers header and
other changes.
Diffstat (limited to 'kernel/cgroup')
| -rw-r--r-- | kernel/cgroup/cgroup-internal.h | 19 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup-v1.c | 64 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup.c | 222 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 88 | ||||
| -rw-r--r-- | kernel/cgroup/misc.c | 31 | ||||
| -rw-r--r-- | kernel/cgroup/rstat.c | 53 |
6 files changed, 299 insertions, 178 deletions
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index bfbeabc17a9d..6e36e854b512 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -65,6 +65,25 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) return container_of(kfc, struct cgroup_fs_context, kfc); } +struct cgroup_pidlist; + +struct cgroup_file_ctx { + struct cgroup_namespace *ns; + + struct { + void *trigger; + } psi; + + struct { + bool started; + struct css_task_iter iter; + } procs; + + struct { + struct cgroup_pidlist *pidlist; + } procs1; +}; + /* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 35b920328344..0e877dbcfeea 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -63,9 +63,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) for_each_root(root) { struct cgroup *from_cgrp; - if (root == &cgrp_dfl_root) - continue; - spin_lock_irq(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); spin_unlock_irq(&css_set_lock); @@ -397,6 +394,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * next pid to display, if any */ struct kernfs_open_file *of = s->private; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; @@ -406,25 +404,24 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) mutex_lock(&cgrp->pidlist_mutex); /* - * !NULL @of->priv indicates that this isn't the first start() - * after open. If the matching pidlist is around, we can use that. - * Look for it. Note that @of->priv can't be used directly. It - * could already have been destroyed. + * !NULL @ctx->procs1.pidlist indicates that this isn't the first + * start() after open. If the matching pidlist is around, we can use + * that. Look for it. Note that @ctx->procs1.pidlist can't be used + * directly. It could already have been destroyed. */ - if (of->priv) - of->priv = cgroup_pidlist_find(cgrp, type); + if (ctx->procs1.pidlist) + ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); /* * Either this is the first start() after open or the matching * pidlist has been destroyed inbetween. Create a new one. */ - if (!of->priv) { - ret = pidlist_array_load(cgrp, type, - (struct cgroup_pidlist **)&of->priv); + if (!ctx->procs1.pidlist) { + ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); if (ret) return ERR_PTR(ret); } - l = of->priv; + l = ctx->procs1.pidlist; if (pid) { int end = l->length; @@ -452,7 +449,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l = ctx->procs1.pidlist; if (l) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, @@ -463,7 +461,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v) static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l = ctx->procs1.pidlist; pid_t *p = v; pid_t *end = l->list + l->length; /* @@ -507,10 +506,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, goto out_unlock; /* - * Even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. + * Even if we're attaching all tasks in the thread group, we only need + * to check permissions on one of them. Check permissions using the + * credentials from file open to protect against inherited fd attacks. */ - cred = current_cred(); + cred = of->file->f_cred; tcred = get_task_cred(task); if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && @@ -549,6 +549,14 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((of->file->f_cred->user_ns != &init_user_ns) || + !capable(CAP_SYS_ADMIN)) + return -EPERM; + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -662,11 +670,9 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); /* - * ideally we don't want subsystems moving around while we do this. - * cgroup_mutex is also necessary to guarantee an atomic snapshot of - * subsys/hierarchy state. + * Grab the subsystems state racily. No need to add avenue to + * cgroup_mutex contention. */ - mutex_lock(&cgroup_mutex); for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", @@ -674,7 +680,6 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); - mutex_unlock(&cgroup_mutex); return 0; } @@ -701,8 +706,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) kernfs_type(kn) != KERNFS_DIR) return -EINVAL; - mutex_lock(&cgroup_mutex); - /* * We aren't being called from kernfs and there's no guarantee on * @kn->priv's validity. For this and css_tryget_online_from_dir(), @@ -710,9 +713,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) */ rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); - if (!cgrp || cgroup_is_dead(cgrp)) { + if (!cgrp || !cgroup_tryget(cgrp)) { rcu_read_unlock(); - mutex_unlock(&cgroup_mutex); return -ENOENT; } rcu_read_unlock(); @@ -740,7 +742,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } css_task_iter_end(&it); - mutex_unlock(&cgroup_mutex); + cgroup_put(cgrp); return 0; } @@ -960,6 +962,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) + return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ea08f01d0111..9d05c3ca2d5e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -30,6 +30,7 @@ #include "cgroup-internal.h" +#include <linux/bpf-cgroup.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/init_task.h> @@ -1740,6 +1741,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, i, ret; + u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -1756,8 +1758,28 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; + + /* + * Collect ssid's that need to be disabled from default + * hierarchy. + */ + if (ss->root == &cgrp_dfl_root) + dfl_disable_ss_mask |= 1 << ssid; + } while_each_subsys_mask(); + if (dfl_disable_ss_mask) { + struct cgroup *scgrp = &cgrp_dfl_root.cgrp; + + /* + * Controllers from default hierarchy that need to be rebound + * are all disabled together in one go. + */ + cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; @@ -1766,10 +1788,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) WARN_ON(!css || cgroup_css(dcgrp, ss)); - /* disable from the source */ - src_root->subsys_mask &= ~(1 << ssid); - WARN_ON(cgroup_apply_control(scgrp)); - cgroup_finalize_control(scgrp, 0); + if (src_root != &cgrp_dfl_root) { + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); @@ -2627,11 +2651,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset, if (src_cset->dead) return; - src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - if (!list_empty(&src_cset->mg_preload_node)) return; + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); + WARN_ON(src_cset->mg_src_cgrp); WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); @@ -3607,6 +3631,7 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { + struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; @@ -3618,6 +3643,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn); + /* Allow only one trigger per file descriptor */ + if (ctx->psi.trigger) { + cgroup_put(cgrp); + return -EBUSY; + } + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { @@ -3625,8 +3656,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return PTR_ERR(new); } - psi_trigger_replace(&of->priv, new); - + smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; @@ -3656,12 +3686,16 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { - return psi_trigger_poll(&of->priv, of->file, pt); + struct cgroup_file_ctx *ctx = of->priv; + + return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } static void cgroup_pressure_release(struct kernfs_open_file *of) { - psi_trigger_replace(&of->priv, NULL); + struct cgroup_file_ctx *ctx = of->priv; + + psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) @@ -3788,24 +3822,43 @@ static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx; + int ret; - if (cft->open) - return cft->open(of); - return 0; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->ns = current->nsproxy->cgroup_ns; + get_cgroup_ns(ctx->ns); + of->priv = ctx; + + if (!cft->open) + return 0; + + ret = cft->open(of); + if (ret) { + put_cgroup_ns(ctx->ns); + kfree(ctx); + } + return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); + put_cgroup_ns(ctx->ns); + kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; @@ -3822,7 +3875,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && - ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) @@ -4728,21 +4781,21 @@ void css_task_iter_end(struct css_task_iter *it) static void cgroup_procs_release(struct kernfs_open_file *of) { - if (of->priv) { - css_task_iter_end(of->priv); - kfree(of->priv); - } + struct cgroup_file_ctx *ctx = of->priv; + + if (ctx->procs.started) + css_task_iter_end(&ctx->procs.iter); } static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; if (pos) (*pos)++; - return css_task_iter_next(it); + return css_task_iter_next(&ctx->procs.iter); } static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, @@ -4750,21 +4803,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct css_task_iter *it = &ctx->procs.iter; /* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos. */ - if (!it) { + if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); - - it = kzalloc(sizeof(*it), GFP_KERNEL); - if (!it) - return ERR_PTR(-ENOMEM); - of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); + ctx->procs.started = true; } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); @@ -4815,9 +4865,9 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb) + struct super_block *sb, + struct cgroup_namespace *ns) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *com_cgrp = src_cgrp; int ret; @@ -4846,11 +4896,12 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, static int cgroup_attach_permissions(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb, bool threadgroup) + struct super_block *sb, bool threadgroup, + struct cgroup_namespace *ns) { int ret = 0; - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); if (ret) return ret; @@ -4867,8 +4918,10 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp, static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, bool threadgroup) { + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; + const struct cred *saved_cred; ssize_t ret; bool locked; @@ -4886,9 +4939,16 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); - /* process and thread migrations follow same delegation rule */ + /* + * Process and thread migrations follow same delegation rule. Check + * permissions using the credentials from file open to protect against + * inherited fd attacks. + */ + saved_cred = override_creds(of->file->f_cred); ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, threadgroup); + of->file->f_path.dentry->d_sb, + threadgroup, ctx->ns); + revert_creds(saved_cred); if (ret) goto out_finish; @@ -5688,7 +5748,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) /* Create the root cgroup state for this subsystem */ ss->root = &cgrp_dfl_root; - css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); + css = ss->css_alloc(NULL); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); @@ -5911,17 +5971,20 @@ struct cgroup *cgroup_get_from_id(u64 id) struct kernfs_node *kn; struct cgroup *cgrp = NULL; - mutex_lock(&cgroup_mutex); kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) - goto out_unlock; + goto out; - cgrp = kn->priv; - if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp)) + rcu_read_lock(); + + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); + if (cgrp && !cgroup_tryget(cgrp)) cgrp = NULL; + + rcu_read_unlock(); + kernfs_put(kn); -out_unlock: - mutex_unlock(&cgroup_mutex); +out: return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_id); @@ -6104,7 +6167,8 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) goto err; ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, - !(kargs->flags & CLONE_THREAD)); + !(kargs->flags & CLONE_THREAD), + current->nsproxy->cgroup_ns); if (ret) goto err; @@ -6474,30 +6538,34 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) * * Find the cgroup at @path on the default hierarchy, increment its * reference count and return it. Returns pointer to the found cgroup on - * success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR) - * if @path points to a non-directory. + * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already + * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory. */ struct cgroup *cgroup_get_from_path(const char *path) { struct kernfs_node *kn; - struct cgroup *cgrp; - - mutex_lock(&cgroup_mutex); + struct cgroup *cgrp = ERR_PTR(-ENOENT); kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); - if (kn) { - if (kernfs_type(kn) == KERNFS_DIR) { - cgrp = kn->priv; - cgroup_get_live(cgrp); - } else { - cgrp = ERR_PTR(-ENOTDIR); - } - kernfs_put(kn); - } else { - cgrp = ERR_PTR(-ENOENT); + if (!kn) + goto out; + + if (kernfs_type(kn) != KERNFS_DIR) { + cgrp = ERR_PTR(-ENOTDIR); + goto out_kernfs; } - mutex_unlock(&cgroup_mutex); + rcu_read_lock(); + + cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); + if (!cgrp || !cgroup_tryget(cgrp)) + cgrp = ERR_PTR(-ENOENT); + + rcu_read_unlock(); + +out_kernfs: + kernfs_put(kn); +out: return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_path); @@ -6625,44 +6693,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ -#ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_attach(struct cgroup *cgrp, - struct bpf_prog *prog, struct bpf_prog *replace_prog, - struct bpf_cgroup_link *link, - enum bpf_attach_type type, - u32 flags) -{ - int ret; - - mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); - mutex_unlock(&cgroup_mutex); - return ret; -} - -int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type) -{ - int ret; - - mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); - mutex_unlock(&cgroup_mutex); - return ret; -} - -int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - int ret; - - mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_query(cgrp, attr, uattr); - mutex_unlock(&cgroup_mutex); - return ret; -} -#endif /* CONFIG_CGROUP_BPF */ - #ifdef CONFIG_SYSFS static ssize_t show_delegatable_files(struct cftype *files, char *buf, ssize_t size, const char *prefix) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2a9695ccb65f..4c7254e8f49a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -69,6 +69,13 @@ DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); +/* + * There could be abnormal cpuset configurations for cpu or memory + * node binding, add this key to provide a quick low-cost judgement + * of the situation. + */ +DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); + /* See "Frequency meter" comments, below. */ struct fmeter { @@ -372,6 +379,17 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); +static inline void check_insane_mems_config(nodemask_t *nodes) +{ + if (!cpusets_insane_config() && + movable_only_nodes(nodes)) { + static_branch_enable(&cpusets_insane_config_key); + pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" + "Cpuset allocations might fail even with a lot of memory available.\n", + nodemask_pr_args(nodes)); + } +} + /* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting @@ -573,6 +591,35 @@ static inline void free_cpuset(struct cpuset *cs) } /* + * validate_change_legacy() - Validate conditions specific to legacy (v1) + * behavior. + */ +static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) +{ + struct cgroup_subsys_state *css; + struct cpuset *c, *par; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* Each of our child cpusets must be a subset of us */ + ret = -EBUSY; + cpuset_for_each_child(c, css, cur) + if (!is_cpuset_subset(c, trial)) + goto out; + + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ + ret = -EACCES; + par = parent_cs(cur); + if (par && !is_cpuset_subset(trial, par)) + goto out; + + ret = 0; +out: + return ret; +} + +/* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * @@ -596,28 +643,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; - int ret; + int ret = 0; rcu_read_lock(); - /* Each of our child cpusets must be a subset of us */ - ret = -EBUSY; - cpuset_for_each_child(c, css, cur) - if (!is_cpuset_subset(c, trial)) - goto out; + if (!is_in_v2_mode()) + ret = validate_change_legacy(cur, trial); + if (ret) + goto out; /* Remaining checks don't apply to root cpuset */ - ret = 0; if (cur == &top_cpuset) goto out; par = parent_cs(cur); - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) - goto out; - /* * If either I or some sibling (!= me) is exclusive, we can't * overlap @@ -1165,9 +1205,7 @@ enum subparts_cmd { * * Because of the implicit cpu exclusive nature of a partition root, * cpumask changes that violates the cpu exclusivity rule will not be - * permitted when checked by validate_change(). The validate_change() - * function will also prevent any changes to the cpu list if it is not - * a superset of children's cpu lists. + * permitted when checked by validate_change(). */ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpumask *newmask, @@ -1512,10 +1550,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; + percpu_rwsem_assert_held(&cpuset_rwsem); + /* * Check all its siblings and call update_cpumasks_hier() * if their use_parent_ecpus flag is set in order for them * to use the right effective_cpus value. + * + * The update_cpumasks_hier() function may sleep. So we have to + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -1523,8 +1566,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; if (!sibling->use_parent_ecpus) continue; + if (!css_tryget_online(&sibling->css)) + continue; + rcu_read_unlock(); update_cpumasks_hier(sibling, tmp); + rcu_read_lock(); + css_put(&sibling->css); } rcu_read_unlock(); } @@ -1597,8 +1645,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Make sure that subparts_cpus is a subset of cpus_allowed. */ if (cs->nr_subparts_cpus) { - cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus, - cs->cpus_allowed); + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed); cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); } spin_unlock_irq(&callback_lock); @@ -1870,6 +1917,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto done; + check_insane_mems_config(&trialcs->mems_allowed); + spin_lock_irq(&callback_lock); cs->mems_allowed = trialcs->mems_allowed; spin_unlock_irq(&callback_lock); @@ -3173,6 +3222,9 @@ update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); + if (mems_updated) + check_insane_mems_config(&new_mems); + if (is_in_v2_mode()) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); @@ -3513,7 +3565,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) bool __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ - int allowed; /* is allocation in zone z allowed? */ + bool allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index ec02d963cad1..fe3e8a0eb7ed 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -157,13 +157,6 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, new_usage = atomic_long_add_return(amount, &res->usage); if (new_usage > READ_ONCE(res->max) || new_usage > READ_ONCE(misc_res_capacity[type])) { - if (!res->failed) { - pr_info("cgroup: charge rejected by the misc controller for %s resource in ", - misc_res_name[type]); - pr_cont_cgroup_path(i->css.cgroup); - pr_cont("\n"); - res->failed = true; - } ret = -EBUSY; goto err_charge; } @@ -171,6 +164,11 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, return 0; err_charge: + for (j = i; j; j = parent_misc(j)) { + atomic_long_inc(&j->res[type].events); + cgroup_file_notify(&j->events_file); + } + for (j = cg; j != i; j = parent_misc(j)) misc_cg_cancel_charge(type, j, amount); misc_cg_cancel_charge(type, i, amount); @@ -335,6 +333,19 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v) return 0; } +static int misc_events_show(struct seq_file *sf, void *v) +{ + struct misc_cg *cg = css_misc(seq_css(sf)); + unsigned long events, i; + + for (i = 0; i < MISC_CG_RES_TYPES; i++) { + events = atomic_long_read(&cg->res[i].events); + if (READ_ONCE(misc_res_capacity[i]) || events) + seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events); + } + return 0; +} + /* Misc cgroup interface files */ static struct cftype misc_cg_files[] = { { @@ -353,6 +364,12 @@ static struct cftype misc_cg_files[] = { .seq_show = misc_cg_capacity_show, .flags = CFTYPE_ONLY_ON_ROOT, }, + { + .name = "events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct misc_cg, events_file), + .seq_show = misc_events_show, + }, {} }; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index b264ab5652ba..9d331ba44870 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -35,7 +35,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. */ - if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) + if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) return; raw_spin_lock_irqsave(cpu_lock, flags); @@ -88,6 +88,7 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, struct cgroup *root, int cpu) { struct cgroup_rstat_cpu *rstatc; + struct cgroup *parent; if (pos == root) return NULL; @@ -96,10 +97,14 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * We're gonna walk down to the first leaf and visit/remove it. We * can pick whatever unvisited node as the starting point. */ - if (!pos) + if (!pos) { pos = root; - else + /* return NULL if this subtree is not on-list */ + if (!cgroup_rstat_cpu(pos, cpu)->updated_next) + return NULL; + } else { pos = cgroup_parent(pos); + } /* walk down to the first leaf */ while (true) { @@ -115,33 +120,25 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * However, due to the way we traverse, @pos will be the first * child in most cases. The only exception is @root. */ - if (rstatc->updated_next) { - struct cgroup *parent = cgroup_parent(pos); - - if (parent) { - struct cgroup_rstat_cpu *prstatc; - struct cgroup **nextp; - - prstatc = cgroup_rstat_cpu(parent, cpu); - nextp = &prstatc->updated_children; - while (true) { - struct cgroup_rstat_cpu *nrstatc; - - nrstatc = cgroup_rstat_cpu(*nextp, cpu); - if (*nextp == pos) - break; - WARN_ON_ONCE(*nextp == parent); - nextp = &nrstatc->updated_next; - } - *nextp = rstatc->updated_next; - } + parent = cgroup_parent(pos); + if (parent) { + struct cgroup_rstat_cpu *prstatc; + struct cgroup **nextp; - rstatc->updated_next = NULL; - return pos; + prstatc = cgroup_rstat_cpu(parent, cpu); + nextp = &prstatc->updated_children; + while (*nextp != pos) { + struct cgroup_rstat_cpu *nrstatc; + + nrstatc = cgroup_rstat_cpu(*nextp, cpu); + WARN_ON_ONCE(*nextp == parent); + nextp = &nrstatc->updated_next; + } + *nextp = rstatc->updated_next; } - /* only happens for @root */ - return NULL; + rstatc->updated_next = NULL; + return pos; } /* see cgroup_rstat_flush() */ @@ -433,8 +430,6 @@ static void root_cgroup_cputime(struct task_cputime *cputime) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; - cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST]; - cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE]; } } |
