diff options
Diffstat (limited to 'kernel/cgroup')
| -rw-r--r-- | kernel/cgroup/cgroup-internal.h | 14 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup-v1.c | 18 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup.c | 394 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset-internal.h | 59 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset-v1.c | 272 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 875 | ||||
| -rw-r--r-- | kernel/cgroup/debug.c | 4 | ||||
| -rw-r--r-- | kernel/cgroup/dmem.c | 83 | ||||
| -rw-r--r-- | kernel/cgroup/legacy_freezer.c | 2 | ||||
| -rw-r--r-- | kernel/cgroup/misc.c | 2 | ||||
| -rw-r--r-- | kernel/cgroup/namespace.c | 2 | ||||
| -rw-r--r-- | kernel/cgroup/pids.c | 2 | ||||
| -rw-r--r-- | kernel/cgroup/rdma.c | 10 |
13 files changed, 983 insertions, 754 deletions
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 22051b4f1ccb..58797123b752 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -52,7 +52,7 @@ struct cgroup_fs_context { bool cpuset_clone_children; bool none; /* User explicitly requested empty subsystem */ bool all_ss; /* Seen 'all' option */ - u16 subsys_mask; /* Selected subsystems */ + u32 subsys_mask; /* Selected subsystems */ char *name; /* Hierarchy name */ char *release_agent; /* Path for release notifications */ }; @@ -146,7 +146,7 @@ struct cgroup_mgctx { struct cgroup_taskset tset; /* subsystems affected by migration */ - u16 ss_mask; + u32 ss_mask; }; #define CGROUP_TASKSET_INIT(tset) \ @@ -184,11 +184,6 @@ extern bool cgrp_dfl_visible; for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) -static inline bool cgroup_is_dead(const struct cgroup *cgrp) -{ - return !(cgrp->self.flags & CSS_ONLINE); -} - static inline bool notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -222,7 +217,6 @@ static inline void get_css_set(struct css_set *cset) } bool cgroup_ssid_enabled(int ssid); -bool cgroup_on_dfl(const struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, @@ -235,8 +229,8 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup_favor_dynmods(struct cgroup_root *root, bool favor); void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_fs_context *ctx); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask); +int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask); int cgroup_do_get_tree(struct fs_context *fc); int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index a9e029b570c8..a4337c9b5287 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -28,7 +28,7 @@ #define CGROUP_PIDLIST_DESTROY_DELAY HZ /* Controllers blocked by the commandline in v1 */ -static u16 cgroup_no_v1_mask; +static u32 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; @@ -317,7 +317,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, return l; /* entry not found; create a new one */ - l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + l = kzalloc_obj(struct cgroup_pidlist); if (!l) return l; @@ -352,7 +352,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, * show up until sometime later on. */ length = cgroup_task_count(cgrp); - array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL); + array = kvmalloc_objs(pid_t, length); if (!array) return -ENOMEM; /* now, populate the array */ @@ -1037,13 +1037,13 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) static int check_cgroupfs_options(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); - u16 mask = U16_MAX; - u16 enabled = 0; + u32 mask = U32_MAX; + u32 enabled = 0; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); + mask = ~((u32)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) && @@ -1095,7 +1095,7 @@ int cgroup1_reconfigure(struct fs_context *fc) struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); int ret = 0; - u16 added_mask, removed_mask; + u32 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); @@ -1237,7 +1237,7 @@ static int cgroup1_root_to_use(struct fs_context *fc) if (ctx->ns != &init_cgroup_ns) return -EPERM; - root = kzalloc(sizeof(*root), GFP_KERNEL); + root = kzalloc_obj(*root); if (!root) return -ENOMEM; @@ -1343,7 +1343,7 @@ static int __init cgroup_no_v1(char *str) continue; if (!strcmp(token, "all")) { - cgroup_no_v1_mask = U16_MAX; + cgroup_no_v1_mask = U32_MAX; continue; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5f0d33b04910..6152add0c5eb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -69,14 +69,6 @@ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* - * To avoid confusing the compiler (and generating warnings) with code - * that attempts to access what would be a 0-element array (i.e. sized - * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this - * constant expression can be added. - */ -#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) - -/* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * @@ -107,12 +99,6 @@ static bool cgroup_debug __read_mostly; */ static DEFINE_SPINLOCK(cgroup_idr_lock); -/* - * Protects cgroup_file->kn for !self csses. It synchronizes notifications - * against file removal/re-creation across css hiding. - */ -static DEFINE_SPINLOCK(cgroup_file_kn_lock); - DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ @@ -203,13 +189,13 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ -static u16 cgrp_dfl_inhibit_ss_mask; +static u32 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ -static u16 cgrp_dfl_implicit_ss_mask; +static u32 cgrp_dfl_implicit_ss_mask; /* some controllers can be threaded on the default hierarchy */ -static u16 cgrp_dfl_threaded_ss_mask; +static u32 cgrp_dfl_threaded_ss_mask; /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); @@ -231,10 +217,10 @@ static u64 css_serial_nr_next = 1; * These bitmasks identify subsystems with specific features to avoid * having to do iterative checks repeatedly. */ -static u16 have_fork_callback __read_mostly; -static u16 have_exit_callback __read_mostly; -static u16 have_release_callback __read_mostly; -static u16 have_canfork_callback __read_mostly; +static u32 have_fork_callback __read_mostly; +static u32 have_exit_callback __read_mostly; +static u32 have_release_callback __read_mostly; +static u32 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); @@ -278,10 +264,12 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); +static void cgroup_finish_destroy(struct cgroup *cgrp); +static void kill_css_sync(struct cgroup_subsys_state *css); +static void kill_css_finish(struct cgroup_subsys_state *css); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); -static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -472,13 +460,13 @@ static bool cgroup_is_valid_domain(struct cgroup *cgrp) } /* subsystems visibly enabled on a cgroup */ -static u16 cgroup_control(struct cgroup *cgrp) +static u32 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); - u16 root_ss_mask = cgrp->root->subsys_mask; + u32 root_ss_mask = cgrp->root->subsys_mask; if (parent) { - u16 ss_mask = parent->subtree_control; + u32 ss_mask = parent->subtree_control; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) @@ -493,12 +481,12 @@ static u16 cgroup_control(struct cgroup *cgrp) } /* subsystems enabled on a cgroup */ -static u16 cgroup_ss_mask(struct cgroup *cgrp) +static u32 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); if (parent) { - u16 ss_mask = parent->subtree_ss_mask; + u32 ss_mask = parent->subtree_ss_mask; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) @@ -510,27 +498,6 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp) } /** - * cgroup_css - obtain a cgroup's css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns @cgrp->self) - * - * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This - * function must be called either under cgroup_mutex or rcu_read_lock() and - * the caller is responsible for pinning the returned css if it wants to - * keep accessing it outside the said locks. This function may return - * %NULL if @cgrp doesn't have @subsys_id enabled. - */ -static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - if (CGROUP_HAS_SUBSYS_CONFIG && ss) - return rcu_dereference_check(cgrp->subsys[ss->id], - lockdep_is_held(&cgroup_mutex)); - else - return &cgrp->self; -} - -/** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) @@ -741,32 +708,6 @@ EXPORT_SYMBOL_GPL(of_css); } \ } while (false) -/* iterate over child cgrps, lock should be held throughout iteration */ -#define cgroup_for_each_live_child(child, cgrp) \ - list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - cgroup_is_dead(child); })) \ - ; \ - else - -/* walk live descendants in pre order */ -#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ - css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - (dsct) = (d_css)->cgroup; \ - cgroup_is_dead(dsct); })) \ - ; \ - else - -/* walk live descendants in postorder */ -#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ - css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - (dsct) = (d_css)->cgroup; \ - cgroup_is_dead(dsct); })) \ - ; \ - else - /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state @@ -858,6 +799,16 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (was_populated == cgroup_is_populated(cgrp)) break; + /* + * Subtree just emptied below an offlined cgrp. Fire deferred + * destroy. The transition is one-shot. + */ + if (was_populated && !css_is_online(&cgrp->self)) { + cgroup_get(cgrp); + WARN_ON_ONCE(!queue_work(cgroup_offline_wq, + &cgrp->finish_destroy_work)); + } + cgroup1_check_for_release(cgrp); TRACE_CGROUP_PATH(notify_populated, cgrp, cgroup_is_populated(cgrp)); @@ -1168,7 +1119,7 @@ static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) INIT_LIST_HEAD(tmp_links); for (i = 0; i < count; i++) { - link = kzalloc(sizeof(*link), GFP_KERNEL); + link = kzalloc_obj(*link); if (!link) { free_cgrp_cset_links(tmp_links); return -ENOMEM; @@ -1241,7 +1192,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, if (cset) return cset; - cset = kzalloc(sizeof(*cset), GFP_KERNEL); + cset = kzalloc_obj(*cset); if (!cset) return NULL; @@ -1633,9 +1584,9 @@ static umode_t cgroup_file_mode(const struct cftype *cft) * This function calculates which subsystems need to be enabled if * @subtree_control is to be applied while restricted to @this_ss_mask. */ -static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) +static u32 cgroup_calc_subtree_ss_mask(u32 subtree_control, u32 this_ss_mask) { - u16 cur_ss_mask = subtree_control; + u32 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; @@ -1644,7 +1595,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { - u16 new_ss_mask = cur_ss_mask; + u32 new_ss_mask = cur_ss_mask; do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; @@ -1748,9 +1699,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); struct cgroup_file *cfile = (void *)css + cft->file_offset; - spin_lock_irq(&cgroup_file_kn_lock); - cfile->kn = NULL; - spin_unlock_irq(&cgroup_file_kn_lock); + spin_lock_irq(&cfile->lock); + WRITE_ONCE(cfile->kn, NULL); + spin_unlock_irq(&cfile->lock); timer_delete_sync(&cfile->notify_timer); } @@ -1848,12 +1799,12 @@ err: return ret; } -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) +int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, ret; - u16 dfl_disable_ss_mask = 0; + u32 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -2100,6 +2051,16 @@ static int cgroup_reconfigure(struct fs_context *fc) return 0; } +static void cgroup_finish_destroy_work_fn(struct work_struct *work) +{ + struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work); + + cgroup_lock(); + cgroup_finish_destroy(cgrp); + cgroup_unlock(); + cgroup_put(cgrp); +} + static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -2126,6 +2087,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) #endif init_waitqueue_head(&cgrp->offline_waitq); + INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } @@ -2149,7 +2111,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -2350,7 +2312,7 @@ static int cgroup_init_fs_context(struct fs_context *fc) { struct cgroup_fs_context *ctx; - ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL); + ctx = kzalloc_obj(struct cgroup_fs_context); if (!ctx) return -ENOMEM; @@ -2608,6 +2570,7 @@ static void cgroup_migrate_add_task(struct task_struct *task, mgctx->tset.nr_tasks++; + css_set_skip_task_iters(cset, task); list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, @@ -3131,7 +3094,7 @@ void cgroup_procs_write_finish(struct task_struct *task, put_task_struct(task); } -static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) +static void cgroup_print_ss_mask(struct seq_file *seq, u32 ss_mask) { struct cgroup_subsys *ss; bool printed = false; @@ -3434,7 +3397,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { - kill_css(css); + kill_css_sync(css); + kill_css_finish(css); } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) @@ -3496,9 +3460,9 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) cgroup_apply_control_disable(cgrp); } -static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) +static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u32 enable) { - u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; + u32 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; /* if nothing is getting enabled, nothing to worry about */ if (!enable) @@ -3541,7 +3505,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - u16 enable = 0, disable = 0; + u32 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -3993,33 +3957,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { - struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_file_ctx *ctx; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; + ssize_t ret = 0; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - cgroup_get(cgrp); - cgroup_kn_unlock(of->kn); + ctx = of->priv; + if (!ctx) { + ret = -ENODEV; + goto out_unlock; + } /* Allow only one trigger per file descriptor */ if (ctx->psi.trigger) { - cgroup_put(cgrp); - return -EBUSY; + ret = -EBUSY; + goto out_unlock; } psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { - cgroup_put(cgrp); - return PTR_ERR(new); + ret = PTR_ERR(new); + goto out_unlock; } smp_store_release(&ctx->psi.trigger, new); - cgroup_put(cgrp); + +out_unlock: + cgroup_kn_unlock(of->kn); + if (ret) + return ret; return nbytes; } @@ -4251,7 +4223,7 @@ static int cgroup_file_open(struct kernfs_open_file *of) struct cgroup_file_ctx *ctx; int ret; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kzalloc_obj(*ctx); if (!ctx) return -ENOMEM; @@ -4427,10 +4399,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); - - spin_lock_irq(&cgroup_file_kn_lock); + spin_lock_init(&cfile->lock); cfile->kn = kn; - spin_unlock_irq(&cgroup_file_kn_lock); } return 0; @@ -4685,21 +4655,32 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) */ void cgroup_file_notify(struct cgroup_file *cfile) { - unsigned long flags; + unsigned long flags, last, next; + struct kernfs_node *kn = NULL; + + if (!READ_ONCE(cfile->kn)) + return; + + last = READ_ONCE(cfile->notified_at); + next = last + CGROUP_FILE_NOTIFY_MIN_INTV; + if (time_in_range(jiffies, last, next)) { + timer_reduce(&cfile->notify_timer, next); + if (timer_pending(&cfile->notify_timer)) + return; + } - spin_lock_irqsave(&cgroup_file_kn_lock, flags); + spin_lock_irqsave(&cfile->lock, flags); if (cfile->kn) { - unsigned long last = cfile->notified_at; - unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; + kn = cfile->kn; + kernfs_get(kn); + WRITE_ONCE(cfile->notified_at, jiffies); + } + spin_unlock_irqrestore(&cfile->lock, flags); - if (time_in_range(jiffies, last, next)) { - timer_reduce(&cfile->notify_timer, next); - } else { - kernfs_notify(cfile->kn); - cfile->notified_at = jiffies; - } + if (kn) { + kernfs_notify(kn); + kernfs_put(kn); } - spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } EXPORT_SYMBOL_GPL(cgroup_file_notify); @@ -4712,10 +4693,10 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show) { struct kernfs_node *kn; - spin_lock_irq(&cgroup_file_kn_lock); + spin_lock_irq(&cfile->lock); kn = cfile->kn; kernfs_get(kn); - spin_unlock_irq(&cgroup_file_kn_lock); + spin_unlock_irq(&cfile->lock); if (kn) kernfs_show(kn, show); @@ -4945,7 +4926,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css) rcu_read_lock(); css_for_each_child(child, css) { - if (child->flags & CSS_ONLINE) { + if (css_is_online(child)) { ret = true; break; } @@ -5108,6 +5089,14 @@ repeat: return; task = list_entry(it->task_pos, struct task_struct, cg_list); + /* + * Hide tasks that are exiting but not yet removed by default. Keep + * zombie leaders with live threads visible. Usages that need to walk + * every existing task can opt out via CSS_TASK_ITER_WITH_DEAD. + */ + if (!(it->flags & CSS_TASK_ITER_WITH_DEAD) && + (task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) + goto repeat; if (it->flags & CSS_TASK_ITER_PROCS) { /* if PROCS, skip over tasks which aren't group leaders */ @@ -5550,7 +5539,7 @@ static struct cftype cgroup_psi_files[] = { * css destruction is four-stage process. * * 1. Destruction starts. Killing of the percpu_ref is initiated. - * Implemented in kill_css(). + * Implemented in kill_css_finish(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs * and thus css_tryget_online() is guaranteed to fail, the css can be @@ -5750,7 +5739,7 @@ static void offline_css(struct cgroup_subsys_state *css) lockdep_assert_held(&cgroup_mutex); - if (!(css->flags & CSS_ONLINE)) + if (!css_is_online(css)) return; if (ss->css_offline) @@ -5760,16 +5749,6 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); - - css->cgroup->nr_dying_subsys[ss->id]++; - /* - * Parent css and cgroup cannot be freed until after the freeing - * of child css, see css_free_rwork_fn(). - */ - while ((css = css->parent)) { - css->nr_descendants--; - css->cgroup->nr_dying_subsys[ss->id]++; - } } /** @@ -5844,7 +5823,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(struct_size(cgrp, _low_ancestors, level), GFP_KERNEL); + cgrp = kzalloc_flex(*cgrp, _low_ancestors, level); if (!cgrp) return ERR_PTR(-ENOMEM); @@ -6039,12 +6018,13 @@ out_unlock: /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to - * initiate destruction and put the css ref from kill_css(). + * initiate destruction and put the css ref from kill_css_finish(). */ static void css_killed_work_fn(struct work_struct *work) { - struct cgroup_subsys_state *css = - container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup_subsys_state *css; + + css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork); cgroup_lock(); @@ -6065,22 +6045,21 @@ static void css_killed_ref_fn(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); if (atomic_dec_and_test(&css->online_cnt)) { - INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_offline_wq, &css->destroy_work); + INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn); + queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork); } } /** - * kill_css - destroy a css - * @css: css to destroy + * kill_css_sync - synchronous half of css teardown + * @css: css being killed * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget_online() is guaranteed to fail and when - * the reference count reaches zero, @css will be released. + * See cgroup_destroy_locked(). */ -static void kill_css(struct cgroup_subsys_state *css) +static void kill_css_sync(struct cgroup_subsys_state *css) { + struct cgroup_subsys *ss = css->ss; + lockdep_assert_held(&cgroup_mutex); if (css->flags & CSS_DYING) @@ -6100,64 +6079,100 @@ static void kill_css(struct cgroup_subsys_state *css) */ css_clear_dir(css); + css->cgroup->nr_dying_subsys[ss->id]++; + /* + * Parent css and cgroup cannot be freed until after the freeing + * of child css, see css_free_rwork_fn(). + */ + while ((css = css->parent)) { + css->nr_descendants--; + css->cgroup->nr_dying_subsys[ss->id]++; + } +} + +/** + * kill_css_finish - deferred half of css teardown + * @css: css being killed + * + * See cgroup_destroy_locked(). + */ +static void kill_css_finish(struct cgroup_subsys_state *css) +{ + lockdep_assert_held(&cgroup_mutex); + + /* + * Skip on re-entry: cgroup_apply_control_disable() may have killed @css + * earlier. cgroup_destroy_locked() can still walk it because + * offline_css() (which NULLs cgrp->subsys[ssid]) runs async. + */ + if (percpu_ref_is_dying(&css->refcnt)) + return; + /* - * Killing would put the base ref, but we need to keep it alive - * until after ->css_offline(). + * Killing would put the base ref, but we need to keep it alive until + * after ->css_offline(). */ css_get(css); /* - * cgroup core guarantees that, by the time ->css_offline() is - * invoked, no new css reference will be given out via - * css_tryget_online(). We can't simply call percpu_ref_kill() and - * proceed to offlining css's because percpu_ref_kill() doesn't - * guarantee that the ref is seen as killed on all CPUs on return. + * cgroup core guarantees that, by the time ->css_offline() is invoked, + * no new css reference will be given out via css_tryget_online(). We + * can't simply call percpu_ref_kill() and proceed to offlining css's + * because percpu_ref_kill() doesn't guarantee that the ref is seen as + * killed on all CPUs on return. * - * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. + * Use percpu_ref_kill_and_confirm() to get notifications as each css is + * confirmed to be seen as killed on all CPUs. */ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } /** - * cgroup_destroy_locked - the first stage of cgroup destruction + * cgroup_destroy_locked - destroy @cgrp (called on rmdir) * @cgrp: cgroup to be destroyed * - * css's make use of percpu refcnts whose killing latency shouldn't be - * exposed to userland and are RCU protected. Also, cgroup core needs to - * guarantee that css_tryget_online() won't succeed by the time - * ->css_offline() is invoked. To satisfy all the requirements, - * destruction is implemented in the following two steps. - * - * s1. Verify @cgrp can be destroyed and mark it dying. Remove all - * userland visible parts and start killing the percpu refcnts of - * css's. Set up so that the next stage will be kicked off once all - * the percpu refcnts are confirmed to be killed. - * - * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the - * rest of destruction. Once all cgroup references are gone, the - * cgroup is RCU-freed. - * - * This function implements s1. After this step, @cgrp is gone as far as - * the userland is concerned and a new cgroup with the same name may be - * created. As cgroup doesn't care about the names internally, this - * doesn't cause any problem. + * Tear down @cgrp on behalf of rmdir. Constraints: + * + * - Userspace: rmdir must succeed when cgroup.procs and friends are empty. + * + * - Kernel: subsystem ->css_offline() must not run while any task in @cgrp's + * subtree is still doing kernel work. A task hidden from cgroup.procs (past + * exit_signals() with signal->live cleared) can still schedule, allocate, and + * consume resources until its final context switch. Dying descendants in the + * subtree can host such tasks too. + * + * - Kernel: css_tryget_online() must fail by the time ->css_offline() runs. + * + * The destruction runs in three parts: + * + * - This function: synchronous user-visible state teardown plus kill_css_sync() + * on each subsystem css. + * + * - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on + * each subsystem css. Fires once @cgrp's subtree is fully drained, either + * inline here or from cgroup_update_populated(). + * + * - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn -> + * ->css_offline() -> release/free. + * + * Return 0 on success, -EBUSY if a userspace-visible task or an online child + * remains. */ static int cgroup_destroy_locked(struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; + struct css_task_iter it; + struct task_struct *task; int ssid, ret; lockdep_assert_held(&cgroup_mutex); - /* - * Only migration can raise populated from zero and we're already - * holding cgroup_mutex. - */ - if (cgroup_is_populated(cgrp)) + css_task_iter_start(&cgrp->self, 0, &it); + task = css_task_iter_next(&it); + css_task_iter_end(&it); + if (task) return -EBUSY; /* @@ -6181,9 +6196,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) link->cset->dead = true; spin_unlock_irq(&css_set_lock); - /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) - kill_css(css); + kill_css_sync(css); /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ css_clear_dir(&cgrp->self); @@ -6214,9 +6228,29 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); + if (!cgroup_is_populated(cgrp)) + cgroup_finish_destroy(cgrp); + return 0; }; +/** + * cgroup_finish_destroy - deferred half of @cgrp destruction + * @cgrp: cgroup whose subtree just became empty + * + * See cgroup_destroy_locked() for the rationale. + */ +static void cgroup_finish_destroy(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *css; + int ssid; + + lockdep_assert_held(&cgroup_mutex); + + for_each_css(css, ssid, cgrp) + kill_css_finish(css); +} + int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; @@ -6347,7 +6381,7 @@ int __init cgroup_init(void) struct cgroup_subsys *ss; int ssid; - BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 32); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 01976c8e7d49..bb4e692bea30 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -9,6 +9,7 @@ #include <linux/cpuset.h> #include <linux/spinlock.h> #include <linux/union_find.h> +#include <linux/sched/isolation.h> /* See "Frequency meter" comments, below. */ @@ -144,17 +145,12 @@ struct cpuset { */ nodemask_t old_mems_allowed; - struct fmeter fmeter; /* memory_pressure filter */ - /* * Tasks are being attached to this cpuset. Used to prevent * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). */ int attach_in_progress; - /* for custom sched domain */ - int relax_domain_level; - /* partition root state */ int partition_root_state; @@ -172,6 +168,11 @@ struct cpuset { int nr_deadline_tasks; int nr_migrate_dl_tasks; u64 sum_migrate_dl_bw; + /* + * CPU used for temporary DL bandwidth allocation during attach; + * -1 if no DL bandwidth was allocated in the current attach. + */ + int dl_bw_cpu; /* Invalid partition error code, not lock protected */ enum prs_errcode prs_err; @@ -179,10 +180,19 @@ struct cpuset { /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; +#ifdef CONFIG_CPUSETS_V1 + struct fmeter fmeter; /* memory_pressure filter */ + + /* for custom sched domain */ + int relax_domain_level; + /* Used to merge intersecting subsets for generate_sched_domains */ struct uf_node node; +#endif }; +extern struct cpuset top_cpuset; + static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) { return css ? container_of(css, struct cpuset, css) : NULL; @@ -240,6 +250,30 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } +/* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? + */ +static inline int cpusets_overlap(struct cpuset *a, struct cpuset *b) +{ + return cpumask_intersects(a->effective_cpus, b->effective_cpus); +} + +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key.key) + 1; +} + +static inline bool cpuset_is_populated(struct cpuset *cs) +{ + lockdep_assert_cpuset_lock_held(); + + /* Cpusets in the process of attaching should be considered as populated */ + return cgroup_is_populated(cs->css.cgroup) || + cs->attach_in_progress; +} + /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child @@ -285,7 +319,6 @@ void cpuset_full_unlock(void); */ #ifdef CONFIG_CPUSETS_V1 extern struct cftype cpuset1_files[]; -void fmeter_init(struct fmeter *fmp); void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk); void cpuset1_update_tasks_flags(struct cpuset *cs); @@ -293,8 +326,13 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2); +void cpuset1_init(struct cpuset *cs); +void cpuset1_online_css(struct cgroup_subsys_state *css); +int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes); + #else -static inline void fmeter_init(struct fmeter *fmp) {} static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) {} static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {} @@ -303,6 +341,13 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, bool cpus_updated, bool mems_updated) {} static inline int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { return 0; } +static inline bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, + struct cpuset *cs2) { return false; } +static inline void cpuset1_init(struct cpuset *cs) {} +static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {} +static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes) { return 0; }; + #endif /* CONFIG_CPUSETS_V1 */ #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 12e76774c75b..7308e9b02495 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -62,7 +62,7 @@ struct cpuset_remove_tasks_struct { #define FM_SCALE 1000 /* faux fixed point scale */ /* Initialize a frequency meter */ -void fmeter_init(struct fmeter *fmp) +static void fmeter_init(struct fmeter *fmp) { fmp->cnt = 0; fmp->val = 0; @@ -316,7 +316,7 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, css_tryget_online(&cs->css)) { struct cpuset_remove_tasks_struct *s; - s = kzalloc(sizeof(*s), GFP_KERNEL); + s = kzalloc_obj(*s); if (WARN_ON_ONCE(!s)) { css_put(&cs->css); return; @@ -368,11 +368,44 @@ int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) if (par && !is_cpuset_subset(trial, par)) goto out; + /* + * Cpusets with tasks - existing or newly being attached - can't + * be changed to have empty cpus_allowed or mems_allowed. + */ + ret = -ENOSPC; + if (cpuset_is_populated(cur)) { + if (!cpumask_empty(cur->cpus_allowed) && + cpumask_empty(trial->cpus_allowed)) + goto out; + if (!nodes_empty(cur->mems_allowed) && + nodes_empty(trial->mems_allowed)) + goto out; + } + ret = 0; out: return ret; } +/* + * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts + * to legacy (v1) + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect. + */ +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return cpumask_intersects(cs1->cpus_allowed, + cs2->cpus_allowed); + + return false; +} + #ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() @@ -499,6 +532,241 @@ out_unlock: return retval; } +void cpuset1_init(struct cpuset *cs) +{ + fmeter_init(&cs->fmeter); + cs->relax_domain_level = -1; +} + +void cpuset1_online_css(struct cgroup_subsys_state *css) +{ + struct cpuset *tmp_cs; + struct cgroup_subsys_state *pos_css; + struct cpuset *cs = css_cs(css); + struct cpuset *parent = parent_cs(cs); + + lockdep_assert_cpus_held(); + lockdep_assert_cpuset_lock_held(); + + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); + + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) + return; + + /* + * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is + * set. This flag handling is implemented in cgroup core for + * historical reasons - the flag may be specified during mount. + * + * Currently, if any sibling cpusets have exclusive cpus or mem, we + * refuse to clone the configuration - thereby refusing the task to + * be entered, and as a result refusing the sys_unshare() or + * clone() which initiated it. If this becomes a problem for some + * users who wish to allow that scenario, then this could be + * changed to grant parent->cpus_allowed-sibling_cpus_exclusive + * (and likewise for mems) to the new cgroup. + */ + rcu_read_lock(); + cpuset_for_each_child(tmp_cs, pos_css, parent) { + if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + + cpuset_callback_lock_irq(); + cs->mems_allowed = parent->mems_allowed; + cs->effective_mems = parent->mems_allowed; + cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); + cpuset_callback_unlock_irq(); +} + +static void +update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) +{ + if (dattr->relax_domain_level < c->relax_domain_level) + dattr->relax_domain_level = c->relax_domain_level; +} + +static void update_domain_attr_tree(struct sched_domain_attr *dattr, + struct cpuset *root_cs) +{ + struct cpuset *cp; + struct cgroup_subsys_state *pos_css; + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + /* skip the whole subtree if @cp doesn't have any CPU */ + if (cpumask_empty(cp->cpus_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + if (is_sched_load_balance(cp)) + update_domain_attr(dattr, cp); + } + rcu_read_unlock(); +} + +/* + * cpuset1_generate_sched_domains() + * + * Finding the best partition (set of domains): + * The double nested loops below over i, j scan over the load + * balanced cpusets (using the array of cpuset pointers in csa[]) + * looking for pairs of cpusets that have overlapping cpus_allowed + * and merging them using a union-find algorithm. + * + * The union of the cpus_allowed masks from the set of all cpusets + * having the same root then form the one element of the partition + * (one sched domain) to be passed to partition_sched_domains(). + */ +int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes) +{ + struct cpuset *cp; /* top-down scan of cpusets */ + struct cpuset **csa; /* array of all cpuset ptrs */ + int csn; /* how many cpuset ptrs in csa so far */ + int i, j; /* indices for partition finding loops */ + cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ + struct sched_domain_attr *dattr; /* attributes for custom domains */ + int ndoms = 0; /* number of sched domains in result */ + int nslot; /* next empty doms[] struct cpumask slot */ + struct cgroup_subsys_state *pos_css; + int nslot_update; + + lockdep_assert_cpuset_lock_held(); + + doms = NULL; + dattr = NULL; + csa = NULL; + + /* Special case for the 99% of systems with one, full, sched domain */ + if (is_sched_load_balance(&top_cpuset)) { + ndoms = 1; + doms = alloc_sched_domains(ndoms); + if (!doms) + goto done; + + dattr = kmalloc_obj(struct sched_domain_attr); + if (dattr) { + *dattr = SD_ATTR_INIT; + update_domain_attr_tree(dattr, &top_cpuset); + } + cpumask_and(doms[0], top_cpuset.effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + + goto done; + } + + csa = kmalloc_objs(cp, nr_cpusets()); + if (!csa) + goto done; + csn = 0; + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { + if (cp == &top_cpuset) + continue; + + /* + * Continue traversing beyond @cp iff @cp has some CPUs and + * isn't load balancing. The former is obvious. The + * latter: All child cpusets contain a subset of the + * parent's cpus, so just skip them, and then we call + * update_domain_attr_tree() to calc relax_domain_level of + * the corresponding sched domain. + */ + if (!cpumask_empty(cp->cpus_allowed) && + !(is_sched_load_balance(cp) && + cpumask_intersects(cp->cpus_allowed, + housekeeping_cpumask(HK_TYPE_DOMAIN)))) + continue; + + if (is_sched_load_balance(cp) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + } + rcu_read_unlock(); + + for (i = 0; i < csn; i++) + uf_node_init(&csa[i]->node); + + /* Merge overlapping cpusets */ + for (i = 0; i < csn; i++) { + for (j = i + 1; j < csn; j++) { + if (cpusets_overlap(csa[i], csa[j])) + uf_union(&csa[i]->node, &csa[j]->node); + } + } + + /* Count the total number of domains */ + for (i = 0; i < csn; i++) { + if (uf_find(&csa[i]->node) == &csa[i]->node) + ndoms++; + } + + /* + * Now we know how many domains to create. + * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. + */ + doms = alloc_sched_domains(ndoms); + if (!doms) + goto done; + + /* + * The rest of the code, including the scheduler, can deal with + * dattr==NULL case. No need to abort if alloc fails. + */ + dattr = kmalloc_objs(struct sched_domain_attr, ndoms); + + for (nslot = 0, i = 0; i < csn; i++) { + nslot_update = 0; + for (j = i; j < csn; j++) { + if (uf_find(&csa[j]->node) == &csa[i]->node) { + struct cpumask *dp = doms[nslot]; + + if (i == j) { + nslot_update = 1; + cpumask_clear(dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + } + cpumask_or(dp, dp, csa[j]->effective_cpus); + cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); + if (dattr) + update_domain_attr_tree(dattr + nslot, csa[j]); + } + } + if (nslot_update) + nslot++; + } + BUG_ON(nslot != ndoms); + +done: + kfree(csa); + + /* + * Fallback to the default domain if kmalloc() failed. + * See comments in partition_sched_domains(). + */ + if (doms == NULL) + ndoms = 1; + + *domains = doms; + *attributes = dattr; + return ndoms; +} + /* * for the common functions, 'private' gives the type of file */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c06e2e96f79d..e3a081a07c6d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -26,7 +26,6 @@ #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/memory.h> -#include <linux/export.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/deadline.h> @@ -63,6 +62,75 @@ static const char * const perr_strings[] = { }; /* + * CPUSET Locking Convention + * ------------------------- + * + * Below are the four global/local locks guarding cpuset structures in lock + * acquisition order: + * - cpuset_top_mutex + * - cpu_hotplug_lock (cpus_read_lock/cpus_write_lock) + * - cpuset_mutex + * - callback_lock (raw spinlock) + * + * As cpuset will now indirectly flush a number of different workqueues in + * housekeeping_update() to update housekeeping cpumasks when the set of + * isolated CPUs is going to be changed, it may be vulnerable to deadlock + * if we hold cpus_read_lock while calling into housekeeping_update(). + * + * The first cpuset_top_mutex will be held except when calling into + * cpuset_handle_hotplug() from the CPU hotplug code where cpus_write_lock + * and cpuset_mutex will be held instead. The main purpose of this mutex + * is to prevent regular cpuset control file write actions from interfering + * with the call to housekeeping_update(), though CPU hotplug operation can + * still happen in parallel. This mutex also provides protection for some + * internal variables. + * + * A task must hold all the remaining three locks to modify externally visible + * or used fields of cpusets, though some of the internally used cpuset fields + * and internal variables can be modified without holding callback_lock. If only + * reliable read access of the externally used fields are needed, a task can + * hold either cpuset_mutex or callback_lock which are exposed to other + * external subsystems. + * + * If a task holds cpu_hotplug_lock and cpuset_mutex, it blocks others, + * ensuring that it is the only task able to also acquire callback_lock and + * be able to modify cpusets. It can perform various checks on the cpuset + * structure first, knowing nothing will change. It can also allocate memory + * without holding callback_lock. While it is performing these checks, various + * callback routines can briefly acquire callback_lock to query cpusets. Once + * it is ready to make the changes, it takes callback_lock, blocking everyone + * else. + * + * Calls to the kernel memory allocator cannot be made while holding + * callback_lock which is a spinlock, as the memory allocator may sleep or + * call back into cpuset code and acquire callback_lock. + * + * Now, the task_struct fields mems_allowed and mempolicy may be changed + * by other task, we use alloc_lock in the task_struct fields to protect + * them. + * + * The cpuset_common_seq_show() handlers only hold callback_lock across + * small pieces of code, such as when reading out possibly multi-word + * cpumasks and nodemasks. + */ + +static DEFINE_MUTEX(cpuset_top_mutex); +static DEFINE_MUTEX(cpuset_mutex); + +/* + * File level internal variables below follow one of the following exclusion + * rules. + * + * RWCS: Read/write-able by holding either cpus_write_lock (and optionally + * cpuset_mutex) or both cpus_read_lock and cpuset_mutex. + * + * CSCB: Readable by holding either cpuset_mutex or callback_lock. Writable + * by holding both cpuset_mutex and callback_lock. + * + * T: Read/write-able by holding the cpuset_top_mutex. + */ + +/* * For local partitions, update to subpartitions_cpus & isolated_cpus is done * in update_parent_effective_cpumask(). For remote partitions, it is done in * the remote_partition_*() and remote_cpus_update() helpers. @@ -71,25 +139,22 @@ static const char * const perr_strings[] = { * Exclusive CPUs distributed out to local or remote sub-partitions of * top_cpuset */ -static cpumask_var_t subpartitions_cpus; +static cpumask_var_t subpartitions_cpus; /* RWCS */ /* - * Exclusive CPUs in isolated partitions + * Exclusive CPUs in isolated partitions (shown in cpuset.cpus.isolated) */ -static cpumask_var_t isolated_cpus; +static cpumask_var_t isolated_cpus; /* CSCB */ /* - * isolated_cpus updating flag (protected by cpuset_mutex) - * Set if isolated_cpus is going to be updated in the current - * cpuset_mutex crtical section. + * Set if housekeeping cpumasks are to be updated. */ -static bool isolated_cpus_updating; +static bool update_housekeeping; /* RWCS */ /* - * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot + * Copy of isolated_cpus to be passed to housekeeping_update() */ -static cpumask_var_t boot_hk_cpus; -static bool have_boot_isolcpus; +static cpumask_var_t isolated_hk_cpus; /* T */ /* * A flag to force sched domain rebuild at the end of an operation. @@ -105,7 +170,7 @@ static bool have_boot_isolcpus; * Note that update_relax_domain_level() in cpuset-v1.c can still call * rebuild_sched_domains_locked() directly without using this flag. */ -static bool force_sd_rebuild; +static bool force_sd_rebuild; /* RWCS */ /* * Partition root states: @@ -126,6 +191,17 @@ static bool force_sd_rebuild; * For simplicity, a local partition can be created under a local or remote * partition but a remote partition cannot have any partition root in its * ancestor chain except the cgroup root. + * + * A valid partition can be formed by setting exclusive_cpus or cpus_allowed + * if exclusive_cpus is not set. In the case of partition with empty + * exclusive_cpus, all the conflicting exclusive CPUs specified in the + * following cpumasks of sibling cpusets will be removed from its + * cpus_allowed in determining its effective_xcpus. + * - effective_xcpus + * - exclusive_cpus + * + * The "cpuset.cpus.exclusive" control file should be used for setting up + * partition if the users want to get as many CPUs as possible. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -208,50 +284,13 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) * If cpu_online_mask is used while a hotunplug operation is happening in * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ -static struct cpuset top_cpuset = { +struct cpuset top_cpuset = { .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, - .relax_domain_level = -1, - .remote_partition = false, + .dl_bw_cpu = -1, }; -/* - * There are two global locks guarding cpuset structures - cpuset_mutex and - * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel - * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset - * structures. Note that cpuset_mutex needs to be a mutex as it is used in - * paths that rely on priority inheritance (e.g. scheduler - on RT) for - * correctness. - * - * A task must hold both locks to modify cpusets. If a task holds - * cpuset_mutex, it blocks others, ensuring that it is the only task able to - * also acquire callback_lock and be able to modify cpusets. It can perform - * various checks on the cpuset structure first, knowing nothing will change. - * It can also allocate memory while just holding cpuset_mutex. While it is - * performing these checks, various callback routines can briefly acquire - * callback_lock to query cpusets. Once it is ready to make the changes, it - * takes callback_lock, blocking everyone else. - * - * Calls to the kernel memory allocator can not be made while holding - * callback_lock, as that would risk double tripping on callback_lock - * from one of the callbacks into the cpuset code from within - * __alloc_pages(). - * - * If a task is only holding callback_lock, then it has read-only - * access to cpusets. - * - * Now, the task_struct fields mems_allowed and mempolicy may be changed - * by other task, we use alloc_lock in the task_struct fields to protect - * them. - * - * The cpuset_common_seq_show() handlers only hold callback_lock across - * small pieces of code, such as when reading out possibly multi-word - * cpumasks and nodemasks. - */ - -static DEFINE_MUTEX(cpuset_mutex); - /** * cpuset_lock - Acquire the global cpuset mutex * @@ -268,6 +307,11 @@ void cpuset_unlock(void) mutex_unlock(&cpuset_mutex); } +void lockdep_assert_cpuset_lock_held(void) +{ + lockdep_assert_held(&cpuset_mutex); +} + /** * cpuset_full_lock - Acquire full protection for cpuset modification * @@ -276,6 +320,7 @@ void cpuset_unlock(void) */ void cpuset_full_lock(void) { + mutex_lock(&cpuset_top_mutex); cpus_read_lock(); mutex_lock(&cpuset_mutex); } @@ -284,8 +329,17 @@ void cpuset_full_unlock(void) { mutex_unlock(&cpuset_mutex); cpus_read_unlock(); + mutex_unlock(&cpuset_top_mutex); } +#ifdef CONFIG_LOCKDEP +bool lockdep_is_cpuset_held(void) +{ + return lockdep_is_held(&cpuset_mutex) || + lockdep_is_held(&cpuset_top_mutex); +} +#endif + static DEFINE_SPINLOCK(callback_lock); void cpuset_callback_lock_irq(void) @@ -319,7 +373,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) */ static inline void dec_attach_in_progress_locked(struct cpuset *cs) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); cs->attach_in_progress--; if (!cs->attach_in_progress) @@ -353,15 +407,6 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } -static inline bool cpuset_is_populated(struct cpuset *cs) -{ - lockdep_assert_held(&cpuset_mutex); - - /* Cpusets in the process of attaching should be considered as populated */ - return cgroup_is_populated(cs->css.cgroup) || - cs->attach_in_progress; -} - /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked @@ -453,9 +498,8 @@ static void guarantee_active_cpus(struct task_struct *tsk, */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { - while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) + while (!nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY])) cs = parent_cs(cs); - nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } /** @@ -532,10 +576,12 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) /* Allocate base structure */ trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) : - kzalloc(sizeof(*cs), GFP_KERNEL); + kzalloc_obj(*cs); if (!trial) return NULL; + trial->dl_bw_cpu = -1; + /* Setup cpumask pointer array */ cpumask_var_t *pmask[4] = { &trial->cpus_allowed, @@ -603,36 +649,32 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) /** * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts - * @cs1: first cpuset to check - * @cs2: second cpuset to check + * @trial: the trial cpuset to be checked + * @sibling: a sibling cpuset to be checked against + * @xcpus_changed: set if exclusive_cpus has been set * * Returns: true if CPU exclusivity conflict exists, false otherwise * * Conflict detection rules: - * 1. If either cpuset is CPU exclusive, they must be mutually exclusive - * 2. exclusive_cpus masks cannot intersect between cpusets - * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + * o cgroup v1 + * See cpuset1_cpus_excl_conflict() + * o cgroup v2 + * - The exclusive_cpus values cannot overlap. + * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed. */ -static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling, + bool xcpus_changed) { - /* If either cpuset is exclusive, check if they are mutually exclusive */ - if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) - return !cpusets_are_exclusive(cs1, cs2); - - /* Exclusive_cpus cannot intersect */ - if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) - return true; - - /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ - if (!cpumask_empty(cs1->cpus_allowed) && - cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) - return true; + if (!cpuset_v2()) + return cpuset1_cpus_excl_conflict(trial, sibling); - if (!cpumask_empty(cs2->cpus_allowed) && - cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */ + if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) && + cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus)) return true; - return false; + /* Exclusive_cpus cannot intersect */ + return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus); } static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) @@ -666,6 +708,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; + bool xcpus_changed; int ret = 0; rcu_read_lock(); @@ -682,20 +725,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); /* - * Cpusets with tasks - existing or newly being attached - can't - * be changed to have empty cpus_allowed or mems_allowed. - */ - ret = -ENOSPC; - if (cpuset_is_populated(cur)) { - if (!cpumask_empty(cur->cpus_allowed) && - cpumask_empty(trial->cpus_allowed)) - goto out; - if (!nodes_empty(cur->mems_allowed) && - nodes_empty(trial->mems_allowed)) - goto out; - } - - /* * We can't shrink if we won't have enough room for SCHED_DEADLINE * tasks. This check is not done when scheduling is disabled as the * users should know what they are doing. @@ -722,10 +751,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; + xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus); cpuset_for_each_child(c, css, par) { if (c == cur) continue; - if (cpus_excl_conflict(trial, c)) + if (cpus_excl_conflict(trial, c, xcpus_changed)) goto out; if (mems_excl_conflict(trial, c)) goto out; @@ -738,49 +768,6 @@ out: } #ifdef CONFIG_SMP -/* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? - */ -static int cpusets_overlap(struct cpuset *a, struct cpuset *b) -{ - return cpumask_intersects(a->effective_cpus, b->effective_cpus); -} - -static void -update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) -{ - if (dattr->relax_domain_level < c->relax_domain_level) - dattr->relax_domain_level = c->relax_domain_level; - return; -} - -static void update_domain_attr_tree(struct sched_domain_attr *dattr, - struct cpuset *root_cs) -{ - struct cpuset *cp; - struct cgroup_subsys_state *pos_css; - - rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - /* skip the whole subtree if @cp doesn't have any CPU */ - if (cpumask_empty(cp->cpus_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } - - if (is_sched_load_balance(cp)) - update_domain_attr(dattr, cp); - } - rcu_read_unlock(); -} - -/* Must be called with cpuset_mutex held. */ -static inline int nr_cpusets(void) -{ - /* jump label reference count + the top-level cpuset */ - return static_key_count(&cpusets_enabled_key.key) + 1; -} /* * generate_sched_domains() @@ -820,103 +807,46 @@ static inline int nr_cpusets(void) * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) - * - * Finding the best partition (set of domains): - * The double nested loops below over i, j scan over the load - * balanced cpusets (using the array of cpuset pointers in csa[]) - * looking for pairs of cpusets that have overlapping cpus_allowed - * and merging them using a union-find algorithm. - * - * The union of the cpus_allowed masks from the set of all cpusets - * having the same root then form the one element of the partition - * (one sched domain) to be passed to partition_sched_domains(). - * */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ - int csn; /* how many cpuset ptrs in csa so far */ int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ - int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; - bool root_load_balance = is_sched_load_balance(&top_cpuset); - bool cgrpv2 = cpuset_v2(); - int nslot_update; + + if (!cpuset_v2()) + return cpuset1_generate_sched_domains(domains, attributes); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && cpumask_empty(subpartitions_cpus)) { -single_root_domain: + if (cpumask_empty(subpartitions_cpus)) { ndoms = 1; - doms = alloc_sched_domains(ndoms); - if (!doms) - goto done; - - dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); - if (dattr) { - *dattr = SD_ATTR_INIT; - update_domain_attr_tree(dattr, &top_cpuset); - } - cpumask_and(doms[0], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_TYPE_DOMAIN)); - - goto done; + /* !csa will be checked and can be correctly handled */ + goto generate_doms; } - csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); + csa = kmalloc_objs(cp, nr_cpusets()); if (!csa) goto done; - csn = 0; + /* Find how many partitions and cache them to csa[] */ rcu_read_lock(); - if (root_load_balance) - csa[csn++] = &top_cpuset; cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { - if (cp == &top_cpuset) - continue; - - if (cgrpv2) - goto v2; - - /* - * v1: - * Continue traversing beyond @cp iff @cp has some CPUs and - * isn't load balancing. The former is obvious. The - * latter: All child cpusets contain a subset of the - * parent's cpus, so just skip them, and then we call - * update_domain_attr_tree() to calc relax_domain_level of - * the corresponding sched domain. - */ - if (!cpumask_empty(cp->cpus_allowed) && - !(is_sched_load_balance(cp) && - cpumask_intersects(cp->cpus_allowed, - housekeeping_cpumask(HK_TYPE_DOMAIN)))) - continue; - - if (is_sched_load_balance(cp) && - !cpumask_empty(cp->effective_cpus)) - csa[csn++] = cp; - - /* skip @cp's subtree */ - pos_css = css_rightmost_descendant(pos_css); - continue; - -v2: /* * Only valid partition roots that are not isolated and with - * non-empty effective_cpus will be saved into csn[]. + * non-empty effective_cpus will be saved into csa[]. */ if ((cp->partition_root_state == PRS_ROOT) && !cpumask_empty(cp->effective_cpus)) - csa[csn++] = cp; + csa[ndoms++] = cp; /* * Skip @cp's subtree if not a partition root and has no @@ -927,40 +857,18 @@ v2: } rcu_read_unlock(); - /* - * If there are only isolated partitions underneath the cgroup root, - * we can optimize out unneeded sched domains scanning. - */ - if (root_load_balance && (csn == 1)) - goto single_root_domain; - - for (i = 0; i < csn; i++) - uf_node_init(&csa[i]->node); - - /* Merge overlapping cpusets */ - for (i = 0; i < csn; i++) { - for (j = i + 1; j < csn; j++) { - if (cpusets_overlap(csa[i], csa[j])) { + for (i = 0; i < ndoms; i++) { + for (j = i + 1; j < ndoms; j++) { + if (cpusets_overlap(csa[i], csa[j])) /* * Cgroup v2 shouldn't pass down overlapping * partition root cpusets. */ - WARN_ON_ONCE(cgrpv2); - uf_union(&csa[i]->node, &csa[j]->node); - } + WARN_ON_ONCE(1); } } - /* Count the total number of domains */ - for (i = 0; i < csn; i++) { - if (uf_find(&csa[i]->node) == &csa[i]->node) - ndoms++; - } - - /* - * Now we know how many domains to create. - * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. - */ +generate_doms: doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -969,53 +877,26 @@ v2: * The rest of the code, including the scheduler, can deal with * dattr==NULL case. No need to abort if alloc fails. */ - dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), - GFP_KERNEL); + dattr = kmalloc_objs(struct sched_domain_attr, ndoms); /* * Cgroup v2 doesn't support domain attributes, just set all of them * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a - * subset of HK_TYPE_DOMAIN housekeeping CPUs. + * subset of HK_TYPE_DOMAIN_BOOT housekeeping CPUs. */ - if (cgrpv2) { - for (i = 0; i < ndoms; i++) { - /* - * The top cpuset may contain some boot time isolated - * CPUs that need to be excluded from the sched domain. - */ - if (csa[i] == &top_cpuset) - cpumask_and(doms[i], csa[i]->effective_cpus, - housekeeping_cpumask(HK_TYPE_DOMAIN)); - else - cpumask_copy(doms[i], csa[i]->effective_cpus); - if (dattr) - dattr[i] = SD_ATTR_INIT; - } - goto done; - } - - for (nslot = 0, i = 0; i < csn; i++) { - nslot_update = 0; - for (j = i; j < csn; j++) { - if (uf_find(&csa[j]->node) == &csa[i]->node) { - struct cpumask *dp = doms[nslot]; - - if (i == j) { - nslot_update = 1; - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; - } - cpumask_or(dp, dp, csa[j]->effective_cpus); - cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); - if (dattr) - update_domain_attr_tree(dattr + nslot, csa[j]); - } - } - if (nslot_update) - nslot++; + for (i = 0; i < ndoms; i++) { + /* + * The top cpuset may contain some boot time isolated + * CPUs that need to be excluded from the sched domain. + */ + if (!csa || csa[i] == &top_cpuset) + cpumask_and(doms[i], top_cpuset.effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT)); + else + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; } - BUG_ON(nslot != ndoms); done: kfree(csa); @@ -1055,7 +936,7 @@ void dl_rebuild_rd_accounting(void) int cpu; u64 cookie = ++dl_cookie; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); @@ -1100,53 +981,33 @@ void dl_rebuild_rd_accounting(void) */ void rebuild_sched_domains_locked(void) { - struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; cpumask_var_t *doms; - struct cpuset *cs; int ndoms; + int i; lockdep_assert_cpus_held(); - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); force_sd_rebuild = false; - /* - * If we have raced with CPU hotplug, return early to avoid - * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_handle_hotplug() will rebuild sched domains. - * - * With no CPUs in any subpartitions, top_cpuset's effective CPUs - * should be the same as the active CPUs, so checking only top_cpuset - * is enough to detect racing CPU offlines. - */ - if (cpumask_empty(subpartitions_cpus) && - !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - return; + /* Generate domain masks and attrs */ + ndoms = generate_sched_domains(&doms, &attr); /* - * With subpartition CPUs, however, the effective CPUs of a partition - * root should be only a subset of the active CPUs. Since a CPU in any - * partition root could be offlined, all must be checked. - */ - if (!cpumask_empty(subpartitions_cpus)) { - rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (!is_partition_valid(cs)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } - if (!cpumask_subset(cs->effective_cpus, - cpu_active_mask)) { - rcu_read_unlock(); - return; - } - } - rcu_read_unlock(); + * cpuset_hotplug_workfn is invoked synchronously now, thus this + * function should not race with CPU hotplug. And the effective CPUs + * must not include any offline CPUs. Passing an offline CPU in the + * doms to partition_sched_domains() will trigger a kernel panic. + * + * We perform a final check here: if the doms contains any + * offline CPUs, a warning is emitted and we return directly to + * prevent the panic. + */ + for (i = 0; doms && i < ndoms; i++) { + if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) + return; } - /* Generate domain masks and attrs */ - ndoms = generate_sched_domains(&doms, &attr); - /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); } @@ -1205,11 +1066,10 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) if (top_cs) { /* + * PF_KTHREAD tasks are handled by housekeeping. * PF_NO_SETAFFINITY tasks are ignored. - * All per cpu kthreads should have PF_NO_SETAFFINITY - * flag set, see kthread_set_per_cpu(). */ - if (task->flags & PF_NO_SETAFFINITY) + if (task->flags & (PF_KTHREAD | PF_NO_SETAFFINITY)) continue; cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { @@ -1343,12 +1203,18 @@ static void reset_partition_data(struct cpuset *cs) static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) { WARN_ON_ONCE(old_prs == new_prs); - if (new_prs == PRS_ISOLATED) + lockdep_assert_held(&callback_lock); + lockdep_assert_held(&cpuset_mutex); + if (new_prs == PRS_ISOLATED) { + if (cpumask_subset(xcpus, isolated_cpus)) + return; cpumask_or(isolated_cpus, isolated_cpus, xcpus); - else + } else { + if (!cpumask_intersects(xcpus, isolated_cpus)) + return; cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); - - isolated_cpus_updating = true; + } + update_housekeeping = true; } /* @@ -1401,8 +1267,8 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent, isolated_cpus_update(old_prs, parent->partition_root_state, xcpus); - cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); + cpumask_and(parent->effective_cpus, parent->effective_cpus, cpu_active_mask); } /* @@ -1450,54 +1316,62 @@ static bool isolated_cpus_can_update(struct cpumask *add_cpus, * @new_cpus: cpu mask * Return: true if there is conflict, false otherwise * - * CPUs outside of boot_hk_cpus, if defined, can only be used in an + * CPUs outside of HK_TYPE_DOMAIN_BOOT, if defined, can only be used in an * isolated partition. */ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) { - if (!have_boot_isolcpus) + if (!housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) return false; - if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) + if ((prstate != PRS_ISOLATED) && + !cpumask_subset(new_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT))) return true; return false; } /* - * update_isolation_cpumasks - Update external isolation related CPU masks + * cpuset_update_sd_hk_unlock - Rebuild sched domains, update HK & unlock * - * The following external CPU masks will be updated if necessary: - * - workqueue unbound cpumask + * Update housekeeping cpumasks and rebuild sched domains if necessary and + * then do a cpuset_full_unlock(). + * This should be called at the end of cpuset operation. */ -static void update_isolation_cpumasks(void) +static void cpuset_update_sd_hk_unlock(void) + __releases(&cpuset_mutex) + __releases(&cpuset_top_mutex) { - int ret; - - if (!isolated_cpus_updating) - return; - - lockdep_assert_cpus_held(); - - ret = workqueue_unbound_exclude_cpumask(isolated_cpus); - WARN_ON_ONCE(ret < 0); + /* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */ + if (force_sd_rebuild) + rebuild_sched_domains_locked(); - ret = tmigr_isolated_exclude_cpumask(isolated_cpus); - WARN_ON_ONCE(ret < 0); + if (update_housekeeping) { + update_housekeeping = false; + cpumask_copy(isolated_hk_cpus, isolated_cpus); - isolated_cpus_updating = false; + /* + * housekeeping_update() is now called without holding + * cpus_read_lock and cpuset_mutex. Only cpuset_top_mutex + * is still being held for mutual exclusion. + */ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + WARN_ON_ONCE(housekeeping_update(isolated_hk_cpus)); + mutex_unlock(&cpuset_top_mutex); + } else { + cpuset_full_unlock(); + } } -/** - * cpuset_cpu_is_isolated - Check if the given CPU is isolated - * @cpu: the CPU number to be checked - * Return: true if CPU is used in an isolated partition, false otherwise +/* + * Work function to invoke cpuset_update_sd_hk_unlock() */ -bool cpuset_cpu_is_isolated(int cpu) +static void hk_sd_workfn(struct work_struct *work) { - return cpumask_test_cpu(cpu, isolated_cpus); + cpuset_full_lock(); + cpuset_update_sd_hk_unlock(); } -EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); /** * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets @@ -1517,23 +1391,29 @@ static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, int retval = 0; if (cpumask_empty(excpus)) - return retval; + return 0; /* - * Exclude exclusive CPUs from siblings + * Remove exclusive CPUs from siblings */ rcu_read_lock(); cpuset_for_each_child(sibling, css, parent) { + struct cpumask *sibling_xcpus; + if (sibling == cs) continue; - if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { - cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); - retval++; - continue; - } - if (cpumask_intersects(excpus, sibling->effective_xcpus)) { - cpumask_andnot(excpus, excpus, sibling->effective_xcpus); + /* + * If exclusive_cpus is defined, effective_xcpus will always + * be a subset. Otherwise, effective_xcpus will only be set + * in a valid partition root. + */ + sibling_xcpus = cpumask_empty(sibling->exclusive_cpus) + ? sibling->effective_xcpus + : sibling->exclusive_cpus; + + if (cpumask_intersects(excpus, sibling_xcpus)) { + cpumask_andnot(excpus, excpus, sibling_xcpus); retval++; } } @@ -1641,7 +1521,6 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, cs->remote_partition = true; cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); cpuset_force_rebuild(); cs->prs_err = 0; @@ -1686,7 +1565,6 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); cpuset_force_rebuild(); /* @@ -1757,7 +1635,6 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, if (xcpus) cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); if (adding || deleting) cpuset_force_rebuild(); @@ -1822,7 +1699,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int parent_prs = parent->partition_root_state; bool nocpu; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ /* @@ -2101,7 +1978,6 @@ write_error: partition_xcpus_add(new_prs, parent, tmp->delmask); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive_flag(cs, new_prs); @@ -2331,17 +2207,13 @@ get_css: spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; - if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) - compute_excpus(cp, cp->effective_xcpus); - /* - * Make sure effective_xcpus is properly set for a valid - * partition root. + * Need to compute effective_xcpus if either exclusive_cpus + * is non-empty or it is a valid partition root. */ - if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) - cpumask_and(cp->effective_xcpus, - cp->cpus_allowed, parent->effective_xcpus); - else if (new_prs < 0) + if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus)) + compute_excpus(cp, cp->effective_xcpus); + if (new_prs <= 0) reset_partition_data(cp); spin_unlock_irq(&callback_lock); @@ -2350,7 +2222,7 @@ get_css: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - cpuset_update_tasks_cpumask(cp, cp->effective_cpus); + cpuset_update_tasks_cpumask(cp, tmp->new_cpus); /* * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE @@ -2394,7 +2266,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); /* * Check all its siblings and call update_cpumasks_hier() @@ -2403,27 +2275,20 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus - * directly. + * directly. It should not impact valid partition. * * The update_cpumasks_hier() function may sleep. So we have to * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { - if (sibling == cs) + if (sibling == cs || is_partition_valid(sibling)) continue; - if (!is_partition_valid(sibling)) { - compute_effective_cpumask(tmp->new_cpus, sibling, - parent); - if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) - continue; - } else if (is_remote_partition(sibling)) { - /* - * Change in a sibling cpuset won't affect a remote - * partition root. - */ + + compute_effective_cpumask(tmp->new_cpus, sibling, + parent); + if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) continue; - } if (!css_tryget_online(&sibling->css)) continue; @@ -2479,43 +2344,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri return PERR_NONE; } -static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, - struct tmpmasks *tmp) -{ - int retval; - struct cpuset *parent = parent_cs(cs); - - retval = validate_change(cs, trialcs); - - if ((retval == -EINVAL) && cpuset_v2()) { - struct cgroup_subsys_state *css; - struct cpuset *cp; - - /* - * The -EINVAL error code indicates that partition sibling - * CPU exclusivity rule has been violated. We still allow - * the cpumask change to proceed while invalidating the - * partition. However, any conflicting sibling partitions - * have to be marked as invalid too. - */ - trialcs->prs_err = PERR_NOTEXCL; - rcu_read_lock(); - cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = user_xcpus(trialcs); - - if (is_partition_valid(cp) && - cpumask_intersects(xcpus, cp->effective_xcpus)) { - rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); - rcu_read_lock(); - } - } - rcu_read_unlock(); - retval = 0; - } - return retval; -} - /** * partition_cpus_change - Handle partition state changes due to CPU mask updates * @cs: The target cpuset being modified @@ -2575,15 +2403,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; - if (alloc_tmpmasks(&tmp)) - return -ENOMEM; - compute_trialcs_excpus(trialcs, cs); trialcs->prs_err = PERR_NONE; - retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + retval = validate_change(cs, trialcs); if (retval < 0) - goto out_free; + return retval; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; /* * Check all the descendants in update_cpumasks_hier() if @@ -2606,7 +2434,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); -out_free: + free_tmpmasks(&tmp); return retval; } @@ -2717,7 +2545,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, return; } - mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); + mwork = kzalloc_obj(*mwork); if (mwork) { mwork->mm = mm; mwork->from = *from; @@ -2739,7 +2567,7 @@ static void schedule_flush_migrate_mm(void) { struct callback_head *flush_cb; - flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); + flush_cb = kzalloc_obj(struct callback_head); if (!flush_cb) return; @@ -2859,13 +2687,13 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); - nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); + bool has_mems = nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (is_in_v2_mode() && nodes_empty(*new_mems)) + if (is_in_v2_mode() && !has_mems) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -3117,7 +2945,6 @@ out: else if (isolcpus_updated) isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); @@ -3156,6 +2983,7 @@ static void reset_migrate_dl_data(struct cpuset *cs) { cs->nr_migrate_dl_tasks = 0; cs->sum_migrate_dl_bw = 0; + cs->dl_bw_cpu = -1; } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ @@ -3164,7 +2992,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct cpuset *cs, *oldcs; struct task_struct *task; - bool cpus_updated, mems_updated; + bool setsched_check; int ret; /* used later by cpuset_attach() */ @@ -3179,20 +3007,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) if (ret) goto out_unlock; - cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); - mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); + /* + * Skip rights over task setsched check in v2 when nothing changes, + * migration permission derives from hierarchy ownership in + * cgroup_procs_write_permission()). + */ + setsched_check = !cpuset_v2() || + !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) || + !nodes_equal(cs->effective_mems, oldcs->effective_mems); + + /* + * A v1 cpuset with tasks will have no CPU left only when CPU hotplug + * brings the last online CPU offline as users are not allowed to empty + * cpuset.cpus when there are active tasks inside. When that happens, + * we should allow tasks to migrate out without security check to make + * sure they will be able to run after migration. + */ + if (!is_in_v2_mode() && cpumask_empty(oldcs->effective_cpus)) + setsched_check = false; cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task); if (ret) goto out_unlock; - /* - * Skip rights over task check in v2 when nothing changes, - * migration permission derives from hierarchy ownership in - * cgroup_procs_write_permission()). - */ - if (!cpuset_v2() || (cpus_updated || mems_updated)) { + if (setsched_check) { ret = security_task_setscheduler(task); if (ret) goto out_unlock; @@ -3221,6 +3060,8 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) reset_migrate_dl_data(cs); goto out_unlock; } + + cs->dl_bw_cpu = cpu; } out_success: @@ -3245,12 +3086,11 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) mutex_lock(&cpuset_mutex); dec_attach_in_progress_locked(cs); - if (cs->nr_migrate_dl_tasks) { - int cpu = cpumask_any(cs->effective_cpus); + if (cs->dl_bw_cpu >= 0) + dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw); - dl_bw_free(cpu, cs->sum_migrate_dl_bw); + if (cs->nr_migrate_dl_tasks) reset_migrate_dl_data(cs); - } mutex_unlock(&cpuset_mutex); } @@ -3265,7 +3105,7 @@ static nodemask_t cpuset_attach_nodemask_to; static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); if (cs != &top_cpuset) guarantee_active_cpus(task, cpus_attach); @@ -3407,10 +3247,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, } free_cpuset(trialcs); - if (force_sd_rebuild) - rebuild_sched_domains_locked(); out_unlock: - cpuset_full_unlock(); + cpuset_update_sd_hk_unlock(); if (of_cft(of)->private == FILE_MEMLIST) schedule_flush_migrate_mm(); return retval ?: nbytes; @@ -3517,7 +3355,7 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, cpuset_full_lock(); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); - cpuset_full_unlock(); + cpuset_update_sd_hk_unlock(); return retval ?: nbytes; } @@ -3621,8 +3459,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - fmeter_init(&cs->fmeter); - cs->relax_domain_level = -1; + cpuset1_init(cs); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cpuset_v2()) @@ -3635,17 +3472,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); - struct cpuset *tmp_cs; - struct cgroup_subsys_state *pos_css; if (!parent) return 0; cpuset_full_lock(); - if (is_spread_page(parent)) - set_bit(CS_SPREAD_PAGE, &cs->flags); - if (is_spread_slab(parent)) - set_bit(CS_SPREAD_SLAB, &cs->flags); /* * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated */ @@ -3660,39 +3491,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; } spin_unlock_irq(&callback_lock); + cpuset1_online_css(css); - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) - goto out_unlock; - - /* - * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is - * set. This flag handling is implemented in cgroup core for - * historical reasons - the flag may be specified during mount. - * - * Currently, if any sibling cpusets have exclusive cpus or mem, we - * refuse to clone the configuration - thereby refusing the task to - * be entered, and as a result refusing the sys_unshare() or - * clone() which initiated it. If this becomes a problem for some - * users who wish to allow that scenario, then this could be - * changed to grant parent->cpus_allowed-sibling_cpus_exclusive - * (and likewise for mems) to the new cgroup. - */ - rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_css, parent) { - if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { - rcu_read_unlock(); - goto out_unlock; - } - } - rcu_read_unlock(); - - spin_lock_irq(&callback_lock); - cs->mems_allowed = parent->mems_allowed; - cs->effective_mems = parent->mems_allowed; - cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); - cpumask_copy(cs->effective_cpus, parent->cpus_allowed); - spin_unlock_irq(&callback_lock); -out_unlock: cpuset_full_unlock(); return 0; } @@ -3729,7 +3529,7 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css) /* Reset valid partition back to member */ if (is_partition_valid(cs)) update_prstate(cs, PRS_MEMBER); - cpuset_full_unlock(); + cpuset_update_sd_hk_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -3884,6 +3684,7 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&isolated_hk_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); @@ -3892,16 +3693,13 @@ int __init cpuset_init(void) cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); - fmeter_init(&top_cpuset.fmeter); + cpuset1_init(&top_cpuset); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); - have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN); - if (have_boot_isolcpus) { - BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL)); - cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); - cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus); - } + if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) + cpumask_andnot(isolated_cpus, cpu_possible_mask, + housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT)); return 0; } @@ -4058,6 +3856,7 @@ unlock: */ static void cpuset_handle_hotplug(void) { + static DECLARE_WORK(hk_sd_work, hk_sd_workfn); static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; @@ -4139,9 +3938,25 @@ static void cpuset_handle_hotplug(void) rcu_read_unlock(); } - /* rebuild sched domains if necessary */ + /* + * rebuild_sched_domains() will always be called directly if needed + * to make sure that newly added or removed CPU will be reflected in + * the sched domains. However, if isolated partition invalidation + * or recreation is being done (update_housekeeping set), a work item + * will be queued to call housekeeping_update() to update the + * corresponding housekeeping cpumasks after some slight delay. + * + * We rely on WORK_STRUCT_PENDING_BIT to not requeue a work item that + * is still pending. Before the pending bit is cleared, the work data + * is copied out and work item dequeued. So it is possible to queue + * the work again before the hk_sd_workfn() is invoked to process the + * previously queued work. Since hk_sd_workfn() doesn't use the work + * item at all, this is not a problem. + */ if (force_sd_rebuild) rebuild_sched_domains_cpuslocked(); + if (update_housekeeping) + queue_work(system_dfl_wq, &hk_sd_work); free_tmpmasks(ptmp); } @@ -4229,7 +4044,7 @@ static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask */ void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); __cpuset_cpus_allowed_locked(tsk, pmask); } @@ -4424,40 +4239,58 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) return allowed; } -bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +/** + * cpuset_nodes_allowed - return effective_mems mask from a cgroup cpuset. + * @cgroup: pointer to struct cgroup. + * @mask: pointer to struct nodemask_t to be returned. + * + * Returns effective_mems mask from a cgroup cpuset if it is cgroup v2 and + * has cpuset subsys. Otherwise, returns node_states[N_MEMORY]. + * + * This function intentionally avoids taking the cpuset_mutex or callback_lock + * when accessing effective_mems. This is because the obtained effective_mems + * is stale immediately after the query anyway (e.g., effective_mems is updated + * immediately after releasing the lock but before returning). + * + * As a result, returned @mask may be empty because cs->effective_mems can be + * rebound during this call. Besides, nodes in @mask are not guaranteed to be + * online due to hot plugins. Callers should check the mask for validity on + * return based on its subsequent use. + **/ +void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask) { struct cgroup_subsys_state *css; struct cpuset *cs; - bool allowed; /* * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy * and mems_allowed is likely to be empty even if we could get to it, - * so return true to avoid taking a global lock on the empty check. + * so return directly to avoid taking a global lock on the empty check. */ - if (!cpuset_v2()) - return true; + if (!cgroup || !cpuset_v2()) { + nodes_copy(*mask, node_states[N_MEMORY]); + return; + } css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys); - if (!css) - return true; + if (!css) { + nodes_copy(*mask, node_states[N_MEMORY]); + return; + } /* - * Normally, accessing effective_mems would require the cpuset_mutex - * or callback_lock - but node_isset is atomic and the reference - * taken via cgroup_get_e_css is sufficient to protect css. - * - * Since this interface is intended for use by migration paths, we - * relax locking here to avoid taking global locks - while accepting - * there may be rare scenarios where the result may be innaccurate. + * The reference taken via cgroup_get_e_css is sufficient to + * protect css, but it does not imply safe accesses to effective_mems. * - * Reclaim and migration are subject to these same race conditions, and - * cannot make strong isolation guarantees, so this is acceptable. + * Normally, accessing effective_mems would require the cpuset_mutex + * or callback_lock - but the correctness of this information is stale + * immediately after the query anyway. We do not acquire the lock + * during this process to save lock contention in exchange for racing + * against mems_allowed rebinds. */ cs = container_of(css, struct cpuset, css); - allowed = node_isset(nid, cs->effective_mems); + nodes_copy(*mask, cs->effective_mems); css_put(css); - return allowed; } /** diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 81ea38dd6f9d..883347b87842 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -14,7 +14,7 @@ static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) { - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + struct cgroup_subsys_state *css = kzalloc_obj(*css); if (!css) return ERR_PTR(-ENOMEM); @@ -230,7 +230,7 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) } static void cgroup_masks_read_one(struct seq_file *seq, const char *name, - u16 mask) + u32 mask) { struct cgroup_subsys *ss; int ssid; diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index e12b946278b6..1ab1fb47f271 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -14,6 +14,7 @@ #include <linux/mutex.h> #include <linux/page_counter.h> #include <linux/parser.h> +#include <linux/refcount.h> #include <linux/rculist.h> #include <linux/slab.h> @@ -71,7 +72,9 @@ struct dmem_cgroup_pool_state { struct rcu_head rcu; struct page_counter cnt; + struct dmem_cgroup_pool_state *parent; + refcount_t ref; bool inited; }; @@ -88,6 +91,9 @@ struct dmem_cgroup_pool_state { static DEFINE_SPINLOCK(dmemcg_lock); static LIST_HEAD(dmem_cgroup_regions); +static void dmemcg_free_region(struct kref *ref); +static void dmemcg_pool_free_rcu(struct rcu_head *rcu); + static inline struct dmemcg_state * css_to_dmemcs(struct cgroup_subsys_state *css) { @@ -104,10 +110,38 @@ static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg) return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL; } +static void dmemcg_pool_get(struct dmem_cgroup_pool_state *pool) +{ + refcount_inc(&pool->ref); +} + +static bool dmemcg_pool_tryget(struct dmem_cgroup_pool_state *pool) +{ + return refcount_inc_not_zero(&pool->ref); +} + +static void dmemcg_pool_put(struct dmem_cgroup_pool_state *pool) +{ + if (!refcount_dec_and_test(&pool->ref)) + return; + + call_rcu(&pool->rcu, dmemcg_pool_free_rcu); +} + +static void dmemcg_pool_free_rcu(struct rcu_head *rcu) +{ + struct dmem_cgroup_pool_state *pool = container_of(rcu, typeof(*pool), rcu); + + if (pool->parent) + dmemcg_pool_put(pool->parent); + kref_put(&pool->region->ref, dmemcg_free_region); + kfree(pool); +} + static void free_cg_pool(struct dmem_cgroup_pool_state *pool) { list_del(&pool->region_node); - kfree(pool); + dmemcg_pool_put(pool); } static void @@ -188,7 +222,7 @@ static void dmemcs_free(struct cgroup_subsys_state *css) static struct cgroup_subsys_state * dmemcs_alloc(struct cgroup_subsys_state *parent_css) { - struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL); + struct dmemcg_state *dmemcs = kzalloc_obj(*dmemcs); if (!dmemcs) return ERR_PTR(-ENOMEM); @@ -325,7 +359,7 @@ alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region struct dmem_cgroup_pool_state *pool, *ppool = NULL; if (!*allocpool) { - pool = kzalloc(sizeof(*pool), GFP_NOWAIT); + pool = kzalloc_obj(*pool, GFP_NOWAIT); if (!pool) return ERR_PTR(-ENOMEM); } else { @@ -342,6 +376,12 @@ alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region page_counter_init(&pool->cnt, ppool ? &ppool->cnt : NULL, true); reset_all_resource_limits(pool); + refcount_set(&pool->ref, 1); + kref_get(®ion->ref); + if (ppool && !pool->parent) { + pool->parent = ppool; + dmemcg_pool_get(ppool); + } list_add_tail_rcu(&pool->css_node, &dmemcs->pools); list_add_tail(&pool->region_node, ®ion->pools); @@ -389,6 +429,10 @@ get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *regio /* Fix up parent links, mark as inited. */ pool->cnt.parent = &ppool->cnt; + if (ppool && !pool->parent) { + pool->parent = ppool; + dmemcg_pool_get(ppool); + } pool->inited = true; pool = ppool; @@ -423,7 +467,7 @@ static void dmemcg_free_region(struct kref *ref) */ void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) { - struct list_head *entry; + struct dmem_cgroup_pool_state *pool, *next; if (!region) return; @@ -433,11 +477,10 @@ void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) /* Remove from global region list */ list_del_rcu(®ion->region_node); - list_for_each_rcu(entry, ®ion->pools) { - struct dmem_cgroup_pool_state *pool = - container_of(entry, typeof(*pool), region_node); - + list_for_each_entry_safe(pool, next, ®ion->pools, region_node) { list_del_rcu(&pool->css_node); + list_del(&pool->region_node); + dmemcg_pool_put(pool); } /* @@ -478,7 +521,7 @@ struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt if (!region_name) return ERR_PTR(-ENOMEM); - ret = kzalloc(sizeof(*ret), GFP_KERNEL); + ret = kzalloc_obj(*ret); if (!ret) { kfree(region_name); return ERR_PTR(-ENOMEM); @@ -518,8 +561,10 @@ static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name) */ void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool) { - if (pool) + if (pool) { css_put(&pool->cs->css); + dmemcg_pool_put(pool); + } } EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put); @@ -533,6 +578,8 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) pool = find_cg_pool_locked(cg, region); if (pool && !READ_ONCE(pool->inited)) pool = NULL; + if (pool && !dmemcg_pool_tryget(pool)) + pool = NULL; rcu_read_unlock(); while (!pool) { @@ -541,6 +588,8 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) pool = get_cg_pool_locked(cg, region, &allocpool); else pool = ERR_PTR(-ENODEV); + if (!IS_ERR(pool)) + dmemcg_pool_get(pool); spin_unlock(&dmemcg_lock); if (pool == ERR_PTR(-ENOMEM)) { @@ -548,7 +597,7 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) if (WARN_ON(allocpool)) continue; - allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL); + allocpool = kzalloc_obj(*allocpool); if (allocpool) { pool = NULL; continue; @@ -576,6 +625,7 @@ void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) page_counter_uncharge(&pool->cnt, size); css_put(&pool->cs->css); + dmemcg_pool_put(pool); } EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge); @@ -627,7 +677,9 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, if (ret_limit_pool) { *ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt); css_get(&(*ret_limit_pool)->cs->css); + dmemcg_pool_get(*ret_limit_pool); } + dmemcg_pool_put(pool); ret = -EAGAIN; goto err; } @@ -655,8 +707,7 @@ static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v) return 0; } -static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region, - u64 *new_limit) +static int dmemcg_parse_limit(char *options, u64 *new_limit) { char *end; @@ -700,6 +751,9 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, if (!region_name[0]) continue; + if (!options || !*options) + return -EINVAL; + rcu_read_lock(); region = dmemcg_get_region_by_name(region_name); rcu_read_unlock(); @@ -707,7 +761,7 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, if (!region) return -EINVAL; - err = dmemcg_parse_limit(options, region, &new_limit); + err = dmemcg_parse_limit(options, &new_limit); if (err < 0) goto out_put; @@ -719,6 +773,7 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, /* And commit */ apply(pool, new_limit); + dmemcg_pool_put(pool); out_put: kref_put(®ion->ref, dmemcg_free_region); diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 817c33450fee..8545e0d1ba3d 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -81,7 +81,7 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) { struct freezer *freezer; - freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); + freezer = kzalloc_obj(struct freezer); if (!freezer) return ERR_PTR(-ENOMEM); diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 6a01d91ea4cb..4a9e2557141c 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -445,7 +445,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css) if (!parent_css) { cg = &root_cg; } else { - cg = kzalloc(sizeof(*cg), GFP_KERNEL); + cg = kzalloc_obj(*cg); if (!cg) return ERR_PTR(-ENOMEM); } diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index db9617556dd7..ea4ee13936be 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) struct cgroup_namespace *new_ns __free(kfree) = NULL; int ret; - new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); + new_ns = kzalloc_obj(struct cgroup_namespace, GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_common_init(new_ns); diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 8f61114c36dd..ecbb839d2acb 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -80,7 +80,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent) { struct pids_cgroup *pids; - pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL); + pids = kzalloc_obj(struct pids_cgroup); if (!pids) return ERR_PTR(-ENOMEM); diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index ef5878fb2005..4fdab4cf49e0 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -134,7 +134,7 @@ get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) if (rpool) return rpool; - rpool = kzalloc(sizeof(*rpool), GFP_KERNEL); + rpool = kzalloc_obj(*rpool); if (!rpool) return ERR_PTR(-ENOMEM); @@ -173,7 +173,7 @@ uncharge_cg_locked(struct rdma_cgroup *cg, * the system. */ if (unlikely(!rpool)) { - pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); + pr_warn("Invalid device %p or rdma cgroup %p\n", device, cg); return; } @@ -283,7 +283,7 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg, ret = PTR_ERR(rpool); goto err; } else { - new = rpool->resources[index].usage + 1; + new = (s64)rpool->resources[index].usage + 1; if (new > rpool->resources[index].max) { ret = -EAGAIN; goto err; @@ -443,7 +443,7 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, goto err; } - new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL); + new_limits = kzalloc_objs(int, RDMACG_RESOURCE_MAX); if (!new_limits) { ret = -ENOMEM; goto err; @@ -566,7 +566,7 @@ rdmacg_css_alloc(struct cgroup_subsys_state *parent) { struct rdma_cgroup *cg; - cg = kzalloc(sizeof(*cg), GFP_KERNEL); + cg = kzalloc_obj(*cg); if (!cg) return ERR_PTR(-ENOMEM); |
