summaryrefslogtreecommitdiff
path: root/kernel/cgroup
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup')
-rw-r--r--kernel/cgroup/cgroup-internal.h14
-rw-r--r--kernel/cgroup/cgroup-v1.c18
-rw-r--r--kernel/cgroup/cgroup.c394
-rw-r--r--kernel/cgroup/cpuset-internal.h59
-rw-r--r--kernel/cgroup/cpuset-v1.c272
-rw-r--r--kernel/cgroup/cpuset.c875
-rw-r--r--kernel/cgroup/debug.c4
-rw-r--r--kernel/cgroup/dmem.c83
-rw-r--r--kernel/cgroup/legacy_freezer.c2
-rw-r--r--kernel/cgroup/misc.c2
-rw-r--r--kernel/cgroup/namespace.c2
-rw-r--r--kernel/cgroup/pids.c2
-rw-r--r--kernel/cgroup/rdma.c10
13 files changed, 983 insertions, 754 deletions
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 22051b4f1ccb..58797123b752 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -52,7 +52,7 @@ struct cgroup_fs_context {
bool cpuset_clone_children;
bool none; /* User explicitly requested empty subsystem */
bool all_ss; /* Seen 'all' option */
- u16 subsys_mask; /* Selected subsystems */
+ u32 subsys_mask; /* Selected subsystems */
char *name; /* Hierarchy name */
char *release_agent; /* Path for release notifications */
};
@@ -146,7 +146,7 @@ struct cgroup_mgctx {
struct cgroup_taskset tset;
/* subsystems affected by migration */
- u16 ss_mask;
+ u32 ss_mask;
};
#define CGROUP_TASKSET_INIT(tset) \
@@ -184,11 +184,6 @@ extern bool cgrp_dfl_visible;
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
- return !(cgrp->self.flags & CSS_ONLINE);
-}
-
static inline bool notify_on_release(const struct cgroup *cgrp)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -222,7 +217,6 @@ static inline void get_css_set(struct css_set *cset)
}
bool cgroup_ssid_enabled(int ssid);
-bool cgroup_on_dfl(const struct cgroup *cgrp);
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
@@ -235,8 +229,8 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
-int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask);
+int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask);
int cgroup_do_get_tree(struct fs_context *fc);
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index a9e029b570c8..a4337c9b5287 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -28,7 +28,7 @@
#define CGROUP_PIDLIST_DESTROY_DELAY HZ
/* Controllers blocked by the commandline in v1 */
-static u16 cgroup_no_v1_mask;
+static u32 cgroup_no_v1_mask;
/* disable named v1 mounts */
static bool cgroup_no_v1_named;
@@ -317,7 +317,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
return l;
/* entry not found; create a new one */
- l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+ l = kzalloc_obj(struct cgroup_pidlist);
if (!l)
return l;
@@ -352,7 +352,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
- array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);
+ array = kvmalloc_objs(pid_t, length);
if (!array)
return -ENOMEM;
/* now, populate the array */
@@ -1037,13 +1037,13 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
static int check_cgroupfs_options(struct fs_context *fc)
{
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
- u16 mask = U16_MAX;
- u16 enabled = 0;
+ u32 mask = U32_MAX;
+ u32 enabled = 0;
struct cgroup_subsys *ss;
int i;
#ifdef CONFIG_CPUSETS
- mask = ~((u16)1 << cpuset_cgrp_id);
+ mask = ~((u32)1 << cpuset_cgrp_id);
#endif
for_each_subsys(ss, i)
if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) &&
@@ -1095,7 +1095,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
int ret = 0;
- u16 added_mask, removed_mask;
+ u32 added_mask, removed_mask;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
@@ -1237,7 +1237,7 @@ static int cgroup1_root_to_use(struct fs_context *fc)
if (ctx->ns != &init_cgroup_ns)
return -EPERM;
- root = kzalloc(sizeof(*root), GFP_KERNEL);
+ root = kzalloc_obj(*root);
if (!root)
return -ENOMEM;
@@ -1343,7 +1343,7 @@ static int __init cgroup_no_v1(char *str)
continue;
if (!strcmp(token, "all")) {
- cgroup_no_v1_mask = U16_MAX;
+ cgroup_no_v1_mask = U32_MAX;
continue;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5f0d33b04910..6152add0c5eb 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -69,14 +69,6 @@
#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
/*
- * To avoid confusing the compiler (and generating warnings) with code
- * that attempts to access what would be a 0-element array (i.e. sized
- * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
- * constant expression can be added.
- */
-#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
-
-/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
@@ -107,12 +99,6 @@ static bool cgroup_debug __read_mostly;
*/
static DEFINE_SPINLOCK(cgroup_idr_lock);
-/*
- * Protects cgroup_file->kn for !self csses. It synchronizes notifications
- * against file removal/re-creation across css hiding.
- */
-static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-
DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
#define cgroup_assert_mutex_or_rcu_locked() \
@@ -203,13 +189,13 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
bool cgrp_dfl_visible;
/* some controllers are not supported in the default hierarchy */
-static u16 cgrp_dfl_inhibit_ss_mask;
+static u32 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
-static u16 cgrp_dfl_implicit_ss_mask;
+static u32 cgrp_dfl_implicit_ss_mask;
/* some controllers can be threaded on the default hierarchy */
-static u16 cgrp_dfl_threaded_ss_mask;
+static u32 cgrp_dfl_threaded_ss_mask;
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
@@ -231,10 +217,10 @@ static u64 css_serial_nr_next = 1;
* These bitmasks identify subsystems with specific features to avoid
* having to do iterative checks repeatedly.
*/
-static u16 have_fork_callback __read_mostly;
-static u16 have_exit_callback __read_mostly;
-static u16 have_release_callback __read_mostly;
-static u16 have_canfork_callback __read_mostly;
+static u32 have_fork_callback __read_mostly;
+static u32 have_exit_callback __read_mostly;
+static u32 have_release_callback __read_mostly;
+static u32 have_canfork_callback __read_mostly;
static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
@@ -278,10 +264,12 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
+static void cgroup_finish_destroy(struct cgroup *cgrp);
+static void kill_css_sync(struct cgroup_subsys_state *css);
+static void kill_css_finish(struct cgroup_subsys_state *css);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
-static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
@@ -472,13 +460,13 @@ static bool cgroup_is_valid_domain(struct cgroup *cgrp)
}
/* subsystems visibly enabled on a cgroup */
-static u16 cgroup_control(struct cgroup *cgrp)
+static u32 cgroup_control(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
- u16 root_ss_mask = cgrp->root->subsys_mask;
+ u32 root_ss_mask = cgrp->root->subsys_mask;
if (parent) {
- u16 ss_mask = parent->subtree_control;
+ u32 ss_mask = parent->subtree_control;
/* threaded cgroups can only have threaded controllers */
if (cgroup_is_threaded(cgrp))
@@ -493,12 +481,12 @@ static u16 cgroup_control(struct cgroup *cgrp)
}
/* subsystems enabled on a cgroup */
-static u16 cgroup_ss_mask(struct cgroup *cgrp)
+static u32 cgroup_ss_mask(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
if (parent) {
- u16 ss_mask = parent->subtree_ss_mask;
+ u32 ss_mask = parent->subtree_ss_mask;
/* threaded cgroups can only have threaded controllers */
if (cgroup_is_threaded(cgrp))
@@ -510,27 +498,6 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
}
/**
- * cgroup_css - obtain a cgroup's css for the specified subsystem
- * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest (%NULL returns @cgrp->self)
- *
- * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
- * function must be called either under cgroup_mutex or rcu_read_lock() and
- * the caller is responsible for pinning the returned css if it wants to
- * keep accessing it outside the said locks. This function may return
- * %NULL if @cgrp doesn't have @subsys_id enabled.
- */
-static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
- struct cgroup_subsys *ss)
-{
- if (CGROUP_HAS_SUBSYS_CONFIG && ss)
- return rcu_dereference_check(cgrp->subsys[ss->id],
- lockdep_is_held(&cgroup_mutex));
- else
- return &cgrp->self;
-}
-
-/**
* cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
@@ -741,32 +708,6 @@ EXPORT_SYMBOL_GPL(of_css);
} \
} while (false)
-/* iterate over child cgrps, lock should be held throughout iteration */
-#define cgroup_for_each_live_child(child, cgrp) \
- list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
- if (({ lockdep_assert_held(&cgroup_mutex); \
- cgroup_is_dead(child); })) \
- ; \
- else
-
-/* walk live descendants in pre order */
-#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
- css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
- if (({ lockdep_assert_held(&cgroup_mutex); \
- (dsct) = (d_css)->cgroup; \
- cgroup_is_dead(dsct); })) \
- ; \
- else
-
-/* walk live descendants in postorder */
-#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
- css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
- if (({ lockdep_assert_held(&cgroup_mutex); \
- (dsct) = (d_css)->cgroup; \
- cgroup_is_dead(dsct); })) \
- ; \
- else
-
/*
* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
@@ -858,6 +799,16 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
if (was_populated == cgroup_is_populated(cgrp))
break;
+ /*
+ * Subtree just emptied below an offlined cgrp. Fire deferred
+ * destroy. The transition is one-shot.
+ */
+ if (was_populated && !css_is_online(&cgrp->self)) {
+ cgroup_get(cgrp);
+ WARN_ON_ONCE(!queue_work(cgroup_offline_wq,
+ &cgrp->finish_destroy_work));
+ }
+
cgroup1_check_for_release(cgrp);
TRACE_CGROUP_PATH(notify_populated, cgrp,
cgroup_is_populated(cgrp));
@@ -1168,7 +1119,7 @@ static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
INIT_LIST_HEAD(tmp_links);
for (i = 0; i < count; i++) {
- link = kzalloc(sizeof(*link), GFP_KERNEL);
+ link = kzalloc_obj(*link);
if (!link) {
free_cgrp_cset_links(tmp_links);
return -ENOMEM;
@@ -1241,7 +1192,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
if (cset)
return cset;
- cset = kzalloc(sizeof(*cset), GFP_KERNEL);
+ cset = kzalloc_obj(*cset);
if (!cset)
return NULL;
@@ -1633,9 +1584,9 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
* This function calculates which subsystems need to be enabled if
* @subtree_control is to be applied while restricted to @this_ss_mask.
*/
-static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
+static u32 cgroup_calc_subtree_ss_mask(u32 subtree_control, u32 this_ss_mask)
{
- u16 cur_ss_mask = subtree_control;
+ u32 cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
@@ -1644,7 +1595,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
while (true) {
- u16 new_ss_mask = cur_ss_mask;
+ u32 new_ss_mask = cur_ss_mask;
do_each_subsys_mask(ss, ssid, cur_ss_mask) {
new_ss_mask |= ss->depends_on;
@@ -1748,9 +1699,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
struct cgroup_file *cfile = (void *)css + cft->file_offset;
- spin_lock_irq(&cgroup_file_kn_lock);
- cfile->kn = NULL;
- spin_unlock_irq(&cgroup_file_kn_lock);
+ spin_lock_irq(&cfile->lock);
+ WRITE_ONCE(cfile->kn, NULL);
+ spin_unlock_irq(&cfile->lock);
timer_delete_sync(&cfile->notify_timer);
}
@@ -1848,12 +1799,12 @@ err:
return ret;
}
-int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
int ssid, ret;
- u16 dfl_disable_ss_mask = 0;
+ u32 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -2100,6 +2051,16 @@ static int cgroup_reconfigure(struct fs_context *fc)
return 0;
}
+static void cgroup_finish_destroy_work_fn(struct work_struct *work)
+{
+ struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work);
+
+ cgroup_lock();
+ cgroup_finish_destroy(cgrp);
+ cgroup_unlock();
+ cgroup_put(cgrp);
+}
+
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
@@ -2126,6 +2087,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
#endif
init_waitqueue_head(&cgrp->offline_waitq);
+ INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
@@ -2149,7 +2111,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -2350,7 +2312,7 @@ static int cgroup_init_fs_context(struct fs_context *fc)
{
struct cgroup_fs_context *ctx;
- ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
+ ctx = kzalloc_obj(struct cgroup_fs_context);
if (!ctx)
return -ENOMEM;
@@ -2608,6 +2570,7 @@ static void cgroup_migrate_add_task(struct task_struct *task,
mgctx->tset.nr_tasks++;
+ css_set_skip_task_iters(cset, task);
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
list_add_tail(&cset->mg_node,
@@ -3131,7 +3094,7 @@ void cgroup_procs_write_finish(struct task_struct *task,
put_task_struct(task);
}
-static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
+static void cgroup_print_ss_mask(struct seq_file *seq, u32 ss_mask)
{
struct cgroup_subsys *ss;
bool printed = false;
@@ -3434,7 +3397,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
if (css->parent &&
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
- kill_css(css);
+ kill_css_sync(css);
+ kill_css_finish(css);
} else if (!css_visible(css)) {
css_clear_dir(css);
if (ss->css_reset)
@@ -3496,9 +3460,9 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
cgroup_apply_control_disable(cgrp);
}
-static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u32 enable)
{
- u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+ u32 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
/* if nothing is getting enabled, nothing to worry about */
if (!enable)
@@ -3541,7 +3505,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- u16 enable = 0, disable = 0;
+ u32 enable = 0, disable = 0;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -3993,33 +3957,41 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
- struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_file_ctx *ctx;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
+ ssize_t ret = 0;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
- cgroup_get(cgrp);
- cgroup_kn_unlock(of->kn);
+ ctx = of->priv;
+ if (!ctx) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
/* Allow only one trigger per file descriptor */
if (ctx->psi.trigger) {
- cgroup_put(cgrp);
- return -EBUSY;
+ ret = -EBUSY;
+ goto out_unlock;
}
psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res, of->file, of);
if (IS_ERR(new)) {
- cgroup_put(cgrp);
- return PTR_ERR(new);
+ ret = PTR_ERR(new);
+ goto out_unlock;
}
smp_store_release(&ctx->psi.trigger, new);
- cgroup_put(cgrp);
+
+out_unlock:
+ cgroup_kn_unlock(of->kn);
+ if (ret)
+ return ret;
return nbytes;
}
@@ -4251,7 +4223,7 @@ static int cgroup_file_open(struct kernfs_open_file *of)
struct cgroup_file_ctx *ctx;
int ret;
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ ctx = kzalloc_obj(*ctx);
if (!ctx)
return -ENOMEM;
@@ -4427,10 +4399,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
struct cgroup_file *cfile = (void *)css + cft->file_offset;
timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
-
- spin_lock_irq(&cgroup_file_kn_lock);
+ spin_lock_init(&cfile->lock);
cfile->kn = kn;
- spin_unlock_irq(&cgroup_file_kn_lock);
}
return 0;
@@ -4685,21 +4655,32 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
*/
void cgroup_file_notify(struct cgroup_file *cfile)
{
- unsigned long flags;
+ unsigned long flags, last, next;
+ struct kernfs_node *kn = NULL;
+
+ if (!READ_ONCE(cfile->kn))
+ return;
+
+ last = READ_ONCE(cfile->notified_at);
+ next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
+ if (time_in_range(jiffies, last, next)) {
+ timer_reduce(&cfile->notify_timer, next);
+ if (timer_pending(&cfile->notify_timer))
+ return;
+ }
- spin_lock_irqsave(&cgroup_file_kn_lock, flags);
+ spin_lock_irqsave(&cfile->lock, flags);
if (cfile->kn) {
- unsigned long last = cfile->notified_at;
- unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
+ kn = cfile->kn;
+ kernfs_get(kn);
+ WRITE_ONCE(cfile->notified_at, jiffies);
+ }
+ spin_unlock_irqrestore(&cfile->lock, flags);
- if (time_in_range(jiffies, last, next)) {
- timer_reduce(&cfile->notify_timer, next);
- } else {
- kernfs_notify(cfile->kn);
- cfile->notified_at = jiffies;
- }
+ if (kn) {
+ kernfs_notify(kn);
+ kernfs_put(kn);
}
- spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}
EXPORT_SYMBOL_GPL(cgroup_file_notify);
@@ -4712,10 +4693,10 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show)
{
struct kernfs_node *kn;
- spin_lock_irq(&cgroup_file_kn_lock);
+ spin_lock_irq(&cfile->lock);
kn = cfile->kn;
kernfs_get(kn);
- spin_unlock_irq(&cgroup_file_kn_lock);
+ spin_unlock_irq(&cfile->lock);
if (kn)
kernfs_show(kn, show);
@@ -4945,7 +4926,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
rcu_read_lock();
css_for_each_child(child, css) {
- if (child->flags & CSS_ONLINE) {
+ if (css_is_online(child)) {
ret = true;
break;
}
@@ -5108,6 +5089,14 @@ repeat:
return;
task = list_entry(it->task_pos, struct task_struct, cg_list);
+ /*
+ * Hide tasks that are exiting but not yet removed by default. Keep
+ * zombie leaders with live threads visible. Usages that need to walk
+ * every existing task can opt out via CSS_TASK_ITER_WITH_DEAD.
+ */
+ if (!(it->flags & CSS_TASK_ITER_WITH_DEAD) &&
+ (task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
+ goto repeat;
if (it->flags & CSS_TASK_ITER_PROCS) {
/* if PROCS, skip over tasks which aren't group leaders */
@@ -5550,7 +5539,7 @@ static struct cftype cgroup_psi_files[] = {
* css destruction is four-stage process.
*
* 1. Destruction starts. Killing of the percpu_ref is initiated.
- * Implemented in kill_css().
+ * Implemented in kill_css_finish().
*
* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
* and thus css_tryget_online() is guaranteed to fail, the css can be
@@ -5750,7 +5739,7 @@ static void offline_css(struct cgroup_subsys_state *css)
lockdep_assert_held(&cgroup_mutex);
- if (!(css->flags & CSS_ONLINE))
+ if (!css_is_online(css))
return;
if (ss->css_offline)
@@ -5760,16 +5749,6 @@ static void offline_css(struct cgroup_subsys_state *css)
RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
wake_up_all(&css->cgroup->offline_waitq);
-
- css->cgroup->nr_dying_subsys[ss->id]++;
- /*
- * Parent css and cgroup cannot be freed until after the freeing
- * of child css, see css_free_rwork_fn().
- */
- while ((css = css->parent)) {
- css->nr_descendants--;
- css->cgroup->nr_dying_subsys[ss->id]++;
- }
}
/**
@@ -5844,7 +5823,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(struct_size(cgrp, _low_ancestors, level), GFP_KERNEL);
+ cgrp = kzalloc_flex(*cgrp, _low_ancestors, level);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -6039,12 +6018,13 @@ out_unlock:
/*
* This is called when the refcnt of a css is confirmed to be killed.
* css_tryget_online() is now guaranteed to fail. Tell the subsystem to
- * initiate destruction and put the css ref from kill_css().
+ * initiate destruction and put the css ref from kill_css_finish().
*/
static void css_killed_work_fn(struct work_struct *work)
{
- struct cgroup_subsys_state *css =
- container_of(work, struct cgroup_subsys_state, destroy_work);
+ struct cgroup_subsys_state *css;
+
+ css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork);
cgroup_lock();
@@ -6065,22 +6045,21 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
container_of(ref, struct cgroup_subsys_state, refcnt);
if (atomic_dec_and_test(&css->online_cnt)) {
- INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_offline_wq, &css->destroy_work);
+ INIT_RCU_WORK(&css->destroy_rwork, css_killed_work_fn);
+ queue_rcu_work(cgroup_offline_wq, &css->destroy_rwork);
}
}
/**
- * kill_css - destroy a css
- * @css: css to destroy
+ * kill_css_sync - synchronous half of css teardown
+ * @css: css being killed
*
- * This function initiates destruction of @css by removing cgroup interface
- * files and putting its base reference. ->css_offline() will be invoked
- * asynchronously once css_tryget_online() is guaranteed to fail and when
- * the reference count reaches zero, @css will be released.
+ * See cgroup_destroy_locked().
*/
-static void kill_css(struct cgroup_subsys_state *css)
+static void kill_css_sync(struct cgroup_subsys_state *css)
{
+ struct cgroup_subsys *ss = css->ss;
+
lockdep_assert_held(&cgroup_mutex);
if (css->flags & CSS_DYING)
@@ -6100,64 +6079,100 @@ static void kill_css(struct cgroup_subsys_state *css)
*/
css_clear_dir(css);
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ /*
+ * Parent css and cgroup cannot be freed until after the freeing
+ * of child css, see css_free_rwork_fn().
+ */
+ while ((css = css->parent)) {
+ css->nr_descendants--;
+ css->cgroup->nr_dying_subsys[ss->id]++;
+ }
+}
+
+/**
+ * kill_css_finish - deferred half of css teardown
+ * @css: css being killed
+ *
+ * See cgroup_destroy_locked().
+ */
+static void kill_css_finish(struct cgroup_subsys_state *css)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ /*
+ * Skip on re-entry: cgroup_apply_control_disable() may have killed @css
+ * earlier. cgroup_destroy_locked() can still walk it because
+ * offline_css() (which NULLs cgrp->subsys[ssid]) runs async.
+ */
+ if (percpu_ref_is_dying(&css->refcnt))
+ return;
+
/*
- * Killing would put the base ref, but we need to keep it alive
- * until after ->css_offline().
+ * Killing would put the base ref, but we need to keep it alive until
+ * after ->css_offline().
*/
css_get(css);
/*
- * cgroup core guarantees that, by the time ->css_offline() is
- * invoked, no new css reference will be given out via
- * css_tryget_online(). We can't simply call percpu_ref_kill() and
- * proceed to offlining css's because percpu_ref_kill() doesn't
- * guarantee that the ref is seen as killed on all CPUs on return.
+ * cgroup core guarantees that, by the time ->css_offline() is invoked,
+ * no new css reference will be given out via css_tryget_online(). We
+ * can't simply call percpu_ref_kill() and proceed to offlining css's
+ * because percpu_ref_kill() doesn't guarantee that the ref is seen as
+ * killed on all CPUs on return.
*
- * Use percpu_ref_kill_and_confirm() to get notifications as each
- * css is confirmed to be seen as killed on all CPUs.
+ * Use percpu_ref_kill_and_confirm() to get notifications as each css is
+ * confirmed to be seen as killed on all CPUs.
*/
percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
}
/**
- * cgroup_destroy_locked - the first stage of cgroup destruction
+ * cgroup_destroy_locked - destroy @cgrp (called on rmdir)
* @cgrp: cgroup to be destroyed
*
- * css's make use of percpu refcnts whose killing latency shouldn't be
- * exposed to userland and are RCU protected. Also, cgroup core needs to
- * guarantee that css_tryget_online() won't succeed by the time
- * ->css_offline() is invoked. To satisfy all the requirements,
- * destruction is implemented in the following two steps.
- *
- * s1. Verify @cgrp can be destroyed and mark it dying. Remove all
- * userland visible parts and start killing the percpu refcnts of
- * css's. Set up so that the next stage will be kicked off once all
- * the percpu refcnts are confirmed to be killed.
- *
- * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
- * rest of destruction. Once all cgroup references are gone, the
- * cgroup is RCU-freed.
- *
- * This function implements s1. After this step, @cgrp is gone as far as
- * the userland is concerned and a new cgroup with the same name may be
- * created. As cgroup doesn't care about the names internally, this
- * doesn't cause any problem.
+ * Tear down @cgrp on behalf of rmdir. Constraints:
+ *
+ * - Userspace: rmdir must succeed when cgroup.procs and friends are empty.
+ *
+ * - Kernel: subsystem ->css_offline() must not run while any task in @cgrp's
+ * subtree is still doing kernel work. A task hidden from cgroup.procs (past
+ * exit_signals() with signal->live cleared) can still schedule, allocate, and
+ * consume resources until its final context switch. Dying descendants in the
+ * subtree can host such tasks too.
+ *
+ * - Kernel: css_tryget_online() must fail by the time ->css_offline() runs.
+ *
+ * The destruction runs in three parts:
+ *
+ * - This function: synchronous user-visible state teardown plus kill_css_sync()
+ * on each subsystem css.
+ *
+ * - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on
+ * each subsystem css. Fires once @cgrp's subtree is fully drained, either
+ * inline here or from cgroup_update_populated().
+ *
+ * - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn ->
+ * ->css_offline() -> release/free.
+ *
+ * Return 0 on success, -EBUSY if a userspace-visible task or an online child
+ * remains.
*/
static int cgroup_destroy_locked(struct cgroup *cgrp)
- __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
+ struct css_task_iter it;
+ struct task_struct *task;
int ssid, ret;
lockdep_assert_held(&cgroup_mutex);
- /*
- * Only migration can raise populated from zero and we're already
- * holding cgroup_mutex.
- */
- if (cgroup_is_populated(cgrp))
+ css_task_iter_start(&cgrp->self, 0, &it);
+ task = css_task_iter_next(&it);
+ css_task_iter_end(&it);
+ if (task)
return -EBUSY;
/*
@@ -6181,9 +6196,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
link->cset->dead = true;
spin_unlock_irq(&css_set_lock);
- /* initiate massacre of all css's */
for_each_css(css, ssid, cgrp)
- kill_css(css);
+ kill_css_sync(css);
/* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
css_clear_dir(&cgrp->self);
@@ -6214,9 +6228,29 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
+ if (!cgroup_is_populated(cgrp))
+ cgroup_finish_destroy(cgrp);
+
return 0;
};
+/**
+ * cgroup_finish_destroy - deferred half of @cgrp destruction
+ * @cgrp: cgroup whose subtree just became empty
+ *
+ * See cgroup_destroy_locked() for the rationale.
+ */
+static void cgroup_finish_destroy(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *css;
+ int ssid;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ for_each_css(css, ssid, cgrp)
+ kill_css_finish(css);
+}
+
int cgroup_rmdir(struct kernfs_node *kn)
{
struct cgroup *cgrp;
@@ -6347,7 +6381,7 @@ int __init cgroup_init(void)
struct cgroup_subsys *ss;
int ssid;
- BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
+ BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 32);
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 01976c8e7d49..bb4e692bea30 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -9,6 +9,7 @@
#include <linux/cpuset.h>
#include <linux/spinlock.h>
#include <linux/union_find.h>
+#include <linux/sched/isolation.h>
/* See "Frequency meter" comments, below. */
@@ -144,17 +145,12 @@ struct cpuset {
*/
nodemask_t old_mems_allowed;
- struct fmeter fmeter; /* memory_pressure filter */
-
/*
* Tasks are being attached to this cpuset. Used to prevent
* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
*/
int attach_in_progress;
- /* for custom sched domain */
- int relax_domain_level;
-
/* partition root state */
int partition_root_state;
@@ -172,6 +168,11 @@ struct cpuset {
int nr_deadline_tasks;
int nr_migrate_dl_tasks;
u64 sum_migrate_dl_bw;
+ /*
+ * CPU used for temporary DL bandwidth allocation during attach;
+ * -1 if no DL bandwidth was allocated in the current attach.
+ */
+ int dl_bw_cpu;
/* Invalid partition error code, not lock protected */
enum prs_errcode prs_err;
@@ -179,10 +180,19 @@ struct cpuset {
/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
+#ifdef CONFIG_CPUSETS_V1
+ struct fmeter fmeter; /* memory_pressure filter */
+
+ /* for custom sched domain */
+ int relax_domain_level;
+
/* Used to merge intersecting subsets for generate_sched_domains */
struct uf_node node;
+#endif
};
+extern struct cpuset top_cpuset;
+
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct cpuset, css) : NULL;
@@ -240,6 +250,30 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
+/*
+ * Helper routine for generate_sched_domains().
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
+ */
+static inline int cpusets_overlap(struct cpuset *a, struct cpuset *b)
+{
+ return cpumask_intersects(a->effective_cpus, b->effective_cpus);
+}
+
+static inline int nr_cpusets(void)
+{
+ /* jump label reference count + the top-level cpuset */
+ return static_key_count(&cpusets_enabled_key.key) + 1;
+}
+
+static inline bool cpuset_is_populated(struct cpuset *cs)
+{
+ lockdep_assert_cpuset_lock_held();
+
+ /* Cpusets in the process of attaching should be considered as populated */
+ return cgroup_is_populated(cs->css.cgroup) ||
+ cs->attach_in_progress;
+}
+
/**
* cpuset_for_each_child - traverse online children of a cpuset
* @child_cs: loop cursor pointing to the current child
@@ -285,7 +319,6 @@ void cpuset_full_unlock(void);
*/
#ifdef CONFIG_CPUSETS_V1
extern struct cftype cpuset1_files[];
-void fmeter_init(struct fmeter *fmp);
void cpuset1_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk);
void cpuset1_update_tasks_flags(struct cpuset *cs);
@@ -293,8 +326,13 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated);
int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
+bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2);
+void cpuset1_init(struct cpuset *cs);
+void cpuset1_online_css(struct cgroup_subsys_state *css);
+int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes);
+
#else
-static inline void fmeter_init(struct fmeter *fmp) {}
static inline void cpuset1_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk) {}
static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {}
@@ -303,6 +341,13 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs,
bool cpus_updated, bool mems_updated) {}
static inline int cpuset1_validate_change(struct cpuset *cur,
struct cpuset *trial) { return 0; }
+static inline bool cpuset1_cpus_excl_conflict(struct cpuset *cs1,
+ struct cpuset *cs2) { return false; }
+static inline void cpuset1_init(struct cpuset *cs) {}
+static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {}
+static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes) { return 0; };
+
#endif /* CONFIG_CPUSETS_V1 */
#endif /* __CPUSET_INTERNAL_H */
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index 12e76774c75b..7308e9b02495 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -62,7 +62,7 @@ struct cpuset_remove_tasks_struct {
#define FM_SCALE 1000 /* faux fixed point scale */
/* Initialize a frequency meter */
-void fmeter_init(struct fmeter *fmp)
+static void fmeter_init(struct fmeter *fmp)
{
fmp->cnt = 0;
fmp->val = 0;
@@ -316,7 +316,7 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs,
css_tryget_online(&cs->css)) {
struct cpuset_remove_tasks_struct *s;
- s = kzalloc(sizeof(*s), GFP_KERNEL);
+ s = kzalloc_obj(*s);
if (WARN_ON_ONCE(!s)) {
css_put(&cs->css);
return;
@@ -368,11 +368,44 @@ int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
if (par && !is_cpuset_subset(trial, par))
goto out;
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * be changed to have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if (cpuset_is_populated(cur)) {
+ if (!cpumask_empty(cur->cpus_allowed) &&
+ cpumask_empty(trial->cpus_allowed))
+ goto out;
+ if (!nodes_empty(cur->mems_allowed) &&
+ nodes_empty(trial->mems_allowed))
+ goto out;
+ }
+
ret = 0;
out:
return ret;
}
+/*
+ * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts
+ * to legacy (v1)
+ * @cs1: first cpuset to check
+ * @cs2: second cpuset to check
+ *
+ * Returns: true if CPU exclusivity conflict exists, false otherwise
+ *
+ * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect.
+ */
+bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+{
+ if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
+ return cpumask_intersects(cs1->cpus_allowed,
+ cs2->cpus_allowed);
+
+ return false;
+}
+
#ifdef CONFIG_PROC_PID_CPUSET
/*
* proc_cpuset_show()
@@ -499,6 +532,241 @@ out_unlock:
return retval;
}
+void cpuset1_init(struct cpuset *cs)
+{
+ fmeter_init(&cs->fmeter);
+ cs->relax_domain_level = -1;
+}
+
+void cpuset1_online_css(struct cgroup_subsys_state *css)
+{
+ struct cpuset *tmp_cs;
+ struct cgroup_subsys_state *pos_css;
+ struct cpuset *cs = css_cs(css);
+ struct cpuset *parent = parent_cs(cs);
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_cpuset_lock_held();
+
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
+ return;
+
+ /*
+ * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
+ * set. This flag handling is implemented in cgroup core for
+ * historical reasons - the flag may be specified during mount.
+ *
+ * Currently, if any sibling cpusets have exclusive cpus or mem, we
+ * refuse to clone the configuration - thereby refusing the task to
+ * be entered, and as a result refusing the sys_unshare() or
+ * clone() which initiated it. If this becomes a problem for some
+ * users who wish to allow that scenario, then this could be
+ * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+ * (and likewise for mems) to the new cgroup.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_css, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
+ cpuset_callback_lock_irq();
+ cs->mems_allowed = parent->mems_allowed;
+ cs->effective_mems = parent->mems_allowed;
+ cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
+ cpuset_callback_unlock_irq();
+}
+
+static void
+update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+ if (dattr->relax_domain_level < c->relax_domain_level)
+ dattr->relax_domain_level = c->relax_domain_level;
+}
+
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
+{
+ struct cpuset *cp;
+ struct cgroup_subsys_state *pos_css;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ if (is_sched_load_balance(cp))
+ update_domain_attr(dattr, cp);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * cpuset1_generate_sched_domains()
+ *
+ * Finding the best partition (set of domains):
+ * The double nested loops below over i, j scan over the load
+ * balanced cpusets (using the array of cpuset pointers in csa[])
+ * looking for pairs of cpusets that have overlapping cpus_allowed
+ * and merging them using a union-find algorithm.
+ *
+ * The union of the cpus_allowed masks from the set of all cpusets
+ * having the same root then form the one element of the partition
+ * (one sched domain) to be passed to partition_sched_domains().
+ */
+int cpuset1_generate_sched_domains(cpumask_var_t **domains,
+ struct sched_domain_attr **attributes)
+{
+ struct cpuset *cp; /* top-down scan of cpusets */
+ struct cpuset **csa; /* array of all cpuset ptrs */
+ int csn; /* how many cpuset ptrs in csa so far */
+ int i, j; /* indices for partition finding loops */
+ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
+ struct sched_domain_attr *dattr; /* attributes for custom domains */
+ int ndoms = 0; /* number of sched domains in result */
+ int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup_subsys_state *pos_css;
+ int nslot_update;
+
+ lockdep_assert_cpuset_lock_held();
+
+ doms = NULL;
+ dattr = NULL;
+ csa = NULL;
+
+ /* Special case for the 99% of systems with one, full, sched domain */
+ if (is_sched_load_balance(&top_cpuset)) {
+ ndoms = 1;
+ doms = alloc_sched_domains(ndoms);
+ if (!doms)
+ goto done;
+
+ dattr = kmalloc_obj(struct sched_domain_attr);
+ if (dattr) {
+ *dattr = SD_ATTR_INIT;
+ update_domain_attr_tree(dattr, &top_cpuset);
+ }
+ cpumask_and(doms[0], top_cpuset.effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+
+ goto done;
+ }
+
+ csa = kmalloc_objs(cp, nr_cpusets());
+ if (!csa)
+ goto done;
+ csn = 0;
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+ if (cp == &top_cpuset)
+ continue;
+
+ /*
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !(is_sched_load_balance(cp) &&
+ cpumask_intersects(cp->cpus_allowed,
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
+ continue;
+
+ if (is_sched_load_balance(cp) &&
+ !cpumask_empty(cp->effective_cpus))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ rcu_read_unlock();
+
+ for (i = 0; i < csn; i++)
+ uf_node_init(&csa[i]->node);
+
+ /* Merge overlapping cpusets */
+ for (i = 0; i < csn; i++) {
+ for (j = i + 1; j < csn; j++) {
+ if (cpusets_overlap(csa[i], csa[j]))
+ uf_union(&csa[i]->node, &csa[j]->node);
+ }
+ }
+
+ /* Count the total number of domains */
+ for (i = 0; i < csn; i++) {
+ if (uf_find(&csa[i]->node) == &csa[i]->node)
+ ndoms++;
+ }
+
+ /*
+ * Now we know how many domains to create.
+ * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+ */
+ doms = alloc_sched_domains(ndoms);
+ if (!doms)
+ goto done;
+
+ /*
+ * The rest of the code, including the scheduler, can deal with
+ * dattr==NULL case. No need to abort if alloc fails.
+ */
+ dattr = kmalloc_objs(struct sched_domain_attr, ndoms);
+
+ for (nslot = 0, i = 0; i < csn; i++) {
+ nslot_update = 0;
+ for (j = i; j < csn; j++) {
+ if (uf_find(&csa[j]->node) == &csa[i]->node) {
+ struct cpumask *dp = doms[nslot];
+
+ if (i == j) {
+ nslot_update = 1;
+ cpumask_clear(dp);
+ if (dattr)
+ *(dattr + nslot) = SD_ATTR_INIT;
+ }
+ cpumask_or(dp, dp, csa[j]->effective_cpus);
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ if (dattr)
+ update_domain_attr_tree(dattr + nslot, csa[j]);
+ }
+ }
+ if (nslot_update)
+ nslot++;
+ }
+ BUG_ON(nslot != ndoms);
+
+done:
+ kfree(csa);
+
+ /*
+ * Fallback to the default domain if kmalloc() failed.
+ * See comments in partition_sched_domains().
+ */
+ if (doms == NULL)
+ ndoms = 1;
+
+ *domains = doms;
+ *attributes = dattr;
+ return ndoms;
+}
+
/*
* for the common functions, 'private' gives the type of file
*/
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c06e2e96f79d..e3a081a07c6d 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -26,7 +26,6 @@
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
-#include <linux/export.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/deadline.h>
@@ -63,6 +62,75 @@ static const char * const perr_strings[] = {
};
/*
+ * CPUSET Locking Convention
+ * -------------------------
+ *
+ * Below are the four global/local locks guarding cpuset structures in lock
+ * acquisition order:
+ * - cpuset_top_mutex
+ * - cpu_hotplug_lock (cpus_read_lock/cpus_write_lock)
+ * - cpuset_mutex
+ * - callback_lock (raw spinlock)
+ *
+ * As cpuset will now indirectly flush a number of different workqueues in
+ * housekeeping_update() to update housekeeping cpumasks when the set of
+ * isolated CPUs is going to be changed, it may be vulnerable to deadlock
+ * if we hold cpus_read_lock while calling into housekeeping_update().
+ *
+ * The first cpuset_top_mutex will be held except when calling into
+ * cpuset_handle_hotplug() from the CPU hotplug code where cpus_write_lock
+ * and cpuset_mutex will be held instead. The main purpose of this mutex
+ * is to prevent regular cpuset control file write actions from interfering
+ * with the call to housekeeping_update(), though CPU hotplug operation can
+ * still happen in parallel. This mutex also provides protection for some
+ * internal variables.
+ *
+ * A task must hold all the remaining three locks to modify externally visible
+ * or used fields of cpusets, though some of the internally used cpuset fields
+ * and internal variables can be modified without holding callback_lock. If only
+ * reliable read access of the externally used fields are needed, a task can
+ * hold either cpuset_mutex or callback_lock which are exposed to other
+ * external subsystems.
+ *
+ * If a task holds cpu_hotplug_lock and cpuset_mutex, it blocks others,
+ * ensuring that it is the only task able to also acquire callback_lock and
+ * be able to modify cpusets. It can perform various checks on the cpuset
+ * structure first, knowing nothing will change. It can also allocate memory
+ * without holding callback_lock. While it is performing these checks, various
+ * callback routines can briefly acquire callback_lock to query cpusets. Once
+ * it is ready to make the changes, it takes callback_lock, blocking everyone
+ * else.
+ *
+ * Calls to the kernel memory allocator cannot be made while holding
+ * callback_lock which is a spinlock, as the memory allocator may sleep or
+ * call back into cpuset code and acquire callback_lock.
+ *
+ * Now, the task_struct fields mems_allowed and mempolicy may be changed
+ * by other task, we use alloc_lock in the task_struct fields to protect
+ * them.
+ *
+ * The cpuset_common_seq_show() handlers only hold callback_lock across
+ * small pieces of code, such as when reading out possibly multi-word
+ * cpumasks and nodemasks.
+ */
+
+static DEFINE_MUTEX(cpuset_top_mutex);
+static DEFINE_MUTEX(cpuset_mutex);
+
+/*
+ * File level internal variables below follow one of the following exclusion
+ * rules.
+ *
+ * RWCS: Read/write-able by holding either cpus_write_lock (and optionally
+ * cpuset_mutex) or both cpus_read_lock and cpuset_mutex.
+ *
+ * CSCB: Readable by holding either cpuset_mutex or callback_lock. Writable
+ * by holding both cpuset_mutex and callback_lock.
+ *
+ * T: Read/write-able by holding the cpuset_top_mutex.
+ */
+
+/*
* For local partitions, update to subpartitions_cpus & isolated_cpus is done
* in update_parent_effective_cpumask(). For remote partitions, it is done in
* the remote_partition_*() and remote_cpus_update() helpers.
@@ -71,25 +139,22 @@ static const char * const perr_strings[] = {
* Exclusive CPUs distributed out to local or remote sub-partitions of
* top_cpuset
*/
-static cpumask_var_t subpartitions_cpus;
+static cpumask_var_t subpartitions_cpus; /* RWCS */
/*
- * Exclusive CPUs in isolated partitions
+ * Exclusive CPUs in isolated partitions (shown in cpuset.cpus.isolated)
*/
-static cpumask_var_t isolated_cpus;
+static cpumask_var_t isolated_cpus; /* CSCB */
/*
- * isolated_cpus updating flag (protected by cpuset_mutex)
- * Set if isolated_cpus is going to be updated in the current
- * cpuset_mutex crtical section.
+ * Set if housekeeping cpumasks are to be updated.
*/
-static bool isolated_cpus_updating;
+static bool update_housekeeping; /* RWCS */
/*
- * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot
+ * Copy of isolated_cpus to be passed to housekeeping_update()
*/
-static cpumask_var_t boot_hk_cpus;
-static bool have_boot_isolcpus;
+static cpumask_var_t isolated_hk_cpus; /* T */
/*
* A flag to force sched domain rebuild at the end of an operation.
@@ -105,7 +170,7 @@ static bool have_boot_isolcpus;
* Note that update_relax_domain_level() in cpuset-v1.c can still call
* rebuild_sched_domains_locked() directly without using this flag.
*/
-static bool force_sd_rebuild;
+static bool force_sd_rebuild; /* RWCS */
/*
* Partition root states:
@@ -126,6 +191,17 @@ static bool force_sd_rebuild;
* For simplicity, a local partition can be created under a local or remote
* partition but a remote partition cannot have any partition root in its
* ancestor chain except the cgroup root.
+ *
+ * A valid partition can be formed by setting exclusive_cpus or cpus_allowed
+ * if exclusive_cpus is not set. In the case of partition with empty
+ * exclusive_cpus, all the conflicting exclusive CPUs specified in the
+ * following cpumasks of sibling cpusets will be removed from its
+ * cpus_allowed in determining its effective_xcpus.
+ * - effective_xcpus
+ * - exclusive_cpus
+ *
+ * The "cpuset.cpus.exclusive" control file should be used for setting up
+ * partition if the users want to get as many CPUs as possible.
*/
#define PRS_MEMBER 0
#define PRS_ROOT 1
@@ -208,50 +284,13 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
* If cpu_online_mask is used while a hotunplug operation is happening in
* parallel, we may leave an offline CPU in cpu_allowed or some other masks.
*/
-static struct cpuset top_cpuset = {
+struct cpuset top_cpuset = {
.flags = BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
- .relax_domain_level = -1,
- .remote_partition = false,
+ .dl_bw_cpu = -1,
};
-/*
- * There are two global locks guarding cpuset structures - cpuset_mutex and
- * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
- * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
- * structures. Note that cpuset_mutex needs to be a mutex as it is used in
- * paths that rely on priority inheritance (e.g. scheduler - on RT) for
- * correctness.
- *
- * A task must hold both locks to modify cpusets. If a task holds
- * cpuset_mutex, it blocks others, ensuring that it is the only task able to
- * also acquire callback_lock and be able to modify cpusets. It can perform
- * various checks on the cpuset structure first, knowing nothing will change.
- * It can also allocate memory while just holding cpuset_mutex. While it is
- * performing these checks, various callback routines can briefly acquire
- * callback_lock to query cpusets. Once it is ready to make the changes, it
- * takes callback_lock, blocking everyone else.
- *
- * Calls to the kernel memory allocator can not be made while holding
- * callback_lock, as that would risk double tripping on callback_lock
- * from one of the callbacks into the cpuset code from within
- * __alloc_pages().
- *
- * If a task is only holding callback_lock, then it has read-only
- * access to cpusets.
- *
- * Now, the task_struct fields mems_allowed and mempolicy may be changed
- * by other task, we use alloc_lock in the task_struct fields to protect
- * them.
- *
- * The cpuset_common_seq_show() handlers only hold callback_lock across
- * small pieces of code, such as when reading out possibly multi-word
- * cpumasks and nodemasks.
- */
-
-static DEFINE_MUTEX(cpuset_mutex);
-
/**
* cpuset_lock - Acquire the global cpuset mutex
*
@@ -268,6 +307,11 @@ void cpuset_unlock(void)
mutex_unlock(&cpuset_mutex);
}
+void lockdep_assert_cpuset_lock_held(void)
+{
+ lockdep_assert_held(&cpuset_mutex);
+}
+
/**
* cpuset_full_lock - Acquire full protection for cpuset modification
*
@@ -276,6 +320,7 @@ void cpuset_unlock(void)
*/
void cpuset_full_lock(void)
{
+ mutex_lock(&cpuset_top_mutex);
cpus_read_lock();
mutex_lock(&cpuset_mutex);
}
@@ -284,8 +329,17 @@ void cpuset_full_unlock(void)
{
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
+ mutex_unlock(&cpuset_top_mutex);
}
+#ifdef CONFIG_LOCKDEP
+bool lockdep_is_cpuset_held(void)
+{
+ return lockdep_is_held(&cpuset_mutex) ||
+ lockdep_is_held(&cpuset_top_mutex);
+}
+#endif
+
static DEFINE_SPINLOCK(callback_lock);
void cpuset_callback_lock_irq(void)
@@ -319,7 +373,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes)
*/
static inline void dec_attach_in_progress_locked(struct cpuset *cs)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
cs->attach_in_progress--;
if (!cs->attach_in_progress)
@@ -353,15 +407,6 @@ static inline bool is_in_v2_mode(void)
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
-static inline bool cpuset_is_populated(struct cpuset *cs)
-{
- lockdep_assert_held(&cpuset_mutex);
-
- /* Cpusets in the process of attaching should be considered as populated */
- return cgroup_is_populated(cs->css.cgroup) ||
- cs->attach_in_progress;
-}
-
/**
* partition_is_populated - check if partition has tasks
* @cs: partition root to be checked
@@ -453,9 +498,8 @@ static void guarantee_active_cpus(struct task_struct *tsk,
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
- while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
+ while (!nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]))
cs = parent_cs(cs);
- nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}
/**
@@ -532,10 +576,12 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
/* Allocate base structure */
trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) :
- kzalloc(sizeof(*cs), GFP_KERNEL);
+ kzalloc_obj(*cs);
if (!trial)
return NULL;
+ trial->dl_bw_cpu = -1;
+
/* Setup cpumask pointer array */
cpumask_var_t *pmask[4] = {
&trial->cpus_allowed,
@@ -603,36 +649,32 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
/**
* cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts
- * @cs1: first cpuset to check
- * @cs2: second cpuset to check
+ * @trial: the trial cpuset to be checked
+ * @sibling: a sibling cpuset to be checked against
+ * @xcpus_changed: set if exclusive_cpus has been set
*
* Returns: true if CPU exclusivity conflict exists, false otherwise
*
* Conflict detection rules:
- * 1. If either cpuset is CPU exclusive, they must be mutually exclusive
- * 2. exclusive_cpus masks cannot intersect between cpusets
- * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs
+ * o cgroup v1
+ * See cpuset1_cpus_excl_conflict()
+ * o cgroup v2
+ * - The exclusive_cpus values cannot overlap.
+ * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed.
*/
-static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
+static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling,
+ bool xcpus_changed)
{
- /* If either cpuset is exclusive, check if they are mutually exclusive */
- if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
- return !cpusets_are_exclusive(cs1, cs2);
-
- /* Exclusive_cpus cannot intersect */
- if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus))
- return true;
-
- /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */
- if (!cpumask_empty(cs1->cpus_allowed) &&
- cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus))
- return true;
+ if (!cpuset_v2())
+ return cpuset1_cpus_excl_conflict(trial, sibling);
- if (!cpumask_empty(cs2->cpus_allowed) &&
- cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus))
+ /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */
+ if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) &&
+ cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus))
return true;
- return false;
+ /* Exclusive_cpus cannot intersect */
+ return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus);
}
static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
@@ -666,6 +708,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
+ bool xcpus_changed;
int ret = 0;
rcu_read_lock();
@@ -682,20 +725,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
par = parent_cs(cur);
/*
- * Cpusets with tasks - existing or newly being attached - can't
- * be changed to have empty cpus_allowed or mems_allowed.
- */
- ret = -ENOSPC;
- if (cpuset_is_populated(cur)) {
- if (!cpumask_empty(cur->cpus_allowed) &&
- cpumask_empty(trial->cpus_allowed))
- goto out;
- if (!nodes_empty(cur->mems_allowed) &&
- nodes_empty(trial->mems_allowed))
- goto out;
- }
-
- /*
* We can't shrink if we won't have enough room for SCHED_DEADLINE
* tasks. This check is not done when scheduling is disabled as the
* users should know what they are doing.
@@ -722,10 +751,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
* overlap. exclusive_cpus cannot overlap with each other if set.
*/
ret = -EINVAL;
+ xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus);
cpuset_for_each_child(c, css, par) {
if (c == cur)
continue;
- if (cpus_excl_conflict(trial, c))
+ if (cpus_excl_conflict(trial, c, xcpus_changed))
goto out;
if (mems_excl_conflict(trial, c))
goto out;
@@ -738,49 +768,6 @@ out:
}
#ifdef CONFIG_SMP
-/*
- * Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping effective cpus_allowed masks?
- */
-static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
-{
- return cpumask_intersects(a->effective_cpus, b->effective_cpus);
-}
-
-static void
-update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
-{
- if (dattr->relax_domain_level < c->relax_domain_level)
- dattr->relax_domain_level = c->relax_domain_level;
- return;
-}
-
-static void update_domain_attr_tree(struct sched_domain_attr *dattr,
- struct cpuset *root_cs)
-{
- struct cpuset *cp;
- struct cgroup_subsys_state *pos_css;
-
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- /* skip the whole subtree if @cp doesn't have any CPU */
- if (cpumask_empty(cp->cpus_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
-
- if (is_sched_load_balance(cp))
- update_domain_attr(dattr, cp);
- }
- rcu_read_unlock();
-}
-
-/* Must be called with cpuset_mutex held. */
-static inline int nr_cpusets(void)
-{
- /* jump label reference count + the top-level cpuset */
- return static_key_count(&cpusets_enabled_key.key) + 1;
-}
/*
* generate_sched_domains()
@@ -820,103 +807,46 @@ static inline int nr_cpusets(void)
* convenient format, that can be easily compared to the prior
* value to determine what partition elements (sched domains)
* were changed (added or removed.)
- *
- * Finding the best partition (set of domains):
- * The double nested loops below over i, j scan over the load
- * balanced cpusets (using the array of cpuset pointers in csa[])
- * looking for pairs of cpusets that have overlapping cpus_allowed
- * and merging them using a union-find algorithm.
- *
- * The union of the cpus_allowed masks from the set of all cpusets
- * having the same root then form the one element of the partition
- * (one sched domain) to be passed to partition_sched_domains().
- *
*/
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
struct cpuset *cp; /* top-down scan of cpusets */
struct cpuset **csa; /* array of all cpuset ptrs */
- int csn; /* how many cpuset ptrs in csa so far */
int i, j; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
- int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
- bool root_load_balance = is_sched_load_balance(&top_cpuset);
- bool cgrpv2 = cpuset_v2();
- int nslot_update;
+
+ if (!cpuset_v2())
+ return cpuset1_generate_sched_domains(domains, attributes);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
-single_root_domain:
+ if (cpumask_empty(subpartitions_cpus)) {
ndoms = 1;
- doms = alloc_sched_domains(ndoms);
- if (!doms)
- goto done;
-
- dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
- if (dattr) {
- *dattr = SD_ATTR_INIT;
- update_domain_attr_tree(dattr, &top_cpuset);
- }
- cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
-
- goto done;
+ /* !csa will be checked and can be correctly handled */
+ goto generate_doms;
}
- csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
+ csa = kmalloc_objs(cp, nr_cpusets());
if (!csa)
goto done;
- csn = 0;
+ /* Find how many partitions and cache them to csa[] */
rcu_read_lock();
- if (root_load_balance)
- csa[csn++] = &top_cpuset;
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
- if (cp == &top_cpuset)
- continue;
-
- if (cgrpv2)
- goto v2;
-
- /*
- * v1:
- * Continue traversing beyond @cp iff @cp has some CPUs and
- * isn't load balancing. The former is obvious. The
- * latter: All child cpusets contain a subset of the
- * parent's cpus, so just skip them, and then we call
- * update_domain_attr_tree() to calc relax_domain_level of
- * the corresponding sched domain.
- */
- if (!cpumask_empty(cp->cpus_allowed) &&
- !(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_TYPE_DOMAIN))))
- continue;
-
- if (is_sched_load_balance(cp) &&
- !cpumask_empty(cp->effective_cpus))
- csa[csn++] = cp;
-
- /* skip @cp's subtree */
- pos_css = css_rightmost_descendant(pos_css);
- continue;
-
-v2:
/*
* Only valid partition roots that are not isolated and with
- * non-empty effective_cpus will be saved into csn[].
+ * non-empty effective_cpus will be saved into csa[].
*/
if ((cp->partition_root_state == PRS_ROOT) &&
!cpumask_empty(cp->effective_cpus))
- csa[csn++] = cp;
+ csa[ndoms++] = cp;
/*
* Skip @cp's subtree if not a partition root and has no
@@ -927,40 +857,18 @@ v2:
}
rcu_read_unlock();
- /*
- * If there are only isolated partitions underneath the cgroup root,
- * we can optimize out unneeded sched domains scanning.
- */
- if (root_load_balance && (csn == 1))
- goto single_root_domain;
-
- for (i = 0; i < csn; i++)
- uf_node_init(&csa[i]->node);
-
- /* Merge overlapping cpusets */
- for (i = 0; i < csn; i++) {
- for (j = i + 1; j < csn; j++) {
- if (cpusets_overlap(csa[i], csa[j])) {
+ for (i = 0; i < ndoms; i++) {
+ for (j = i + 1; j < ndoms; j++) {
+ if (cpusets_overlap(csa[i], csa[j]))
/*
* Cgroup v2 shouldn't pass down overlapping
* partition root cpusets.
*/
- WARN_ON_ONCE(cgrpv2);
- uf_union(&csa[i]->node, &csa[j]->node);
- }
+ WARN_ON_ONCE(1);
}
}
- /* Count the total number of domains */
- for (i = 0; i < csn; i++) {
- if (uf_find(&csa[i]->node) == &csa[i]->node)
- ndoms++;
- }
-
- /*
- * Now we know how many domains to create.
- * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
- */
+generate_doms:
doms = alloc_sched_domains(ndoms);
if (!doms)
goto done;
@@ -969,53 +877,26 @@ v2:
* The rest of the code, including the scheduler, can deal with
* dattr==NULL case. No need to abort if alloc fails.
*/
- dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
- GFP_KERNEL);
+ dattr = kmalloc_objs(struct sched_domain_attr, ndoms);
/*
* Cgroup v2 doesn't support domain attributes, just set all of them
* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
- * subset of HK_TYPE_DOMAIN housekeeping CPUs.
+ * subset of HK_TYPE_DOMAIN_BOOT housekeeping CPUs.
*/
- if (cgrpv2) {
- for (i = 0; i < ndoms; i++) {
- /*
- * The top cpuset may contain some boot time isolated
- * CPUs that need to be excluded from the sched domain.
- */
- if (csa[i] == &top_cpuset)
- cpumask_and(doms[i], csa[i]->effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
- else
- cpumask_copy(doms[i], csa[i]->effective_cpus);
- if (dattr)
- dattr[i] = SD_ATTR_INIT;
- }
- goto done;
- }
-
- for (nslot = 0, i = 0; i < csn; i++) {
- nslot_update = 0;
- for (j = i; j < csn; j++) {
- if (uf_find(&csa[j]->node) == &csa[i]->node) {
- struct cpumask *dp = doms[nslot];
-
- if (i == j) {
- nslot_update = 1;
- cpumask_clear(dp);
- if (dattr)
- *(dattr + nslot) = SD_ATTR_INIT;
- }
- cpumask_or(dp, dp, csa[j]->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
- if (dattr)
- update_domain_attr_tree(dattr + nslot, csa[j]);
- }
- }
- if (nslot_update)
- nslot++;
+ for (i = 0; i < ndoms; i++) {
+ /*
+ * The top cpuset may contain some boot time isolated
+ * CPUs that need to be excluded from the sched domain.
+ */
+ if (!csa || csa[i] == &top_cpuset)
+ cpumask_and(doms[i], top_cpuset.effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
+ else
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
+ if (dattr)
+ dattr[i] = SD_ATTR_INIT;
}
- BUG_ON(nslot != ndoms);
done:
kfree(csa);
@@ -1055,7 +936,7 @@ void dl_rebuild_rd_accounting(void)
int cpu;
u64 cookie = ++dl_cookie;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
lockdep_assert_cpus_held();
lockdep_assert_held(&sched_domains_mutex);
@@ -1100,53 +981,33 @@ void dl_rebuild_rd_accounting(void)
*/
void rebuild_sched_domains_locked(void)
{
- struct cgroup_subsys_state *pos_css;
struct sched_domain_attr *attr;
cpumask_var_t *doms;
- struct cpuset *cs;
int ndoms;
+ int i;
lockdep_assert_cpus_held();
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
force_sd_rebuild = false;
- /*
- * If we have raced with CPU hotplug, return early to avoid
- * passing doms with offlined cpu to partition_sched_domains().
- * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
- *
- * With no CPUs in any subpartitions, top_cpuset's effective CPUs
- * should be the same as the active CPUs, so checking only top_cpuset
- * is enough to detect racing CPU offlines.
- */
- if (cpumask_empty(subpartitions_cpus) &&
- !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
- return;
+ /* Generate domain masks and attrs */
+ ndoms = generate_sched_domains(&doms, &attr);
/*
- * With subpartition CPUs, however, the effective CPUs of a partition
- * root should be only a subset of the active CPUs. Since a CPU in any
- * partition root could be offlined, all must be checked.
- */
- if (!cpumask_empty(subpartitions_cpus)) {
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
- if (!is_partition_valid(cs)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
- if (!cpumask_subset(cs->effective_cpus,
- cpu_active_mask)) {
- rcu_read_unlock();
- return;
- }
- }
- rcu_read_unlock();
+ * cpuset_hotplug_workfn is invoked synchronously now, thus this
+ * function should not race with CPU hotplug. And the effective CPUs
+ * must not include any offline CPUs. Passing an offline CPU in the
+ * doms to partition_sched_domains() will trigger a kernel panic.
+ *
+ * We perform a final check here: if the doms contains any
+ * offline CPUs, a warning is emitted and we return directly to
+ * prevent the panic.
+ */
+ for (i = 0; doms && i < ndoms; i++) {
+ if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask)))
+ return;
}
- /* Generate domain masks and attrs */
- ndoms = generate_sched_domains(&doms, &attr);
-
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
}
@@ -1205,11 +1066,10 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
if (top_cs) {
/*
+ * PF_KTHREAD tasks are handled by housekeeping.
* PF_NO_SETAFFINITY tasks are ignored.
- * All per cpu kthreads should have PF_NO_SETAFFINITY
- * flag set, see kthread_set_per_cpu().
*/
- if (task->flags & PF_NO_SETAFFINITY)
+ if (task->flags & (PF_KTHREAD | PF_NO_SETAFFINITY))
continue;
cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
} else {
@@ -1343,12 +1203,18 @@ static void reset_partition_data(struct cpuset *cs)
static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus)
{
WARN_ON_ONCE(old_prs == new_prs);
- if (new_prs == PRS_ISOLATED)
+ lockdep_assert_held(&callback_lock);
+ lockdep_assert_held(&cpuset_mutex);
+ if (new_prs == PRS_ISOLATED) {
+ if (cpumask_subset(xcpus, isolated_cpus))
+ return;
cpumask_or(isolated_cpus, isolated_cpus, xcpus);
- else
+ } else {
+ if (!cpumask_intersects(xcpus, isolated_cpus))
+ return;
cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
-
- isolated_cpus_updating = true;
+ }
+ update_housekeeping = true;
}
/*
@@ -1401,8 +1267,8 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent,
isolated_cpus_update(old_prs, parent->partition_root_state,
xcpus);
- cpumask_and(xcpus, xcpus, cpu_active_mask);
cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
+ cpumask_and(parent->effective_cpus, parent->effective_cpus, cpu_active_mask);
}
/*
@@ -1450,54 +1316,62 @@ static bool isolated_cpus_can_update(struct cpumask *add_cpus,
* @new_cpus: cpu mask
* Return: true if there is conflict, false otherwise
*
- * CPUs outside of boot_hk_cpus, if defined, can only be used in an
+ * CPUs outside of HK_TYPE_DOMAIN_BOOT, if defined, can only be used in an
* isolated partition.
*/
static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
{
- if (!have_boot_isolcpus)
+ if (!housekeeping_enabled(HK_TYPE_DOMAIN_BOOT))
return false;
- if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus))
+ if ((prstate != PRS_ISOLATED) &&
+ !cpumask_subset(new_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT)))
return true;
return false;
}
/*
- * update_isolation_cpumasks - Update external isolation related CPU masks
+ * cpuset_update_sd_hk_unlock - Rebuild sched domains, update HK & unlock
*
- * The following external CPU masks will be updated if necessary:
- * - workqueue unbound cpumask
+ * Update housekeeping cpumasks and rebuild sched domains if necessary and
+ * then do a cpuset_full_unlock().
+ * This should be called at the end of cpuset operation.
*/
-static void update_isolation_cpumasks(void)
+static void cpuset_update_sd_hk_unlock(void)
+ __releases(&cpuset_mutex)
+ __releases(&cpuset_top_mutex)
{
- int ret;
-
- if (!isolated_cpus_updating)
- return;
-
- lockdep_assert_cpus_held();
-
- ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
- WARN_ON_ONCE(ret < 0);
+ /* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */
+ if (force_sd_rebuild)
+ rebuild_sched_domains_locked();
- ret = tmigr_isolated_exclude_cpumask(isolated_cpus);
- WARN_ON_ONCE(ret < 0);
+ if (update_housekeeping) {
+ update_housekeeping = false;
+ cpumask_copy(isolated_hk_cpus, isolated_cpus);
- isolated_cpus_updating = false;
+ /*
+ * housekeeping_update() is now called without holding
+ * cpus_read_lock and cpuset_mutex. Only cpuset_top_mutex
+ * is still being held for mutual exclusion.
+ */
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
+ WARN_ON_ONCE(housekeeping_update(isolated_hk_cpus));
+ mutex_unlock(&cpuset_top_mutex);
+ } else {
+ cpuset_full_unlock();
+ }
}
-/**
- * cpuset_cpu_is_isolated - Check if the given CPU is isolated
- * @cpu: the CPU number to be checked
- * Return: true if CPU is used in an isolated partition, false otherwise
+/*
+ * Work function to invoke cpuset_update_sd_hk_unlock()
*/
-bool cpuset_cpu_is_isolated(int cpu)
+static void hk_sd_workfn(struct work_struct *work)
{
- return cpumask_test_cpu(cpu, isolated_cpus);
+ cpuset_full_lock();
+ cpuset_update_sd_hk_unlock();
}
-EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
/**
* rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets
@@ -1517,23 +1391,29 @@ static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,
int retval = 0;
if (cpumask_empty(excpus))
- return retval;
+ return 0;
/*
- * Exclude exclusive CPUs from siblings
+ * Remove exclusive CPUs from siblings
*/
rcu_read_lock();
cpuset_for_each_child(sibling, css, parent) {
+ struct cpumask *sibling_xcpus;
+
if (sibling == cs)
continue;
- if (cpumask_intersects(excpus, sibling->exclusive_cpus)) {
- cpumask_andnot(excpus, excpus, sibling->exclusive_cpus);
- retval++;
- continue;
- }
- if (cpumask_intersects(excpus, sibling->effective_xcpus)) {
- cpumask_andnot(excpus, excpus, sibling->effective_xcpus);
+ /*
+ * If exclusive_cpus is defined, effective_xcpus will always
+ * be a subset. Otherwise, effective_xcpus will only be set
+ * in a valid partition root.
+ */
+ sibling_xcpus = cpumask_empty(sibling->exclusive_cpus)
+ ? sibling->effective_xcpus
+ : sibling->exclusive_cpus;
+
+ if (cpumask_intersects(excpus, sibling_xcpus)) {
+ cpumask_andnot(excpus, excpus, sibling_xcpus);
retval++;
}
}
@@ -1641,7 +1521,6 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
cs->remote_partition = true;
cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
cpuset_force_rebuild();
cs->prs_err = 0;
@@ -1686,7 +1565,6 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
compute_excpus(cs, cs->effective_xcpus);
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
cpuset_force_rebuild();
/*
@@ -1757,7 +1635,6 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
if (xcpus)
cpumask_copy(cs->exclusive_cpus, xcpus);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
if (adding || deleting)
cpuset_force_rebuild();
@@ -1822,7 +1699,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
int parent_prs = parent->partition_root_state;
bool nocpu;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */
/*
@@ -2101,7 +1978,6 @@ write_error:
partition_xcpus_add(new_prs, parent, tmp->delmask);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
if ((old_prs != new_prs) && (cmd == partcmd_update))
update_partition_exclusive_flag(cs, new_prs);
@@ -2331,17 +2207,13 @@ get_css:
spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
cp->partition_root_state = new_prs;
- if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
- compute_excpus(cp, cp->effective_xcpus);
-
/*
- * Make sure effective_xcpus is properly set for a valid
- * partition root.
+ * Need to compute effective_xcpus if either exclusive_cpus
+ * is non-empty or it is a valid partition root.
*/
- if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
- cpumask_and(cp->effective_xcpus,
- cp->cpus_allowed, parent->effective_xcpus);
- else if (new_prs < 0)
+ if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus))
+ compute_excpus(cp, cp->effective_xcpus);
+ if (new_prs <= 0)
reset_partition_data(cp);
spin_unlock_irq(&callback_lock);
@@ -2350,7 +2222,7 @@ get_css:
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
- cpuset_update_tasks_cpumask(cp, cp->effective_cpus);
+ cpuset_update_tasks_cpumask(cp, tmp->new_cpus);
/*
* On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
@@ -2394,7 +2266,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
/*
* Check all its siblings and call update_cpumasks_hier()
@@ -2403,27 +2275,20 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
* It is possible a change in parent's effective_cpus
* due to a change in a child partition's effective_xcpus will impact
* its siblings even if they do not inherit parent's effective_cpus
- * directly.
+ * directly. It should not impact valid partition.
*
* The update_cpumasks_hier() function may sleep. So we have to
* release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
- if (sibling == cs)
+ if (sibling == cs || is_partition_valid(sibling))
continue;
- if (!is_partition_valid(sibling)) {
- compute_effective_cpumask(tmp->new_cpus, sibling,
- parent);
- if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
- continue;
- } else if (is_remote_partition(sibling)) {
- /*
- * Change in a sibling cpuset won't affect a remote
- * partition root.
- */
+
+ compute_effective_cpumask(tmp->new_cpus, sibling,
+ parent);
+ if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
continue;
- }
if (!css_tryget_online(&sibling->css))
continue;
@@ -2479,43 +2344,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri
return PERR_NONE;
}
-static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs,
- struct tmpmasks *tmp)
-{
- int retval;
- struct cpuset *parent = parent_cs(cs);
-
- retval = validate_change(cs, trialcs);
-
- if ((retval == -EINVAL) && cpuset_v2()) {
- struct cgroup_subsys_state *css;
- struct cpuset *cp;
-
- /*
- * The -EINVAL error code indicates that partition sibling
- * CPU exclusivity rule has been violated. We still allow
- * the cpumask change to proceed while invalidating the
- * partition. However, any conflicting sibling partitions
- * have to be marked as invalid too.
- */
- trialcs->prs_err = PERR_NOTEXCL;
- rcu_read_lock();
- cpuset_for_each_child(cp, css, parent) {
- struct cpumask *xcpus = user_xcpus(trialcs);
-
- if (is_partition_valid(cp) &&
- cpumask_intersects(xcpus, cp->effective_xcpus)) {
- rcu_read_unlock();
- update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp);
- rcu_read_lock();
- }
- }
- rcu_read_unlock();
- retval = 0;
- }
- return retval;
-}
-
/**
* partition_cpus_change - Handle partition state changes due to CPU mask updates
* @cs: The target cpuset being modified
@@ -2575,15 +2403,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- if (alloc_tmpmasks(&tmp))
- return -ENOMEM;
-
compute_trialcs_excpus(trialcs, cs);
trialcs->prs_err = PERR_NONE;
- retval = cpus_allowed_validate_change(cs, trialcs, &tmp);
+ retval = validate_change(cs, trialcs);
if (retval < 0)
- goto out_free;
+ return retval;
+
+ if (alloc_tmpmasks(&tmp))
+ return -ENOMEM;
/*
* Check all the descendants in update_cpumasks_hier() if
@@ -2606,7 +2434,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
-out_free:
+
free_tmpmasks(&tmp);
return retval;
}
@@ -2717,7 +2545,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
return;
}
- mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
+ mwork = kzalloc_obj(*mwork);
if (mwork) {
mwork->mm = mm;
mwork->from = *from;
@@ -2739,7 +2567,7 @@ static void schedule_flush_migrate_mm(void)
{
struct callback_head *flush_cb;
- flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);
+ flush_cb = kzalloc_obj(struct callback_head);
if (!flush_cb)
return;
@@ -2859,13 +2687,13 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
- nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+ bool has_mems = nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
/*
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
- if (is_in_v2_mode() && nodes_empty(*new_mems))
+ if (is_in_v2_mode() && !has_mems)
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
@@ -3117,7 +2945,6 @@ out:
else if (isolcpus_updated)
isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock);
- update_isolation_cpumasks();
/* Force update if switching back to member & update effective_xcpus */
update_cpumasks_hier(cs, &tmpmask, !new_prs);
@@ -3156,6 +2983,7 @@ static void reset_migrate_dl_data(struct cpuset *cs)
{
cs->nr_migrate_dl_tasks = 0;
cs->sum_migrate_dl_bw = 0;
+ cs->dl_bw_cpu = -1;
}
/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
@@ -3164,7 +2992,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
struct cpuset *cs, *oldcs;
struct task_struct *task;
- bool cpus_updated, mems_updated;
+ bool setsched_check;
int ret;
/* used later by cpuset_attach() */
@@ -3179,20 +3007,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
if (ret)
goto out_unlock;
- cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
- mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+ /*
+ * Skip rights over task setsched check in v2 when nothing changes,
+ * migration permission derives from hierarchy ownership in
+ * cgroup_procs_write_permission()).
+ */
+ setsched_check = !cpuset_v2() ||
+ !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) ||
+ !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
+ /*
+ * A v1 cpuset with tasks will have no CPU left only when CPU hotplug
+ * brings the last online CPU offline as users are not allowed to empty
+ * cpuset.cpus when there are active tasks inside. When that happens,
+ * we should allow tasks to migrate out without security check to make
+ * sure they will be able to run after migration.
+ */
+ if (!is_in_v2_mode() && cpumask_empty(oldcs->effective_cpus))
+ setsched_check = false;
cgroup_taskset_for_each(task, css, tset) {
ret = task_can_attach(task);
if (ret)
goto out_unlock;
- /*
- * Skip rights over task check in v2 when nothing changes,
- * migration permission derives from hierarchy ownership in
- * cgroup_procs_write_permission()).
- */
- if (!cpuset_v2() || (cpus_updated || mems_updated)) {
+ if (setsched_check) {
ret = security_task_setscheduler(task);
if (ret)
goto out_unlock;
@@ -3221,6 +3060,8 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
reset_migrate_dl_data(cs);
goto out_unlock;
}
+
+ cs->dl_bw_cpu = cpu;
}
out_success:
@@ -3245,12 +3086,11 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
mutex_lock(&cpuset_mutex);
dec_attach_in_progress_locked(cs);
- if (cs->nr_migrate_dl_tasks) {
- int cpu = cpumask_any(cs->effective_cpus);
+ if (cs->dl_bw_cpu >= 0)
+ dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
- dl_bw_free(cpu, cs->sum_migrate_dl_bw);
+ if (cs->nr_migrate_dl_tasks)
reset_migrate_dl_data(cs);
- }
mutex_unlock(&cpuset_mutex);
}
@@ -3265,7 +3105,7 @@ static nodemask_t cpuset_attach_nodemask_to;
static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
if (cs != &top_cpuset)
guarantee_active_cpus(task, cpus_attach);
@@ -3407,10 +3247,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
}
free_cpuset(trialcs);
- if (force_sd_rebuild)
- rebuild_sched_domains_locked();
out_unlock:
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
if (of_cft(of)->private == FILE_MEMLIST)
schedule_flush_migrate_mm();
return retval ?: nbytes;
@@ -3517,7 +3355,7 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
cpuset_full_lock();
if (is_cpuset_online(cs))
retval = update_prstate(cs, val);
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
return retval ?: nbytes;
}
@@ -3621,8 +3459,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- fmeter_init(&cs->fmeter);
- cs->relax_domain_level = -1;
+ cpuset1_init(cs);
/* Set CS_MEMORY_MIGRATE for default hierarchy */
if (cpuset_v2())
@@ -3635,17 +3472,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
struct cpuset *parent = parent_cs(cs);
- struct cpuset *tmp_cs;
- struct cgroup_subsys_state *pos_css;
if (!parent)
return 0;
cpuset_full_lock();
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
/*
* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
*/
@@ -3660,39 +3491,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->effective_mems = parent->effective_mems;
}
spin_unlock_irq(&callback_lock);
+ cpuset1_online_css(css);
- if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
- goto out_unlock;
-
- /*
- * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
- * set. This flag handling is implemented in cgroup core for
- * historical reasons - the flag may be specified during mount.
- *
- * Currently, if any sibling cpusets have exclusive cpus or mem, we
- * refuse to clone the configuration - thereby refusing the task to
- * be entered, and as a result refusing the sys_unshare() or
- * clone() which initiated it. If this becomes a problem for some
- * users who wish to allow that scenario, then this could be
- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup.
- */
- rcu_read_lock();
- cpuset_for_each_child(tmp_cs, pos_css, parent) {
- if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
- rcu_read_unlock();
- goto out_unlock;
- }
- }
- rcu_read_unlock();
-
- spin_lock_irq(&callback_lock);
- cs->mems_allowed = parent->mems_allowed;
- cs->effective_mems = parent->mems_allowed;
- cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
- cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
- spin_unlock_irq(&callback_lock);
-out_unlock:
cpuset_full_unlock();
return 0;
}
@@ -3729,7 +3529,7 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css)
/* Reset valid partition back to member */
if (is_partition_valid(cs))
update_prstate(cs, PRS_MEMBER);
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -3884,6 +3684,7 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&isolated_hk_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
@@ -3892,16 +3693,13 @@ int __init cpuset_init(void)
cpumask_setall(top_cpuset.exclusive_cpus);
nodes_setall(top_cpuset.effective_mems);
- fmeter_init(&top_cpuset.fmeter);
+ cpuset1_init(&top_cpuset);
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
- have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN);
- if (have_boot_isolcpus) {
- BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL));
- cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN));
- cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus);
- }
+ if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT))
+ cpumask_andnot(isolated_cpus, cpu_possible_mask,
+ housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
return 0;
}
@@ -4058,6 +3856,7 @@ unlock:
*/
static void cpuset_handle_hotplug(void)
{
+ static DECLARE_WORK(hk_sd_work, hk_sd_workfn);
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
@@ -4139,9 +3938,25 @@ static void cpuset_handle_hotplug(void)
rcu_read_unlock();
}
- /* rebuild sched domains if necessary */
+ /*
+ * rebuild_sched_domains() will always be called directly if needed
+ * to make sure that newly added or removed CPU will be reflected in
+ * the sched domains. However, if isolated partition invalidation
+ * or recreation is being done (update_housekeeping set), a work item
+ * will be queued to call housekeeping_update() to update the
+ * corresponding housekeeping cpumasks after some slight delay.
+ *
+ * We rely on WORK_STRUCT_PENDING_BIT to not requeue a work item that
+ * is still pending. Before the pending bit is cleared, the work data
+ * is copied out and work item dequeued. So it is possible to queue
+ * the work again before the hk_sd_workfn() is invoked to process the
+ * previously queued work. Since hk_sd_workfn() doesn't use the work
+ * item at all, this is not a problem.
+ */
if (force_sd_rebuild)
rebuild_sched_domains_cpuslocked();
+ if (update_housekeeping)
+ queue_work(system_dfl_wq, &hk_sd_work);
free_tmpmasks(ptmp);
}
@@ -4229,7 +4044,7 @@ static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask
*/
void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
{
- lockdep_assert_held(&cpuset_mutex);
+ lockdep_assert_cpuset_lock_held();
__cpuset_cpus_allowed_locked(tsk, pmask);
}
@@ -4424,40 +4239,58 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
return allowed;
}
-bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+/**
+ * cpuset_nodes_allowed - return effective_mems mask from a cgroup cpuset.
+ * @cgroup: pointer to struct cgroup.
+ * @mask: pointer to struct nodemask_t to be returned.
+ *
+ * Returns effective_mems mask from a cgroup cpuset if it is cgroup v2 and
+ * has cpuset subsys. Otherwise, returns node_states[N_MEMORY].
+ *
+ * This function intentionally avoids taking the cpuset_mutex or callback_lock
+ * when accessing effective_mems. This is because the obtained effective_mems
+ * is stale immediately after the query anyway (e.g., effective_mems is updated
+ * immediately after releasing the lock but before returning).
+ *
+ * As a result, returned @mask may be empty because cs->effective_mems can be
+ * rebound during this call. Besides, nodes in @mask are not guaranteed to be
+ * online due to hot plugins. Callers should check the mask for validity on
+ * return based on its subsequent use.
+ **/
+void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask)
{
struct cgroup_subsys_state *css;
struct cpuset *cs;
- bool allowed;
/*
* In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
* and mems_allowed is likely to be empty even if we could get to it,
- * so return true to avoid taking a global lock on the empty check.
+ * so return directly to avoid taking a global lock on the empty check.
*/
- if (!cpuset_v2())
- return true;
+ if (!cgroup || !cpuset_v2()) {
+ nodes_copy(*mask, node_states[N_MEMORY]);
+ return;
+ }
css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
- if (!css)
- return true;
+ if (!css) {
+ nodes_copy(*mask, node_states[N_MEMORY]);
+ return;
+ }
/*
- * Normally, accessing effective_mems would require the cpuset_mutex
- * or callback_lock - but node_isset is atomic and the reference
- * taken via cgroup_get_e_css is sufficient to protect css.
- *
- * Since this interface is intended for use by migration paths, we
- * relax locking here to avoid taking global locks - while accepting
- * there may be rare scenarios where the result may be innaccurate.
+ * The reference taken via cgroup_get_e_css is sufficient to
+ * protect css, but it does not imply safe accesses to effective_mems.
*
- * Reclaim and migration are subject to these same race conditions, and
- * cannot make strong isolation guarantees, so this is acceptable.
+ * Normally, accessing effective_mems would require the cpuset_mutex
+ * or callback_lock - but the correctness of this information is stale
+ * immediately after the query anyway. We do not acquire the lock
+ * during this process to save lock contention in exchange for racing
+ * against mems_allowed rebinds.
*/
cs = container_of(css, struct cpuset, css);
- allowed = node_isset(nid, cs->effective_mems);
+ nodes_copy(*mask, cs->effective_mems);
css_put(css);
- return allowed;
}
/**
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 81ea38dd6f9d..883347b87842 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -14,7 +14,7 @@
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
{
- struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+ struct cgroup_subsys_state *css = kzalloc_obj(*css);
if (!css)
return ERR_PTR(-ENOMEM);
@@ -230,7 +230,7 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
}
static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
- u16 mask)
+ u32 mask)
{
struct cgroup_subsys *ss;
int ssid;
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
index e12b946278b6..1ab1fb47f271 100644
--- a/kernel/cgroup/dmem.c
+++ b/kernel/cgroup/dmem.c
@@ -14,6 +14,7 @@
#include <linux/mutex.h>
#include <linux/page_counter.h>
#include <linux/parser.h>
+#include <linux/refcount.h>
#include <linux/rculist.h>
#include <linux/slab.h>
@@ -71,7 +72,9 @@ struct dmem_cgroup_pool_state {
struct rcu_head rcu;
struct page_counter cnt;
+ struct dmem_cgroup_pool_state *parent;
+ refcount_t ref;
bool inited;
};
@@ -88,6 +91,9 @@ struct dmem_cgroup_pool_state {
static DEFINE_SPINLOCK(dmemcg_lock);
static LIST_HEAD(dmem_cgroup_regions);
+static void dmemcg_free_region(struct kref *ref);
+static void dmemcg_pool_free_rcu(struct rcu_head *rcu);
+
static inline struct dmemcg_state *
css_to_dmemcs(struct cgroup_subsys_state *css)
{
@@ -104,10 +110,38 @@ static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg)
return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL;
}
+static void dmemcg_pool_get(struct dmem_cgroup_pool_state *pool)
+{
+ refcount_inc(&pool->ref);
+}
+
+static bool dmemcg_pool_tryget(struct dmem_cgroup_pool_state *pool)
+{
+ return refcount_inc_not_zero(&pool->ref);
+}
+
+static void dmemcg_pool_put(struct dmem_cgroup_pool_state *pool)
+{
+ if (!refcount_dec_and_test(&pool->ref))
+ return;
+
+ call_rcu(&pool->rcu, dmemcg_pool_free_rcu);
+}
+
+static void dmemcg_pool_free_rcu(struct rcu_head *rcu)
+{
+ struct dmem_cgroup_pool_state *pool = container_of(rcu, typeof(*pool), rcu);
+
+ if (pool->parent)
+ dmemcg_pool_put(pool->parent);
+ kref_put(&pool->region->ref, dmemcg_free_region);
+ kfree(pool);
+}
+
static void free_cg_pool(struct dmem_cgroup_pool_state *pool)
{
list_del(&pool->region_node);
- kfree(pool);
+ dmemcg_pool_put(pool);
}
static void
@@ -188,7 +222,7 @@ static void dmemcs_free(struct cgroup_subsys_state *css)
static struct cgroup_subsys_state *
dmemcs_alloc(struct cgroup_subsys_state *parent_css)
{
- struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL);
+ struct dmemcg_state *dmemcs = kzalloc_obj(*dmemcs);
if (!dmemcs)
return ERR_PTR(-ENOMEM);
@@ -325,7 +359,7 @@ alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region
struct dmem_cgroup_pool_state *pool, *ppool = NULL;
if (!*allocpool) {
- pool = kzalloc(sizeof(*pool), GFP_NOWAIT);
+ pool = kzalloc_obj(*pool, GFP_NOWAIT);
if (!pool)
return ERR_PTR(-ENOMEM);
} else {
@@ -342,6 +376,12 @@ alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region
page_counter_init(&pool->cnt,
ppool ? &ppool->cnt : NULL, true);
reset_all_resource_limits(pool);
+ refcount_set(&pool->ref, 1);
+ kref_get(&region->ref);
+ if (ppool && !pool->parent) {
+ pool->parent = ppool;
+ dmemcg_pool_get(ppool);
+ }
list_add_tail_rcu(&pool->css_node, &dmemcs->pools);
list_add_tail(&pool->region_node, &region->pools);
@@ -389,6 +429,10 @@ get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *regio
/* Fix up parent links, mark as inited. */
pool->cnt.parent = &ppool->cnt;
+ if (ppool && !pool->parent) {
+ pool->parent = ppool;
+ dmemcg_pool_get(ppool);
+ }
pool->inited = true;
pool = ppool;
@@ -423,7 +467,7 @@ static void dmemcg_free_region(struct kref *ref)
*/
void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
{
- struct list_head *entry;
+ struct dmem_cgroup_pool_state *pool, *next;
if (!region)
return;
@@ -433,11 +477,10 @@ void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
/* Remove from global region list */
list_del_rcu(&region->region_node);
- list_for_each_rcu(entry, &region->pools) {
- struct dmem_cgroup_pool_state *pool =
- container_of(entry, typeof(*pool), region_node);
-
+ list_for_each_entry_safe(pool, next, &region->pools, region_node) {
list_del_rcu(&pool->css_node);
+ list_del(&pool->region_node);
+ dmemcg_pool_put(pool);
}
/*
@@ -478,7 +521,7 @@ struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt
if (!region_name)
return ERR_PTR(-ENOMEM);
- ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ ret = kzalloc_obj(*ret);
if (!ret) {
kfree(region_name);
return ERR_PTR(-ENOMEM);
@@ -518,8 +561,10 @@ static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name)
*/
void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool)
{
- if (pool)
+ if (pool) {
css_put(&pool->cs->css);
+ dmemcg_pool_put(pool);
+ }
}
EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put);
@@ -533,6 +578,8 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
pool = find_cg_pool_locked(cg, region);
if (pool && !READ_ONCE(pool->inited))
pool = NULL;
+ if (pool && !dmemcg_pool_tryget(pool))
+ pool = NULL;
rcu_read_unlock();
while (!pool) {
@@ -541,6 +588,8 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
pool = get_cg_pool_locked(cg, region, &allocpool);
else
pool = ERR_PTR(-ENODEV);
+ if (!IS_ERR(pool))
+ dmemcg_pool_get(pool);
spin_unlock(&dmemcg_lock);
if (pool == ERR_PTR(-ENOMEM)) {
@@ -548,7 +597,7 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
if (WARN_ON(allocpool))
continue;
- allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL);
+ allocpool = kzalloc_obj(*allocpool);
if (allocpool) {
pool = NULL;
continue;
@@ -576,6 +625,7 @@ void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size)
page_counter_uncharge(&pool->cnt, size);
css_put(&pool->cs->css);
+ dmemcg_pool_put(pool);
}
EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge);
@@ -627,7 +677,9 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
if (ret_limit_pool) {
*ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt);
css_get(&(*ret_limit_pool)->cs->css);
+ dmemcg_pool_get(*ret_limit_pool);
}
+ dmemcg_pool_put(pool);
ret = -EAGAIN;
goto err;
}
@@ -655,8 +707,7 @@ static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v)
return 0;
}
-static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region,
- u64 *new_limit)
+static int dmemcg_parse_limit(char *options, u64 *new_limit)
{
char *end;
@@ -700,6 +751,9 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
if (!region_name[0])
continue;
+ if (!options || !*options)
+ return -EINVAL;
+
rcu_read_lock();
region = dmemcg_get_region_by_name(region_name);
rcu_read_unlock();
@@ -707,7 +761,7 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
if (!region)
return -EINVAL;
- err = dmemcg_parse_limit(options, region, &new_limit);
+ err = dmemcg_parse_limit(options, &new_limit);
if (err < 0)
goto out_put;
@@ -719,6 +773,7 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
/* And commit */
apply(pool, new_limit);
+ dmemcg_pool_put(pool);
out_put:
kref_put(&region->ref, dmemcg_free_region);
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 817c33450fee..8545e0d1ba3d 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -81,7 +81,7 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct freezer *freezer;
- freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
+ freezer = kzalloc_obj(struct freezer);
if (!freezer)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 6a01d91ea4cb..4a9e2557141c 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -445,7 +445,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css)
if (!parent_css) {
cg = &root_cg;
} else {
- cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+ cg = kzalloc_obj(*cg);
if (!cg)
return ERR_PTR(-ENOMEM);
}
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index db9617556dd7..ea4ee13936be 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
struct cgroup_namespace *new_ns __free(kfree) = NULL;
int ret;
- new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
+ new_ns = kzalloc_obj(struct cgroup_namespace, GFP_KERNEL_ACCOUNT);
if (!new_ns)
return ERR_PTR(-ENOMEM);
ret = ns_common_init(new_ns);
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 8f61114c36dd..ecbb839d2acb 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -80,7 +80,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent)
{
struct pids_cgroup *pids;
- pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+ pids = kzalloc_obj(struct pids_cgroup);
if (!pids)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index ef5878fb2005..4fdab4cf49e0 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -134,7 +134,7 @@ get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
if (rpool)
return rpool;
- rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
+ rpool = kzalloc_obj(*rpool);
if (!rpool)
return ERR_PTR(-ENOMEM);
@@ -173,7 +173,7 @@ uncharge_cg_locked(struct rdma_cgroup *cg,
* the system.
*/
if (unlikely(!rpool)) {
- pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
+ pr_warn("Invalid device %p or rdma cgroup %p\n", device, cg);
return;
}
@@ -283,7 +283,7 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
ret = PTR_ERR(rpool);
goto err;
} else {
- new = rpool->resources[index].usage + 1;
+ new = (s64)rpool->resources[index].usage + 1;
if (new > rpool->resources[index].max) {
ret = -EAGAIN;
goto err;
@@ -443,7 +443,7 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
goto err;
}
- new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
+ new_limits = kzalloc_objs(int, RDMACG_RESOURCE_MAX);
if (!new_limits) {
ret = -ENOMEM;
goto err;
@@ -566,7 +566,7 @@ rdmacg_css_alloc(struct cgroup_subsys_state *parent)
{
struct rdma_cgroup *cg;
- cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+ cg = kzalloc_obj(*cg);
if (!cg)
return ERR_PTR(-ENOMEM);