diff options
Diffstat (limited to 'kernel/cpuset.c')
| -rw-r--r-- | kernel/cpuset.c | 509 | 
1 files changed, 311 insertions, 198 deletions
| diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 116a4164720a..52cb04c993b7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -76,8 +76,34 @@ struct cpuset {  	struct cgroup_subsys_state css;  	unsigned long flags;		/* "unsigned long" so bitops work */ -	cpumask_var_t cpus_allowed;	/* CPUs allowed to tasks in cpuset */ -	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */ + +	/* +	 * On default hierarchy: +	 * +	 * The user-configured masks can only be changed by writing to +	 * cpuset.cpus and cpuset.mems, and won't be limited by the +	 * parent masks. +	 * +	 * The effective masks is the real masks that apply to the tasks +	 * in the cpuset. They may be changed if the configured masks are +	 * changed or hotplug happens. +	 * +	 * effective_mask == configured_mask & parent's effective_mask, +	 * and if it ends up empty, it will inherit the parent's mask. +	 * +	 * +	 * On legacy hierachy: +	 * +	 * The user-configured masks are always the same with effective masks. +	 */ + +	/* user-configured CPUs and Memory Nodes allow to tasks */ +	cpumask_var_t cpus_allowed; +	nodemask_t mems_allowed; + +	/* effective CPUs and Memory Nodes allow to tasks */ +	cpumask_var_t effective_cpus; +	nodemask_t effective_mems;  	/*  	 * This is old Memory Nodes tasks took on. @@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = {   */  static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)  { -	while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) +	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))  		cs = parent_cs(cs); -	cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); +	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);  }  /* @@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)   */  static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)  { -	while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) +	while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))  		cs = parent_cs(cs); -	nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); +	nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);  }  /* @@ -339,13 +365,14 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,  					struct task_struct *tsk)  {  	if (is_spread_page(cs)) -		tsk->flags |= PF_SPREAD_PAGE; +		task_set_spread_page(tsk);  	else -		tsk->flags &= ~PF_SPREAD_PAGE; +		task_clear_spread_page(tsk); +  	if (is_spread_slab(cs)) -		tsk->flags |= PF_SPREAD_SLAB; +		task_set_spread_slab(tsk);  	else -		tsk->flags &= ~PF_SPREAD_SLAB; +		task_clear_spread_slab(tsk);  }  /* @@ -376,13 +403,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)  	if (!trial)  		return NULL; -	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { -		kfree(trial); -		return NULL; -	} -	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); +	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) +		goto free_cs; +	if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) +		goto free_cpus; +	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); +	cpumask_copy(trial->effective_cpus, cs->effective_cpus);  	return trial; + +free_cpus: +	free_cpumask_var(trial->cpus_allowed); +free_cs: +	kfree(trial); +	return NULL;  }  /** @@ -391,6 +425,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)   */  static void free_trial_cpuset(struct cpuset *trial)  { +	free_cpumask_var(trial->effective_cpus);  	free_cpumask_var(trial->cpus_allowed);  	kfree(trial);  } @@ -436,9 +471,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)  	par = parent_cs(cur); -	/* We must be a subset of our parent cpuset */ +	/* On legacy hiearchy, we must be a subset of our parent cpuset. */  	ret = -EACCES; -	if (!is_cpuset_subset(trial, par)) +	if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))  		goto out;  	/* @@ -480,11 +515,11 @@ out:  #ifdef CONFIG_SMP  /*   * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping cpus_allowed masks? + * Do cpusets a, b have overlapping effective cpus_allowed masks?   */  static int cpusets_overlap(struct cpuset *a, struct cpuset *b)  { -	return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); +	return cpumask_intersects(a->effective_cpus, b->effective_cpus);  }  static void @@ -601,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains,  			*dattr = SD_ATTR_INIT;  			update_domain_attr_tree(dattr, &top_cpuset);  		} -		cpumask_copy(doms[0], top_cpuset.cpus_allowed); +		cpumask_copy(doms[0], top_cpuset.effective_cpus);  		goto done;  	} @@ -705,7 +740,7 @@ restart:  			struct cpuset *b = csa[j];  			if (apn == b->pn) { -				cpumask_or(dp, dp, b->cpus_allowed); +				cpumask_or(dp, dp, b->effective_cpus);  				if (dattr)  					update_domain_attr_tree(dattr + nslot, b); @@ -757,7 +792,7 @@ static void rebuild_sched_domains_locked(void)  	 * passing doms with offlined cpu to partition_sched_domains().  	 * Anyways, hotplug work item will rebuild sched domains.  	 */ -	if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) +	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))  		goto out;  	/* Generate domain masks and attrs */ @@ -781,45 +816,6 @@ void rebuild_sched_domains(void)  	mutex_unlock(&cpuset_mutex);  } -/* - * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus - * @cs: the cpuset in interest - * - * A cpuset's effective cpumask is the cpumask of the nearest ancestor - * with non-empty cpus. We use effective cpumask whenever: - * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask - *   if the cpuset they reside in has no cpus) - * - we want to retrieve task_cs(tsk)'s cpus_allowed. - * - * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an - * exception. See comments there. - */ -static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs) -{ -	while (cpumask_empty(cs->cpus_allowed)) -		cs = parent_cs(cs); -	return cs; -} - -/* - * effective_nodemask_cpuset - return nearest ancestor with non-empty mems - * @cs: the cpuset in interest - * - * A cpuset's effective nodemask is the nodemask of the nearest ancestor - * with non-empty memss. We use effective nodemask whenever: - * - we update tasks' mems_allowed. (they take on the ancestor's nodemask - *   if the cpuset they reside in has no mems) - * - we want to retrieve task_cs(tsk)'s mems_allowed. - * - * Called with cpuset_mutex held. - */ -static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) -{ -	while (nodes_empty(cs->mems_allowed)) -		cs = parent_cs(cs); -	return cs; -} -  /**   * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.   * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed @@ -830,53 +826,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)   */  static void update_tasks_cpumask(struct cpuset *cs)  { -	struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);  	struct css_task_iter it;  	struct task_struct *task;  	css_task_iter_start(&cs->css, &it);  	while ((task = css_task_iter_next(&it))) -		set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); +		set_cpus_allowed_ptr(task, cs->effective_cpus);  	css_task_iter_end(&it);  }  /* - * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. - * @root_cs: the root cpuset of the hierarchy - * @update_root: update root cpuset or not? + * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree + * @cs: the cpuset to consider + * @new_cpus: temp variable for calculating new effective_cpus + * + * When congifured cpumask is changed, the effective cpumasks of this cpuset + * and all its descendants need to be updated.   * - * This will update cpumasks of tasks in @root_cs and all other empty cpusets - * which take on cpumask of @root_cs. + * On legacy hierachy, effective_cpus will be the same with cpu_allowed.   *   * Called with cpuset_mutex held   */ -static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) +static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)  {  	struct cpuset *cp;  	struct cgroup_subsys_state *pos_css; +	bool need_rebuild_sched_domains = false;  	rcu_read_lock(); -	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { -		if (cp == root_cs) { -			if (!update_root) -				continue; -		} else { -			/* skip the whole subtree if @cp have some CPU */ -			if (!cpumask_empty(cp->cpus_allowed)) { -				pos_css = css_rightmost_descendant(pos_css); -				continue; -			} +	cpuset_for_each_descendant_pre(cp, pos_css, cs) { +		struct cpuset *parent = parent_cs(cp); + +		cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); + +		/* +		 * If it becomes empty, inherit the effective mask of the +		 * parent, which is guaranteed to have some CPUs. +		 */ +		if (cpumask_empty(new_cpus)) +			cpumask_copy(new_cpus, parent->effective_cpus); + +		/* Skip the whole subtree if the cpumask remains the same. */ +		if (cpumask_equal(new_cpus, cp->effective_cpus)) { +			pos_css = css_rightmost_descendant(pos_css); +			continue;  		} +  		if (!css_tryget_online(&cp->css))  			continue;  		rcu_read_unlock(); +		mutex_lock(&callback_mutex); +		cpumask_copy(cp->effective_cpus, new_cpus); +		mutex_unlock(&callback_mutex); + +		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && +			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); +  		update_tasks_cpumask(cp); +		/* +		 * If the effective cpumask of any non-empty cpuset is changed, +		 * we need to rebuild sched domains. +		 */ +		if (!cpumask_empty(cp->cpus_allowed) && +		    is_sched_load_balance(cp)) +			need_rebuild_sched_domains = true; +  		rcu_read_lock();  		css_put(&cp->css);  	}  	rcu_read_unlock(); + +	if (need_rebuild_sched_domains) +		rebuild_sched_domains_locked();  }  /** @@ -889,7 +912,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,  			  const char *buf)  {  	int retval; -	int is_load_balanced;  	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */  	if (cs == &top_cpuset) @@ -908,7 +930,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,  		if (retval < 0)  			return retval; -		if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) +		if (!cpumask_subset(trialcs->cpus_allowed, +				    top_cpuset.cpus_allowed))  			return -EINVAL;  	} @@ -920,16 +943,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,  	if (retval < 0)  		return retval; -	is_load_balanced = is_sched_load_balance(trialcs); -  	mutex_lock(&callback_mutex);  	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);  	mutex_unlock(&callback_mutex); -	update_tasks_cpumask_hier(cs, true); - -	if (is_load_balanced) -		rebuild_sched_domains_locked(); +	/* use trialcs->cpus_allowed as a temp variable */ +	update_cpumasks_hier(cs, trialcs->cpus_allowed);  	return 0;  } @@ -951,15 +970,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,  							const nodemask_t *to)  {  	struct task_struct *tsk = current; -	struct cpuset *mems_cs;  	tsk->mems_allowed = *to;  	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);  	rcu_read_lock(); -	mems_cs = effective_nodemask_cpuset(task_cs(tsk)); -	guarantee_online_mems(mems_cs, &tsk->mems_allowed); +	guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);  	rcu_read_unlock();  } @@ -1028,13 +1045,12 @@ static void *cpuset_being_rebound;  static void update_tasks_nodemask(struct cpuset *cs)  {  	static nodemask_t newmems;	/* protected by cpuset_mutex */ -	struct cpuset *mems_cs = effective_nodemask_cpuset(cs);  	struct css_task_iter it;  	struct task_struct *task;  	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */ -	guarantee_online_mems(mems_cs, &newmems); +	guarantee_online_mems(cs, &newmems);  	/*  	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't @@ -1077,36 +1093,52 @@ static void update_tasks_nodemask(struct cpuset *cs)  }  /* - * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. - * @cs: the root cpuset of the hierarchy - * @update_root: update the root cpuset or not? + * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree + * @cs: the cpuset to consider + * @new_mems: a temp variable for calculating new effective_mems   * - * This will update nodemasks of tasks in @root_cs and all other empty cpusets - * which take on nodemask of @root_cs. + * When configured nodemask is changed, the effective nodemasks of this cpuset + * and all its descendants need to be updated. + * + * On legacy hiearchy, effective_mems will be the same with mems_allowed.   *   * Called with cpuset_mutex held   */ -static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) +static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)  {  	struct cpuset *cp;  	struct cgroup_subsys_state *pos_css;  	rcu_read_lock(); -	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { -		if (cp == root_cs) { -			if (!update_root) -				continue; -		} else { -			/* skip the whole subtree if @cp have some CPU */ -			if (!nodes_empty(cp->mems_allowed)) { -				pos_css = css_rightmost_descendant(pos_css); -				continue; -			} +	cpuset_for_each_descendant_pre(cp, pos_css, cs) { +		struct cpuset *parent = parent_cs(cp); + +		nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); + +		/* +		 * If it becomes empty, inherit the effective mask of the +		 * parent, which is guaranteed to have some MEMs. +		 */ +		if (nodes_empty(*new_mems)) +			*new_mems = parent->effective_mems; + +		/* Skip the whole subtree if the nodemask remains the same. */ +		if (nodes_equal(*new_mems, cp->effective_mems)) { +			pos_css = css_rightmost_descendant(pos_css); +			continue;  		} +  		if (!css_tryget_online(&cp->css))  			continue;  		rcu_read_unlock(); +		mutex_lock(&callback_mutex); +		cp->effective_mems = *new_mems; +		mutex_unlock(&callback_mutex); + +		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && +			!nodes_equal(cp->mems_allowed, cp->effective_mems)); +  		update_tasks_nodemask(cp);  		rcu_read_lock(); @@ -1156,8 +1188,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,  			goto done;  		if (!nodes_subset(trialcs->mems_allowed, -				node_states[N_MEMORY])) { -			retval =  -EINVAL; +				  top_cpuset.mems_allowed)) { +			retval = -EINVAL;  			goto done;  		}  	} @@ -1174,7 +1206,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,  	cs->mems_allowed = trialcs->mems_allowed;  	mutex_unlock(&callback_mutex); -	update_tasks_nodemask_hier(cs, true); +	/* use trialcs->mems_allowed as a temp variable */ +	update_nodemasks_hier(cs, &cs->mems_allowed);  done:  	return retval;  } @@ -1389,12 +1422,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,  	mutex_lock(&cpuset_mutex); -	/* -	 * We allow to move tasks into an empty cpuset if sane_behavior -	 * flag is set. -	 */ +	/* allow moving tasks into an empty cpuset if on default hierarchy */  	ret = -ENOSPC; -	if (!cgroup_sane_behavior(css->cgroup) && +	if (!cgroup_on_dfl(css->cgroup) &&  	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))  		goto out_unlock; @@ -1452,8 +1482,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css,  	struct task_struct *leader = cgroup_taskset_first(tset);  	struct cpuset *cs = css_cs(css);  	struct cpuset *oldcs = cpuset_attach_old_cs; -	struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); -	struct cpuset *mems_cs = effective_nodemask_cpuset(cs);  	mutex_lock(&cpuset_mutex); @@ -1461,9 +1489,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,  	if (cs == &top_cpuset)  		cpumask_copy(cpus_attach, cpu_possible_mask);  	else -		guarantee_online_cpus(cpus_cs, cpus_attach); +		guarantee_online_cpus(cs, cpus_attach); -	guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); +	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);  	cgroup_taskset_for_each(task, tset) {  		/* @@ -1480,11 +1508,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,  	 * Change mm, possibly for multiple threads in a threadgroup. This is  	 * expensive and may sleep.  	 */ -	cpuset_attach_nodemask_to = cs->mems_allowed; +	cpuset_attach_nodemask_to = cs->effective_mems;  	mm = get_task_mm(leader);  	if (mm) { -		struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); -  		mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);  		/* @@ -1495,7 +1521,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,  		 * mm from.  		 */  		if (is_memory_migrate(cs)) { -			cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, +			cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,  					  &cpuset_attach_nodemask_to);  		}  		mmput(mm); @@ -1516,6 +1542,8 @@ typedef enum {  	FILE_MEMORY_MIGRATE,  	FILE_CPULIST,  	FILE_MEMLIST, +	FILE_EFFECTIVE_CPULIST, +	FILE_EFFECTIVE_MEMLIST,  	FILE_CPU_EXCLUSIVE,  	FILE_MEM_EXCLUSIVE,  	FILE_MEM_HARDWALL, @@ -1694,6 +1722,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)  	case FILE_MEMLIST:  		s += nodelist_scnprintf(s, count, cs->mems_allowed);  		break; +	case FILE_EFFECTIVE_CPULIST: +		s += cpulist_scnprintf(s, count, cs->effective_cpus); +		break; +	case FILE_EFFECTIVE_MEMLIST: +		s += nodelist_scnprintf(s, count, cs->effective_mems); +		break;  	default:  		ret = -EINVAL;  		goto out_unlock; @@ -1779,6 +1813,18 @@ static struct cftype files[] = {  	},  	{ +		.name = "effective_cpus", +		.seq_show = cpuset_common_seq_show, +		.private = FILE_EFFECTIVE_CPULIST, +	}, + +	{ +		.name = "effective_mems", +		.seq_show = cpuset_common_seq_show, +		.private = FILE_EFFECTIVE_MEMLIST, +	}, + +	{  		.name = "cpu_exclusive",  		.read_u64 = cpuset_read_u64,  		.write_u64 = cpuset_write_u64, @@ -1869,18 +1915,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)  	cs = kzalloc(sizeof(*cs), GFP_KERNEL);  	if (!cs)  		return ERR_PTR(-ENOMEM); -	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { -		kfree(cs); -		return ERR_PTR(-ENOMEM); -	} +	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) +		goto free_cs; +	if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) +		goto free_cpus;  	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);  	cpumask_clear(cs->cpus_allowed);  	nodes_clear(cs->mems_allowed); +	cpumask_clear(cs->effective_cpus); +	nodes_clear(cs->effective_mems);  	fmeter_init(&cs->fmeter);  	cs->relax_domain_level = -1;  	return &cs->css; + +free_cpus: +	free_cpumask_var(cs->cpus_allowed); +free_cs: +	kfree(cs); +	return ERR_PTR(-ENOMEM);  }  static int cpuset_css_online(struct cgroup_subsys_state *css) @@ -1903,6 +1957,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)  	cpuset_inc(); +	mutex_lock(&callback_mutex); +	if (cgroup_on_dfl(cs->css.cgroup)) { +		cpumask_copy(cs->effective_cpus, parent->effective_cpus); +		cs->effective_mems = parent->effective_mems; +	} +	mutex_unlock(&callback_mutex); +  	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))  		goto out_unlock; @@ -1962,20 +2023,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)  {  	struct cpuset *cs = css_cs(css); +	free_cpumask_var(cs->effective_cpus);  	free_cpumask_var(cs->cpus_allowed);  	kfree(cs);  } +static void cpuset_bind(struct cgroup_subsys_state *root_css) +{ +	mutex_lock(&cpuset_mutex); +	mutex_lock(&callback_mutex); + +	if (cgroup_on_dfl(root_css->cgroup)) { +		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); +		top_cpuset.mems_allowed = node_possible_map; +	} else { +		cpumask_copy(top_cpuset.cpus_allowed, +			     top_cpuset.effective_cpus); +		top_cpuset.mems_allowed = top_cpuset.effective_mems; +	} + +	mutex_unlock(&callback_mutex); +	mutex_unlock(&cpuset_mutex); +} +  struct cgroup_subsys cpuset_cgrp_subsys = { -	.css_alloc = cpuset_css_alloc, -	.css_online = cpuset_css_online, -	.css_offline = cpuset_css_offline, -	.css_free = cpuset_css_free, -	.can_attach = cpuset_can_attach, -	.cancel_attach = cpuset_cancel_attach, -	.attach = cpuset_attach, -	.base_cftypes = files, -	.early_init = 1, +	.css_alloc	= cpuset_css_alloc, +	.css_online	= cpuset_css_online, +	.css_offline	= cpuset_css_offline, +	.css_free	= cpuset_css_free, +	.can_attach	= cpuset_can_attach, +	.cancel_attach	= cpuset_cancel_attach, +	.attach		= cpuset_attach, +	.bind		= cpuset_bind, +	.legacy_cftypes	= files, +	.early_init	= 1,  };  /** @@ -1990,9 +2071,13 @@ int __init cpuset_init(void)  	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))  		BUG(); +	if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) +		BUG();  	cpumask_setall(top_cpuset.cpus_allowed);  	nodes_setall(top_cpuset.mems_allowed); +	cpumask_setall(top_cpuset.effective_cpus); +	nodes_setall(top_cpuset.effective_mems);  	fmeter_init(&top_cpuset.fmeter);  	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); @@ -2035,6 +2120,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)  	}  } +static void +hotplug_update_tasks_legacy(struct cpuset *cs, +			    struct cpumask *new_cpus, nodemask_t *new_mems, +			    bool cpus_updated, bool mems_updated) +{ +	bool is_empty; + +	mutex_lock(&callback_mutex); +	cpumask_copy(cs->cpus_allowed, new_cpus); +	cpumask_copy(cs->effective_cpus, new_cpus); +	cs->mems_allowed = *new_mems; +	cs->effective_mems = *new_mems; +	mutex_unlock(&callback_mutex); + +	/* +	 * Don't call update_tasks_cpumask() if the cpuset becomes empty, +	 * as the tasks will be migratecd to an ancestor. +	 */ +	if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) +		update_tasks_cpumask(cs); +	if (mems_updated && !nodes_empty(cs->mems_allowed)) +		update_tasks_nodemask(cs); + +	is_empty = cpumask_empty(cs->cpus_allowed) || +		   nodes_empty(cs->mems_allowed); + +	mutex_unlock(&cpuset_mutex); + +	/* +	 * Move tasks to the nearest ancestor with execution resources, +	 * This is full cgroup operation which will also call back into +	 * cpuset. Should be done outside any lock. +	 */ +	if (is_empty) +		remove_tasks_in_empty_cpuset(cs); + +	mutex_lock(&cpuset_mutex); +} + +static void +hotplug_update_tasks(struct cpuset *cs, +		     struct cpumask *new_cpus, nodemask_t *new_mems, +		     bool cpus_updated, bool mems_updated) +{ +	if (cpumask_empty(new_cpus)) +		cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); +	if (nodes_empty(*new_mems)) +		*new_mems = parent_cs(cs)->effective_mems; + +	mutex_lock(&callback_mutex); +	cpumask_copy(cs->effective_cpus, new_cpus); +	cs->effective_mems = *new_mems; +	mutex_unlock(&callback_mutex); + +	if (cpus_updated) +		update_tasks_cpumask(cs); +	if (mems_updated) +		update_tasks_nodemask(cs); +} +  /**   * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug   * @cs: cpuset in interest @@ -2045,11 +2190,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)   */  static void cpuset_hotplug_update_tasks(struct cpuset *cs)  { -	static cpumask_t off_cpus; -	static nodemask_t off_mems; -	bool is_empty; -	bool sane = cgroup_sane_behavior(cs->css.cgroup); - +	static cpumask_t new_cpus; +	static nodemask_t new_mems; +	bool cpus_updated; +	bool mems_updated;  retry:  	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -2064,51 +2208,20 @@ retry:  		goto retry;  	} -	cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); -	nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); - -	mutex_lock(&callback_mutex); -	cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); -	mutex_unlock(&callback_mutex); - -	/* -	 * If sane_behavior flag is set, we need to update tasks' cpumask -	 * for empty cpuset to take on ancestor's cpumask. Otherwise, don't -	 * call update_tasks_cpumask() if the cpuset becomes empty, as -	 * the tasks in it will be migrated to an ancestor. -	 */ -	if ((sane && cpumask_empty(cs->cpus_allowed)) || -	    (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) -		update_tasks_cpumask(cs); +	cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); +	nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); -	mutex_lock(&callback_mutex); -	nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); -	mutex_unlock(&callback_mutex); - -	/* -	 * If sane_behavior flag is set, we need to update tasks' nodemask -	 * for empty cpuset to take on ancestor's nodemask. Otherwise, don't -	 * call update_tasks_nodemask() if the cpuset becomes empty, as -	 * the tasks in it will be migratd to an ancestor. -	 */ -	if ((sane && nodes_empty(cs->mems_allowed)) || -	    (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) -		update_tasks_nodemask(cs); +	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); +	mems_updated = !nodes_equal(new_mems, cs->effective_mems); -	is_empty = cpumask_empty(cs->cpus_allowed) || -		nodes_empty(cs->mems_allowed); +	if (cgroup_on_dfl(cs->css.cgroup)) +		hotplug_update_tasks(cs, &new_cpus, &new_mems, +				     cpus_updated, mems_updated); +	else +		hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, +					    cpus_updated, mems_updated);  	mutex_unlock(&cpuset_mutex); - -	/* -	 * If sane_behavior flag is set, we'll keep tasks in empty cpusets. -	 * -	 * Otherwise move tasks to the nearest ancestor with execution -	 * resources.  This is full cgroup operation which will -	 * also call back into cpuset.  Should be done outside any lock. -	 */ -	if (!sane && is_empty) -		remove_tasks_in_empty_cpuset(cs);  }  /** @@ -2132,6 +2245,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  	static cpumask_t new_cpus;  	static nodemask_t new_mems;  	bool cpus_updated, mems_updated; +	bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);  	mutex_lock(&cpuset_mutex); @@ -2139,13 +2253,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  	cpumask_copy(&new_cpus, cpu_active_mask);  	new_mems = node_states[N_MEMORY]; -	cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); -	mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); +	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); +	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);  	/* synchronize cpus_allowed to cpu_active_mask */  	if (cpus_updated) {  		mutex_lock(&callback_mutex); -		cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); +		if (!on_dfl) +			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); +		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);  		mutex_unlock(&callback_mutex);  		/* we don't mess with cpumasks of tasks in top_cpuset */  	} @@ -2153,7 +2269,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  	/* synchronize mems_allowed to N_MEMORY */  	if (mems_updated) {  		mutex_lock(&callback_mutex); -		top_cpuset.mems_allowed = new_mems; +		if (!on_dfl) +			top_cpuset.mems_allowed = new_mems; +		top_cpuset.effective_mems = new_mems;  		mutex_unlock(&callback_mutex);  		update_tasks_nodemask(&top_cpuset);  	} @@ -2228,6 +2346,9 @@ void __init cpuset_init_smp(void)  	top_cpuset.mems_allowed = node_states[N_MEMORY];  	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; +	cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); +	top_cpuset.effective_mems = node_states[N_MEMORY]; +  	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);  } @@ -2244,23 +2365,17 @@ void __init cpuset_init_smp(void)  void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)  { -	struct cpuset *cpus_cs; -  	mutex_lock(&callback_mutex);  	rcu_read_lock(); -	cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); -	guarantee_online_cpus(cpus_cs, pmask); +	guarantee_online_cpus(task_cs(tsk), pmask);  	rcu_read_unlock();  	mutex_unlock(&callback_mutex);  }  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)  { -	struct cpuset *cpus_cs; -  	rcu_read_lock(); -	cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); -	do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed); +	do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);  	rcu_read_unlock();  	/* @@ -2299,13 +2414,11 @@ void cpuset_init_current_mems_allowed(void)  nodemask_t cpuset_mems_allowed(struct task_struct *tsk)  { -	struct cpuset *mems_cs;  	nodemask_t mask;  	mutex_lock(&callback_mutex);  	rcu_read_lock(); -	mems_cs = effective_nodemask_cpuset(task_cs(tsk)); -	guarantee_online_mems(mems_cs, &mask); +	guarantee_online_mems(task_cs(tsk), &mask);  	rcu_read_unlock();  	mutex_unlock(&callback_mutex); | 
