diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 167 |
1 files changed, 133 insertions, 34 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index cfaf6419d817..d94a8f7c4c29 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -56,6 +56,8 @@ #include <asm/atomic.h> #include <linux/mutex.h> #include <linux/kfifo.h> +#include <linux/workqueue.h> +#include <linux/cgroup.h> /* * Tracks how many cpusets are currently defined in system. @@ -96,6 +98,9 @@ struct cpuset { /* partition number for rebuild_sched_domains() */ int pn; + + /* used for walking a cpuset heirarchy */ + struct list_head stack_list; }; /* Retrieve the cpuset for a cgroup */ @@ -111,7 +116,10 @@ static inline struct cpuset *task_cs(struct task_struct *task) return container_of(task_subsys_state(task, cpuset_subsys_id), struct cpuset, css); } - +struct cpuset_hotplug_scanner { + struct cgroup_scanner scan; + struct cgroup *to; +}; /* bits in struct cpuset flags field */ typedef enum { @@ -1687,53 +1695,146 @@ int __init cpuset_init(void) return 0; } +/** + * cpuset_do_move_task - move a given task to another cpuset + * @tsk: pointer to task_struct the task to move + * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner + * + * Called by cgroup_scan_tasks() for each task in a cgroup. + * Return nonzero to stop the walk through the tasks. + */ +void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan) +{ + struct cpuset_hotplug_scanner *chsp; + + chsp = container_of(scan, struct cpuset_hotplug_scanner, scan); + cgroup_attach_task(chsp->to, tsk); +} + +/** + * move_member_tasks_to_cpuset - move tasks from one cpuset to another + * @from: cpuset in which the tasks currently reside + * @to: cpuset to which the tasks will be moved + * + * Called with manage_sem held + * callback_mutex must not be held, as attach_task() will take it. + * + * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * calling callback functions for each. + */ +static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) +{ + struct cpuset_hotplug_scanner scan; + + scan.scan.cg = from->css.cgroup; + scan.scan.test_task = NULL; /* select all tasks in cgroup */ + scan.scan.process_task = cpuset_do_move_task; + scan.scan.heap = NULL; + scan.to = to->css.cgroup; + + if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) + printk(KERN_ERR "move_member_tasks_to_cpuset: " + "cgroup_scan_tasks failed\n"); +} + /* * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then the guarantee_online_cpus() - * or guarantee_online_mems() code will use that emptied cpusets - * parent online CPUs or nodes. Cpusets that were already empty of - * CPUs or nodes are left empty. - * - * This routine is intentionally inefficient in a couple of regards. - * It will check all cpusets in a subtree even if the top cpuset of - * the subtree has no offline CPUs or nodes. It checks both CPUs and - * nodes, even though the caller could have been coded to know that - * only one of CPUs or nodes needed to be checked on a given call. - * This was done to minimize text size rather than cpu cycles. + * last CPU or node from a cpuset, then move the tasks in the empty + * cpuset to its next-highest non-empty parent. * - * Call with both manage_mutex and callback_mutex held. + * The parent cpuset has some superset of the 'mems' nodes that the + * newly empty cpuset held, so no migration of memory is necessary. * - * Recursive, on depth of cpuset subtree. + * Called with both manage_sem and callback_sem held */ +static void remove_tasks_in_empty_cpuset(struct cpuset *cs) +{ + struct cpuset *parent; + + /* the cgroup's css_sets list is in use if there are tasks + in the cpuset; the list is empty if there are none; + the cs->css.refcnt seems always 0 */ + if (list_empty(&cs->css.cgroup->css_sets)) + return; -static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) + /* + * Find its next-highest non-empty parent, (top cpuset + * has online cpus, so can't be empty). + */ + parent = cs->parent; + while (cpus_empty(parent->cpus_allowed)) { + /* + * this empty cpuset should now be considered to + * have been used, and therefore eligible for + * release when empty (if it is notify_on_release) + */ + parent = parent->parent; + } + + move_member_tasks_to_cpuset(cs, parent); +} + +/* + * Walk the specified cpuset subtree and look for empty cpusets. + * The tasks of such cpuset must be moved to a parent cpuset. + * + * Note that such a notify_on_release cpuset must have had, at some time, + * member tasks or cpuset descendants and cpus and memory, before it can + * be a candidate for release. + * + * Called with manage_mutex held. We take callback_mutex to modify + * cpus_allowed and mems_allowed. + * + * This walk processes the tree from top to bottom, completing one layer + * before dropping down to the next. It always processes a node before + * any of its children. + * + * For now, since we lack memory hot unplug, we'll never see a cpuset + * that has tasks along with an empty 'mems'. But if we did see such + * a cpuset, we'd handle it just like we do if its 'cpus' was empty. + */ +static void scan_for_empty_cpusets(const struct cpuset *root) { + struct cpuset *cp; /* scans cpusets being updated */ + struct cpuset *child; /* scans child cpusets of cp */ + struct list_head queue; struct cgroup *cont; - struct cpuset *c; - /* Each of our child cpusets mems must be online */ - list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { - c = cgroup_cs(cont); - guarantee_online_cpus_mems_in_subtree(c); - if (!cpus_empty(c->cpus_allowed)) - guarantee_online_cpus(c, &c->cpus_allowed); - if (!nodes_empty(c->mems_allowed)) - guarantee_online_mems(c, &c->mems_allowed); + INIT_LIST_HEAD(&queue); + + list_add_tail((struct list_head *)&root->stack_list, &queue); + + mutex_lock(&callback_mutex); + while (!list_empty(&queue)) { + cp = container_of(queue.next, struct cpuset, stack_list); + list_del(queue.next); + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, &queue); + } + cont = cp->css.cgroup; + /* Remove offline cpus and mems from this cpuset. */ + cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); + nodes_and(cp->mems_allowed, cp->mems_allowed, + node_states[N_HIGH_MEMORY]); + if ((cpus_empty(cp->cpus_allowed) || + nodes_empty(cp->mems_allowed))) { + /* Move tasks from the empty cpuset to a parent */ + mutex_unlock(&callback_mutex); + remove_tasks_in_empty_cpuset(cp); + mutex_lock(&callback_mutex); + } } + mutex_unlock(&callback_mutex); + return; } /* * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to - * track what's online after any CPU or memory node hotplug or unplug - * event. - * - * To ensure that we don't remove a CPU or node from the top cpuset - * that is currently in use by a child cpuset (which would violate - * the rule that cpusets must be subsets of their parent), we first - * call the recursive routine guarantee_online_cpus_mems_in_subtree(). + * track what's online after any CPU or memory node hotplug or unplug event. * * Since there are two callers of this routine, one for CPU hotplug * events and one for memory node hotplug events, we could have coded @@ -1744,13 +1845,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) static void common_cpu_mem_hotplug_unplug(void) { cgroup_lock(); - mutex_lock(&callback_mutex); - guarantee_online_cpus_mems_in_subtree(&top_cpuset); top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + scan_for_empty_cpusets(&top_cpuset); - mutex_unlock(&callback_mutex); cgroup_unlock(); } |