From aa24163b2ee5c92120e32e99b5a93143a0f4258e Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Wed, 15 Nov 2017 19:50:14 +0530 Subject: cgroup/cpuset: remove circular dependency deadlock Remove circular dependency deadlock in a scenario where hotplug of CPU is being done while there is updation in cgroup and cpuset triggered from userspace. Process A => kthreadd => Process B => Process C => Process A Process A cpu_subsys_offline(); cpu_down(); _cpu_down(); percpu_down_write(&cpu_hotplug_lock); //held cpuhp_invoke_callback(); workqueue_offline_cpu(); queue_work_on(); // unbind_work on system_highpri_wq __queue_work(); insert_work(); wake_up_worker(); flush_work(); wait_for_completion(); worker_thread(); manage_workers(); create_worker(); kthread_create_on_node(); wake_up_process(kthreadd_task); kthreadd kthreadd(); kernel_thread(); do_fork(); copy_process(); percpu_down_read(&cgroup_threadgroup_rwsem); __rwsem_down_read_failed_common(); //waiting Process B kernfs_fop_write(); cgroup_file_write(); cgroup_procs_write(); percpu_down_write(&cgroup_threadgroup_rwsem); //held cgroup_attach_task(); cgroup_migrate(); cgroup_migrate_execute(); cpuset_can_attach(); mutex_lock(&cpuset_mutex); //waiting Process C kernfs_fop_write(); cgroup_file_write(); cpuset_write_resmask(); mutex_lock(&cpuset_mutex); //held update_cpumask(); update_cpumasks_hier(); rebuild_sched_domains_locked(); get_online_cpus(); percpu_down_read(&cpu_hotplug_lock); //waiting Eliminating deadlock by reversing the locking order for cpuset_mutex and cpu_hotplug_lock. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 53 ++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 23 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f7efa7b4d825..cab5fd1ee767 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -812,6 +812,18 @@ done: return ndoms; } +static void cpuset_sched_change_begin(void) +{ + cpus_read_lock(); + mutex_lock(&cpuset_mutex); +} + +static void cpuset_sched_change_end(void) +{ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} + /* * Rebuild scheduler domains. * @@ -821,16 +833,14 @@ done: * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -838,27 +848,25 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { - mutex_lock(&cpuset_mutex); - rebuild_sched_domains_locked(); - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_begin(); + rebuild_sched_domains_cpuslocked(); + cpuset_sched_change_end(); } /** @@ -944,7 +952,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } /** @@ -1276,7 +1284,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } return 0; @@ -1309,7 +1317,6 @@ static void update_tasks_flags(struct cpuset *cs) * * Call with cpuset_mutex held. */ - static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { @@ -1342,7 +1349,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1610,7 +1617,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -1646,7 +1653,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); return retval; } @@ -1657,7 +1664,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1670,7 +1677,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); return retval; } @@ -1709,7 +1716,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1733,7 +1740,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2034,14 +2041,14 @@ out_unlock: /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). + * will call rebuild_sched_domains_cpuslocked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -2049,7 +2056,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); } static void cpuset_css_free(struct cgroup_subsys_state *css) -- cgit v1.2.3 From 1599a185f0e6113be185b9fb809c621c73865829 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Wed, 15 Nov 2017 19:50:15 +0530 Subject: cpuset: Make cpuset hotplug synchronous Convert cpuset_hotplug_workfn() into synchronous call for cpu hotplug path. For memory hotplug path it still gets queued as a work item. Since cpuset_hotplug_workfn() can be made synchronous for cpu hotplug path, it is not required to wait for cpuset hotplug while thawing processes. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index cab5fd1ee767..227bc25d951d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2277,15 +2277,8 @@ retry: mutex_unlock(&cpuset_mutex); } -static bool force_rebuild; - -void cpuset_force_rebuild(void) -{ - force_rebuild = true; -} - /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2300,7 +2293,7 @@ void cpuset_force_rebuild(void) * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_hotplug(bool use_cpu_hp_lock) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -2358,25 +2351,31 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; - rebuild_sched_domains(); + if (cpus_updated) { + if (use_cpu_hp_lock) + rebuild_sched_domains(); + else { + /* Acquiring cpu_hotplug_lock is not required. + * When cpuset_hotplug() is called in hotplug path, + * cpu_hotplug_lock is held by the hotplug context + * which is waiting for cpuhp_thread_fun to indicate + * completion of callback. + */ + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_cpuslocked(); + mutex_unlock(&cpuset_mutex); + } } } -void cpuset_update_active_cpus(void) +static void cpuset_hotplug_workfn(struct work_struct *work) { - /* - * We're inside cpu hotplug critical region which usually nests - * inside cgroup synchronization. Bounce actual hotplug processing - * to a work item to avoid reverse locking order. - */ - schedule_work(&cpuset_hotplug_work); + cpuset_hotplug(true); } -void cpuset_wait_for_hotplug(void) +void cpuset_update_active_cpus(void) { - flush_work(&cpuset_hotplug_work); + cpuset_hotplug(false); } /* -- cgit v1.2.3 From ddf7005f32212f28669032651e09bd8d2245c35d Mon Sep 17 00:00:00 2001 From: Wang Long Date: Sun, 19 Nov 2017 16:08:37 -0500 Subject: debug cgroup: use task_css_set instead of rcu_dereference This macro `task_css_set` verifies that the caller is inside proper critical section if the kernel set CONFIG_PROVE_RCU=y. Signed-off-by: Wang Long Signed-off-by: Tejun Heo --- kernel/cgroup/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f780d8f6a9d..9caeda610249 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); if (refcnt > cset->nr_tasks) @@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; -- cgit v1.2.3 From 52cf373c37a684f8fc279d541307fad39d206376 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Tue, 28 Nov 2017 13:59:25 +0100 Subject: cgroup: properly init u64_stats Lockdep complains that the stats update is trying to register a non-static key. This is because u64_stats are using a seqlock on 32bit arches, which needs to be initialized before usage. Fixes: 041cd640b2f3 (cgroup: Implement cgroup2 basic CPU usage accounting) Signed-off-by: Lucas Stach Signed-off-by: Tejun Heo --- kernel/cgroup/stat.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c index 133b465691d6..1e111dd455c4 100644 --- a/kernel/cgroup/stat.c +++ b/kernel/cgroup/stat.c @@ -296,8 +296,12 @@ int cgroup_stat_init(struct cgroup *cgrp) } /* ->updated_children list is self terminated */ - for_each_possible_cpu(cpu) - cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + cstat->updated_children = cgrp; + u64_stats_init(&cstat->sync); + } prev_cputime_init(&cgrp->stat.prev_cputime); -- cgit v1.2.3 From 11db855c3d06e82f432cb1bafd73296586d5ceec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Dec 2017 14:41:11 -0800 Subject: Revert "cpuset: Make cpuset hotplug synchronous" This reverts commit 1599a185f0e6113be185b9fb809c621c73865829. This and the previous commit led to another circular locking scenario and the scenario which is fixed by this commit no longer exists after e8b3f8db7aad ("workqueue/hotplug: simplify workqueue_offline_cpu()") which removes work item flushing from hotplug path. Revert it for now. Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 227bc25d951d..cab5fd1ee767 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2277,8 +2277,15 @@ retry: mutex_unlock(&cpuset_mutex); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** - * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset + * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2293,7 +2300,7 @@ retry: * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug(bool use_cpu_hp_lock) +static void cpuset_hotplug_workfn(struct work_struct *work) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -2351,31 +2358,25 @@ static void cpuset_hotplug(bool use_cpu_hp_lock) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) { - if (use_cpu_hp_lock) - rebuild_sched_domains(); - else { - /* Acquiring cpu_hotplug_lock is not required. - * When cpuset_hotplug() is called in hotplug path, - * cpu_hotplug_lock is held by the hotplug context - * which is waiting for cpuhp_thread_fun to indicate - * completion of callback. - */ - mutex_lock(&cpuset_mutex); - rebuild_sched_domains_cpuslocked(); - mutex_unlock(&cpuset_mutex); - } + if (cpus_updated || force_rebuild) { + force_rebuild = false; + rebuild_sched_domains(); } } -static void cpuset_hotplug_workfn(struct work_struct *work) +void cpuset_update_active_cpus(void) { - cpuset_hotplug(true); + /* + * We're inside cpu hotplug critical region which usually nests + * inside cgroup synchronization. Bounce actual hotplug processing + * to a work item to avoid reverse locking order. + */ + schedule_work(&cpuset_hotplug_work); } -void cpuset_update_active_cpus(void) +void cpuset_wait_for_hotplug(void) { - cpuset_hotplug(false); + flush_work(&cpuset_hotplug_work); } /* -- cgit v1.2.3 From bdfbbda90aeb75ce0951413fd7f495d4d377bd5e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Dec 2017 14:55:59 -0800 Subject: Revert "cgroup/cpuset: remove circular dependency deadlock" This reverts commit aa24163b2ee5c92120e32e99b5a93143a0f4258e. This and the following commit led to another circular locking scenario and the scenario which is fixed by this commit no longer exists after e8b3f8db7aad ("workqueue/hotplug: simplify workqueue_offline_cpu()") which removes work item flushing from hotplug path. Revert it for now. Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 53 ++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 30 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index cab5fd1ee767..f7efa7b4d825 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -812,18 +812,6 @@ done: return ndoms; } -static void cpuset_sched_change_begin(void) -{ - cpus_read_lock(); - mutex_lock(&cpuset_mutex); -} - -static void cpuset_sched_change_end(void) -{ - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); -} - /* * Rebuild scheduler domains. * @@ -833,14 +821,16 @@ static void cpuset_sched_change_end(void) * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * + * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_cpuslocked(void) +static void rebuild_sched_domains_locked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; lockdep_assert_held(&cpuset_mutex); + get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -848,25 +838,27 @@ static void rebuild_sched_domains_cpuslocked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - return; + goto out; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); +out: + put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_cpuslocked(void) +static void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { - cpuset_sched_change_begin(); - rebuild_sched_domains_cpuslocked(); - cpuset_sched_change_end(); + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_locked(); + mutex_unlock(&cpuset_mutex); } /** @@ -952,7 +944,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_cpuslocked(); + rebuild_sched_domains_locked(); } /** @@ -1284,7 +1276,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_cpuslocked(); + rebuild_sched_domains_locked(); } return 0; @@ -1317,6 +1309,7 @@ static void update_tasks_flags(struct cpuset *cs) * * Call with cpuset_mutex held. */ + static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { @@ -1349,7 +1342,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_cpuslocked(); + rebuild_sched_domains_locked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1617,7 +1610,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -1653,7 +1646,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); return retval; } @@ -1664,7 +1657,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1677,7 +1670,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); return retval; } @@ -1716,7 +1709,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1740,7 +1733,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2041,14 +2034,14 @@ out_unlock: /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_cpuslocked(). + * will call rebuild_sched_domains_locked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - cpuset_sched_change_begin(); + mutex_lock(&cpuset_mutex); if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -2056,7 +2049,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - cpuset_sched_change_end(); + mutex_unlock(&cpuset_mutex); } static void cpuset_css_free(struct cgroup_subsys_state *css) -- cgit v1.2.3 From e7fd37ba12170cc414be8b639dfc2c5f7172fac2 Mon Sep 17 00:00:00 2001 From: Ma Shimiao Date: Tue, 12 Dec 2017 09:43:49 +0800 Subject: cgroup: avoid copying strings longer than the buffers cgroup root name and file name have max length limit, we should avoid copying longer name than that to the name. tj: minor update to $SUBJ. Signed-off-by: Ma Shimiao Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0b1ffe147f24..18d71fbd3923 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strcpy(root->release_agent_path, opts->release_agent); + strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strcpy(root->name, opts->name); + strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -- cgit v1.2.3 From 50034ed49645463a16327cad05694e201e6b4126 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 Dec 2017 05:09:47 -0800 Subject: cgroup: use strlcpy() instead of strscpy() to avoid spurious warning As long as cft->name is guaranteed to be NUL-terminated, using strlcpy() would work just as well and avoid that warning, so the change below could be folded into that commit. Signed-off-by: Arnd Bergmann Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 18d71fbd3923..f4c2f8cb5748 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); + strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); + strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -- cgit v1.2.3 From 116d2f7496c51b2e02e8e4ecdd2bdf5fb9d5a641 Mon Sep 17 00:00:00 2001 From: Prateek Sood Date: Tue, 19 Dec 2017 12:56:57 +0530 Subject: cgroup: Fix deadlock in cpu hotplug path Deadlock during cgroup migration from cpu hotplug path when a task T is being moved from source to destination cgroup. kworker/0:0 cpuset_hotplug_workfn() cpuset_hotplug_update_tasks() hotplug_update_tasks_legacy() remove_tasks_in_empty_cpuset() cgroup_transfer_tasks() // stuck in iterator loop cgroup_migrate() cgroup_migrate_add_task() In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T. Task T will not migrate to destination cgroup. css_task_iter_start() will keep pointing to task T in loop waiting for task T cg_list node to be removed. Task T do_exit() exit_signals() // sets PF_EXITING exit_task_namespaces() switch_task_namespaces() free_nsproxy() put_mnt_ns() drop_collected_mounts() namespace_unlock() synchronize_rcu() _synchronize_rcu_expedited() schedule_work() // on cpu0 low priority worker pool wait_event() // waiting for work item to execute Task T inserted a work item in the worklist of cpu0 low priority worker pool. It is waiting for expedited grace period work item to execute. This work item will only be executed once kworker/0:0 complete execution of cpuset_hotplug_workfn(). kworker/0:0 ==> Task T ==>kworker/0:0 In case of PF_EXITING task being migrated from source to destination cgroup, migrate next available task in source cgroup. Signed-off-by: Prateek Sood Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 024085daab1a..a2c05d2476ac 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) */ do { css_task_iter_start(&from->self, 0, &it); - task = css_task_iter_next(&it); + + do { + task = css_task_iter_next(&it); + } while (task && (task->flags & PF_EXITING)); + if (task) get_task_struct(task); css_task_iter_end(&it); -- cgit v1.2.3 From 74d0833c659a8a54735e5efdd44f4b225af68586 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 20 Dec 2017 07:09:19 -0800 Subject: cgroup: fix css_task_iter crash on CSS_TASK_ITER_PROC While teaching css_task_iter to handle skipping over tasks which aren't group leaders, bc2fb7ed089f ("cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS") introduced a silly bug. CSS_TASK_ITER_PROCS is implemented by repeating css_task_iter_advance() while the advanced cursor is pointing to a non-leader thread. However, the cursor variable, @l, wasn't updated when the iteration has to advance to the next css_set and the following repetition would operate on the terminal @l from the previous iteration which isn't pointing to a valid task leading to oopses like the following or infinite looping. BUG: unable to handle kernel NULL pointer dereference at 0000000000000254 IP: __task_pid_nr_ns+0xc7/0xf0 PGD 0 P4D 0 Oops: 0000 [#1] SMP ... CPU: 2 PID: 1 Comm: systemd Not tainted 4.14.4-200.fc26.x86_64 #1 Hardware name: System manufacturer System Product Name/PRIME B350M-A, BIOS 3203 11/09/2017 task: ffff88c4baee8000 task.stack: ffff96d5c3158000 RIP: 0010:__task_pid_nr_ns+0xc7/0xf0 RSP: 0018:ffff96d5c315bd50 EFLAGS: 00010206 RAX: 0000000000000000 RBX: ffff88c4b68c6000 RCX: 0000000000000250 RDX: ffffffffa5e47960 RSI: 0000000000000000 RDI: ffff88c490f6ab00 RBP: ffff96d5c315bd50 R08: 0000000000001000 R09: 0000000000000005 R10: ffff88c4be006b80 R11: ffff88c42f1b8004 R12: ffff96d5c315bf18 R13: ffff88c42d7dd200 R14: ffff88c490f6a510 R15: ffff88c4b68c6000 FS: 00007f9446f8ea00(0000) GS:ffff88c4be680000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000254 CR3: 00000007f956f000 CR4: 00000000003406e0 Call Trace: cgroup_procs_show+0x19/0x30 cgroup_seqfile_show+0x4c/0xb0 kernfs_seq_show+0x21/0x30 seq_read+0x2ec/0x3f0 kernfs_fop_read+0x134/0x180 __vfs_read+0x37/0x160 ? security_file_permission+0x9b/0xc0 vfs_read+0x8e/0x130 SyS_read+0x55/0xc0 entry_SYSCALL_64_fastpath+0x1a/0xa5 RIP: 0033:0x7f94455f942d RSP: 002b:00007ffe81ba2d00 EFLAGS: 00000293 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 00005574e2233f00 RCX: 00007f94455f942d RDX: 0000000000001000 RSI: 00005574e2321a90 RDI: 000000000000002b RBP: 0000000000000000 R08: 00005574e2321a90 R09: 00005574e231de60 R10: 00007f94458c8b38 R11: 0000000000000293 R12: 00007f94458c8ae0 R13: 00007ffe81ba3800 R14: 0000000000000000 R15: 00005574e2116560 Code: 04 74 0e 89 f6 48 8d 04 76 48 8d 04 c5 f0 05 00 00 48 8b bf b8 05 00 00 48 01 c7 31 c0 48 8b 0f 48 85 c9 74 18 8b b2 30 08 00 00 <3b> 71 04 77 0d 48 c1 e6 05 48 01 f1 48 3b 51 38 74 09 5d c3 8b RIP: __task_pid_nr_ns+0xc7/0xf0 RSP: ffff96d5c315bd50 Fix it by moving the initialization of the cursor below the repeat label. While at it, rename it to @next for readability. Signed-off-by: Tejun Heo Fixes: bc2fb7ed089f ("cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS") Cc: stable@vger.kernel.org # v4.14+ Reported-by: Laura Abbott Reported-by: Bronek Kozicki Reported-by: George Amanakis Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f4c2f8cb5748..2cf06c274e4c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) static void css_task_iter_advance(struct css_task_iter *it) { - struct list_head *l = it->task_pos; + struct list_head *next; lockdep_assert_held(&css_set_lock); - WARN_ON_ONCE(!l); - repeat: /* * Advance iterator to find next entry. cset->tasks is consumed * first and then ->mg_tasks. After ->mg_tasks, we move onto the * next cset. */ - l = l->next; + next = it->task_pos->next; - if (l == it->tasks_head) - l = it->mg_tasks_head->next; + if (next == it->tasks_head) + next = it->mg_tasks_head->next; - if (l == it->mg_tasks_head) + if (next == it->mg_tasks_head) css_task_iter_advance_css_set(it); else - it->task_pos = l; + it->task_pos = next; /* if PROCS, skip over tasks which aren't group leaders */ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && -- cgit v1.2.3 From 4f58424da3deead2605e39a9df65f5f06107a3cb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 10 Jan 2018 04:35:12 -0800 Subject: cgroup: make cgroup.threads delegatable Make cgroup.threads file delegatable. The behavior of cgroup.threads should follow the behavior of cgroup.procs. Signed-off-by: Roman Gushchin Discovered-by: Michael Kerrisk Cc: Tejun Heo Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/cgroup') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 2cf06c274e4c..7e4c44538119 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4447,6 +4447,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.threads", + .flags = CFTYPE_NS_DELEGATABLE, .release = cgroup_procs_release, .seq_start = cgroup_threads_start, .seq_next = cgroup_procs_next, -- cgit v1.2.3