diff options
Diffstat (limited to 'kernel')
78 files changed, 3369 insertions, 2049 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 8d6e145138bb..808a86ff229d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -55,7 +55,7 @@ #include <linux/times.h> #include <linux/syscalls.h> #include <linux/mount.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/div64.h> #include <linux/blkdev.h> /* sector_div */ #include <linux/pid_namespace.h> @@ -134,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) spin_lock(&acct_lock); if (file != acct->file) { if (act) - res = act>0; + res = act > 0; goto out; } @@ -262,7 +262,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); if (IS_ERR(tmp)) - return (PTR_ERR(tmp)); + return PTR_ERR(tmp); error = acct_on(tmp); putname(tmp); } else { diff --git a/kernel/audit.c b/kernel/audit.c index 47845c57eb19..f30106459a32 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -44,7 +44,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> -#include <asm/types.h> +#include <linux/types.h> #include <linux/atomic.h> #include <linux/mm.h> #include <linux/export.h> diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f251a5e8d17a..21eae3c05ec0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -728,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) return AUDIT_BUILD_CONTEXT; } +static int audit_in_mask(const struct audit_krule *rule, unsigned long val) +{ + int word, bit; + + if (val > 0xffffffff) + return false; + + word = AUDIT_WORD(val); + if (word >= AUDIT_BITMASK_SIZE) + return false; + + bit = AUDIT_BIT(val); + + return rule->mask[word] & bit; +} + /* At syscall entry and exit time, this filter is called if the * audit_state is not low enough that auditing cannot take place, but is * also not high enough that we already know we have to write an audit @@ -745,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, rcu_read_lock(); if (!list_empty(list)) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && + if (audit_in_mask(&e->rule, ctx->major) && audit_filter_rules(tsk, &e->rule, ctx, NULL, &state, false)) { rcu_read_unlock(); @@ -769,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, static int audit_filter_inode_name(struct task_struct *tsk, struct audit_names *n, struct audit_context *ctx) { - int word, bit; int h = audit_hash_ino((u32)n->ino); struct list_head *list = &audit_inode_hash[h]; struct audit_entry *e; enum audit_state state; - word = AUDIT_WORD(ctx->major); - bit = AUDIT_BIT(ctx->major); - if (list_empty(list)) return 0; list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && + if (audit_in_mask(&e->rule, ctx->major) && audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { ctx->current_state = state; return 1; diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a5e026bc45c4..1323360d90e3 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -19,8 +19,8 @@ static void backtrace_test_normal(void) { - printk("Testing a backtrace from process context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a backtrace from process context.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); dump_stack(); } @@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); static void backtrace_test_irq(void) { - printk("Testing a backtrace from irq context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a backtrace from irq context.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); init_completion(&backtrace_work); tasklet_schedule(&backtrace_tasklet); @@ -51,8 +51,8 @@ static void backtrace_test_saved(void) struct stack_trace trace; unsigned long entries[8]; - printk("Testing a saved backtrace.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a saved backtrace.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); trace.nr_entries = 0; trace.max_entries = ARRAY_SIZE(entries); @@ -65,19 +65,19 @@ static void backtrace_test_saved(void) #else static void backtrace_test_saved(void) { - printk("Saved backtrace test skipped.\n"); + pr_info("Saved backtrace test skipped.\n"); } #endif static int backtrace_regression_test(void) { - printk("====[ backtrace testing ]===========\n"); + pr_info("====[ backtrace testing ]===========\n"); backtrace_test_normal(); backtrace_test_irq(); backtrace_test_saved(); - printk("====[ end of backtrace testing ]====\n"); + pr_info("====[ end of backtrace testing ]====\n"); return 0; } diff --git a/kernel/capability.c b/kernel/capability.c index a8d63df0c322..a5cf13c018ce 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -24,7 +24,6 @@ */ const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; - EXPORT_SYMBOL(__cap_empty_set); int file_caps_enabled = 1; @@ -189,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) * * An alternative would be to return an error here * (-ERANGE), but that causes legacy applications to - * unexpectidly fail; the capget/modify/capset aborts + * unexpectedly fail; the capget/modify/capset aborts * before modification is attempted and the application * fails. */ @@ -395,7 +394,8 @@ EXPORT_SYMBOL(ns_capable); * This does not set PF_SUPERPRIV because the caller may not * actually be privileged. */ -bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) +bool file_ns_capable(const struct file *file, struct user_namespace *ns, + int cap) { if (WARN_ON_ONCE(!cap_valid(cap))) return false; @@ -424,23 +424,19 @@ bool capable(int cap) EXPORT_SYMBOL(capable); /** - * inode_capable - Check superior capability over inode + * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question * @cap: The capability in question * - * Return true if the current task has the given superior capability - * targeted at it's own user namespace and that the given inode is owned - * by the current user namespace or a child namespace. - * - * Currently we check to see if an inode is owned by the current - * user namespace by seeing if the inode's owner maps into the - * current user namespace. - * + * Return true if the current task has the given capability targeted at + * its own user namespace and that the given inode's uid and gid are + * mapped into the current user namespace. */ -bool inode_capable(const struct inode *inode, int cap) +bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); - return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); + return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) && + kgid_has_mapping(ns, inode->i_gid); } -EXPORT_SYMBOL(inode_capable); +EXPORT_SYMBOL(capable_wrt_inode_uidgid); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ceee0c54c6a4..7868fc3c0bc5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -26,6 +26,8 @@ * distribution for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/cgroup.h> #include <linux/cred.h> #include <linux/ctype.h> @@ -70,15 +72,6 @@ MAX_CFTYPE_NAME + 2) /* - * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file - * creation/removal and hierarchy changing operations including cgroup - * creation, removal, css association and controller rebinding. This outer - * lock is needed mainly to resolve the circular dependency between kernfs - * active ref and cgroup_mutex. cgroup_tree_mutex nests above both. - */ -static DEFINE_MUTEX(cgroup_tree_mutex); - -/* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * @@ -99,16 +92,21 @@ static DECLARE_RWSEM(css_set_rwsem); #endif /* + * Protects cgroup_idr and css_idr so that IDs can be released without + * grabbing cgroup_mutex. + */ +static DEFINE_SPINLOCK(cgroup_idr_lock); + +/* * Protects cgroup_subsys->release_agent_path. Modifying it also requires * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. */ static DEFINE_SPINLOCK(release_agent_path_lock); -#define cgroup_assert_mutexes_or_rcu_locked() \ +#define cgroup_assert_mutex_or_rcu_locked() \ rcu_lockdep_assert(rcu_read_lock_held() || \ - lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex), \ - "cgroup_[tree_]mutex or RCU read lock required"); + "cgroup_mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot @@ -151,6 +149,13 @@ struct cgroup_root cgrp_dfl_root; */ static bool cgrp_dfl_root_visible; +/* some controllers are not supported in the default hierarchy */ +static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 +#ifdef CONFIG_CGROUP_DEBUG + | (1 << debug_cgrp_id) +#endif + ; + /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); @@ -160,14 +165,13 @@ static int cgroup_root_count; static DEFINE_IDR(cgroup_hierarchy_idr); /* - * Assign a monotonically increasing serial number to cgroups. It - * guarantees cgroups with bigger numbers are newer than those with smaller - * numbers. Also, as cgroups are always appended to the parent's - * ->children list, it guarantees that sibling cgroups are always sorted in - * the ascending serial number order on the list. Protected by - * cgroup_mutex. + * Assign a monotonically increasing serial number to csses. It guarantees + * cgroups with bigger numbers are newer than those with smaller numbers. + * Also, as csses are always appended to the parent's ->children list, it + * guarantees that sibling csses are always sorted in the ascending serial + * number order on the list. Protected by cgroup_mutex. */ -static u64 cgroup_serial_nr_next = 1; +static u64 css_serial_nr_next = 1; /* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do @@ -180,17 +184,59 @@ static struct cftype cgroup_base_files[]; static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask); -static void cgroup_destroy_css_killed(struct cgroup *cgrp); + unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static void css_release(struct percpu_ref *ref); +static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); +/* IDR wrappers which synchronize using cgroup_idr_lock */ +static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, + gfp_t gfp_mask) +{ + int ret; + + idr_preload(gfp_mask); + spin_lock_bh(&cgroup_idr_lock); + ret = idr_alloc(idr, ptr, start, end, gfp_mask); + spin_unlock_bh(&cgroup_idr_lock); + idr_preload_end(); + return ret; +} + +static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) +{ + void *ret; + + spin_lock_bh(&cgroup_idr_lock); + ret = idr_replace(idr, ptr, id); + spin_unlock_bh(&cgroup_idr_lock); + return ret; +} + +static void cgroup_idr_remove(struct idr *idr, int id) +{ + spin_lock_bh(&cgroup_idr_lock); + idr_remove(idr, id); + spin_unlock_bh(&cgroup_idr_lock); +} + +static struct cgroup *cgroup_parent(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *parent_css = cgrp->self.parent; + + if (parent_css) + return container_of(parent_css, struct cgroup, self); + return NULL; +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns the dummy_css) + * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and @@ -203,23 +249,49 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, { if (ss) return rcu_dereference_check(cgrp->subsys[ss->id], - lockdep_is_held(&cgroup_tree_mutex) || lockdep_is_held(&cgroup_mutex)); else - return &cgrp->dummy_css; + return &cgrp->self; +} + +/** + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns @cgrp->self) + * + * Similar to cgroup_css() but returns the effctive css, which is defined + * as the matching css of the nearest ancestor including self which has @ss + * enabled. If @ss is associated with the hierarchy @cgrp is on, this + * function is guaranteed to return non-NULL css. + */ +static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + lockdep_assert_held(&cgroup_mutex); + + if (!ss) + return &cgrp->self; + + if (!(cgrp->root->subsys_mask & (1 << ss->id))) + return NULL; + + while (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) + cgrp = cgroup_parent(cgrp); + + return cgroup_css(cgrp, ss); } /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { - return test_bit(CGRP_DEAD, &cgrp->flags); + return !(cgrp->self.flags & CSS_ONLINE); } -struct cgroup_subsys_state *seq_css(struct seq_file *seq) +struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { - struct kernfs_open_file *of = seq->private; struct cgroup *cgrp = of->kn->parent->priv; - struct cftype *cft = seq_cft(seq); + struct cftype *cft = of_cft(of); /* * This is open and unprotected implementation of cgroup_css(). @@ -232,9 +304,9 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq) if (cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else - return &cgrp->dummy_css; + return &cgrp->self; } -EXPORT_SYMBOL_GPL(seq_css); +EXPORT_SYMBOL_GPL(of_css); /** * cgroup_is_descendant - test ancestry @@ -250,7 +322,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) while (cgrp) { if (cgrp == ancestor) return true; - cgrp = cgrp->parent; + cgrp = cgroup_parent(cgrp); } return false; } @@ -274,17 +346,30 @@ static int notify_on_release(const struct cgroup *cgrp) * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * - * Should be called under cgroup_mutex. + * Should be called under cgroup_[tree_]mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ - lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex)))) { } \ else /** + * for_each_e_css - iterate all effective css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of + * + * Should be called under cgroup_[tree_]mutex. + */ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ + ; \ + else + +/** * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end @@ -297,22 +382,13 @@ static int notify_on_release(const struct cgroup *cgrp) #define for_each_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the mutex should be later unlocked. On - * failure returns false with no lock held. - */ -static bool cgroup_lock_live_group(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_mutex); - if (cgroup_is_dead(cgrp)) { - mutex_unlock(&cgroup_mutex); - return false; - } - return true; -} +/* iterate over child cgrps, lock should be held throughout iteration */ +#define cgroup_for_each_live_child(child, cgrp) \ + list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + cgroup_is_dead(child); })) \ + ; \ + else /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ @@ -360,6 +436,43 @@ struct css_set init_css_set = { static int css_set_count = 1; /* 1 for init_css_set */ +/** + * cgroup_update_populated - updated populated count of a cgroup + * @cgrp: the target cgroup + * @populated: inc or dec populated count + * + * @cgrp is either getting the first task (css_set) or losing the last. + * Update @cgrp->populated_cnt accordingly. The count is propagated + * towards root so that a given cgroup's populated_cnt is zero iff the + * cgroup and all its descendants are empty. + * + * @cgrp's interface file "cgroup.populated" is zero if + * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt + * changes from or to zero, userland is notified that the content of the + * interface file has changed. This can be used to detect when @cgrp and + * its descendants become populated or empty. + */ +static void cgroup_update_populated(struct cgroup *cgrp, bool populated) +{ + lockdep_assert_held(&css_set_rwsem); + + do { + bool trigger; + + if (populated) + trigger = !cgrp->populated_cnt++; + else + trigger = !--cgrp->populated_cnt; + + if (!trigger) + break; + + if (cgrp->populated_kn) + kernfs_notify(cgrp->populated_kn); + cgrp = cgroup_parent(cgrp); + } while (cgrp); +} + /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into @@ -384,6 +497,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) static void put_css_set_locked(struct css_set *cset, bool taskexit) { struct cgrp_cset_link *link, *tmp_link; + struct cgroup_subsys *ss; + int ssid; lockdep_assert_held(&css_set_rwsem); @@ -391,6 +506,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) return; /* This css_set is dead. unlink it and release cgroup refcounts */ + for_each_subsys(ss, ssid) + list_del(&cset->e_cset_node[ssid]); hash_del(&cset->hlist); css_set_count--; @@ -401,10 +518,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) list_del(&link->cgrp_link); /* @cgrp can't go away while we're holding css_set_rwsem */ - if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { - if (taskexit) - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); + if (list_empty(&cgrp->cset_links)) { + cgroup_update_populated(cgrp, false); + if (notify_on_release(cgrp)) { + if (taskexit) + set_bit(CGRP_RELEASABLE, &cgrp->flags); + check_for_release(cgrp); + } } kfree(link); @@ -453,20 +573,20 @@ static bool compare_css_sets(struct css_set *cset, { struct list_head *l1, *l2; - if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { - /* Not all subsystems matched */ + /* + * On the default hierarchy, there can be csets which are + * associated with the same set of cgroups but different csses. + * Let's first ensure that csses match. + */ + if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; - } /* * Compare cgroup pointers in order to distinguish between - * different cgroups in heirarchies with no subsystems. We - * could get by with just this check alone (and skip the - * memcmp above) but on most setups the memcmp check will - * avoid the need for this more expensive check on almost all - * candidates. + * different cgroups in hierarchies. As different cgroups may + * share the same effective css, this comparison is always + * necessary. */ - l1 = &cset->cgrp_links; l2 = &old_cset->cgrp_links; while (1) { @@ -530,14 +650,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * won't change, so no need for locking. */ for_each_subsys(ss, i) { - if (root->cgrp.subsys_mask & (1UL << i)) { - /* Subsystem is in this hierarchy. So we want - * the subsystem state from the new - * cgroup */ - template[i] = cgroup_css(cgrp, ss); + if (root->subsys_mask & (1UL << i)) { + /* + * @ss is in this hierarchy, so we want the + * effective css from @cgrp. + */ + template[i] = cgroup_e_css(cgrp, ss); } else { - /* Subsystem is not in this hierarchy, so we - * don't want to change the subsystem state */ + /* + * @ss is not in this hierarchy, so we don't want + * to change the css. + */ template[i] = old_cset->subsys[i]; } } @@ -603,10 +726,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgrp_cset_link *link; BUG_ON(list_empty(tmp_links)); + + if (cgroup_on_dfl(cgrp)) + cset->dfl_cgrp = cgrp; + link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; + + if (list_empty(&cgrp->cset_links)) + cgroup_update_populated(cgrp, true); list_move(&link->cset_link, &cgrp->cset_links); + /* * Always add links to the tail of the list so that the list * is sorted by order of hierarchy creation @@ -629,7 +760,9 @@ static struct css_set *find_css_set(struct css_set *old_cset, struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; + struct cgroup_subsys *ss; unsigned long key; + int ssid; lockdep_assert_held(&cgroup_mutex); @@ -680,10 +813,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, css_set_count++; - /* Add this cgroup group to the hash table */ + /* Add @cset to the hash table */ key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); + for_each_subsys(ss, ssid) + list_add_tail(&cset->e_cset_node[ssid], + &cset->subsys[ssid]->cgroup->e_csets[ssid]); + up_write(&css_set_rwsem); return cset; @@ -736,14 +873,13 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); BUG_ON(atomic_read(&root->nr_cgrps)); - BUG_ON(!list_empty(&cgrp->children)); + BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); + rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); /* * Release all the links from cset_links to this hierarchy's @@ -766,7 +902,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_exit_root_id(root); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); @@ -849,7 +984,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask); static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; @@ -884,79 +1019,95 @@ static umode_t cgroup_file_mode(const struct cftype *cft) if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write_u64 || cft->write_s64 || cft->write_string || - cft->trigger) + if (cft->write_u64 || cft->write_s64 || cft->write) mode |= S_IWUSR; return mode; } -static void cgroup_free_fn(struct work_struct *work) +static void cgroup_get(struct cgroup *cgrp) { - struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - - atomic_dec(&cgrp->root->nr_cgrps); - cgroup_pidlist_destroy_all(cgrp); - - if (cgrp->parent) { - /* - * We get a ref to the parent, and put the ref when this - * cgroup is being freed, so it's guaranteed that the - * parent won't be destroyed before its children. - */ - cgroup_put(cgrp->parent); - kernfs_put(cgrp->kn); - kfree(cgrp); - } else { - /* - * This is root cgroup's refcnt reaching zero, which - * indicates that the root should be released. - */ - cgroup_destroy_root(cgrp->root); - } + WARN_ON_ONCE(cgroup_is_dead(cgrp)); + css_get(&cgrp->self); } -static void cgroup_free_rcu(struct rcu_head *head) +static void cgroup_put(struct cgroup *cgrp) { - struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); - - INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); - queue_work(cgroup_destroy_wq, &cgrp->destroy_work); + css_put(&cgrp->self); } -static void cgroup_get(struct cgroup *cgrp) +/** + * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper undoes cgroup_kn_lock_live() and should be invoked before + * the method finishes if locking succeeded. Note that once this function + * returns the cgroup returned by cgroup_kn_lock_live() may become + * inaccessible any time. If the caller intends to continue to access the + * cgroup, it should pin it before invoking this function. + */ +static void cgroup_kn_unlock(struct kernfs_node *kn) { - WARN_ON_ONCE(cgroup_is_dead(cgrp)); - WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); - atomic_inc(&cgrp->refcnt); + struct cgroup *cgrp; + + if (kernfs_type(kn) == KERNFS_DIR) + cgrp = kn->priv; + else + cgrp = kn->parent->priv; + + mutex_unlock(&cgroup_mutex); + + kernfs_unbreak_active_protection(kn); + cgroup_put(cgrp); } -static void cgroup_put(struct cgroup *cgrp) +/** + * cgroup_kn_lock_live - locking helper for cgroup kernfs methods + * @kn: the kernfs_node being serviced + * + * This helper is to be used by a cgroup kernfs method currently servicing + * @kn. It breaks the active protection, performs cgroup locking and + * verifies that the associated cgroup is alive. Returns the cgroup if + * alive; otherwise, %NULL. A successful return should be undone by a + * matching cgroup_kn_unlock() invocation. + * + * Any cgroup kernfs method implementation which requires locking the + * associated cgroup should use this helper. It avoids nesting cgroup + * locking under kernfs active protection and allows all kernfs operations + * including self-removal. + */ +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) { - if (!atomic_dec_and_test(&cgrp->refcnt)) - return; - if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) - return; + struct cgroup *cgrp; + + if (kernfs_type(kn) == KERNFS_DIR) + cgrp = kn->priv; + else + cgrp = kn->parent->priv; /* - * XXX: cgrp->id is only used to look up css's. As cgroup and - * css's lifetimes will be decoupled, it should be made - * per-subsystem and moved to css->id so that lookups are - * successful until the target css is released. + * We're gonna grab cgroup_mutex which nests outside kernfs + * active_ref. cgroup liveliness check alone provides enough + * protection against removal. Ensure @cgrp stays accessible and + * break the active_ref protection. */ + cgroup_get(cgrp); + kernfs_break_active_protection(kn); + mutex_lock(&cgroup_mutex); - idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - mutex_unlock(&cgroup_mutex); - cgrp->id = -1; - call_rcu(&cgrp->rcu_head, cgroup_free_rcu); + if (!cgroup_is_dead(cgrp)) + return cgrp; + + cgroup_kn_unlock(kn); + return NULL; } static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; - lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } @@ -965,7 +1116,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) * @cgrp: target cgroup * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) { struct cgroup_subsys *ss; int i; @@ -973,40 +1124,40 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) for_each_subsys(ss, i) { struct cftype *cfts; - if (!test_bit(i, &subsys_mask)) + if (!(subsys_mask & (1 << i))) continue; list_for_each_entry(cfts, &ss->cfts, node) cgroup_addrm_files(cgrp, cfts, false); } } -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) { struct cgroup_subsys *ss; - int ssid, ret; + unsigned int tmp_ss_mask; + int ssid, i, ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); for_each_subsys(ss, ssid) { if (!(ss_mask & (1 << ssid))) continue; - /* if @ss is on the dummy_root, we can always move it */ - if (ss->root == &cgrp_dfl_root) - continue; - - /* if @ss has non-root cgroups attached to it, can't move */ - if (!list_empty(&ss->root->cgrp.children)) + /* if @ss has non-root csses attached to it, can't move */ + if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) return -EBUSY; /* can't move between two non-dummy roots either */ - if (dst_root != &cgrp_dfl_root) + if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; } - ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); + /* skip creating root files on dfl_root for inhibited subsystems */ + tmp_ss_mask = ss_mask; + if (dst_root == &cgrp_dfl_root) + tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; + + ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); if (ret) { if (dst_root != &cgrp_dfl_root) return ret; @@ -1018,9 +1169,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * Just warn about it and continue. */ if (cgrp_dfl_root_visible) { - pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", - ret, ss_mask); - pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); + pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", + ret, ss_mask); + pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); } } @@ -1028,15 +1179,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - mutex_unlock(&cgroup_mutex); for_each_subsys(ss, ssid) if (ss_mask & (1 << ssid)) cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); - mutex_lock(&cgroup_mutex); for_each_subsys(ss, ssid) { struct cgroup_root *src_root; struct cgroup_subsys_state *css; + struct css_set *cset; if (!(ss_mask & (1 << ssid))) continue; @@ -1051,8 +1201,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ss->root = dst_root; css->cgroup = &dst_root->cgrp; - src_root->cgrp.subsys_mask &= ~(1 << ssid); - dst_root->cgrp.subsys_mask |= 1 << ssid; + down_write(&css_set_rwsem); + hash_for_each(css_set_table, i, cset, hlist) + list_move_tail(&cset->e_cset_node[ss->id], + &dst_root->cgrp.e_csets[ss->id]); + up_write(&css_set_rwsem); + + src_root->subsys_mask &= ~(1 << ssid); + src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + + /* default hierarchy doesn't enable controllers by default */ + dst_root->subsys_mask |= 1 << ssid; + if (dst_root != &cgrp_dfl_root) + dst_root->cgrp.child_subsys_mask |= 1 << ssid; if (ss->bind) ss->bind(css); @@ -1070,7 +1231,7 @@ static int cgroup_show_options(struct seq_file *seq, int ssid; for_each_subsys(ss, ssid) - if (root->cgrp.subsys_mask & (1 << ssid)) + if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); @@ -1092,8 +1253,8 @@ static int cgroup_show_options(struct seq_file *seq, } struct cgroup_sb_opts { - unsigned long subsys_mask; - unsigned long flags; + unsigned int subsys_mask; + unsigned int flags; char *release_agent; bool cpuset_clone_children; char *name; @@ -1101,24 +1262,16 @@ struct cgroup_sb_opts { bool none; }; -/* - * Convert a hierarchy specifier into a bitmask of subsystems and - * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] - * array. This function takes refcounts on subsystems to be used, unless it - * returns error, in which case no refcounts are taken. - */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; - unsigned long mask = (unsigned long)-1; + unsigned int mask = -1U; struct cgroup_subsys *ss; int i; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - #ifdef CONFIG_CPUSETS - mask = ~(1UL << cpuset_cgrp_id); + mask = ~(1U << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1199,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) return -EINVAL; - set_bit(i, &opts->subsys_mask); + opts->subsys_mask |= (1 << i); one_ss = true; break; @@ -1211,12 +1364,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Consistency checks */ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || opts->cpuset_clone_children || opts->release_agent || opts->name) { - pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); + pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); return -EINVAL; } } else { @@ -1228,7 +1381,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) if (!ss->disabled) - set_bit(i, &opts->subsys_mask); + opts->subsys_mask |= (1 << i); /* * We either have to specify by name or by subsystems. (So @@ -1259,14 +1412,13 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; - unsigned long added_mask, removed_mask; + unsigned int added_mask, removed_mask; if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("cgroup: sane_behavior: remount is not allowed\n"); + pr_err("sane_behavior: remount is not allowed\n"); return -EINVAL; } - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* See what subsystems are wanted */ @@ -1274,17 +1426,17 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) - pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", - task_tgid_nr(current), current->comm); + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", + task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; - removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || (opts.name && strcmp(opts.name, root->name))) { - pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", + pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", root->flags & CGRP_ROOT_OPTION_MASK, root->name); ret = -EINVAL; @@ -1292,7 +1444,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) } /* remounting is not allowed for populated hierarchies */ - if (!list_empty(&root->cgrp.children)) { + if (!list_empty(&root->cgrp.self.children)) { ret = -EBUSY; goto out_unlock; } @@ -1312,7 +1464,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) kfree(opts.release_agent); kfree(opts.name); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); return ret; } @@ -1370,14 +1521,22 @@ out_unlock: static void init_cgroup_housekeeping(struct cgroup *cgrp) { - atomic_set(&cgrp->refcnt, 1); - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); + struct cgroup_subsys *ss; + int ssid; + + INIT_LIST_HEAD(&cgrp->self.sibling); + INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); - cgrp->dummy_css.cgroup = cgrp; + cgrp->self.cgroup = cgrp; + cgrp->self.flags |= CSS_ONLINE; + + for_each_subsys(ss, ssid) + INIT_LIST_HEAD(&cgrp->e_csets[ssid]); + + init_waitqueue_head(&cgrp->offline_waitq); } static void init_cgroup_root(struct cgroup_root *root, @@ -1400,21 +1559,24 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; struct css_set *cset; int i, ret; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); - ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); + ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); if (ret < 0) goto out; root_cgrp->id = ret; + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); + if (ret) + goto out; + /* * We're accessing css_set_count without locking css_set_rwsem here, * but that's OK - it can only be increased by someone holding @@ -1423,11 +1585,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) */ ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); if (ret) - goto out; + goto cancel_ref; ret = cgroup_init_root_id(root); if (ret) - goto out; + goto cancel_ref; root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, KERNFS_ROOT_CREATE_DEACTIVATED, @@ -1463,7 +1625,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) link_css_set(&tmp_links, cset, root_cgrp); up_write(&css_set_rwsem); - BUG_ON(!list_empty(&root_cgrp->children)); + BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); kernfs_activate(root_cgrp->kn); @@ -1475,6 +1637,8 @@ destroy_root: root->kf_root = NULL; exit_root_id: cgroup_exit_root_id(root); +cancel_ref: + percpu_ref_cancel_init(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; @@ -1497,14 +1661,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); if (ret) goto out_unlock; -retry: + /* look for a matching existing root */ if (!opts.subsys_mask && !opts.none && !opts.name) { cgrp_dfl_root_visible = true; @@ -1536,7 +1699,7 @@ retry: * subsystems) then they must match. */ if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->cgrp.subsys_mask)) { + (opts.subsys_mask != root->subsys_mask)) { if (!name_match) continue; ret = -EBUSY; @@ -1545,28 +1708,27 @@ retry: if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { - pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + pr_err("sane_behavior: new mount options should match the existing superblock\n"); ret = -EINVAL; goto out_unlock; } else { - pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); } } /* - * A root's lifetime is governed by its root cgroup. Zero - * ref indicate that the root is being destroyed. Wait for - * destruction to complete so that the subsystems are free. - * We can use wait_queue for the wait but this path is - * super cold. Let's just sleep for a bit and retry. + * A root's lifetime is governed by its root cgroup. + * tryget_live failure indicate that the root is being + * destroyed. Wait for destruction to complete so that the + * subsystems are free. We can use wait_queue for the wait + * but this path is super cold. Let's just sleep for a bit + * and retry. */ - if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { + if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); msleep(10); - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); - goto retry; + ret = restart_syscall(); + goto out_free; } ret = 0; @@ -1597,8 +1759,7 @@ retry: out_unlock: mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - +out_free: kfree(opts.release_agent); kfree(opts.name); @@ -1617,7 +1778,19 @@ static void cgroup_kill_sb(struct super_block *sb) struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); - cgroup_put(&root->cgrp); + /* + * If @root doesn't have any mounts or children, start killing it. + * This prevents new mounts by disabling percpu_ref_tryget_live(). + * cgroup_mount() may wait for @root's release. + * + * And don't kill the default root. + */ + if (css_has_online_children(&root->cgrp.self) || + root == &cgrp_dfl_root) + cgroup_put(&root->cgrp); + else + percpu_ref_kill(&root->cgrp.self.refcnt); + kernfs_kill_sb(sb); } @@ -1739,7 +1912,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) /** * cgroup_task_migrate - move a task from one cgroup to another. - * @old_cgrp; the cgroup @tsk is being migrated from + * @old_cgrp: the cgroup @tsk is being migrated from * @tsk: the task being migrated * @new_cset: the new css_set @tsk is being attached to * @@ -1831,10 +2004,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - /* nothing to do if this cset already belongs to the cgroup */ - if (src_cgrp == dst_cgrp) - return; - if (!list_empty(&src_cset->mg_preload_node)) return; @@ -1849,13 +2018,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @dst_cgrp: the destination cgroup + * @dst_cgrp: the destination cgroup (may be %NULL) * @preloaded_csets: list of preloaded source css_sets * * Tasks are about to be moved to @dst_cgrp and all the source css_sets * have been preloaded to @preloaded_csets. This function looks up and - * pins all destination css_sets, links each to its source, and put them on - * @preloaded_csets. + * pins all destination css_sets, links each to its source, and append them + * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each + * source css_set is assumed to be its cgroup on the default hierarchy. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed @@ -1866,19 +2036,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, struct list_head *preloaded_csets) { LIST_HEAD(csets); - struct css_set *src_cset; + struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); + /* + * Except for the root, child_subsys_mask must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. + */ + if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && + dst_cgrp->child_subsys_mask) + return -EBUSY; + /* look up the dst cset for each src cset and link it to src */ - list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { + list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; - dst_cset = find_css_set(src_cset, dst_cgrp); + dst_cset = find_css_set(src_cset, + dst_cgrp ?: src_cset->dfl_cgrp); if (!dst_cset) goto err; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); + + /* + * If src cset equals dst, it's noop. Drop the src. + * cgroup_migrate() will skip the cset too. Note that we + * can't handle src == dst as some nodes are used by both. + */ + if (src_cset == dst_cset) { + src_cset->mg_src_cgrp = NULL; + list_del_init(&src_cset->mg_preload_node); + put_css_set(src_cset, false); + put_css_set(dst_cset, false); + continue; + } + src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_preload_node)) @@ -1887,7 +2080,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, put_css_set(dst_cset, false); } - list_splice(&csets, preloaded_csets); + list_splice_tail(&csets, preloaded_csets); return 0; err: cgroup_migrate_finish(&csets); @@ -1968,7 +2161,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, return 0; /* check that we can legitimately attach to the cgroup */ - for_each_css(css, i, cgrp) { + for_each_e_css(css, i, cgrp) { if (css->ss->can_attach) { ret = css->ss->can_attach(css, &tset); if (ret) { @@ -1998,7 +2191,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, */ tset.csets = &tset.dst_csets; - for_each_css(css, i, cgrp) + for_each_e_css(css, i, cgrp) if (css->ss->attach) css->ss->attach(css, &tset); @@ -2006,7 +2199,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, goto out_release_tset; out_cancel_attach: - for_each_css(css, i, cgrp) { + for_each_e_css(css, i, cgrp) { if (css == failed_css) break; if (css->ss->cancel_attach) @@ -2065,13 +2258,20 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, * function to attach either it or all tasks in its threadgroup. Will lock * cgroup_mutex and threadgroup. */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) +static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; + struct cgroup *cgrp; + pid_t pid; int ret; - if (!cgroup_lock_live_group(cgrp)) + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) return -ENODEV; retry_find_task: @@ -2137,8 +2337,8 @@ retry_find_task: put_task_struct(tsk); out_unlock_cgroup: - mutex_unlock(&cgroup_mutex); - return ret; + cgroup_kn_unlock(of->kn); + return ret ?: nbytes; } /** @@ -2172,43 +2372,44 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -static int cgroup_tasks_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 pid) +static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return attach_task_by_pid(css->cgroup, pid, false); + return __cgroup_procs_write(of, buf, nbytes, off, false); } -static int cgroup_procs_write(struct cgroup_subsys_state *css, - struct cftype *cft, u64 tgid) +static ssize_t cgroup_procs_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - return attach_task_by_pid(css->cgroup, tgid, true); + return __cgroup_procs_write(of, buf, nbytes, off, true); } -static int cgroup_release_agent_write(struct cgroup_subsys_state *css, - struct cftype *cft, char *buffer) +static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct cgroup_root *root = css->cgroup->root; + struct cgroup *cgrp; - BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); - if (!cgroup_lock_live_group(css->cgroup)) + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); - strlcpy(root->release_agent_path, buffer, - sizeof(root->release_agent_path)); + strlcpy(cgrp->root->release_agent_path, strstrip(buf), + sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); - mutex_unlock(&cgroup_mutex); - return 0; + cgroup_kn_unlock(of->kn); + return nbytes; } static int cgroup_release_agent_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + spin_lock(&release_agent_path_lock); seq_puts(seq, cgrp->root->release_agent_path); + spin_unlock(&release_agent_path_lock); seq_putc(seq, '\n'); - mutex_unlock(&cgroup_mutex); return 0; } @@ -2220,6 +2421,320 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } +static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) +{ + struct cgroup_subsys *ss; + bool printed = false; + int ssid; + + for_each_subsys(ss, ssid) { + if (ss_mask & (1 << ssid)) { + if (printed) + seq_putc(seq, ' '); + seq_printf(seq, "%s", ss->name); + printed = true; + } + } + if (printed) + seq_putc(seq, '\n'); +} + +/* show controllers which are currently attached to the default hierarchy */ +static int cgroup_root_controllers_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & + ~cgrp_dfl_root_inhibit_ss_mask); + return 0; +} + +/* show controllers which are enabled from the parent */ +static int cgroup_controllers_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); + return 0; +} + +/* show controllers which are enabled for a given cgroup's children */ +static int cgroup_subtree_control_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); + return 0; +} + +/** + * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy + * @cgrp: root of the subtree to update csses for + * + * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) + * css associations need to be updated accordingly. This function looks up + * all css_sets which are attached to the subtree, creates the matching + * updated css_sets and migrates the tasks to the new ones. + */ +static int cgroup_update_dfl_csses(struct cgroup *cgrp) +{ + LIST_HEAD(preloaded_csets); + struct cgroup_subsys_state *css; + struct css_set *src_cset; + int ret; + + lockdep_assert_held(&cgroup_mutex); + + /* look up all csses currently attached to @cgrp's subtree */ + down_read(&css_set_rwsem); + css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { + struct cgrp_cset_link *link; + + /* self is not affected by child_subsys_mask change */ + if (css->cgroup == cgrp) + continue; + + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, cgrp, + &preloaded_csets); + } + up_read(&css_set_rwsem); + + /* NULL dst indicates self on default hierarchy */ + ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); + if (ret) + goto out_finish; + + list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { + struct task_struct *last_task = NULL, *task; + + /* src_csets precede dst_csets, break on the first dst_cset */ + if (!src_cset->mg_src_cgrp) + break; + + /* + * All tasks in src_cset need to be migrated to the + * matching dst_cset. Empty it process by process. We + * walk tasks but migrate processes. The leader might even + * belong to a different cset but such src_cset would also + * be among the target src_csets because the default + * hierarchy enforces per-process membership. + */ + while (true) { + down_read(&css_set_rwsem); + task = list_first_entry_or_null(&src_cset->tasks, + struct task_struct, cg_list); + if (task) { + task = task->group_leader; + WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); + get_task_struct(task); + } + up_read(&css_set_rwsem); + + if (!task) + break; + + /* guard against possible infinite loop */ + if (WARN(last_task == task, + "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) + goto out_finish; + last_task = task; + + threadgroup_lock(task); + /* raced against de_thread() from another thread? */ + if (!thread_group_leader(task)) { + threadgroup_unlock(task); + put_task_struct(task); + continue; + } + + ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); + + threadgroup_unlock(task); + put_task_struct(task); + + if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) + goto out_finish; + } + } + +out_finish: + cgroup_migrate_finish(&preloaded_csets); + return ret; +} + +/* change the enabled child controllers for a cgroup in the default hierarchy */ +static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + unsigned int enable = 0, disable = 0; + struct cgroup *cgrp, *child; + struct cgroup_subsys *ss; + char *tok; + int ssid, ret; + + /* + * Parse input - space separated list of subsystem names prefixed + * with either + or -. + */ + buf = strstrip(buf); + while ((tok = strsep(&buf, " "))) { + if (tok[0] == '\0') + continue; + for_each_subsys(ss, ssid) { + if (ss->disabled || strcmp(tok + 1, ss->name) || + ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask)) + continue; + + if (*tok == '+') { + enable |= 1 << ssid; + disable &= ~(1 << ssid); + } else if (*tok == '-') { + disable |= 1 << ssid; + enable &= ~(1 << ssid); + } else { + return -EINVAL; + } + break; + } + if (ssid == CGROUP_SUBSYS_COUNT) + return -EINVAL; + } + + cgrp = cgroup_kn_lock_live(of->kn); + if (!cgrp) + return -ENODEV; + + for_each_subsys(ss, ssid) { + if (enable & (1 << ssid)) { + if (cgrp->child_subsys_mask & (1 << ssid)) { + enable &= ~(1 << ssid); + continue; + } + + /* + * Because css offlining is asynchronous, userland + * might try to re-enable the same controller while + * the previous instance is still around. In such + * cases, wait till it's gone using offline_waitq. + */ + cgroup_for_each_live_child(child, cgrp) { + DEFINE_WAIT(wait); + + if (!cgroup_css(child, ss)) + continue; + + cgroup_get(child); + prepare_to_wait(&child->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + cgroup_kn_unlock(of->kn); + schedule(); + finish_wait(&child->offline_waitq, &wait); + cgroup_put(child); + + return restart_syscall(); + } + + /* unavailable or not enabled on the parent? */ + if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || + (cgroup_parent(cgrp) && + !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { + ret = -ENOENT; + goto out_unlock; + } + } else if (disable & (1 << ssid)) { + if (!(cgrp->child_subsys_mask & (1 << ssid))) { + disable &= ~(1 << ssid); + continue; + } + + /* a child has it enabled? */ + cgroup_for_each_live_child(child, cgrp) { + if (child->child_subsys_mask & (1 << ssid)) { + ret = -EBUSY; + goto out_unlock; + } + } + } + } + + if (!enable && !disable) { + ret = 0; + goto out_unlock; + } + + /* + * Except for the root, child_subsys_mask must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. + */ + if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Create csses for enables and update child_subsys_mask. This + * changes cgroup_e_css() results which in turn makes the + * subsequent cgroup_update_dfl_csses() associate all tasks in the + * subtree to the updated csses. + */ + for_each_subsys(ss, ssid) { + if (!(enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + ret = create_css(child, ss); + if (ret) + goto err_undo_css; + } + } + + cgrp->child_subsys_mask |= enable; + cgrp->child_subsys_mask &= ~disable; + + ret = cgroup_update_dfl_csses(cgrp); + if (ret) + goto err_undo_css; + + /* all tasks are now migrated away from the old csses, kill them */ + for_each_subsys(ss, ssid) { + if (!(disable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) + kill_css(cgroup_css(child, ss)); + } + + kernfs_activate(cgrp->kn); + ret = 0; +out_unlock: + cgroup_kn_unlock(of->kn); + return ret ?: nbytes; + +err_undo_css: + cgrp->child_subsys_mask &= ~enable; + cgrp->child_subsys_mask |= disable; + + for_each_subsys(ss, ssid) { + if (!(enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + struct cgroup_subsys_state *css = cgroup_css(child, ss); + if (css) + kill_css(css); + } + } + goto out_unlock; +} + +static int cgroup_populated_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); + return 0; +} + static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -2228,6 +2743,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, struct cgroup_subsys_state *css; int ret; + if (cft->write) + return cft->write(of, buf, nbytes, off); + /* * kernfs guarantees that a file isn't deleted with operations in * flight, which means that the matching css is and stays alive and @@ -2238,9 +2756,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, css = cgroup_css(cgrp, cft->ss); rcu_read_unlock(); - if (cft->write_string) { - ret = cft->write_string(css, cft, strstrip(buf)); - } else if (cft->write_u64) { + if (cft->write_u64) { unsigned long long v; ret = kstrtoull(buf, 0, &v); if (!ret) @@ -2250,8 +2766,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, ret = kstrtoll(buf, 0, &v); if (!ret) ret = cft->write_s64(css, cft, v); - } else if (cft->trigger) { - ret = cft->trigger(css, (unsigned int)cft->private); } else { ret = -EINVAL; } @@ -2328,20 +2842,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, return -EPERM; /* - * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. kernfs_rename() doesn't require active_ref - * protection. Break them before grabbing cgroup_tree_mutex. + * protection. Break them before grabbing cgroup_mutex. */ kernfs_break_active_protection(new_parent); kernfs_break_active_protection(kn); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); kernfs_unbreak_active_protection(kn); kernfs_unbreak_active_protection(new_parent); @@ -2379,9 +2891,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) return PTR_ERR(kn); ret = cgroup_kn_set_ugid(kn); - if (ret) + if (ret) { kernfs_remove(kn); - return ret; + return ret; + } + + if (cft->seq_show == cgroup_populated_show) + cgrp->populated_kn = kn; + return 0; } /** @@ -2401,7 +2918,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], struct cftype *cft; int ret; - lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2409,16 +2926,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], continue; if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) continue; - if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) + if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; - if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) + if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; if (is_add) { ret = cgroup_add_file(cgrp, cft); if (ret) { - pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", - cft->name, ret); + pr_warn("%s: failed to add %s, err=%d\n", + __func__, cft->name, ret); return ret; } } else { @@ -2436,11 +2953,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) struct cgroup_subsys_state *css; int ret = 0; - lockdep_assert_held(&cgroup_tree_mutex); - - /* don't bother if @ss isn't attached */ - if (ss->root == &cgrp_dfl_root) - return 0; + lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { @@ -2508,7 +3021,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) static int cgroup_rm_cftypes_locked(struct cftype *cfts) { - lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); if (!cfts || !cfts[0].ss) return -ENOENT; @@ -2534,9 +3047,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) { int ret; - mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); ret = cgroup_rm_cftypes_locked(cfts); - mutex_unlock(&cgroup_tree_mutex); + mutex_unlock(&cgroup_mutex); return ret; } @@ -2558,6 +3071,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; + if (ss->disabled) + return 0; + if (!cfts || cfts[0].name[0] == '\0') return 0; @@ -2565,14 +3081,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (ret) return ret; - mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); - mutex_unlock(&cgroup_tree_mutex); + mutex_unlock(&cgroup_mutex); return ret; } @@ -2596,57 +3112,65 @@ static int cgroup_task_count(const struct cgroup *cgrp) /** * css_next_child - find the next child of a given css - * @pos_css: the current position (%NULL to initiate traversal) - * @parent_css: css whose children to walk + * @pos: the current position (%NULL to initiate traversal) + * @parent: css whose children to walk * - * This function returns the next child of @parent_css and should be called + * This function returns the next child of @parent and should be called * under either cgroup_mutex or RCU read lock. The only requirement is - * that @parent_css and @pos_css are accessible. The next sibling is - * guaranteed to be returned regardless of their states. + * that @parent and @pos are accessible. The next sibling is guaranteed to + * be returned regardless of their states. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ -struct cgroup_subsys_state * -css_next_child(struct cgroup_subsys_state *pos_css, - struct cgroup_subsys_state *parent_css) +struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *parent) { - struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; - struct cgroup *cgrp = parent_css->cgroup; - struct cgroup *next; + struct cgroup_subsys_state *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* - * @pos could already have been removed. Once a cgroup is removed, - * its ->sibling.next is no longer updated when its next sibling - * changes. As CGRP_DEAD assertion is serialized and happens - * before the cgroup is taken off the ->sibling list, if we see it - * unasserted, it's guaranteed that the next sibling hasn't - * finished its grace period even if it's already removed, and thus - * safe to dereference from this RCU critical section. If - * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed - * to be visible as %true here. + * @pos could already have been unlinked from the sibling list. + * Once a cgroup is removed, its ->sibling.next is no longer + * updated when its next sibling changes. CSS_RELEASED is set when + * @pos is taken off list, at which time its next pointer is valid, + * and, as releases are serialized, the one pointed to by the next + * pointer is guaranteed to not have started release yet. This + * implies that if we observe !CSS_RELEASED on @pos in this RCU + * critical section, the one pointed to by its next pointer is + * guaranteed to not have finished its RCU grace period even if we + * have dropped rcu_read_lock() inbetween iterations. * - * If @pos is dead, its next pointer can't be dereferenced; - * however, as each cgroup is given a monotonically increasing - * unique serial number and always appended to the sibling list, - * the next one can be found by walking the parent's children until - * we see a cgroup with higher serial number than @pos's. While - * this path can be slower, it's taken only when either the current - * cgroup is removed or iteration and removal race. + * If @pos has CSS_RELEASED set, its next pointer can't be + * dereferenced; however, as each css is given a monotonically + * increasing unique serial number and always appended to the + * sibling list, the next one can be found by walking the parent's + * children until the first css with higher serial number than + * @pos's. While this path can be slower, it happens iff iteration + * races against release and the race window is very small. */ if (!pos) { - next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); - } else if (likely(!cgroup_is_dead(pos))) { - next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); + } else if (likely(!(pos->flags & CSS_RELEASED))) { + next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { - list_for_each_entry_rcu(next, &cgrp->children, sibling) + list_for_each_entry_rcu(next, &parent->children, sibling) if (next->serial_nr > pos->serial_nr) break; } - if (&next->sibling == &cgrp->children) - return NULL; - - return cgroup_css(next, parent_css->ss); + /* + * @next, if not pointing to the head, can be dereferenced and is + * the next sibling. + */ + if (&next->sibling != &parent->children) + return next; + return NULL; } /** @@ -2662,6 +3186,13 @@ css_next_child(struct cgroup_subsys_state *pos_css, * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct next descendant as long * as both @pos and @root are accessible and @pos is a descendant of @root. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, @@ -2669,7 +3200,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) @@ -2682,10 +3213,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != root) { - next = css_next_child(pos, css_parent(pos)); + next = css_next_child(pos, pos->parent); if (next) return next; - pos = css_parent(pos); + pos = pos->parent; } return NULL; @@ -2709,7 +3240,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); do { last = pos; @@ -2749,6 +3280,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * section. This function will return the correct next descendant as long * as both @pos and @cgroup are accessible and @pos is a descendant of * @cgroup. + * + * If a subsystem synchronizes ->css_online() and the start of iteration, a + * css which finished ->css_online() is guaranteed to be visible in the + * future iterations and will stay visible until the last reference is put. + * A css which hasn't finished ->css_online() or already finished + * ->css_offline() may show up during traversal. It's each subsystem's + * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, @@ -2756,7 +3294,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutexes_or_rcu_locked(); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) @@ -2767,12 +3305,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ - next = css_next_child(pos, css_parent(pos)); + next = css_next_child(pos, pos->parent); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ - return css_parent(pos); + return pos->parent; +} + +/** + * css_has_online_children - does a css have online children + * @css: the target css + * + * Returns %true if @css has any online children; otherwise, %false. This + * function can be called from any context but the caller is responsible + * for synchronizing against on/offlining as necessary. + */ +bool css_has_online_children(struct cgroup_subsys_state *css) +{ + struct cgroup_subsys_state *child; + bool ret = false; + + rcu_read_lock(); + css_for_each_child(child, css) { + if (css->flags & CSS_ONLINE) { + ret = true; + break; + } + } + rcu_read_unlock(); + return ret; } /** @@ -2783,27 +3345,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, */ static void css_advance_task_iter(struct css_task_iter *it) { - struct list_head *l = it->cset_link; + struct list_head *l = it->cset_pos; struct cgrp_cset_link *link; struct css_set *cset; /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &it->origin_css->cgroup->cset_links) { - it->cset_link = NULL; + if (l == it->cset_head) { + it->cset_pos = NULL; return; } - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; + + if (it->ss) { + cset = container_of(l, struct css_set, + e_cset_node[it->ss->id]); + } else { + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); - it->cset_link = l; + it->cset_pos = l; if (!list_empty(&cset->tasks)) - it->task = cset->tasks.next; + it->task_pos = cset->tasks.next; else - it->task = cset->mg_tasks.next; + it->task_pos = cset->mg_tasks.next; + + it->tasks_head = &cset->tasks; + it->mg_tasks_head = &cset->mg_tasks; } /** @@ -2829,8 +3400,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, down_read(&css_set_rwsem); - it->origin_css = css; - it->cset_link = &css->cgroup->cset_links; + it->ss = css->ss; + + if (it->ss) + it->cset_pos = &css->cgroup->e_csets[css->ss->id]; + else + it->cset_pos = &css->cgroup->cset_links; + + it->cset_head = it->cset_pos; css_advance_task_iter(it); } @@ -2846,12 +3423,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css, struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; - struct list_head *l = it->task; - struct cgrp_cset_link *link = list_entry(it->cset_link, - struct cgrp_cset_link, cset_link); + struct list_head *l = it->task_pos; /* If the iterator cg is NULL, we have no tasks */ - if (!it->cset_link) + if (!it->cset_pos) return NULL; res = list_entry(l, struct task_struct, cg_list); @@ -2862,13 +3437,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) */ l = l->next; - if (l == &link->cset->tasks) - l = link->cset->mg_tasks.next; + if (l == it->tasks_head) + l = it->mg_tasks_head->next; - if (l == &link->cset->mg_tasks) + if (l == it->mg_tasks_head) css_advance_task_iter(it); else - it->task = l; + it->task_pos = l; return res; } @@ -2921,7 +3496,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) * ->can_attach() fails. */ do { - css_task_iter_start(&from->dummy_css, &it); + css_task_iter_start(&from->self, &it); task = css_task_iter_next(&it); if (task) get_task_struct(task); @@ -3186,7 +3761,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - css_task_iter_start(&cgrp->dummy_css, &it); + css_task_iter_start(&cgrp->self, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; @@ -3248,7 +3823,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) /* * We aren't being called from kernfs and there's no guarantee on - * @kn->priv's validity. For this and css_tryget_from_dir(), + * @kn->priv's validity. For this and css_tryget_online_from_dir(), * @kn->priv is RCU safe. Let's do the RCU dancing. */ rcu_read_lock(); @@ -3260,7 +3835,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } rcu_read_unlock(); - css_task_iter_start(&cgrp->dummy_css, &it); + css_task_iter_start(&cgrp->self, &it); while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: @@ -3390,17 +3965,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v) return seq_printf(s, "%d\n", *(int *)v); } -/* - * seq_operations functions for iterating on pidlists through seq_file - - * independent of whether it's tasks or procs - */ -static const struct seq_operations cgroup_pidlist_seq_operations = { - .start = cgroup_pidlist_start, - .stop = cgroup_pidlist_stop, - .next = cgroup_pidlist_next, - .show = cgroup_pidlist_show, -}; - static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -3442,7 +4006,7 @@ static struct cftype cgroup_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, - .write_u64 = cgroup_procs_write, + .write = cgroup_procs_write, .mode = S_IRUGO | S_IWUSR, }, { @@ -3456,6 +4020,27 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_sane_behavior_show, }, + { + .name = "cgroup.controllers", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_root_controllers_show, + }, + { + .name = "cgroup.controllers", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_controllers_show, + }, + { + .name = "cgroup.subtree_control", + .flags = CFTYPE_ONLY_ON_DFL, + .seq_show = cgroup_subtree_control_show, + .write = cgroup_subtree_control_write, + }, + { + .name = "cgroup.populated", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_populated_show, + }, /* * Historical crazy stuff. These don't have "cgroup." prefix and @@ -3470,7 +4055,7 @@ static struct cftype cgroup_base_files[] = { .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, - .write_u64 = cgroup_tasks_write, + .write = cgroup_tasks_write, .mode = S_IRUGO | S_IWUSR, }, { @@ -3483,7 +4068,7 @@ static struct cftype cgroup_base_files[] = { .name = "release_agent", .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, - .write_string = cgroup_release_agent_write, + .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, { } /* terminate */ @@ -3496,7 +4081,7 @@ static struct cftype cgroup_base_files[] = { * * On failure, no file is added. */ -static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask) { struct cgroup_subsys *ss; int i, ret = 0; @@ -3505,7 +4090,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) for_each_subsys(ss, i) { struct cftype *cfts; - if (!test_bit(i, &subsys_mask)) + if (!(subsys_mask & (1 << i))) continue; list_for_each_entry(cfts, &ss->cfts, node) { @@ -3527,9 +4112,9 @@ err: * Implemented in kill_css(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs - * and thus css_tryget() is guaranteed to fail, the css can be offlined - * by invoking offline_css(). After offlining, the base ref is put. - * Implemented in css_killed_work_fn(). + * and thus css_tryget_online() is guaranteed to fail, the css can be + * offlined by invoking offline_css(). After offlining, the base ref is + * put. Implemented in css_killed_work_fn(). * * 3. When the percpu_ref reaches zero, the only possible remaining * accessors are inside RCU read sections. css_release() schedules the @@ -3548,11 +4133,37 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; - if (css->parent) - css_put(css->parent); + if (css->ss) { + /* css free path */ + if (css->parent) + css_put(css->parent); - css->ss->css_free(css); - cgroup_put(cgrp); + css->ss->css_free(css); + cgroup_put(cgrp); + } else { + /* cgroup free path */ + atomic_dec(&cgrp->root->nr_cgrps); + cgroup_pidlist_destroy_all(cgrp); + + if (cgroup_parent(cgrp)) { + /* + * We get a ref to the parent, and put the ref when + * this cgroup is being freed, so it's guaranteed + * that the parent won't be destroyed before its + * children. + */ + cgroup_put(cgroup_parent(cgrp)); + kernfs_put(cgrp->kn); + kfree(cgrp); + } else { + /* + * This is root cgroup's refcnt reaching zero, + * which indicates that the root should be + * released. + */ + cgroup_destroy_root(cgrp->root); + } + } } static void css_free_rcu_fn(struct rcu_head *rcu_head) @@ -3564,26 +4175,59 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) queue_work(cgroup_destroy_wq, &css->destroy_work); } +static void css_release_work_fn(struct work_struct *work) +{ + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup_subsys *ss = css->ss; + struct cgroup *cgrp = css->cgroup; + + mutex_lock(&cgroup_mutex); + + css->flags |= CSS_RELEASED; + list_del_rcu(&css->sibling); + + if (ss) { + /* css release path */ + cgroup_idr_remove(&ss->css_idr, css->id); + } else { + /* cgroup release path */ + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + } + + mutex_unlock(&cgroup_mutex); + + call_rcu(&css->rcu_head, css_free_rcu_fn); +} + static void css_release(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); - call_rcu(&css->rcu_head, css_free_rcu_fn); + INIT_WORK(&css->destroy_work, css_release_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); } -static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, - struct cgroup *cgrp) +static void init_and_link_css(struct cgroup_subsys_state *css, + struct cgroup_subsys *ss, struct cgroup *cgrp) { + lockdep_assert_held(&cgroup_mutex); + + cgroup_get(cgrp); + + memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; - css->flags = 0; + INIT_LIST_HEAD(&css->sibling); + INIT_LIST_HEAD(&css->children); + css->serial_nr = css_serial_nr_next++; - if (cgrp->parent) - css->parent = cgroup_css(cgrp->parent, ss); - else - css->flags |= CSS_ROOT; + if (cgroup_parent(cgrp)) { + css->parent = cgroup_css(cgroup_parent(cgrp), ss); + css_get(css->parent); + } BUG_ON(cgroup_css(cgrp, ss)); } @@ -3594,14 +4238,12 @@ static int online_css(struct cgroup_subsys_state *css) struct cgroup_subsys *ss = css->ss; int ret = 0; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (ss->css_online) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; - css->cgroup->nr_css++; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); } return ret; @@ -3612,7 +4254,6 @@ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) @@ -3622,8 +4263,9 @@ static void offline_css(struct cgroup_subsys_state *css) ss->css_offline(css); css->flags &= ~CSS_ONLINE; - css->cgroup->nr_css--; - RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); + + wake_up_all(&css->cgroup->offline_waitq); } /** @@ -3637,111 +4279,102 @@ static void offline_css(struct cgroup_subsys_state *css) */ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { - struct cgroup *parent = cgrp->parent; + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); struct cgroup_subsys_state *css; int err; lockdep_assert_held(&cgroup_mutex); - css = ss->css_alloc(cgroup_css(parent, ss)); + css = ss->css_alloc(parent_css); if (IS_ERR(css)) return PTR_ERR(css); + init_and_link_css(css, ss, cgrp); + err = percpu_ref_init(&css->refcnt, css_release); if (err) goto err_free_css; - init_css(css, ss, cgrp); + err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); + if (err < 0) + goto err_free_percpu_ref; + css->id = err; err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) - goto err_free_percpu_ref; + goto err_free_id; + + /* @css is ready to be brought online now, make it visible */ + list_add_tail_rcu(&css->sibling, &parent_css->children); + cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) - goto err_clear_dir; - - cgroup_get(cgrp); - css_get(css->parent); - - cgrp->subsys_mask |= 1 << ss->id; + goto err_list_del; if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - parent->parent) { - pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); + cgroup_parent(parent)) { + pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); if (!strcmp(ss->name, "memory")) - pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); ss->warned_broken_hierarchy = true; } return 0; -err_clear_dir: +err_list_del: + list_del_rcu(&css->sibling); cgroup_clear_dir(css->cgroup, 1 << css->ss->id); +err_free_id: + cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: percpu_ref_cancel_init(&css->refcnt); err_free_css: - ss->css_free(css); + call_rcu(&css->rcu_head, css_free_rcu_fn); return err; } -/** - * cgroup_create - create a cgroup - * @parent: cgroup that will be parent of the new cgroup - * @name: name of the new cgroup - * @mode: mode to set on new cgroup - */ -static long cgroup_create(struct cgroup *parent, const char *name, - umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) { - struct cgroup *cgrp; - struct cgroup_root *root = parent->root; - int ssid, err; + struct cgroup *parent, *cgrp; + struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; + int ssid, ret; - /* - * XXX: The default hierarchy isn't fully implemented yet. Block - * !root cgroup creation on it for now. - */ - if (root == &cgrp_dfl_root) - return -EINVAL; + parent = cgroup_kn_lock_live(parent_kn); + if (!parent) + return -ENODEV; + root = parent->root; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); - if (!cgrp) - return -ENOMEM; - - mutex_lock(&cgroup_tree_mutex); - - /* - * Only live parents can have children. Note that the liveliness - * check isn't strictly necessary because cgroup_mkdir() and - * cgroup_rmdir() are fully synchronized by i_mutex; however, do it - * anyway so that locking is contained inside cgroup proper and we - * don't get nasty surprises if we ever grow another caller. - */ - if (!cgroup_lock_live_group(parent)) { - err = -ENODEV; - goto err_unlock_tree; + if (!cgrp) { + ret = -ENOMEM; + goto out_unlock; } + ret = percpu_ref_init(&cgrp->self.refcnt, css_release); + if (ret) + goto out_free_cgrp; + /* * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); + cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); if (cgrp->id < 0) { - err = -ENOMEM; - goto err_unlock; + ret = -ENOMEM; + goto out_cancel_ref; } init_cgroup_housekeeping(cgrp); - cgrp->parent = parent; - cgrp->dummy_css.parent = &parent->dummy_css; - cgrp->root = parent->root; + cgrp->self.parent = &parent->self; + cgrp->root = root; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -3752,8 +4385,8 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); if (IS_ERR(kn)) { - err = PTR_ERR(kn); - goto err_free_id; + ret = PTR_ERR(kn); + goto out_free_id; } cgrp->kn = kn; @@ -3763,10 +4396,10 @@ static long cgroup_create(struct cgroup *parent, const char *name, */ kernfs_get(kn); - cgrp->serial_nr = cgroup_serial_nr_next++; + cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ - list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); + list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get(parent); @@ -3774,107 +4407,66 @@ static long cgroup_create(struct cgroup *parent, const char *name, * @cgrp is now fully operational. If something fails after this * point, it'll be released via the normal destruction path. */ - idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - err = cgroup_kn_set_ugid(kn); - if (err) - goto err_destroy; + ret = cgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; - err = cgroup_addrm_files(cgrp, cgroup_base_files, true); - if (err) - goto err_destroy; + ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); + if (ret) + goto out_destroy; /* let's create and online css's */ for_each_subsys(ss, ssid) { - if (root->cgrp.subsys_mask & (1 << ssid)) { - err = create_css(cgrp, ss); - if (err) - goto err_destroy; + if (parent->child_subsys_mask & (1 << ssid)) { + ret = create_css(cgrp, ss); + if (ret) + goto out_destroy; } } - kernfs_activate(kn); + /* + * On the default hierarchy, a child doesn't automatically inherit + * child_subsys_mask from the parent. Each is configured manually. + */ + if (!cgroup_on_dfl(cgrp)) + cgrp->child_subsys_mask = parent->child_subsys_mask; - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); + kernfs_activate(kn); - return 0; + ret = 0; + goto out_unlock; -err_free_id: - idr_remove(&root->cgroup_idr, cgrp->id); -err_unlock: - mutex_unlock(&cgroup_mutex); -err_unlock_tree: - mutex_unlock(&cgroup_tree_mutex); +out_free_id: + cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_cancel_ref: + percpu_ref_cancel_init(&cgrp->self.refcnt); +out_free_cgrp: kfree(cgrp); - return err; +out_unlock: + cgroup_kn_unlock(parent_kn); + return ret; -err_destroy: +out_destroy: cgroup_destroy_locked(cgrp); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - return err; -} - -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) -{ - struct cgroup *parent = parent_kn->priv; - int ret; - - /* - * cgroup_create() grabs cgroup_tree_mutex which nests outside - * kernfs active_ref and cgroup_create() already synchronizes - * properly against removal through cgroup_lock_live_group(). - * Break it before calling cgroup_create(). - */ - cgroup_get(parent); - kernfs_break_active_protection(parent_kn); - - ret = cgroup_create(parent, name, mode); - - kernfs_unbreak_active_protection(parent_kn); - cgroup_put(parent); - return ret; + goto out_unlock; } /* * This is called when the refcnt of a css is confirmed to be killed. - * css_tryget() is now guaranteed to fail. + * css_tryget_online() is now guaranteed to fail. Tell the subsystem to + * initate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); - struct cgroup *cgrp = css->cgroup; - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - - /* - * css_tryget() is guaranteed to fail now. Tell subsystems to - * initate destruction. - */ offline_css(css); - - /* - * If @cgrp is marked dead, it's waiting for refs of all css's to - * be disabled before proceeding to the second phase of cgroup - * destruction. If we are the last one, kick it off. - */ - if (!cgrp->nr_css && cgroup_is_dead(cgrp)) - cgroup_destroy_css_killed(cgrp); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); - /* - * Put the css refs from kill_css(). Each css holds an extra - * reference to the cgroup's dentry and cgroup removal proceeds - * regardless of css refs. On the last put of each css, whenever - * that may be, the extra dentry ref is put so that dentry - * destruction happens only after all css's are released. - */ css_put(css); } @@ -3888,9 +4480,18 @@ static void css_killed_ref_fn(struct percpu_ref *ref) queue_work(cgroup_destroy_wq, &css->destroy_work); } -static void __kill_css(struct cgroup_subsys_state *css) +/** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget_online() is guaranteed to fail and when + * the reference count reaches zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) { - lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); /* * This must happen before css is disassociated with its cgroup. @@ -3907,7 +4508,7 @@ static void __kill_css(struct cgroup_subsys_state *css) /* * cgroup core guarantees that, by the time ->css_offline() is * invoked, no new css reference will be given out via - * css_tryget(). We can't simply call percpu_ref_kill() and + * css_tryget_online(). We can't simply call percpu_ref_kill() and * proceed to offlining css's because percpu_ref_kill() doesn't * guarantee that the ref is seen as killed on all CPUs on return. * @@ -3918,36 +4519,14 @@ static void __kill_css(struct cgroup_subsys_state *css) } /** - * kill_css - destroy a css - * @css: css to destroy - * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. - */ -static void kill_css(struct cgroup_subsys_state *css) -{ - struct cgroup *cgrp = css->cgroup; - - lockdep_assert_held(&cgroup_tree_mutex); - - /* if already killed, noop */ - if (cgrp->subsys_mask & (1 << css->ss->id)) { - cgrp->subsys_mask &= ~(1 << css->ss->id); - __kill_css(css); - } -} - -/** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed * * css's make use of percpu refcnts whose killing latency shouldn't be * exposed to userland and are RCU protected. Also, cgroup core needs to - * guarantee that css_tryget() won't succeed by the time ->css_offline() is - * invoked. To satisfy all the requirements, destruction is implemented in - * the following two steps. + * guarantee that css_tryget_online() won't succeed by the time + * ->css_offline() is invoked. To satisfy all the requirements, + * destruction is implemented in the following two steps. * * s1. Verify @cgrp can be destroyed and mark it dying. Remove all * userland visible parts and start killing the percpu refcnts of @@ -3966,12 +4545,10 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct cgroup *child; struct cgroup_subsys_state *css; bool empty; int ssid; - lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* @@ -3985,127 +4562,68 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Make sure there's no live children. We can't test ->children - * emptiness as dead children linger on it while being destroyed; - * otherwise, "rmdir parent/child parent" may fail with -EBUSY. + * Make sure there's no live children. We can't test emptiness of + * ->self.children as dead children linger on it while being + * drained; otherwise, "rmdir parent/child parent" may fail. */ - empty = true; - rcu_read_lock(); - list_for_each_entry_rcu(child, &cgrp->children, sibling) { - empty = cgroup_is_dead(child); - if (!empty) - break; - } - rcu_read_unlock(); - if (!empty) + if (css_has_online_children(&cgrp->self)) return -EBUSY; /* * Mark @cgrp dead. This prevents further task migration and child - * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by css_next_child() to - * resume iteration after dropping RCU read lock. See - * css_next_child() for details. + * creation by disabling cgroup_lock_live_group(). */ - set_bit(CGRP_DEAD, &cgrp->flags); + cgrp->self.flags &= ~CSS_ONLINE; - /* - * Initiate massacre of all css's. cgroup_destroy_css_killed() - * will be invoked to perform the rest of destruction once the - * percpu refs of all css's are confirmed to be killed. This - * involves removing the subsystem's files, drop cgroup_mutex. - */ - mutex_unlock(&cgroup_mutex); + /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); - mutex_lock(&cgroup_mutex); - /* CGRP_DEAD is set, remove from ->release_list for the last time */ + /* CSS_ONLINE is clear, remove from ->release_list for the last time */ raw_spin_lock(&release_list_lock); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); /* - * If @cgrp has css's attached, the second stage of cgroup - * destruction is kicked off from css_killed_work_fn() after the - * refs of all attached css's are killed. If @cgrp doesn't have - * any css, we kick it off here. + * Remove @cgrp directory along with the base files. @cgrp has an + * extra ref on its kn. */ - if (!cgrp->nr_css) - cgroup_destroy_css_killed(cgrp); - - /* remove @cgrp directory along with the base files */ - mutex_unlock(&cgroup_mutex); + kernfs_remove(cgrp->kn); - /* - * There are two control paths which try to determine cgroup from - * dentry without going through kernfs - cgroupstats_build() and - * css_tryget_from_dir(). Those are supported by RCU protecting - * clearing of cgrp->kn->priv backpointer, which should happen - * after all files under it have been removed. - */ - kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ - RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); + check_for_release(cgroup_parent(cgrp)); - mutex_lock(&cgroup_mutex); + /* put the base reference */ + percpu_ref_kill(&cgrp->self.refcnt); return 0; }; -/** - * cgroup_destroy_css_killed - the second step of cgroup destruction - * @work: cgroup->destroy_free_work - * - * This function is invoked from a work item for a cgroup which is being - * destroyed after all css's are offlined and performs the rest of - * destruction. This is the second step of destruction described in the - * comment above cgroup_destroy_locked(). - */ -static void cgroup_destroy_css_killed(struct cgroup *cgrp) -{ - struct cgroup *parent = cgrp->parent; - - lockdep_assert_held(&cgroup_tree_mutex); - lockdep_assert_held(&cgroup_mutex); - - /* delete this cgroup from parent->children */ - list_del_rcu(&cgrp->sibling); - - cgroup_put(cgrp); - - set_bit(CGRP_RELEASABLE, &parent->flags); - check_for_release(parent); -} - static int cgroup_rmdir(struct kernfs_node *kn) { - struct cgroup *cgrp = kn->priv; + struct cgroup *cgrp; int ret = 0; - /* - * This is self-destruction but @kn can't be removed while this - * callback is in progress. Let's break active protection. Once - * the protection is broken, @cgrp can be destroyed at any point. - * Pin it so that it stays accessible. - */ - cgroup_get(cgrp); - kernfs_break_active_protection(kn); + cgrp = cgroup_kn_lock_live(kn); + if (!cgrp) + return 0; + cgroup_get(cgrp); /* for @kn->priv clearing */ - mutex_lock(&cgroup_tree_mutex); - mutex_lock(&cgroup_mutex); + ret = cgroup_destroy_locked(cgrp); + + cgroup_kn_unlock(kn); /* - * @cgrp might already have been destroyed while we're trying to - * grab the mutexes. + * There are two control paths which try to determine cgroup from + * dentry without going through kernfs - cgroupstats_build() and + * css_tryget_online_from_dir(). Those are supported by RCU + * protecting clearing of cgrp->kn->priv backpointer, which should + * happen after all files under it have been removed. */ - if (!cgroup_is_dead(cgrp)) - ret = cgroup_destroy_locked(cgrp); - - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); + if (!ret) + RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); - kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); return ret; } @@ -4118,15 +4636,15 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .rename = cgroup_rename, }; -static void __init cgroup_init_subsys(struct cgroup_subsys *ss) +static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); + idr_init(&ss->css_idr); INIT_LIST_HEAD(&ss->cfts); /* Create the root cgroup state for this subsystem */ @@ -4134,7 +4652,21 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_css(css, ss, &cgrp_dfl_root.cgrp); + init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); + + /* + * Root csses are never destroyed and we can't initialize + * percpu_ref during early init. Disable refcnting. + */ + css->flags |= CSS_NO_REF; + + if (early) { + /* allocation can't be done safely during early init */ + css->id = 1; + } else { + css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); + BUG_ON(css->id < 0); + } /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is @@ -4151,10 +4683,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); - cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id; - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); } /** @@ -4171,6 +4700,8 @@ int __init cgroup_init_early(void) int i; init_cgroup_root(&cgrp_dfl_root, &opts); + cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; + RCU_INIT_POINTER(init_task.cgroups, &init_css_set); for_each_subsys(ss, i) { @@ -4185,7 +4716,7 @@ int __init cgroup_init_early(void) ss->name = cgroup_subsys_name[i]; if (ss->early_init) - cgroup_init_subsys(ss); + cgroup_init_subsys(ss, true); } return 0; } @@ -4204,7 +4735,6 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); - mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ @@ -4214,18 +4744,31 @@ int __init cgroup_init(void) BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgroup_tree_mutex); for_each_subsys(ss, ssid) { - if (!ss->early_init) - cgroup_init_subsys(ss); + if (ss->early_init) { + struct cgroup_subsys_state *css = + init_css_set.subsys[ss->id]; + + css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, + GFP_KERNEL); + BUG_ON(css->id < 0); + } else { + cgroup_init_subsys(ss, false); + } + + list_add_tail(&init_css_set.e_cset_node[ssid], + &cgrp_dfl_root.cgrp.e_csets[ssid]); /* - * cftype registration needs kmalloc and can't be done - * during early_init. Register base cftypes separately. + * Setting dfl_root subsys_mask needs to consider the + * disabled flag and cftype registration needs kmalloc, + * both of which aren't available during early_init. */ - if (ss->base_cftypes) + if (!ss->disabled) { + cgrp_dfl_root.subsys_mask |= 1 << ss->id; WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); + } } cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); @@ -4308,7 +4851,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(ss, ssid) - if (root->cgrp.subsys_mask & (1 << ssid)) + if (root->subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", @@ -4503,8 +5046,8 @@ void cgroup_exit(struct task_struct *tsk) static void check_for_release(struct cgroup *cgrp) { - if (cgroup_is_releasable(cgrp) && - list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { + if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && + !css_has_online_children(&cgrp->self)) { /* * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue @@ -4621,7 +5164,7 @@ static int __init cgroup_disable(char *str) __setup("cgroup_disable=", cgroup_disable); /** - * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir + * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest * @ss: subsystem of interest * @@ -4629,8 +5172,8 @@ __setup("cgroup_disable=", cgroup_disable); * to get the corresponding css and return it. If such css doesn't exist * or can't be pinned, an ERR_PTR value is returned. */ -struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss) +struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup_subsys_state *css = NULL; @@ -4646,13 +5189,13 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU - * protected for this access. See destroy_locked() for details. + * protected for this access. See cgroup_rmdir() for details. */ cgrp = rcu_dereference(kn->priv); if (cgrp) css = cgroup_css(cgrp, ss); - if (!css || !css_tryget(css)) + if (!css || !css_tryget_online(css)) css = ERR_PTR(-ENOENT); rcu_read_unlock(); @@ -4669,14 +5212,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, */ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { - struct cgroup *cgrp; - - cgroup_assert_mutexes_or_rcu_locked(); - - cgrp = idr_find(&ss->root->cgroup_idr, id); - if (cgrp) - return cgroup_css(cgrp, ss); - return NULL; + WARN_ON_ONCE(!rcu_read_lock_held()); + return idr_find(&ss->css_idr, id); } #ifdef CONFIG_CGROUP_DEBUG diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 345628c78b5b..a79e40f9d700 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -59,7 +59,7 @@ static inline struct freezer *task_freezer(struct task_struct *task) static struct freezer *parent_freezer(struct freezer *freezer) { - return css_freezer(css_parent(&freezer->css)); + return css_freezer(freezer->css.parent); } bool cgroup_freezing(struct task_struct *task) @@ -73,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task) return ret; } -/* - * cgroups_write_string() limits the size of freezer state strings to - * CGROUP_LOCAL_BUFFER_SIZE - */ static const char *freezer_state_strs(unsigned int state) { if (state & CGROUP_FROZEN) @@ -304,7 +300,7 @@ static int freezer_read(struct seq_file *m, void *v) /* update states bottom-up */ css_for_each_descendant_post(pos, css) { - if (!css_tryget(pos)) + if (!css_tryget_online(pos)) continue; rcu_read_unlock(); @@ -404,7 +400,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); - if (!css_tryget(pos)) + if (!css_tryget_online(pos)) continue; rcu_read_unlock(); @@ -423,20 +419,22 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) mutex_unlock(&freezer_mutex); } -static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, - char *buffer) +static ssize_t freezer_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { bool freeze; - if (strcmp(buffer, freezer_state_strs(0)) == 0) + buf = strstrip(buf); + + if (strcmp(buf, freezer_state_strs(0)) == 0) freeze = false; - else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) freeze = true; else return -EINVAL; - freezer_change_state(css_freezer(css), freeze); - return 0; + freezer_change_state(css_freezer(of_css(of)), freeze); + return nbytes; } static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, @@ -460,7 +458,7 @@ static struct cftype files[] = { .name = "state", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = freezer_read, - .write_string = freezer_write, + .write = freezer_write, }, { .name = "self_freezing", diff --git a/kernel/compat.c b/kernel/compat.c index e40b0430b562..633394f442f8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -157,7 +157,7 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp int compat_get_timeval(struct timeval *tv, const void __user *utv) { if (COMPAT_USE_64BIT_TIME) - return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; + return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; else return __compat_get_timeval(tv, utv); } @@ -166,7 +166,7 @@ EXPORT_SYMBOL_GPL(compat_get_timeval); int compat_put_timeval(const struct timeval *tv, void __user *utv) { if (COMPAT_USE_64BIT_TIME) - return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; + return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; else return __compat_put_timeval(tv, utv); } @@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(compat_put_timeval); int compat_get_timespec(struct timespec *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; + return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; else return __compat_get_timespec(ts, uts); } @@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(compat_get_timespec); int compat_put_timespec(const struct timespec *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; + return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; else return __compat_put_timespec(ts, uts); } diff --git a/kernel/cpu.c b/kernel/cpu.c index 759feaaa9bb0..a343bde710b1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -284,8 +284,7 @@ static inline void check_for_tasks(int cpu) task_cputime(p, &utime, &stime); if (task_cpu(p) == cpu && p->state == TASK_RUNNING && (utime || stime)) - printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " - "(state = %ld, flags = %x)\n", + pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", p->comm, task_pid_nr(p), cpu, p->state, p->flags); } @@ -337,8 +336,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (err) { nr_calls--; __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); - printk("%s: attempt to take down CPU %u failed\n", - __func__, cpu); + pr_warn("%s: attempt to take down CPU %u failed\n", + __func__, cpu); goto out_release; } @@ -445,8 +444,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", - __func__, cpu); + pr_warn("%s: attempt to bring up CPU %u failed\n", + __func__, cpu); goto out_notify; } @@ -476,11 +475,10 @@ int cpu_up(unsigned int cpu) int err = 0; if (!cpu_possible(cpu)) { - printk(KERN_ERR "can't online cpu %d because it is not " - "configured as may-hotadd at boot time\n", cpu); + pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", + cpu); #if defined(CONFIG_IA64) - printk(KERN_ERR "please check additional_cpus= boot " - "parameter\n"); + pr_err("please check additional_cpus= boot parameter\n"); #endif return -EINVAL; } @@ -519,7 +517,7 @@ int disable_nonboot_cpus(void) */ cpumask_clear(frozen_cpus); - printk("Disabling non-boot CPUs ...\n"); + pr_info("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; @@ -529,8 +527,7 @@ int disable_nonboot_cpus(void) if (!error) cpumask_set_cpu(cpu, frozen_cpus); else { - printk(KERN_ERR "Error taking CPU%d down: %d\n", - cpu, error); + pr_err("Error taking CPU%d down: %d\n", cpu, error); break; } } @@ -540,7 +537,7 @@ int disable_nonboot_cpus(void) /* Make sure the CPUs won't be enabled by someone else */ cpu_hotplug_disabled = 1; } else { - printk(KERN_ERR "Non-boot CPUs are not disabled\n"); + pr_err("Non-boot CPUs are not disabled\n"); } cpu_maps_update_done(); return error; @@ -564,7 +561,7 @@ void __ref enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - printk(KERN_INFO "Enabling non-boot CPUs ...\n"); + pr_info("Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); @@ -573,10 +570,10 @@ void __ref enable_nonboot_cpus(void) error = _cpu_up(cpu, 1); trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { - printk(KERN_INFO "CPU%d is up\n", cpu); + pr_info("CPU%d is up\n", cpu); continue; } - printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); + pr_warn("Error taking CPU%d up: %d\n", cpu, error); } arch_enable_nonboot_cpus_end(); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d54c418bd06..f6b33c696224 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,12 +61,7 @@ #include <linux/cgroup.h> #include <linux/wait.h> -/* - * Tracks how many cpusets are currently defined in system. - * When there is only one cpuset (the root cpuset) we can - * short circuit some hooks. - */ -int number_of_cpusets __read_mostly; +struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; /* See "Frequency meter" comments, below. */ @@ -124,7 +119,7 @@ static inline struct cpuset *task_cs(struct task_struct *task) static inline struct cpuset *parent_cs(struct cpuset *cs) { - return css_cs(css_parent(&cs->css)); + return css_cs(cs->css.parent); } #ifdef CONFIG_NUMA @@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains, goto done; } - csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); + csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; @@ -696,11 +691,8 @@ restart: if (nslot == ndoms) { static int warnings = 10; if (warnings) { - printk(KERN_WARNING - "rebuild_sched_domains confused:" - " nslot %d, ndoms %d, csn %d, i %d," - " apn %d\n", - nslot, ndoms, csn, i, apn); + pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", + nslot, ndoms, csn, i, apn); warnings--; } continue; @@ -875,7 +867,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) continue; } } - if (!css_tryget(&cp->css)) + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); @@ -890,6 +882,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider + * @trialcs: trial cpuset * @buf: buffer of cpu numbers written to this cpuset */ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, @@ -1110,7 +1103,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) continue; } } - if (!css_tryget(&cp->css)) + if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); @@ -1605,13 +1598,15 @@ out_unlock: /* * Common handling for a write to a "cpus" or "mems" file. */ -static int cpuset_write_resmask(struct cgroup_subsys_state *css, - struct cftype *cft, char *buf) +static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) { - struct cpuset *cs = css_cs(css); + struct cpuset *cs = css_cs(of_css(of)); struct cpuset *trialcs; int retval = -ENODEV; + buf = strstrip(buf); + /* * CPU or memory hotunplug may leave @cs w/o any execution * resources, in which case the hotplug code asynchronously updates @@ -1635,7 +1630,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css, goto out_unlock; } - switch (cft->private) { + switch (of_cft(of)->private) { case FILE_CPULIST: retval = update_cpumask(cs, trialcs, buf); break; @@ -1650,7 +1645,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); - return retval; + return retval ?: nbytes; } /* @@ -1752,7 +1747,7 @@ static struct cftype files[] = { { .name = "cpus", .seq_show = cpuset_common_seq_show, - .write_string = cpuset_write_resmask, + .write = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, }, @@ -1760,7 +1755,7 @@ static struct cftype files[] = { { .name = "mems", .seq_show = cpuset_common_seq_show, - .write_string = cpuset_write_resmask, + .write = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, }, @@ -1888,7 +1883,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); - number_of_cpusets++; + cpuset_inc(); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -1939,7 +1934,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - number_of_cpusets--; + cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); @@ -1992,7 +1987,6 @@ int __init cpuset_init(void) if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) BUG(); - number_of_cpusets = 1; return 0; } @@ -2017,7 +2011,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) parent = parent_cs(parent); if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); + pr_err("cpuset: failed to transfer tasks out of empty cpuset "); pr_cont_cgroup_name(cs->css.cgroup); pr_cont("\n"); } @@ -2155,7 +2149,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (cs == &top_cpuset || !css_tryget(&cs->css)) + if (cs == &top_cpuset || !css_tryget_online(&cs->css)) continue; rcu_read_unlock(); @@ -2536,7 +2530,7 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, /** * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed - * @task: pointer to task_struct of some task. + * @tsk: pointer to task_struct of some task. * * Description: Prints @task's name, cpuset name, and cached copy of its * mems_allowed to the kernel log. @@ -2554,7 +2548,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) cgrp = task_cs(tsk)->css.cgroup; nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); - printk(KERN_INFO "%s cpuset=", tsk->comm); + pr_info("%s cpuset=", tsk->comm); pr_cont_cgroup_name(cgrp); pr_cont(" mems_allowed=%s\n", cpuset_nodelist); @@ -2646,10 +2640,10 @@ out: /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { - seq_printf(m, "Mems_allowed:\t"); + seq_puts(m, "Mems_allowed:\t"); seq_nodemask(m, &task->mems_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Mems_allowed_list:\t"); + seq_puts(m, "\n"); + seq_puts(m, "Mems_allowed_list:\t"); seq_nodemask_list(m, &task->mems_allowed); - seq_printf(m, "\n"); + seq_puts(m, "\n"); } diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index b03e0e814e43..fe15fff5df53 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -21,7 +21,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { int old_lvl = console_loglevel; - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; kdb_set_current_task(p); if (addr) { diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 14ff4849262c..7c70812caea5 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -710,7 +710,7 @@ kdb_printit: } if (logging) { saved_loglevel = console_loglevel; - console_loglevel = 0; + console_loglevel = CONSOLE_LOGLEVEL_SILENT; printk(KERN_INFO "%s", kdb_buffer); } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0b097c8a1e50..2f7c760305ca 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv) static void kdb_dumpregs(struct pt_regs *regs) { int old_lvl = console_loglevel; - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; show_regs(regs); kdb_trap_printk--; diff --git a/kernel/events/core.c b/kernel/events/core.c index 689237a0c5e8..24d35cc38e42 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -608,7 +608,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; - css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); + css = css_tryget_online_from_dir(f.file->f_dentry, + &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d1edc5e6fd03..adcd76a96839 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1291,14 +1291,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) if (unlikely(!xol_vaddr)) return 0; - /* Initialize the slot */ - copy_to_page(area->page, xol_vaddr, - &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); - /* - * We probably need flush_icache_user_range() but it needs vma. - * This should work on supported architectures too. - */ - flush_dcache_page(area->page); + arch_uprobe_copy_ixol(area->page, xol_vaddr, + &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return xol_vaddr; } @@ -1341,6 +1335,21 @@ static void xol_free_insn_slot(struct task_struct *tsk) } } +void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len) +{ + /* Initialize the slot */ + copy_to_page(page, vaddr, src, len); + + /* + * We probably need flush_icache_user_range() but it needs vma. + * This should work on most of architectures by default. If + * architecture needs to do something different it can define + * its own version of the function. + */ + flush_dcache_page(page); +} + /** * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs * @regs: Reflects the saved state of the task after it has hit a breakpoint diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0dbeae374225..83d4382f5699 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -37,7 +37,7 @@ static unsigned long ident_map[32] = { struct exec_domain default_exec_domain = { .name = "Linux", /* name */ .handler = default_handler, /* lcall7 causes a seg fault. */ - .pers_low = 0, /* PER_LINUX personality. */ + .pers_low = 0, /* PER_LINUX personality. */ .pers_high = 0, /* PER_LINUX personality. */ .signal_map = ident_map, /* Identity map signals. */ .signal_invmap = ident_map, /* - both ways. */ @@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality) ep = &default_exec_domain; out: read_unlock(&exec_domains_lock); - return (ep); + return ep; } int @@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep) out: write_unlock(&exec_domains_lock); - return (err); + return err; } +EXPORT_SYMBOL(register_exec_domain); int unregister_exec_domain(struct exec_domain *ep) @@ -133,6 +134,7 @@ unregister: write_unlock(&exec_domains_lock); return 0; } +EXPORT_SYMBOL(unregister_exec_domain); int __set_personality(unsigned int personality) { @@ -144,6 +146,7 @@ int __set_personality(unsigned int personality) return 0; } +EXPORT_SYMBOL(__set_personality); #ifdef CONFIG_PROC_FS static int execdomains_proc_show(struct seq_file *m, void *v) @@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality) return old; } - - -EXPORT_SYMBOL(register_exec_domain); -EXPORT_SYMBOL(unregister_exec_domain); -EXPORT_SYMBOL(__set_personality); diff --git a/kernel/exit.c b/kernel/exit.c index 6ed6a1d552b5..e5c4668f1799 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -313,46 +313,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -/* - * Let kernel threads use this to say that they allow a certain signal. - * Must not be used if kthread was cloned with CLONE_SIGHAND. - */ -int allow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - /* This is only needed for daemonize()'ed kthreads */ - sigdelset(¤t->blocked, sig); - /* - * Kernel threads handle their own signals. Let the signal code - * know it'll be handled, so that they don't get converted to - * SIGKILL or just silently dropped. - */ - current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(allow_signal); - -int disallow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(disallow_signal); - -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ @@ -395,14 +356,18 @@ retry: } /* - * Search through everything else. We should not get - * here often + * Search through everything else, we should not get here often. */ - do_each_thread(g, c) { - if (c->mm == mm) - goto assign_new_owner; - } while_each_thread(g, c); - + for_each_process(g) { + if (g->flags & PF_KTHREAD) + continue; + for_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + if (c->mm) + break; + } + } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are @@ -434,7 +399,7 @@ assign_new_owner: task_unlock(c); put_task_struct(c); } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we diff --git a/kernel/fork.c b/kernel/fork.c index 54a8d26f612f..d2799d1fc952 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, - THREAD_SIZE_ORDER); + struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; @@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { mm->owner = p; } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Initialize POSIX timer handling for a single task. @@ -1606,10 +1606,12 @@ long do_fork(unsigned long clone_flags, */ if (!IS_ERR(p)) { struct completion vfork; + struct pid *pid; trace_sched_process_fork(current, p); - nr = task_pid_vnr(p); + pid = get_task_pid(p, PIDTYPE_PID); + nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, parent_tidptr); @@ -1624,12 +1626,14 @@ long do_fork(unsigned long clone_flags, /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) - ptrace_event(trace, nr); + ptrace_event_pid(trace, pid); if (clone_flags & CLONE_VFORK) { if (!wait_for_vfork_done(p, &vfork)) - ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); + ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } + + put_pid(pid); } else { nr = PTR_ERR(p); } diff --git a/kernel/futex.c b/kernel/futex.c index 89bc9d59ac65..b632b5f3f094 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -743,10 +743,58 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); } +/* + * We need to check the following states: + * + * Waiter | pi_state | pi->owner | uTID | uODIED | ? + * + * [1] NULL | --- | --- | 0 | 0/1 | Valid + * [2] NULL | --- | --- | >0 | 0/1 | Valid + * + * [3] Found | NULL | -- | Any | 0/1 | Invalid + * + * [4] Found | Found | NULL | 0 | 1 | Valid + * [5] Found | Found | NULL | >0 | 1 | Invalid + * + * [6] Found | Found | task | 0 | 1 | Valid + * + * [7] Found | Found | NULL | Any | 0 | Invalid + * + * [8] Found | Found | task | ==taskTID | 0/1 | Valid + * [9] Found | Found | task | 0 | 0 | Invalid + * [10] Found | Found | task | !=taskTID | 0/1 | Invalid + * + * [1] Indicates that the kernel can acquire the futex atomically. We + * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * + * [2] Valid, if TID does not belong to a kernel thread. If no matching + * thread is found then it indicates that the owner TID has died. + * + * [3] Invalid. The waiter is queued on a non PI futex + * + * [4] Valid state after exit_robust_list(), which sets the user space + * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. + * + * [5] The user space value got manipulated between exit_robust_list() + * and exit_pi_state_list() + * + * [6] Valid state after exit_pi_state_list() which sets the new owner in + * the pi_state but cannot access the user space value. + * + * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. + * + * [8] Owner and user space value match + * + * [9] There is no transient state which sets the user space TID to 0 + * except exit_robust_list(), but this is indicated by the + * FUTEX_OWNER_DIED bit. See [4] + * + * [10] There is no transient state which leaves owner and user space + * TID out of sync. + */ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps, - struct task_struct *task) + union futex_key *key, struct futex_pi_state **ps) { struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; @@ -756,12 +804,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex(&this->key, key)) { /* - * Another waiter already exists - bump up - * the refcount and return its pi_state: + * Sanity check the waiter before increasing + * the refcount and attaching to it. */ pi_state = this->pi_state; /* - * Userspace might have messed up non-PI and PI futexes + * Userspace might have messed up non-PI and + * PI futexes [3] */ if (unlikely(!pi_state)) return -EINVAL; @@ -769,44 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, WARN_ON(!atomic_read(&pi_state->refcount)); /* - * When pi_state->owner is NULL then the owner died - * and another waiter is on the fly. pi_state->owner - * is fixed up by the task which acquires - * pi_state->rt_mutex. - * - * We do not check for pid == 0 which can happen when - * the owner died and robust_list_exit() cleared the - * TID. + * Handle the owner died case: */ - if (pid && pi_state->owner) { + if (uval & FUTEX_OWNER_DIED) { /* - * Bail out if user space manipulated the - * futex value. + * exit_pi_state_list sets owner to NULL and + * wakes the topmost waiter. The task which + * acquires the pi_state->rt_mutex will fixup + * owner. */ - if (pid != task_pid_vnr(pi_state->owner)) + if (!pi_state->owner) { + /* + * No pi state owner, but the user + * space TID is not 0. Inconsistent + * state. [5] + */ + if (pid) + return -EINVAL; + /* + * Take a ref on the state and + * return. [4] + */ + goto out_state; + } + + /* + * If TID is 0, then either the dying owner + * has not yet executed exit_pi_state_list() + * or some waiter acquired the rtmutex in the + * pi state, but did not yet fixup the TID in + * user space. + * + * Take a ref on the state and return. [6] + */ + if (!pid) + goto out_state; + } else { + /* + * If the owner died bit is not set, + * then the pi_state must have an + * owner. [7] + */ + if (!pi_state->owner) return -EINVAL; } /* - * Protect against a corrupted uval. If uval - * is 0x80000000 then pid is 0 and the waiter - * bit is set. So the deadlock check in the - * calling code has failed and we did not fall - * into the check above due to !pid. + * Bail out if user space manipulated the + * futex value. If pi state exists then the + * owner TID must be the same as the user + * space TID. [9/10] */ - if (task && pi_state->owner == task) - return -EDEADLK; + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; + out_state: atomic_inc(&pi_state->refcount); *ps = pi_state; - return 0; } } /* * We are the first waiter - try to look up the real owner and attach - * the new pi_state to it, but bail out when TID = 0 + * the new pi_state to it, but bail out when TID = 0 [1] */ if (!pid) return -ESRCH; @@ -839,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return ret; } + /* + * No existing pi state. First waiter. [2] + */ pi_state = alloc_pi_state(); /* @@ -910,10 +988,18 @@ retry: return -EDEADLK; /* - * Surprise - we got the lock. Just return to userspace: + * Surprise - we got the lock, but we do not trust user space at all. */ - if (unlikely(!curval)) - return 1; + if (unlikely(!curval)) { + /* + * We verify whether there is kernel state for this + * futex. If not, we can safely assume, that the 0 -> + * TID transition is correct. If state exists, we do + * not bother to fixup the user space state as it was + * corrupted already. + */ + return futex_top_waiter(hb, key) ? -EINVAL : 1; + } uval = curval; @@ -951,7 +1037,7 @@ retry: * We dont have the lock. Look up the PI state (or create it if * we are the first waiter): */ - ret = lookup_pi_state(uval, hb, key, ps, task); + ret = lookup_pi_state(uval, hb, key, ps); if (unlikely(ret)) { switch (ret) { @@ -1044,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; + int ret = 0; if (!pi_state) return -EINVAL; @@ -1067,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) new_owner = this->task; /* - * We pass it to the next owner. (The WAITERS bit is always - * kept enabled while there is PI state around. We must also - * preserve the owner died bit.) + * We pass it to the next owner. The WAITERS bit is always + * kept enabled while there is PI state around. We cleanup the + * owner died bit, because we are the owner. */ - if (!(uval & FUTEX_OWNER_DIED)) { - int ret = 0; - - newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) - ret = -EFAULT; - else if (curval != uval) - ret = -EINVAL; - if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - return ret; - } + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) + ret = -EFAULT; + else if (curval != uval) + ret = -EINVAL; + if (ret) { + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + return ret; } raw_spin_lock_irq(&pi_state->owner->pi_lock); @@ -1442,6 +1525,13 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, if (requeue_pi) { /* + * Requeue PI only works on two distinct uaddrs. This + * check is only valid for private futexes. See below. + */ + if (uaddr1 == uaddr2) + return -EINVAL; + + /* * requeue_pi requires a pi_state, try to allocate it now * without any locks in case it fails. */ @@ -1479,6 +1569,15 @@ retry: if (unlikely(ret != 0)) goto out_put_key1; + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (requeue_pi && match_futex(&key1, &key2)) { + ret = -EINVAL; + goto out_put_keys; + } + hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -1544,7 +1643,7 @@ retry_private: * rereading and handing potential crap to * lookup_pi_state. */ - ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL); + ret = lookup_pi_state(ret, hb2, &key2, &pi_state); } switch (ret) { @@ -2327,9 +2426,10 @@ retry: /* * To avoid races, try to do the TID -> 0 atomic transition * again. If it succeeds then we can return without waking - * anyone else up: + * anyone else up. We only try this if neither the waiters nor + * the owner died bit are set. */ - if (!(uval & FUTEX_OWNER_DIED) && + if (!(uval & ~FUTEX_TID_MASK) && cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) goto pi_faulted; /* @@ -2359,11 +2459,9 @@ retry: /* * No waiters - kernel unlocks the futex: */ - if (!(uval & FUTEX_OWNER_DIED)) { - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; - } + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; out_unlock: spin_unlock(&hb->lock); @@ -2525,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (ret) goto out_key2; + /* + * The check above which compares uaddrs is not sufficient for + * shared futexes. We need to compare the keys: + */ + if (match_futex(&q.key, &key2)) { + ret = -EINVAL; + goto out_put_keys; + } + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index f45b75b713c0..b358a802fd18 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -85,6 +85,12 @@ void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_ior); +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_time_profile); + /** * gcov_enable_events - enable event reporting through gcov_event() * diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 2c6e4631c814..826ba9fb5e32 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,12 @@ #include <linux/vmalloc.h> #include "gcov.h" +#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9 +#define GCOV_COUNTERS 9 +#else #define GCOV_COUNTERS 8 +#endif + #define GCOV_TAG_FUNCTION_LENGTH 3 static struct gcov_info *gcov_info_head; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06bb1417b063..06db12434d72 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic = static int __init hung_task_panic_setup(char *str) { - sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); + int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); + if (rc) + return rc; return 1; } __setup("hung_task_panic=", hung_task_panic_setup); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 07cbdfea9ae2..d269cecdfbf0 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -5,6 +5,10 @@ menu "IRQ subsystem" config MAY_HAVE_SPARSE_IRQ bool +# Legacy support, required for itanic +config GENERIC_IRQ_LEGACY + bool + # Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE bool @@ -17,6 +21,11 @@ config GENERIC_IRQ_SHOW config GENERIC_IRQ_SHOW_LEVEL bool +# Facility to allocate a hardware interrupt. This is legacy support +# and should not be used in new code. Use irq domains instead. +config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ + bool + # Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6397df2d6945..a2b28a2fd7b1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -40,10 +40,9 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) irq_put_desc_unlock(desc, flags); /* * For !CONFIG_SPARSE_IRQ make the irq show up in - * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is - * already marked, and this call is harmless. + * allocated_irqs. */ - irq_reserve_irq(irq); + irq_mark_irq(irq); return 0; } EXPORT_SYMBOL(irq_set_chip); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ddf1ffeb79f1..099ea2e0eb88 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -33,7 +33,7 @@ enum { }; /* - * Bit masks for desc->state + * Bit masks for desc->core_internal_state__do_not_mess_with_it * * IRQS_AUTODETECT - autodetection in progress * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt @@ -76,6 +76,12 @@ extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); extern void unmask_threaded_irq(struct irq_desc *desc); +#ifdef CONFIG_SPARSE_IRQ +static inline void irq_mark_irq(unsigned int irq) { } +#else +extern void irq_mark_irq(unsigned int irq); +#endif + extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index bb07f2928f4b..7339e42a85ab 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -278,7 +278,12 @@ EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { - dynamic_irq_cleanup(irq); + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc_set_defaults(irq, desc, desc_node(desc), NULL); + raw_spin_unlock_irqrestore(&desc->lock, flags); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, @@ -299,6 +304,20 @@ static int irq_expand_nr_irqs(unsigned int nr) return -ENOMEM; } +void irq_mark_irq(unsigned int irq) +{ + mutex_lock(&sparse_irq_lock); + bitmap_set(allocated_irqs, irq, 1); + mutex_unlock(&sparse_irq_lock); +} + +#ifdef CONFIG_GENERIC_IRQ_LEGACY +void irq_init_desc(unsigned int irq) +{ + free_desc(irq); +} +#endif + #endif /* !CONFIG_SPARSE_IRQ */ /** @@ -396,30 +415,56 @@ err: } EXPORT_SYMBOL_GPL(__irq_alloc_descs); +#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ /** - * irq_reserve_irqs - mark irqs allocated - * @from: mark from irq number - * @cnt: number of irqs to mark + * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware + * @cnt: number of interrupts to allocate + * @node: node on which to allocate * - * Returns 0 on success or an appropriate error code + * Returns an interrupt number > 0 or 0, if the allocation fails. */ -int irq_reserve_irqs(unsigned int from, unsigned int cnt) +unsigned int irq_alloc_hwirqs(int cnt, int node) { - unsigned int start; - int ret = 0; + int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); - if (!cnt || (from + cnt) > nr_irqs) - return -EINVAL; + if (irq < 0) + return 0; - mutex_lock(&sparse_irq_lock); - start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); - if (start == from) - bitmap_set(allocated_irqs, start, cnt); - else - ret = -EEXIST; - mutex_unlock(&sparse_irq_lock); - return ret; + for (i = irq; cnt > 0; i++, cnt--) { + if (arch_setup_hwirq(i, node)) + goto err; + irq_clear_status_flags(i, _IRQ_NOREQUEST); + } + return irq; + +err: + for (i--; i >= irq; i--) { + irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); + arch_teardown_hwirq(i); + } + irq_free_descs(irq, cnt); + return 0; +} +EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); + +/** + * irq_free_hwirqs - Free irq descriptor and cleanup the hardware + * @from: Free from irq number + * @cnt: number of interrupts to free + * + */ +void irq_free_hwirqs(unsigned int from, int cnt) +{ + int i; + + for (i = from; cnt > 0; i++, cnt--) { + irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); + arch_teardown_hwirq(i); + } + irq_free_descs(from, cnt); } +EXPORT_SYMBOL_GPL(irq_free_hwirqs); +#endif /** * irq_get_next_irq - get next allocated irq number @@ -482,20 +527,6 @@ int irq_set_percpu_devid(unsigned int irq) return 0; } -/** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_cleanup(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, desc_node(desc), NULL); - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f14033700c25..eb5e10e32e05 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -27,14 +27,14 @@ static struct irq_domain *irq_default_domain; * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller * @size: Size of linear map; 0 for radix mapping only + * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no * direct mapping * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * - * Allocates and initialize and irq_domain structure. Caller is expected to - * register allocated irq_domain with irq_domain_register(). Returns pointer - * to IRQ domain, or NULL on failure. + * Allocates and initialize and irq_domain structure. + * Returns pointer to IRQ domain, or NULL on failure. */ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, irq_hw_number_t hwirq_max, int direct_max, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d34131ca372b..3dc6a61bf06a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -886,8 +886,8 @@ static int irq_thread(void *data) irq_thread_check_affinity(desc, action); action_ret = handler_fn(desc, action); - if (!noirqdebug) - note_interrupt(action->irq, desc, action_ret); + if (action_ret == IRQ_HANDLED) + atomic_inc(&desc->threads_handled); wake_threads_waitq(desc); } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index a1d8cc63b56e..e2514b0e439e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -270,6 +270,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, return action && (action->flags & IRQF_IRQPOLL); } +#define SPURIOUS_DEFERRED 0x80000000 + void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { @@ -277,15 +279,111 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, irq_settings_is_polled(desc)) return; - /* we get here again via the threaded handler */ - if (action_ret == IRQ_WAKE_THREAD) - return; - if (bad_action_ret(action_ret)) { report_bad_irq(irq, desc, action_ret); return; } + /* + * We cannot call note_interrupt from the threaded handler + * because we need to look at the compound of all handlers + * (primary and threaded). Aside of that in the threaded + * shared case we have no serialization against an incoming + * hardware interrupt while we are dealing with a threaded + * result. + * + * So in case a thread is woken, we just note the fact and + * defer the analysis to the next hardware interrupt. + * + * The threaded handlers store whether they sucessfully + * handled an interrupt and we check whether that number + * changed versus the last invocation. + * + * We could handle all interrupts with the delayed by one + * mechanism, but for the non forced threaded case we'd just + * add pointless overhead to the straight hardirq interrupts + * for the sake of a few lines less code. + */ + if (action_ret & IRQ_WAKE_THREAD) { + /* + * There is a thread woken. Check whether one of the + * shared primary handlers returned IRQ_HANDLED. If + * not we defer the spurious detection to the next + * interrupt. + */ + if (action_ret == IRQ_WAKE_THREAD) { + int handled; + /* + * We use bit 31 of thread_handled_last to + * denote the deferred spurious detection + * active. No locking necessary as + * thread_handled_last is only accessed here + * and we have the guarantee that hard + * interrupts are not reentrant. + */ + if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) { + desc->threads_handled_last |= SPURIOUS_DEFERRED; + return; + } + /* + * Check whether one of the threaded handlers + * returned IRQ_HANDLED since the last + * interrupt happened. + * + * For simplicity we just set bit 31, as it is + * set in threads_handled_last as well. So we + * avoid extra masking. And we really do not + * care about the high bits of the handled + * count. We just care about the count being + * different than the one we saw before. + */ + handled = atomic_read(&desc->threads_handled); + handled |= SPURIOUS_DEFERRED; + if (handled != desc->threads_handled_last) { + action_ret = IRQ_HANDLED; + /* + * Note: We keep the SPURIOUS_DEFERRED + * bit set. We are handling the + * previous invocation right now. + * Keep it for the current one, so the + * next hardware interrupt will + * account for it. + */ + desc->threads_handled_last = handled; + } else { + /* + * None of the threaded handlers felt + * responsible for the last interrupt + * + * We keep the SPURIOUS_DEFERRED bit + * set in threads_handled_last as we + * need to account for the current + * interrupt as well. + */ + action_ret = IRQ_NONE; + } + } else { + /* + * One of the primary handlers returned + * IRQ_HANDLED. So we don't care about the + * threaded handlers on the same line. Clear + * the deferred detection bit. + * + * In theory we could/should check whether the + * deferred bit is set and take the result of + * the previous run into account here as + * well. But it's really not worth the + * trouble. If every other interrupt is + * handled we never trigger the spurious + * detector. And if this is just the one out + * of 100k unhandled ones which is handled + * then we merily delay the spurious detection + * by one hard interrupt. Not a real problem. + */ + desc->threads_handled_last &= ~SPURIOUS_DEFERRED; + } + } + if (unlikely(action_ret == IRQ_NONE)) { /* * If we are seeing only the odd spurious IRQ caused by diff --git a/kernel/kexec.c b/kernel/kexec.c index 28c57069ef68..6748688813d0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -125,8 +125,8 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned long dest); static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) + unsigned long nr_segments, + struct kexec_segment __user *segments) { size_t segment_bytes; struct kimage *image; @@ -257,13 +257,13 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { - printk(KERN_ERR "Could not allocate control_code_buffer\n"); + pr_err("Could not allocate control_code_buffer\n"); goto out_free; } image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { - printk(KERN_ERR "Could not allocate swap buffer\n"); + pr_err("Could not allocate swap buffer\n"); goto out_free; } @@ -332,7 +332,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { - printk(KERN_ERR "Could not allocate control_code_buffer\n"); + pr_err("Could not allocate control_code_buffer\n"); goto out_free; } @@ -621,8 +621,8 @@ static void kimage_terminate(struct kimage *image) #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ - ptr = (entry & IND_INDIRECTION)? \ - phys_to_virt((entry & PAGE_MASK)): ptr +1) + ptr = (entry & IND_INDIRECTION) ? \ + phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { @@ -650,8 +650,7 @@ static void kimage_free(struct kimage *image) * done with it. */ ind = entry; - } - else if (entry & IND_SOURCE) + } else if (entry & IND_SOURCE) kimage_free_entry(entry); } /* Free the final indirection page */ @@ -774,8 +773,7 @@ static struct page *kimage_alloc_page(struct kimage *image, addr = old_addr; page = old_page; break; - } - else { + } else { /* Place the page on the destination list I * will use it later. */ @@ -1059,7 +1057,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, return -EINVAL; ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); - for (i=0; i < nr_segments; i++) { + for (i = 0; i < nr_segments; i++) { result = copy_from_user(&in, &segments[i], sizeof(in)); if (result) return -EFAULT; @@ -1214,14 +1212,14 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) * squirrelled away. ELF notes happen to provide * all of that, so there is no need to invent something new. */ - buf = (u32*)per_cpu_ptr(crash_notes, cpu); + buf = (u32 *)per_cpu_ptr(crash_notes, cpu); if (!buf) return; memset(&prstatus, 0, sizeof(prstatus)); prstatus.pr_pid = current->pid; elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, - &prstatus, sizeof(prstatus)); + &prstatus, sizeof(prstatus)); final_note(buf); } @@ -1230,8 +1228,7 @@ static int __init crash_notes_memory_init(void) /* Allocate memory for saving cpu registers. */ crash_notes = alloc_percpu(note_buf_t); if (!crash_notes) { - printk("Kexec: Memory allocation for saving cpu register" - " states failed\n"); + pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); return -ENOMEM; } return 0; @@ -1253,10 +1250,10 @@ subsys_initcall(crash_notes_memory_init); * * The function returns 0 on success and -EINVAL on failure. */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) { char *cur = cmdline, *tmp; @@ -1267,12 +1264,12 @@ static int __init parse_crashkernel_mem(char *cmdline, /* get the start of the range */ start = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("crashkernel: Memory value expected\n"); + pr_warn("crashkernel: Memory value expected\n"); return -EINVAL; } cur = tmp; if (*cur != '-') { - pr_warning("crashkernel: '-' expected\n"); + pr_warn("crashkernel: '-' expected\n"); return -EINVAL; } cur++; @@ -1281,31 +1278,30 @@ static int __init parse_crashkernel_mem(char *cmdline, if (*cur != ':') { end = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("crashkernel: Memory " - "value expected\n"); + pr_warn("crashkernel: Memory value expected\n"); return -EINVAL; } cur = tmp; if (end <= start) { - pr_warning("crashkernel: end <= start\n"); + pr_warn("crashkernel: end <= start\n"); return -EINVAL; } } if (*cur != ':') { - pr_warning("crashkernel: ':' expected\n"); + pr_warn("crashkernel: ':' expected\n"); return -EINVAL; } cur++; size = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("Memory value expected\n"); + pr_warn("Memory value expected\n"); return -EINVAL; } cur = tmp; if (size >= system_ram) { - pr_warning("crashkernel: invalid size\n"); + pr_warn("crashkernel: invalid size\n"); return -EINVAL; } @@ -1323,8 +1319,7 @@ static int __init parse_crashkernel_mem(char *cmdline, cur++; *crash_base = memparse(cur, &tmp); if (cur == tmp) { - pr_warning("Memory value expected " - "after '@'\n"); + pr_warn("Memory value expected after '@'\n"); return -EINVAL; } } @@ -1336,26 +1331,26 @@ static int __init parse_crashkernel_mem(char *cmdline, /* * That function parses "simple" (old) crashkernel command lines like * - * crashkernel=size[@offset] + * crashkernel=size[@offset] * * It returns 0 on success and -EINVAL on failure. */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) { char *cur = cmdline; *crash_size = memparse(cmdline, &cur); if (cmdline == cur) { - pr_warning("crashkernel: memory value expected\n"); + pr_warn("crashkernel: memory value expected\n"); return -EINVAL; } if (*cur == '@') *crash_base = memparse(cur+1, &cur); else if (*cur != ' ' && *cur != '\0') { - pr_warning("crashkernel: unrecognized char\n"); + pr_warn("crashkernel: unrecognized char\n"); return -EINVAL; } @@ -1691,7 +1686,7 @@ int kernel_kexec(void) * CPU hotplug again; so re-enable it here. */ cpu_hotplug_enable(); - printk(KERN_EMERG "Starting new kernel\n"); + pr_emerg("Starting new kernel\n"); machine_shutdown(); } diff --git a/kernel/kmod.c b/kernel/kmod.c index 0ac67a5861c5..8637e041a247 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -285,10 +285,7 @@ static int wait_for_helper(void *data) pid_t pid; /* If SIGCLD is ignored sys_wait4 won't populate the status. */ - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL; - spin_unlock_irq(¤t->sighand->siglock); - + kernel_sigaction(SIGCHLD, SIG_DFL); pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); if (pid < 0) { sub_info->retval = pid; diff --git a/kernel/kthread.c b/kernel/kthread.c index 9a130ec06f7a..c2390f41307b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create) * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * - * Returns a task_struct or ERR_PTR(-ENOMEM). + * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, @@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), * that thread. */ if (xchg(&create->done, NULL)) - return ERR_PTR(-ENOMEM); + return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a462b317f9a0..a02812743a7e 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void) } static void __sched -account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) +account_global_scheduler_latency(struct task_struct *tsk, + struct latency_record *lat) { int firstnonnull = MAXLR + 1; int i; @@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v) break; seq_printf(m, " %ps", (void *)bt); } - seq_printf(m, "\n"); + seq_puts(m, "\n"); } } return 0; diff --git a/kernel/module.c b/kernel/module.c index 079c4615607d..81e727cf6df9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3020,21 +3020,6 @@ static int do_init_module(struct module *mod) */ current->flags &= ~PF_USED_ASYNC; - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); - - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); - - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); - do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -3165,9 +3150,26 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); + /* Set RO and NX regions for core */ + set_section_ro_nx(mod->module_core, + mod->core_text_size, + mod->core_ro_size, + mod->core_size); + + /* Set RO and NX regions for init */ + set_section_ro_nx(mod->module_init, + mod->init_text_size, + mod->init_ro_size, + mod->init_size); + /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ mod->state = MODULE_STATE_COMING; + mutex_unlock(&module_mutex); + + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + return 0; out: mutex_unlock(&module_mutex); @@ -3190,6 +3192,7 @@ static int load_module(struct load_info *info, const char __user *uargs, { struct module *mod; long err; + char *after_dashes; err = module_sig_check(info); if (err) @@ -3277,10 +3280,15 @@ static int load_module(struct load_info *info, const char __user *uargs, goto ddebug_cleanup; /* Module is ready to execute: parsing args may do that. */ - err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, unknown_module_param_cb); - if (err < 0) + after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, + -32768, 32767, unknown_module_param_cb); + if (IS_ERR(after_dashes)) { + err = PTR_ERR(after_dashes); goto bug_cleanup; + } else if (after_dashes) { + pr_warn("%s: parameters '%s' after `--' ignored\n", + mod->name, after_dashes); + } /* Link in to syfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); diff --git a/kernel/panic.c b/kernel/panic.c index d02fa9fef46a..62e16cef9cc2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,6 +32,7 @@ static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); +static bool crash_kexec_post_notifiers; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -112,9 +113,11 @@ void panic(const char *fmt, ...) /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. - * Do we want to call this before we try to display a message? + * If we want to run this after calling panic_notifiers, pass + * the "crash_kexec_post_notifiers" option to the kernel. */ - crash_kexec(NULL); + if (!crash_kexec_post_notifiers) + crash_kexec(NULL); /* * Note smp_send_stop is the usual smp shutdown function, which @@ -131,6 +134,15 @@ void panic(const char *fmt, ...) kmsg_dump(KMSG_DUMP_PANIC); + /* + * If you doubt kdump always works fine in any situation, + * "crash_kexec_post_notifiers" offers you a chance to run + * panic_notifiers and dumping kmsg before kdump. + * Note: since some panic_notifiers can make crashed kernel + * more unstable, it can increase risks of the kdump failure too. + */ + crash_kexec(NULL); + bust_spinlocks(0); if (!panic_blink) @@ -472,6 +484,13 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); +static int __init setup_crash_kexec_post_notifiers(char *s) +{ + crash_kexec_post_notifiers = true; + return 0; +} +early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers); + static int __init oops_setup(char *s) { if (!s) diff --git a/kernel/params.c b/kernel/params.c index b00142e7f3ba..1e52ca233fd9 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -177,13 +177,13 @@ static char *next_arg(char *args, char **param, char **val) } /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ -int parse_args(const char *doing, - char *args, - const struct kernel_param *params, - unsigned num, - s16 min_level, - s16 max_level, - int (*unknown)(char *param, char *val, const char *doing)) +char *parse_args(const char *doing, + char *args, + const struct kernel_param *params, + unsigned num, + s16 min_level, + s16 max_level, + int (*unknown)(char *param, char *val, const char *doing)) { char *param, *val; @@ -198,6 +198,9 @@ int parse_args(const char *doing, int irq_was_disabled; args = next_arg(args, ¶m, &val); + /* Stop at -- */ + if (!val && strcmp(param, "--") == 0) + return args; irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, doing, params, num, min_level, max_level, unknown); @@ -208,22 +211,22 @@ int parse_args(const char *doing, switch (ret) { case -ENOENT: pr_err("%s: Unknown parameter `%s'\n", doing, param); - return ret; + return ERR_PTR(ret); case -ENOSPC: pr_err("%s: `%s' too large for parameter `%s'\n", doing, val ?: "", param); - return ret; + return ERR_PTR(ret); case 0: break; default: pr_err("%s: `%s' invalid for parameter `%s'\n", doing, val ?: "", param); - return ret; + return ERR_PTR(ret); } } /* All parsed OK. */ - return 0; + return NULL; } /* Lazy bastard, eh? */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 221229cf0190..ea2d5f6962ed 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -54,20 +54,16 @@ #include "console_cmdline.h" #include "braille.h" -/* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL - -/* We show everything that is MORE important than this.. */ -#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ -#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ - int console_printk[4] = { - DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ + CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ - MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ - DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ + CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ + CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; +/* Deferred messaged from sched code are marked by this special level */ +#define SCHED_MESSAGE_LOGLEVEL -2 + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -91,6 +87,29 @@ static struct lockdep_map console_lock_dep_map = { #endif /* + * Helper macros to handle lockdep when locking/unlocking console_sem. We use + * macros instead of functions so that _RET_IP_ contains useful information. + */ +#define down_console_sem() do { \ + down(&console_sem);\ + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ +} while (0) + +static int __down_trylock_console_sem(unsigned long ip) +{ + if (down_trylock(&console_sem)) + return 1; + mutex_acquire(&console_lock_dep_map, 0, 1, ip); + return 0; +} +#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) + +#define up_console_sem() do { \ + mutex_release(&console_lock_dep_map, 1, _RET_IP_);\ + up(&console_sem);\ +} while (0) + +/* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ @@ -206,8 +225,9 @@ struct printk_log { }; /* - * The logbuf_lock protects kmsg buffer, indices, counters. It is also - * used in interesting ways to provide interlocking in console_unlock(); + * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken + * within the scheduler's rq lock. It must be released before calling + * console_unlock() or anything else that might wake up a process. */ static DEFINE_RAW_SPINLOCK(logbuf_lock); @@ -250,9 +270,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; -/* cpu currently holding logbuf_lock */ -static volatile unsigned int logbuf_cpu = UINT_MAX; - /* human readable text of the record */ static char *log_text(const struct printk_log *msg) { @@ -297,34 +314,106 @@ static u32 log_next(u32 idx) return idx + msg->len; } -/* insert record into the buffer, discard old ones, update heads */ -static void log_store(int facility, int level, - enum log_flags flags, u64 ts_nsec, - const char *dict, u16 dict_len, - const char *text, u16 text_len) +/* + * Check whether there is enough free space for the given message. + * + * The same values of first_idx and next_idx mean that the buffer + * is either empty or full. + * + * If the buffer is empty, we must respect the position of the indexes. + * They cannot be reset to the beginning of the buffer. + */ +static int logbuf_has_space(u32 msg_size, bool empty) { - struct printk_log *msg; - u32 size, pad_len; + u32 free; - /* number of '\0' padding bytes to next message */ - size = sizeof(struct printk_log) + text_len + dict_len; - pad_len = (-size) & (LOG_ALIGN - 1); - size += pad_len; + if (log_next_idx > log_first_idx || empty) + free = max(log_buf_len - log_next_idx, log_first_idx); + else + free = log_first_idx - log_next_idx; + /* + * We need space also for an empty header that signalizes wrapping + * of the buffer. + */ + return free >= msg_size + sizeof(struct printk_log); +} + +static int log_make_free_space(u32 msg_size) +{ while (log_first_seq < log_next_seq) { - u32 free; + if (logbuf_has_space(msg_size, false)) + return 0; + /* drop old messages until we have enough continuous space */ + log_first_idx = log_next(log_first_idx); + log_first_seq++; + } - if (log_next_idx > log_first_idx) - free = max(log_buf_len - log_next_idx, log_first_idx); - else - free = log_first_idx - log_next_idx; + /* sequence numbers are equal, so the log buffer is empty */ + if (logbuf_has_space(msg_size, true)) + return 0; - if (free >= size + sizeof(struct printk_log)) - break; + return -ENOMEM; +} - /* drop old messages until we have enough contiuous space */ - log_first_idx = log_next(log_first_idx); - log_first_seq++; +/* compute the message size including the padding bytes */ +static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) +{ + u32 size; + + size = sizeof(struct printk_log) + text_len + dict_len; + *pad_len = (-size) & (LOG_ALIGN - 1); + size += *pad_len; + + return size; +} + +/* + * Define how much of the log buffer we could take at maximum. The value + * must be greater than two. Note that only half of the buffer is available + * when the index points to the middle. + */ +#define MAX_LOG_TAKE_PART 4 +static const char trunc_msg[] = "<truncated>"; + +static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, + u16 *dict_len, u32 *pad_len) +{ + /* + * The message should not take the whole buffer. Otherwise, it might + * get removed too soon. + */ + u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; + if (*text_len > max_text_len) + *text_len = max_text_len; + /* enable the warning message */ + *trunc_msg_len = strlen(trunc_msg); + /* disable the "dict" completely */ + *dict_len = 0; + /* compute the size again, count also the warning message */ + return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); +} + +/* insert record into the buffer, discard old ones, update heads */ +static int log_store(int facility, int level, + enum log_flags flags, u64 ts_nsec, + const char *dict, u16 dict_len, + const char *text, u16 text_len) +{ + struct printk_log *msg; + u32 size, pad_len; + u16 trunc_msg_len = 0; + + /* number of '\0' padding bytes to next message */ + size = msg_used_size(text_len, dict_len, &pad_len); + + if (log_make_free_space(size)) { + /* truncate the message if it is too long for empty buffer */ + size = truncate_msg(&text_len, &trunc_msg_len, + &dict_len, &pad_len); + /* survive when the log buffer is too small for trunc_msg */ + if (log_make_free_space(size)) + return 0; } if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { @@ -341,6 +430,10 @@ static void log_store(int facility, int level, msg = (struct printk_log *)(log_buf + log_next_idx); memcpy(log_text(msg), text, text_len); msg->text_len = text_len; + if (trunc_msg_len) { + memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); + msg->text_len += trunc_msg_len; + } memcpy(log_dict(msg), dict, dict_len); msg->dict_len = dict_len; msg->facility = facility; @@ -356,6 +449,8 @@ static void log_store(int facility, int level, /* insert message */ log_next_idx += msg->len; log_next_seq++; + + return msg->text_len; } #ifdef CONFIG_SECURITY_DMESG_RESTRICT @@ -1303,7 +1398,10 @@ static void zap_locks(void) sema_init(&console_sem, 1); } -/* Check if we have any console registered that can be called early in boot. */ +/* + * Check if we have any console that is capable of printing while cpu is + * booting or shutting down. Requires console_sem. + */ static int have_callable_console(void) { struct console *con; @@ -1318,10 +1416,9 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1333,36 +1430,24 @@ static inline int can_use_console(unsigned int cpu) * messages from a 'printk'. Return true (and with the * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. - * - * This gets called with the 'logbuf_lock' spinlock held and - * interrupts disabled. It should return with 'lockbuf_lock' - * released but interrupts still disabled. */ -static int console_trylock_for_printk(unsigned int cpu) - __releases(&logbuf_lock) +static int console_trylock_for_printk(void) { - int retval = 0, wake = 0; - - if (console_trylock()) { - retval = 1; + unsigned int cpu = smp_processor_id(); - /* - * If we can't use the console, we need to release - * the console semaphore by hand to avoid flushing - * the buffer. We need to hold the console semaphore - * in order to do this test safely. - */ - if (!can_use_console(cpu)) { - console_locked = 0; - wake = 1; - retval = 0; - } + if (!console_trylock()) + return 0; + /* + * If we can't use the console, we need to release the console + * semaphore by hand to avoid flushing the buffer. We need to hold the + * console semaphore in order to do this test safely. + */ + if (!can_use_console(cpu)) { + console_locked = 0; + up_console_sem(); + return 0; } - logbuf_cpu = UINT_MAX; - raw_spin_unlock(&logbuf_lock); - if (wake) - up(&console_sem); - return retval; + return 1; } int printk_delay_msec __read_mostly; @@ -1490,11 +1575,19 @@ asmlinkage int vprintk_emit(int facility, int level, static int recursion_bug; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; - size_t text_len; + size_t text_len = 0; enum log_flags lflags = 0; unsigned long flags; int this_cpu; int printed_len = 0; + bool in_sched = false; + /* cpu currently holding logbuf_lock in this function */ + static volatile unsigned int logbuf_cpu = UINT_MAX; + + if (level == SCHED_MESSAGE_LOGLEVEL) { + level = -1; + in_sched = true; + } boot_delay_msec(level); printk_delay(); @@ -1516,7 +1609,8 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - goto out_restore_irqs; + local_irq_restore(flags); + return 0; } zap_locks(); } @@ -1530,17 +1624,22 @@ asmlinkage int vprintk_emit(int facility, int level, "BUG: recent printk recursion!"; recursion_bug = 0; - printed_len += strlen(recursion_msg); + text_len = strlen(recursion_msg); /* emit KERN_CRIT message */ - log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, printed_len); + printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, + NULL, 0, recursion_msg, text_len); } /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. */ - text_len = vscnprintf(text, sizeof(textbuf), fmt, args); + if (in_sched) + text_len = scnprintf(text, sizeof(textbuf), + KERN_WARNING "[sched_delayed] "); + + text_len += vscnprintf(text + text_len, + sizeof(textbuf) - text_len, fmt, args); /* mark and strip a trailing newline */ if (text_len && text[text_len-1] == '\n') { @@ -1586,9 +1685,12 @@ asmlinkage int vprintk_emit(int facility, int level, cont_flush(LOG_NEWLINE); /* buffer line if possible, otherwise store it right away */ - if (!cont_add(facility, level, text, text_len)) - log_store(facility, level, lflags | LOG_CONT, 0, - dict, dictlen, text, text_len); + if (cont_add(facility, level, text, text_len)) + printed_len += text_len; + else + printed_len += log_store(facility, level, + lflags | LOG_CONT, 0, + dict, dictlen, text, text_len); } else { bool stored = false; @@ -1607,26 +1709,35 @@ asmlinkage int vprintk_emit(int facility, int level, cont_flush(LOG_NEWLINE); } - if (!stored) - log_store(facility, level, lflags, 0, - dict, dictlen, text, text_len); + if (stored) + printed_len += text_len; + else + printed_len += log_store(facility, level, lflags, 0, + dict, dictlen, text, text_len); } - printed_len += text_len; + logbuf_cpu = UINT_MAX; + raw_spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); + + /* If called from the scheduler, we can not call up(). */ + if (in_sched) + return printed_len; + + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to console + */ + preempt_disable(); /* * Try to acquire and then immediately release the console semaphore. * The release will print out buffers and wake up /dev/kmsg and syslog() * users. - * - * The console_trylock_for_printk() function will release 'logbuf_lock' - * regardless of whether it actually gets the console semaphore or not. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk()) console_unlock(); - - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); + preempt_enable(); return printed_len; } @@ -1882,16 +1993,14 @@ void suspend_console(void) printk("Suspending console(s) (use no_console_suspend to debug)\n"); console_lock(); console_suspended = 1; - up(&console_sem); - mutex_release(&console_lock_dep_map, 1, _RET_IP_); + up_console_sem(); } void resume_console(void) { if (!console_suspend_enabled) return; - down(&console_sem); - mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); + down_console_sem(); console_suspended = 0; console_unlock(); } @@ -1933,12 +2042,11 @@ void console_lock(void) { might_sleep(); - down(&console_sem); + down_console_sem(); if (console_suspended) return; console_locked = 1; console_may_schedule = 1; - mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); } EXPORT_SYMBOL(console_lock); @@ -1952,15 +2060,14 @@ EXPORT_SYMBOL(console_lock); */ int console_trylock(void) { - if (down_trylock(&console_sem)) + if (down_trylock_console_sem()) return 0; if (console_suspended) { - up(&console_sem); + up_console_sem(); return 0; } console_locked = 1; console_may_schedule = 0; - mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); return 1; } EXPORT_SYMBOL(console_trylock); @@ -2022,7 +2129,7 @@ void console_unlock(void) bool retry; if (console_suspended) { - up(&console_sem); + up_console_sem(); return; } @@ -2043,10 +2150,15 @@ again: } if (console_seq < log_first_seq) { + len = sprintf(text, "** %u printk messages dropped ** ", + (unsigned)(log_first_seq - console_seq)); + /* messages are gone, move to first one */ console_seq = log_first_seq; console_idx = log_first_idx; console_prev = 0; + } else { + len = 0; } skip: if (console_seq == log_next_seq) @@ -2071,8 +2183,8 @@ skip: } level = msg->level; - len = msg_print_text(msg, console_prev, false, - text, sizeof(text)); + len += msg_print_text(msg, console_prev, false, + text + len, sizeof(text) - len); console_idx = log_next(console_idx); console_seq++; console_prev = msg->flags; @@ -2084,7 +2196,6 @@ skip: local_irq_restore(flags); } console_locked = 0; - mutex_release(&console_lock_dep_map, 1, _RET_IP_); /* Release the exclusive_console once it is used */ if (unlikely(exclusive_console)) @@ -2092,7 +2203,7 @@ skip: raw_spin_unlock(&logbuf_lock); - up(&console_sem); + up_console_sem(); /* * Someone could have filled up the buffer again, so re-check if there's @@ -2137,7 +2248,7 @@ void console_unblank(void) * oops_in_progress is set to 1.. */ if (oops_in_progress) { - if (down_trylock(&console_sem) != 0) + if (down_trylock_console_sem() != 0) return; } else console_lock(); @@ -2438,21 +2549,19 @@ late_initcall(printk_late_init); /* * Delayed printk version, for scheduler-internal messages: */ -#define PRINTK_BUF_SIZE 512 - #define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 +#define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = __this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - pr_warn("[sched_delayed] %s", buf); + if (pending & PRINTK_PENDING_OUTPUT) { + /* If trylock fails, someone else is doing the printing */ + if (console_trylock()) + console_unlock(); } if (pending & PRINTK_PENDING_WAKEUP) @@ -2474,23 +2583,19 @@ void wake_up_klogd(void) preempt_enable(); } -int printk_sched(const char *fmt, ...) +int printk_deferred(const char *fmt, ...) { - unsigned long flags; va_list args; - char *buf; int r; - local_irq_save(flags); - buf = __get_cpu_var(printk_sched_buf); - + preempt_disable(); va_start(args, fmt); - r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); + r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); va_end(args); - __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); - local_irq_restore(flags); + preempt_enable(); return r; } diff --git a/kernel/profile.c b/kernel/profile.c index cb980f0c731b..54bf5ba26420 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -52,9 +52,9 @@ static DEFINE_MUTEX(profile_flip_mutex); int profile_setup(char *str) { - static char schedstr[] = "schedule"; - static char sleepstr[] = "sleep"; - static char kvmstr[] = "kvm"; + static const char schedstr[] = "schedule"; + static const char sleepstr[] = "sleep"; + static const char kvmstr[] = "kvm"; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { @@ -64,12 +64,10 @@ int profile_setup(char *str) str += strlen(sleepstr) + 1; if (get_option(&str, &par)) prof_shift = par; - printk(KERN_INFO - "kernel sleep profiling enabled (shift: %ld)\n", + pr_info("kernel sleep profiling enabled (shift: %ld)\n", prof_shift); #else - printk(KERN_WARNING - "kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); + pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); #endif /* CONFIG_SCHEDSTATS */ } else if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; @@ -77,8 +75,7 @@ int profile_setup(char *str) str += strlen(schedstr) + 1; if (get_option(&str, &par)) prof_shift = par; - printk(KERN_INFO - "kernel schedule profiling enabled (shift: %ld)\n", + pr_info("kernel schedule profiling enabled (shift: %ld)\n", prof_shift); } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; @@ -86,13 +83,12 @@ int profile_setup(char *str) str += strlen(kvmstr) + 1; if (get_option(&str, &par)) prof_shift = par; - printk(KERN_INFO - "kernel KVM profiling enabled (shift: %ld)\n", + pr_info("kernel KVM profiling enabled (shift: %ld)\n", prof_shift); } else if (get_option(&str, &par)) { prof_shift = par; prof_on = CPU_PROFILING; - printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", + pr_info("kernel profiling enabled (shift: %ld)\n", prof_shift); } return 1; diff --git a/kernel/reboot.c b/kernel/reboot.c index 662c83fc16b7..a3a9e240fcdb 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -388,15 +388,22 @@ static int __init reboot_setup(char *str) break; case 's': - if (isdigit(*(str+1))) - reboot_cpu = simple_strtoul(str+1, NULL, 0); - else if (str[1] == 'm' && str[2] == 'p' && - isdigit(*(str+3))) - reboot_cpu = simple_strtoul(str+3, NULL, 0); - else + { + int rc; + + if (isdigit(*(str+1))) { + rc = kstrtoint(str+1, 0, &reboot_cpu); + if (rc) + return rc; + } else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) { + rc = kstrtoint(str+3, 0, &reboot_cpu); + if (rc) + return rc; + } else reboot_mode = REBOOT_SOFT; break; - + } case 'g': reboot_mode = REBOOT_GPIO; break; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 51dbac6a3633..e791130f85a7 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -186,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf, /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ if (*buf == '-') { - res = simple_strtoull(buf + 1, &end, 10); - if (res != 1 || *end != '\0') + int rc = kstrtoull(buf + 1, 10, &res); + + if (rc) + return rc; + if (res != 1) return -EINVAL; *resp = RES_COUNTER_MAX; return 0; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 913c6d6cc2c1..c6b98793d647 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1367,7 +1367,7 @@ out: * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk_sched("process %d (%s) no longer affine to cpu%d\n", + printk_deferred("process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } } @@ -3723,7 +3723,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (retval) return retval; - if (attr.sched_policy < 0) + if ((int)attr.sched_policy < 0) return -EINVAL; rcu_read_lock(); @@ -7669,7 +7669,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css_parent(css)); + struct task_group *parent = css_tg(css->parent); if (parent) sched_online_group(tg, parent); @@ -7800,8 +7800,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) /* restart the period timer (if active) to handle new period expiry */ if (runtime_enabled && cfs_b->timer_active) { /* force a reprogram */ - cfs_b->timer_active = 0; - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, true); } raw_spin_unlock_irq(&cfs_b->lock); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index c143ee380e3a..9cf350c94ec4 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -46,7 +46,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) static inline struct cpuacct *parent_ca(struct cpuacct *ca) { - return css_ca(css_parent(&ca->css)); + return css_ca(ca->css.parent); } static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f9ca7d19781a..2b8cbf09d1a4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -348,12 +348,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - static bool lag_once = false; - - if (!lag_once) { - lag_once = true; - printk_sched("sched: DL replenish lagged to much\n"); - } + printk_deferred_once("sched: DL replenish lagged to much\n"); dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -513,9 +508,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); - struct rq *rq = task_rq(p); + struct rq *rq; +again: + rq = task_rq(p); raw_spin_lock(&rq->lock); + if (rq != task_rq(p)) { + /* Task was moved, retrying. */ + raw_spin_unlock(&rq->lock); + goto again; + } + /* * We need to take care of a possible races here. In fact, the * task might have changed its scheduling policy to something diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c9617b73bcc0..9855e87d671a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1120,7 +1120,7 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; /* Would this change make things worse? */ - return (old_imb > imb); + return (imb > old_imb); } /* @@ -1745,18 +1745,19 @@ no_join: void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; - int i; void *numa_faults = p->numa_faults_memory; + unsigned long flags; + int i; if (grp) { - spin_lock_irq(&grp->lock); + spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; list_del(&p->numa_entry); grp->nr_tasks--; - spin_unlock_irq(&grp->lock); + spin_unlock_irqrestore(&grp->lock, flags); rcu_assign_pointer(p->numa_group, NULL); put_numa_group(grp); } @@ -3179,7 +3180,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) */ if (!cfs_b->timer_active) { __refill_cfs_bandwidth_runtime(cfs_b); - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); } if (cfs_b->runtime > 0) { @@ -3358,7 +3359,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); if (!cfs_b->timer_active) - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); } @@ -3740,7 +3741,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) } /* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) { /* * The timer may be active because we're trying to set a new bandwidth @@ -3755,7 +3756,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cpu_relax(); raw_spin_lock(&cfs_b->lock); /* if someone else restarted the timer then we're done */ - if (cfs_b->timer_active) + if (!force && cfs_b->timer_active) return; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0ebfd7a29472..b3512f1afce9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -890,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { - static bool once = false; - rt_rq->rt_throttled = 1; - - if (!once) { - once = true; - printk_sched("sched: RT throttling activated\n"); - } + printk_deferred_once("sched: RT throttling activated\n"); } else { /* * In case we did anyway, make it go away, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 600e2291a75c..e47679b04d16 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -278,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); extern void free_rt_sched_group(struct task_group *tg); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b35c21503a36..f6d76bebe69f 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -39,7 +39,7 @@ * is only needed for handling filters shared across tasks. * @prev: points to a previously installed, or inherited, filter * @len: the number of instructions in the program - * @insns: the BPF program instructions to evaluate + * @insnsi: the BPF program instructions to evaluate * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -220,7 +220,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) return -ENOMEM; /* - * Installing a seccomp filter requires that the task have + * Installing a seccomp filter requires that the task has * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. * This avoids scenarios where unprivileged tasks can affect the * behavior of privileged children. diff --git a/kernel/signal.c b/kernel/signal.c index 6ea13c09ae56..a4077e90f19f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -277,6 +277,7 @@ void task_clear_jobctl_trapping(struct task_struct *task) { if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { task->jobctl &= ~JOBCTL_TRAPPING; + smp_mb(); /* advised by wake_up_bit() */ wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); } } @@ -705,11 +706,8 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) * Returns 1 if any signals were found. * * All callers must be holding the siglock. - * - * This version takes a sigset mask and looks at all signals, - * not just those in the first mask word. */ -static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) +static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; @@ -727,29 +725,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) } return 1; } -/* - * Remove signals in mask from the pending set and queue. - * Returns 1 if any signals were found. - * - * All callers must be holding the siglock. - */ -static int rm_from_queue(unsigned long mask, struct sigpending *s) -{ - struct sigqueue *q, *n; - - if (!sigtestsetmask(&s->signal, mask)) - return 0; - - sigdelsetmask(&s->signal, mask); - list_for_each_entry_safe(q, n, &s->list, list) { - if (q->info.si_signo < SIGRTMIN && - (mask & sigmask(q->info.si_signo))) { - list_del_init(&q->list); - __sigqueue_free(q); - } - } - return 1; -} static inline int is_si_special(const struct siginfo *info) { @@ -861,6 +836,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; + sigset_t flush; if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { if (signal->flags & SIGNAL_GROUP_COREDUMP) @@ -872,26 +848,25 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) /* * This is a stop signal. Remove SIGCONT from all queues. */ - rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); - t = p; - do { - rm_from_queue(sigmask(SIGCONT), &t->pending); - } while_each_thread(p, t); + siginitset(&flush, sigmask(SIGCONT)); + flush_sigqueue_mask(&flush, &signal->shared_pending); + for_each_thread(p, t) + flush_sigqueue_mask(&flush, &t->pending); } else if (sig == SIGCONT) { unsigned int why; /* * Remove all stop signals from all queues, wake all threads. */ - rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); - t = p; - do { + siginitset(&flush, SIG_KERNEL_STOP_MASK); + flush_sigqueue_mask(&flush, &signal->shared_pending); + for_each_thread(p, t) { + flush_sigqueue_mask(&flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); - rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); if (likely(!(t->ptrace & PT_SEIZED))) wake_up_state(t, __TASK_STOPPED); else ptrace_trap_notify(t); - } while_each_thread(p, t); + } /* * Notify the parent with CLD_CONTINUED if we were stopped. @@ -2854,7 +2829,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); - siginitset(&tsk->real_blocked, 0); + sigemptyset(&tsk->real_blocked); sig = dequeue_signal(tsk, &mask, info); } spin_unlock_irq(&tsk->sighand->siglock); @@ -3091,18 +3066,39 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, } #endif +/* + * For kthreads only, must not be used if cloned with CLONE_SIGHAND + */ +void kernel_sigaction(int sig, __sighandler_t action) +{ + spin_lock_irq(¤t->sighand->siglock); + current->sighand->action[sig - 1].sa.sa_handler = action; + if (action == SIG_IGN) { + sigset_t mask; + + sigemptyset(&mask); + sigaddset(&mask, sig); + + flush_sigqueue_mask(&mask, ¤t->signal->shared_pending); + flush_sigqueue_mask(&mask, ¤t->pending); + recalc_sigpending(); + } + spin_unlock_irq(¤t->sighand->siglock); +} +EXPORT_SYMBOL(kernel_sigaction); + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { - struct task_struct *t = current; + struct task_struct *p = current, *t; struct k_sigaction *k; sigset_t mask; if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) return -EINVAL; - k = &t->sighand->action[sig-1]; + k = &p->sighand->action[sig-1]; - spin_lock_irq(¤t->sighand->siglock); + spin_lock_irq(&p->sighand->siglock); if (oact) *oact = *k; @@ -3121,21 +3117,20 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) * (for example, SIGCHLD), shall cause the pending signal to * be discarded, whether or not it is blocked" */ - if (sig_handler_ignored(sig_handler(t, sig), sig)) { + if (sig_handler_ignored(sig_handler(p, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); - rm_from_queue_full(&mask, &t->signal->shared_pending); - do { - rm_from_queue_full(&mask, &t->pending); - } while_each_thread(current, t); + flush_sigqueue_mask(&mask, &p->signal->shared_pending); + for_each_thread(p, t) + flush_sigqueue_mask(&mask, &t->pending); } } - spin_unlock_irq(¤t->sighand->siglock); + spin_unlock_irq(&p->sighand->siglock); return 0; } -static int +static int do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) { stack_t oss; @@ -3496,7 +3491,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, } #endif -#ifdef __ARCH_WANT_SYS_SGETMASK +#ifdef CONFIG_SGETMASK_SYSCALL /* * For backwards compatibility. Functionality superseded by sigprocmask. @@ -3517,7 +3512,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) return old; } -#endif /* __ARCH_WANT_SGETMASK */ +#endif /* CONFIG_SGETMASK_SYSCALL */ #ifdef __ARCH_WANT_SYS_SIGNAL /* diff --git a/kernel/smp.c b/kernel/smp.c index 06d574e42c72..306f8180b0d5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -185,14 +185,26 @@ void generic_smp_call_function_single_interrupt(void) { struct llist_node *entry; struct call_single_data *csd, *csd_next; + static bool warned; + + entry = llist_del_all(&__get_cpu_var(call_single_queue)); + entry = llist_reverse_order(entry); /* * Shouldn't receive this interrupt on a cpu that is not yet online. */ - WARN_ON_ONCE(!cpu_online(smp_processor_id())); + if (unlikely(!cpu_online(smp_processor_id()) && !warned)) { + warned = true; + WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); - entry = llist_del_all(&__get_cpu_var(call_single_queue)); - entry = llist_reverse_order(entry); + /* + * We don't have to use the _safe() variant here + * because we are not invoking the IPI handlers yet. + */ + llist_for_each_entry(csd, entry, llist) + pr_warn("IPI callback %pS sent to offline CPU\n", + csd->func); + } llist_for_each_entry_safe(csd, csd_next, entry, llist) { csd->func(csd->info); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 01fbae5b97b7..695f0c6cd169 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * * @cpu: cpu to stop * @fn: function to execute * @arg: argument to @fn + * @work_buf: pointer to cpu_stop_work structure * * Similar to stop_one_cpu() but doesn't wait for completion. The * caller is responsible for ensuring @work_buf is currently unused diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bc8d1b74a6b9..36441b51b5df 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16); cond_syscall(sys_setresuid16); cond_syscall(sys_setreuid16); cond_syscall(sys_setuid16); +cond_syscall(sys_sgetmask); +cond_syscall(sys_ssetmask); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); cond_syscall(sys_ipc); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bc966a8ffc3e..db19e3e2aa4b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -173,6 +173,13 @@ extern int no_unaligned_warning; #endif #ifdef CONFIG_PROC_SYSCTL + +#define SYSCTL_WRITES_LEGACY -1 +#define SYSCTL_WRITES_WARN 0 +#define SYSCTL_WRITES_STRICT 1 + +static int sysctl_writes_strict = SYSCTL_WRITES_WARN; + static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); static int proc_taint(struct ctl_table *table, int write, @@ -195,7 +202,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; -static int sysrq_sysctl_handler(ctl_table *table, int write, +static int sysrq_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -495,6 +502,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_taint, }, + { + .procname = "sysctl_writes_strict", + .data = &sysctl_writes_strict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, + .extra2 = &one, + }, #endif #ifdef CONFIG_LATENCYTOP { @@ -1418,8 +1434,13 @@ static struct ctl_table vm_table[] = { (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { .procname = "vdso_enabled", +#ifdef CONFIG_X86_32 + .data = &vdso32_enabled, + .maxlen = sizeof(vdso32_enabled), +#else .data = &vdso_enabled, .maxlen = sizeof(vdso_enabled), +#endif .mode = 0644, .proc_handler = proc_dointvec, .extra1 = &zero, @@ -1698,8 +1719,8 @@ int __init sysctl_init(void) #ifdef CONFIG_PROC_SYSCTL -static int _proc_do_string(void* data, int maxlen, int write, - void __user *buffer, +static int _proc_do_string(char *data, int maxlen, int write, + char __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -1712,21 +1733,30 @@ static int _proc_do_string(void* data, int maxlen, int write, } if (write) { - len = 0; + if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { + /* Only continue writes not past the end of buffer. */ + len = strlen(data); + if (len > maxlen - 1) + len = maxlen - 1; + + if (*ppos > len) + return 0; + len = *ppos; + } else { + /* Start writing from beginning of buffer. */ + len = 0; + } + + *ppos += *lenp; p = buffer; - while (len < *lenp) { + while ((p - buffer) < *lenp && len < maxlen - 1) { if (get_user(c, p++)) return -EFAULT; if (c == 0 || c == '\n') break; - len++; + data[len++] = c; } - if (len >= maxlen) - len = maxlen-1; - if(copy_from_user(data, buffer, len)) - return -EFAULT; - ((char *) data)[len] = 0; - *ppos += *lenp; + data[len] = 0; } else { len = strlen(data); if (len > maxlen) @@ -1743,10 +1773,10 @@ static int _proc_do_string(void* data, int maxlen, int write, if (len > *lenp) len = *lenp; if (len) - if(copy_to_user(buffer, data, len)) + if (copy_to_user(buffer, data, len)) return -EFAULT; if (len < *lenp) { - if(put_user('\n', ((char __user *) buffer) + len)) + if (put_user('\n', buffer + len)) return -EFAULT; len++; } @@ -1756,6 +1786,14 @@ static int _proc_do_string(void* data, int maxlen, int write, return 0; } +static void warn_sysctl_write(struct ctl_table *table) +{ + pr_warn_once("%s wrote to %s when file position was not 0!\n" + "This will not be supported in the future. To silence this\n" + "warning, set kernel.sysctl_writes_strict = -1\n", + current->comm, table->procname); +} + /** * proc_dostring - read a string sysctl * @table: the sysctl table @@ -1776,8 +1814,11 @@ static int _proc_do_string(void* data, int maxlen, int write, int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, - buffer, lenp, ppos); + if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) + warn_sysctl_write(table); + + return _proc_do_string((char *)(table->data), table->maxlen, write, + (char __user *)buffer, lenp, ppos); } static size_t proc_skip_spaces(char **buf) @@ -1951,6 +1992,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, conv = do_proc_dointvec_conv; if (write) { + if (*ppos) { + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + goto out; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + break; + default: + break; + } + } + if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; page = __get_free_page(GFP_TEMPORARY); @@ -2008,6 +2061,7 @@ free: return err ? : -EINVAL; } *lenp -= left; +out: *ppos += *lenp; return err; } @@ -2200,6 +2254,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int left = *lenp; if (write) { + if (*ppos) { + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + goto out; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + break; + default: + break; + } + } + if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; page = __get_free_page(GFP_TEMPORARY); @@ -2255,6 +2321,7 @@ free: return err ? : -EINVAL; } *lenp -= left; +out: *ppos += *lenp; return err; } diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 419a52cecd20..33db43a39515 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq) static inline int is_error_status(int status) { - return (time_status & (STA_UNSYNC|STA_CLOCKERR)) + return (status & (STA_UNSYNC|STA_CLOCKERR)) /* PPS signal lost when either PPS time or * PPS frequency synchronization requested */ - || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) - && !(time_status & STA_PPSSIGNAL)) + || ((status & (STA_PPSFREQ|STA_PPSTIME)) + && !(status & STA_PPSSIGNAL)) /* PPS jitter exceeded when * PPS time synchronization requested */ - || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) + || ((status & (STA_PPSTIME|STA_PPSJITTER)) == (STA_PPSTIME|STA_PPSJITTER)) /* PPS wander exceeded or calibration error when * PPS frequency synchronization requested */ - || ((time_status & STA_PPSFREQ) - && (time_status & (STA_PPSWANDER|STA_PPSERROR))); + || ((status & STA_PPSFREQ) + && (status & (STA_PPSWANDER|STA_PPSERROR))); } static inline void pps_fill_timex(struct timex *txc) @@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) time_status |= STA_PPSERROR; pps_errcnt++; pps_dec_freq_interval(); - pr_err("hardpps: PPSERROR: interval too long - %ld s\n", - freq_norm.sec); + printk_deferred(KERN_ERR + "hardpps: PPSERROR: interval too long - %ld s\n", + freq_norm.sec); return 0; } @@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { - pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); + printk_deferred(KERN_WARNING + "hardpps: PPSWANDER: change=%ld\n", delta); time_status |= STA_PPSWANDER; pps_stbcnt++; pps_dec_freq_interval(); @@ -844,8 +846,9 @@ static void hardpps_update_phase(long error) * the time offset is updated. */ if (jitter > (pps_jitter << PPS_POPCORN)) { - pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", - jitter, (pps_jitter << PPS_POPCORN)); + printk_deferred(KERN_WARNING + "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (time_status & STA_PPSTIME) { @@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - pr_err("hardpps: PPSJITTER: bad pulse\n"); + printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); return; } @@ -923,7 +926,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) static int __init ntp_tick_adj_setup(char *str) { - ntp_tick_adj = simple_strtol(str, NULL, 0); + int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); + + if (rc) + return rc; ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 4d23dc4d8139..445106d2c729 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -49,13 +49,6 @@ static u64 notrace jiffy_sched_clock_read(void) return (u64)(jiffies - INITIAL_JIFFIES); } -static u32 __read_mostly (*read_sched_clock_32)(void); - -static u64 notrace read_sched_clock_32_wrapper(void) -{ - return read_sched_clock_32(); -} - static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -176,12 +169,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits, pr_debug("Registered %pF as sched_clock source\n", read); } -void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) -{ - read_sched_clock_32 = read; - sched_clock_register(read_sched_clock_32_wrapper, bits, rate); -} - void __init sched_clock_postinit(void) { /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f7df8ea21707..32d8d6aaedb8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -852,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, struct timespec *delta) { if (!timespec_valid_strict(delta)) { - printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " - "sleep delta value!\n"); + printk_deferred(KERN_WARNING + "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); @@ -1157,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) if (unlikely(tk->clock->maxadj && (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { - printk_once(KERN_WARNING + printk_deferred_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->clock->name, (long)tk->mult + adj, (long)tk->clock->mult + tk->clock->maxadj); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8639819f6cef..d4409356f40d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -535,6 +535,36 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. +config TRACEPOINT_BENCHMARK + bool "Add tracepoint that benchmarks tracepoints" + help + This option creates the tracepoint "benchmark:benchmark_event". + When the tracepoint is enabled, it kicks off a kernel thread that + goes into an infinite loop (calling cond_sched() to let other tasks + run), and calls the tracepoint. Each iteration will record the time + it took to write to the tracepoint and the next iteration that + data will be passed to the tracepoint itself. That is, the tracepoint + will report the time it took to do the previous tracepoint. + The string written to the tracepoint is a static string of 128 bytes + to keep the time the same. The initial string is simply a write of + "START". The second string records the cold cache time of the first + write which is not added to the rest of the calculations. + + As it is a tight loop, it benchmarks as hot cache. That's fine because + we care most about hot paths that are probably in cache already. + + An example of the output: + + START + first=3672 [COLD CACHED] + last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712 + last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337 + last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064 + last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411 + last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389 + last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666 + + config RING_BUFFER_BENCHMARK tristate "Ring buffer benchmark stress tester" depends on RING_BUFFER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1378e84fbe39..2611613f14f1 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -17,6 +17,7 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +CFLAGS_trace_benchmark.o := -I$(src) CFLAGS_trace_events_filter.o := -I$(src) obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o @@ -62,4 +63,6 @@ endif obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o +obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o + libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4a54a25afa2f..5b372e3ed675 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,7 +62,7 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 -#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) #ifdef CONFIG_DYNAMIC_FTRACE #define INIT_REGEX_LOCK(opsname) \ @@ -103,7 +103,6 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; @@ -171,23 +170,6 @@ int ftrace_nr_registered_ops(void) return cnt; } -static void -ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) -{ - int bit; - - bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); - if (bit < 0) - return; - - do_for_each_ftrace_op(op, ftrace_global_list) { - op->func(ip, parent_ip, op, regs); - } while_for_each_ftrace_op(op); - - trace_clear_recursion(bit); -} - static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { @@ -237,43 +219,6 @@ static int control_ops_alloc(struct ftrace_ops *ops) return 0; } -static void update_global_ops(void) -{ - ftrace_func_t func = ftrace_global_list_func; - void *private = NULL; - - /* The list has its own recursion protection. */ - global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; - - /* - * If there's only one function registered, then call that - * function directly. Otherwise, we need to iterate over the - * registered callers. - */ - if (ftrace_global_list == &ftrace_list_end || - ftrace_global_list->next == &ftrace_list_end) { - func = ftrace_global_list->func; - private = ftrace_global_list->private; - /* - * As we are calling the function directly. - * If it does not have recursion protection, - * the function_trace_op needs to be updated - * accordingly. - */ - if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)) - global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; - } - - /* If we filter on pids, update to use the pid function */ - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } - - global_ops.func = func; - global_ops.private = private; -} - static void ftrace_sync(struct work_struct *work) { /* @@ -301,8 +246,6 @@ static void update_ftrace_function(void) { ftrace_func_t func; - update_global_ops(); - /* * If we are at the end of the list and this ops is * recursion safe and not dynamic and the arch supports passing ops, @@ -314,10 +257,7 @@ static void update_ftrace_function(void) (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && !FTRACE_FORCE_LIST_FUNC)) { /* Set the ftrace_ops that the arch callback uses */ - if (ftrace_ops_list == &global_ops) - set_function_trace_op = ftrace_global_list; - else - set_function_trace_op = ftrace_ops_list; + set_function_trace_op = ftrace_ops_list; func = ftrace_ops_list->func; } else { /* Just use the default ftrace_ops */ @@ -373,6 +313,11 @@ static void update_ftrace_function(void) ftrace_trace_function = func; } +int using_ftrace_ops_list_func(void) +{ + return ftrace_trace_function == ftrace_ops_list_func; +} + static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) { ops->next = *list; @@ -434,16 +379,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (ops->flags & FTRACE_OPS_FL_DELETED) return -EINVAL; - if (FTRACE_WARN_ON(ops == &global_ops)) - return -EINVAL; - if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) return -EBUSY; - /* We don't support both control and global flags set. */ - if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) - return -EINVAL; - #ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS /* * If the ftrace_ops specifies SAVE_REGS, then it only can be used @@ -461,10 +399,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); - ops->flags |= FTRACE_OPS_FL_ENABLED; - } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (ops->flags & FTRACE_OPS_FL_CONTROL) { if (control_ops_alloc(ops)) return -ENOMEM; add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); @@ -484,15 +419,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; - if (FTRACE_WARN_ON(ops == &global_ops)) - return -EINVAL; - - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ret = remove_ftrace_list_ops(&ftrace_global_list, - &global_ops, ops); - if (!ret) - ops->flags &= ~FTRACE_OPS_FL_ENABLED; - } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (ops->flags & FTRACE_OPS_FL_CONTROL) { ret = remove_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); } else @@ -895,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, local_irq_save(flags); - stat = &__get_cpu_var(ftrace_profile_stats); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) goto out; @@ -926,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) unsigned long flags; local_irq_save(flags); - stat = &__get_cpu_var(ftrace_profile_stats); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) goto out; @@ -1178,7 +1105,7 @@ struct ftrace_page { static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; -static bool ftrace_hash_empty(struct ftrace_hash *hash) +static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) { return !hash || !hash->count; } @@ -1625,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); /* + * If filter_hash is set, we want to match all functions + * that are in the hash but not in the other hash. * + * If filter_hash is not set, then we are decrementing. + * That means we match anything that is in the hash + * and also in the other_hash. That is, we need to turn + * off functions in the other hash because they are disabled + * by this hash. */ if (filter_hash && in_hash && !in_other_hash) match = 1; @@ -1767,19 +1701,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) /* * If this record is being updated from a nop, then * return UPDATE_MAKE_CALL. - * Otherwise, if the EN flag is set, then return - * UPDATE_MODIFY_CALL_REGS to tell the caller to convert - * from the non-save regs, to a save regs function. * Otherwise, * return UPDATE_MODIFY_CALL to tell the caller to convert - * from the save regs, to a non-save regs function. + * from the save regs, to a non-save regs function or + * vice versa. */ if (flag & FTRACE_FL_ENABLED) return FTRACE_UPDATE_MAKE_CALL; - else if (rec->flags & FTRACE_FL_REGS_EN) - return FTRACE_UPDATE_MODIFY_CALL_REGS; - else - return FTRACE_UPDATE_MODIFY_CALL; + + return FTRACE_UPDATE_MODIFY_CALL; } if (update) { @@ -1821,6 +1751,42 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) return ftrace_check_record(rec, enable, 0); } +/** + * ftrace_get_addr_new - Get the call address to set to + * @rec: The ftrace record descriptor + * + * If the record has the FTRACE_FL_REGS set, that means that it + * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS + * is not not set, then it wants to convert to the normal callback. + * + * Returns the address of the trampoline to set to + */ +unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + +/** + * ftrace_get_addr_curr - Get the call address that is already there + * @rec: The ftrace record descriptor + * + * The FTRACE_FL_REGS_EN is set when the record already points to + * a function that saves all the regs. Basically the '_EN' version + * represents the current state of the function. + * + * Returns the address of the trampoline that is currently being called + */ +unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS_EN) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { @@ -1828,12 +1794,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) unsigned long ftrace_addr; int ret; - ret = ftrace_update_record(rec, enable); + ftrace_addr = ftrace_get_addr_new(rec); - if (rec->flags & FTRACE_FL_REGS) - ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; - else - ftrace_addr = (unsigned long)FTRACE_ADDR; + /* This needs to be done before we call ftrace_update_record */ + ftrace_old_addr = ftrace_get_addr_curr(rec); + + ret = ftrace_update_record(rec, enable); switch (ret) { case FTRACE_UPDATE_IGNORE: @@ -1845,13 +1811,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) case FTRACE_UPDATE_MAKE_NOP: return ftrace_make_nop(NULL, rec, ftrace_addr); - case FTRACE_UPDATE_MODIFY_CALL_REGS: case FTRACE_UPDATE_MODIFY_CALL: - if (rec->flags & FTRACE_FL_REGS) - ftrace_old_addr = (unsigned long)FTRACE_ADDR; - else - ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; - return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } @@ -2115,7 +2075,6 @@ static void ftrace_startup_enable(int command) static int ftrace_startup(struct ftrace_ops *ops, int command) { - bool hash_enable = true; int ret; if (unlikely(ftrace_disabled)) @@ -2128,18 +2087,9 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; - /* ops marked global share the filter hashes */ - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ops = &global_ops; - /* Don't update hash if global is already set */ - if (global_start_up) - hash_enable = false; - global_start_up++; - } - ops->flags |= FTRACE_OPS_FL_ENABLED; - if (hash_enable) - ftrace_hash_rec_enable(ops, 1); + + ftrace_hash_rec_enable(ops, 1); ftrace_startup_enable(command); @@ -2148,7 +2098,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) static int ftrace_shutdown(struct ftrace_ops *ops, int command) { - bool hash_disable = true; int ret; if (unlikely(ftrace_disabled)) @@ -2166,21 +2115,9 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) */ WARN_ON_ONCE(ftrace_start_up < 0); - if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ops = &global_ops; - global_start_up--; - WARN_ON_ONCE(global_start_up < 0); - /* Don't update hash if global still has users */ - if (global_start_up) { - WARN_ON_ONCE(!ftrace_start_up); - hash_disable = false; - } - } - - if (hash_disable) - ftrace_hash_rec_disable(ops, 1); + ftrace_hash_rec_disable(ops, 1); - if (ops != &global_ops || !global_start_up) + if (!global_start_up) ops->flags &= ~FTRACE_OPS_FL_ENABLED; command |= FTRACE_UPDATE_CALLS; @@ -3524,10 +3461,6 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, struct ftrace_hash *hash; int ret; - /* All global ops uses the global ops filters */ - if (ops->flags & FTRACE_OPS_FL_GLOBAL) - ops = &global_ops; - if (unlikely(ftrace_disabled)) return -ENODEV; @@ -3639,8 +3572,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, } EXPORT_SYMBOL_GPL(ftrace_set_notrace); /** - * ftrace_set_filter - set a function to filter on in ftrace - * @ops - the ops to set the filter with + * ftrace_set_global_filter - set a function to filter on with global tracers * @buf - the string that holds the function filter text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -3655,8 +3587,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset) EXPORT_SYMBOL_GPL(ftrace_set_global_filter); /** - * ftrace_set_notrace - set a function to not trace in ftrace - * @ops - the ops to set the notrace filter with + * ftrace_set_global_notrace - set a function to not trace with global tracers * @buf - the string that holds the function notrace text. * @len - the length of the string. * @reset - non zero to reset all filters before applying this filter. @@ -4443,6 +4374,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) #endif /* CONFIG_DYNAMIC_FTRACE */ +__init void ftrace_init_global_array_ops(struct trace_array *tr) +{ + tr->ops = &global_ops; + tr->ops->private = tr; +} + +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) +{ + /* If we filter on pids, update to use the pid function */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + if (WARN_ON(tr->ops->func != ftrace_stub)) + printk("ftrace ops had %pS for function\n", + tr->ops->func); + /* Only the top level instance does pid tracing */ + if (!list_empty(&ftrace_pids)) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } + } + tr->ops->func = func; + tr->ops->private = tr; +} + +void ftrace_reset_array_ops(struct trace_array *tr) +{ + tr->ops->func = ftrace_stub; +} + static void ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) @@ -4501,9 +4460,16 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { - if (ftrace_ops_test(op, ip, regs)) + if (ftrace_ops_test(op, ip, regs)) { + if (WARN_ON(!op->func)) { + function_trace_stop = 1; + printk("op=%p %pS\n", op, op); + goto out; + } op->func(ip, parent_ip, op, regs); + } } while_for_each_ftrace_op(op); +out: preempt_enable_notrace(); trace_clear_recursion(bit); } @@ -4908,7 +4874,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int ftrace_graph_active; -static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { @@ -5054,13 +5019,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, return NOTIFY_DONE; } -/* Just a place holder for function graph */ -static struct ftrace_ops fgraph_ops __read_mostly = { - .func = ftrace_stub, - .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | - FTRACE_OPS_FL_RECURSION_SAFE, -}; - static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) { if (!ftrace_ops_test(&global_ops, trace->func, NULL)) @@ -5085,6 +5043,10 @@ static void update_function_graph_func(void) ftrace_graph_entry = ftrace_graph_entry_test; } +static struct notifier_block ftrace_suspend_notifier = { + .notifier_call = ftrace_suspend_notifier_call, +}; + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -5098,7 +5060,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, goto out; } - ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); ftrace_graph_active++; @@ -5120,7 +5081,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_entry = ftrace_graph_entry_test; update_function_graph_func(); - ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); + /* Function graph doesn't use the .func field of global_ops */ + global_ops.flags |= FTRACE_OPS_FL_STUB; + + ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -5138,7 +5102,8 @@ void unregister_ftrace_graph(void) ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); + ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); + global_ops.flags &= ~FTRACE_OPS_FL_STUB; unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 737b0efa1a62..16f7038d1f4d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -275,7 +275,7 @@ int call_filter_check_discard(struct ftrace_event_call *call, void *rec, } EXPORT_SYMBOL_GPL(call_filter_check_discard); -cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) +static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { u64 ts; @@ -599,7 +599,7 @@ static int alloc_snapshot(struct trace_array *tr) return 0; } -void free_snapshot(struct trace_array *tr) +static void free_snapshot(struct trace_array *tr) { /* * We don't free the ring buffer. instead, resize it because @@ -963,27 +963,9 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } -/* - * ftrace_max_lock is used to protect the swapping of buffers - * when taking a max snapshot. The buffers themselves are - * protected by per_cpu spinlocks. But the action of the swap - * needs its own lock. - * - * This is defined as a arch_spinlock_t in order to help - * with performance when lockdep debugging is enabled. - * - * It is also used in other places outside the update_max_tr - * so it needs to be defined outside of the - * CONFIG_TRACER_MAX_TRACE. - */ -static arch_spinlock_t ftrace_max_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - unsigned long __read_mostly tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE -unsigned long __read_mostly tracing_max_latency; - /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, @@ -1000,7 +982,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_buf->cpu = cpu; max_buf->time_start = data->preempt_timestamp; - max_data->saved_latency = tracing_max_latency; + max_data->saved_latency = tr->max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; @@ -1048,14 +1030,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; } - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); buf = tr->trace_buffer.buffer; tr->trace_buffer.buffer = tr->max_buffer.buffer; tr->max_buffer.buffer = buf; __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); } /** @@ -1081,7 +1063,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; } - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); @@ -1099,11 +1081,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); } #endif /* CONFIG_TRACER_MAX_TRACE */ -static void default_wait_pipe(struct trace_iterator *iter) +static void wait_on_pipe(struct trace_iterator *iter) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) @@ -1220,8 +1202,6 @@ int register_tracer(struct tracer *type) else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; - if (!type->wait_pipe) - type->wait_pipe = default_wait_pipe; ret = run_tracer_selftest(type); if (ret < 0) @@ -1305,22 +1285,71 @@ void tracing_reset_all_online_cpus(void) } } -#define SAVED_CMDLINES 128 +#define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX -static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; -static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; -static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; -static int cmdline_idx; static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; +struct saved_cmdlines_buffer { + unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; + unsigned *map_cmdline_to_pid; + unsigned cmdline_num; + int cmdline_idx; + char *saved_cmdlines; +}; +static struct saved_cmdlines_buffer *savedcmd; /* temporary disable recording */ static atomic_t trace_record_cmdline_disabled __read_mostly; -static void trace_init_cmdlines(void) +static inline char *get_saved_cmdlines(int idx) { - memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); - memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); - cmdline_idx = 0; + return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; +} + +static inline void set_cmdline(int idx, const char *cmdline) +{ + memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); +} + +static int allocate_cmdlines_buffer(unsigned int val, + struct saved_cmdlines_buffer *s) +{ + s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid), + GFP_KERNEL); + if (!s->map_cmdline_to_pid) + return -ENOMEM; + + s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL); + if (!s->saved_cmdlines) { + kfree(s->map_cmdline_to_pid); + return -ENOMEM; + } + + s->cmdline_idx = 0; + s->cmdline_num = val; + memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, + sizeof(s->map_pid_to_cmdline)); + memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_pid)); + + return 0; +} + +static int trace_create_savedcmd(void) +{ + int ret; + + savedcmd = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); + if (!savedcmd) + return -ENOMEM; + + ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); + if (ret < 0) { + kfree(savedcmd); + savedcmd = NULL; + return -ENOMEM; + } + + return 0; } int is_tracing_stopped(void) @@ -1353,7 +1382,7 @@ void tracing_start(void) } /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&global_trace.max_lock); buffer = global_trace.trace_buffer.buffer; if (buffer) @@ -1365,7 +1394,7 @@ void tracing_start(void) ring_buffer_record_enable(buffer); #endif - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&global_trace.max_lock); ftrace_start(); out: @@ -1420,7 +1449,7 @@ void tracing_stop(void) goto out; /* Prevent the buffers from switching */ - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&global_trace.max_lock); buffer = global_trace.trace_buffer.buffer; if (buffer) @@ -1432,7 +1461,7 @@ void tracing_stop(void) ring_buffer_record_disable(buffer); #endif - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&global_trace.max_lock); out: raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); @@ -1461,12 +1490,12 @@ static void tracing_stop_tr(struct trace_array *tr) void trace_stop_cmdline_recording(void); -static void trace_save_cmdline(struct task_struct *tsk) +static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) - return; + return 0; /* * It's not the end of the world if we don't get @@ -1475,11 +1504,11 @@ static void trace_save_cmdline(struct task_struct *tsk) * so if we miss here, then better luck next time. */ if (!arch_spin_trylock(&trace_cmdline_lock)) - return; + return 0; - idx = map_pid_to_cmdline[tsk->pid]; + idx = savedcmd->map_pid_to_cmdline[tsk->pid]; if (idx == NO_CMDLINE_MAP) { - idx = (cmdline_idx + 1) % SAVED_CMDLINES; + idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; /* * Check whether the cmdline buffer at idx has a pid @@ -1487,22 +1516,24 @@ static void trace_save_cmdline(struct task_struct *tsk) * need to clear the map_pid_to_cmdline. Otherwise we * would read the new comm for the old pid. */ - pid = map_cmdline_to_pid[idx]; + pid = savedcmd->map_cmdline_to_pid[idx]; if (pid != NO_CMDLINE_MAP) - map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; + savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; - map_cmdline_to_pid[idx] = tsk->pid; - map_pid_to_cmdline[tsk->pid] = idx; + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; + savedcmd->map_pid_to_cmdline[tsk->pid] = idx; - cmdline_idx = idx; + savedcmd->cmdline_idx = idx; } - memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + set_cmdline(idx, tsk->comm); arch_spin_unlock(&trace_cmdline_lock); + + return 1; } -void trace_find_cmdline(int pid, char comm[]) +static void __trace_find_cmdline(int pid, char comm[]) { unsigned map; @@ -1521,13 +1552,19 @@ void trace_find_cmdline(int pid, char comm[]) return; } - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - map = map_pid_to_cmdline[pid]; + map = savedcmd->map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) - strcpy(comm, saved_cmdlines[map]); + strcpy(comm, get_saved_cmdlines(map)); else strcpy(comm, "<...>"); +} + +void trace_find_cmdline(int pid, char comm[]) +{ + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + __trace_find_cmdline(pid, comm); arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); @@ -1541,9 +1578,8 @@ void tracing_record_cmdline(struct task_struct *tsk) if (!__this_cpu_read(trace_cmdline_save)) return; - __this_cpu_write(trace_cmdline_save, false); - - trace_save_cmdline(tsk); + if (trace_save_cmdline(tsk)) + __this_cpu_write(trace_cmdline_save, false); } void @@ -1746,7 +1782,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ barrier(); if (use_stack == 1) { - trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; + trace.entries = this_cpu_ptr(ftrace_stack.calls); trace.max_entries = FTRACE_STACK_MAX_ENTRIES; if (regs) @@ -1995,7 +2031,21 @@ void trace_printk_init_buffers(void) if (alloc_percpu_trace_buffer()) return; - pr_info("ftrace: Allocated trace_printk buffers\n"); + /* trace_printk() is for debug use only. Don't use it in production. */ + + pr_warning("\n**********************************************************\n"); + pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warning("** **\n"); + pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warning("** **\n"); + pr_warning("** This means that this is a DEBUG kernel and it is **\n"); + pr_warning("** unsafe for produciton use. **\n"); + pr_warning("** **\n"); + pr_warning("** If you see this message and you are not debugging **\n"); + pr_warning("** the kernel, report this immediately to your vendor! **\n"); + pr_warning("** **\n"); + pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warning("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(); @@ -3333,7 +3383,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); local_irq_disable(); - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are @@ -3350,7 +3400,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); } } - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&tr->max_lock); local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); @@ -3592,6 +3642,7 @@ static const char readme_msg[] = " trace_options\t\t- Set format or modify how tracing happens\n" "\t\t\t Disable an option by adding a suffix 'no' to the\n" "\t\t\t option name\n" + " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" #ifdef CONFIG_DYNAMIC_FTRACE "\n available_filter_functions - list of functions that can be filtered on\n" " set_ftrace_filter\t- echo function name in here to only trace these\n" @@ -3705,55 +3756,153 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; +static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) +{ + unsigned int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; + ptr++) { + if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) + continue; + + return ptr; + } + + return NULL; +} + +static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + v = &savedcmd->map_cmdline_to_pid[0]; + while (l <= *pos) { + v = saved_cmdlines_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_cmdlines_stop(struct seq_file *m, void *v) +{ + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +static int saved_cmdlines_show(struct seq_file *m, void *v) +{ + char buf[TASK_COMM_LEN]; + unsigned int *pid = v; + + __trace_find_cmdline(*pid, buf); + seq_printf(m, "%d %s\n", *pid, buf); + return 0; +} + +static const struct seq_operations tracing_saved_cmdlines_seq_ops = { + .start = saved_cmdlines_start, + .next = saved_cmdlines_next, + .stop = saved_cmdlines_stop, + .show = saved_cmdlines_show, +}; + +static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_saved_cmdlines_seq_ops); +} + +static const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_saved_cmdlines_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static ssize_t -tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) +tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) { - char *buf_comm; - char *file_buf; - char *buf; - int len = 0; - int pid; - int i; + char buf[64]; + int r; + + arch_spin_lock(&trace_cmdline_lock); + r = sprintf(buf, "%u\n", savedcmd->cmdline_num); + arch_spin_unlock(&trace_cmdline_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ + kfree(s->saved_cmdlines); + kfree(s->map_cmdline_to_pid); + kfree(s); +} + +static int tracing_resize_saved_cmdlines(unsigned int val) +{ + struct saved_cmdlines_buffer *s, *savedcmd_temp; - file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); - if (!file_buf) + s = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); + if (!s) return -ENOMEM; - buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); - if (!buf_comm) { - kfree(file_buf); + if (allocate_cmdlines_buffer(val, s) < 0) { + kfree(s); return -ENOMEM; } - buf = file_buf; + arch_spin_lock(&trace_cmdline_lock); + savedcmd_temp = savedcmd; + savedcmd = s; + arch_spin_unlock(&trace_cmdline_lock); + free_saved_cmdlines_buffer(savedcmd_temp); - for (i = 0; i < SAVED_CMDLINES; i++) { - int r; + return 0; +} - pid = map_cmdline_to_pid[i]; - if (pid == -1 || pid == NO_CMDLINE_MAP) - continue; +static ssize_t +tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; - trace_find_cmdline(pid, buf_comm); - r = sprintf(buf, "%d %s\n", pid, buf_comm); - buf += r; - len += r; - } + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; - len = simple_read_from_buffer(ubuf, cnt, ppos, - file_buf, len); + /* must have at least 1 entry or less than PID_MAX_DEFAULT */ + if (!val || val > PID_MAX_DEFAULT) + return -EINVAL; - kfree(file_buf); - kfree(buf_comm); + ret = tracing_resize_saved_cmdlines((unsigned int)val); + if (ret < 0) + return ret; - return len; + *ppos += cnt; + + return cnt; } -static const struct file_operations tracing_saved_cmdlines_fops = { - .open = tracing_open_generic, - .read = tracing_saved_cmdlines_read, - .llseek = generic_file_llseek, +static const struct file_operations tracing_saved_cmdlines_size_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_size_read, + .write = tracing_saved_cmdlines_size_write, }; static ssize_t @@ -4225,25 +4374,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) return trace_poll(iter, filp, poll_table); } -/* - * This is a make-shift waitqueue. - * A tracer might use this callback on some rare cases: - * - * 1) the current tracer might hold the runqueue lock when it wakes up - * a reader, hence a deadlock (sched, function, and function graph tracers) - * 2) the function tracers, trace all functions, we don't want - * the overhead of calling wake_up and friends - * (and tracing them too) - * - * Anyway, this is really very primitive wakeup. - */ -void poll_wait_pipe(struct trace_iterator *iter) -{ - set_current_state(TASK_INTERRUPTIBLE); - /* sleep for 100 msecs, and try again. */ - schedule_timeout(HZ / 10); -} - /* Must be called with trace_types_lock mutex held. */ static int tracing_wait_pipe(struct file *filp) { @@ -4255,15 +4385,6 @@ static int tracing_wait_pipe(struct file *filp) return -EAGAIN; } - mutex_unlock(&iter->mutex); - - iter->trace->wait_pipe(iter); - - mutex_lock(&iter->mutex); - - if (signal_pending(current)) - return -EINTR; - /* * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never @@ -4275,6 +4396,15 @@ static int tracing_wait_pipe(struct file *filp) */ if (!tracing_is_on() && iter->pos) break; + + mutex_unlock(&iter->mutex); + + wait_on_pipe(iter); + + mutex_lock(&iter->mutex); + + if (signal_pending(current)) + return -EINTR; } return 1; @@ -5197,7 +5327,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, goto out_unlock; } mutex_unlock(&trace_types_lock); - iter->trace->wait_pipe(iter); + wait_on_pipe(iter); mutex_lock(&trace_types_lock); if (signal_pending(current)) { size = -EINTR; @@ -5408,7 +5538,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, goto out; } mutex_unlock(&trace_types_lock); - iter->trace->wait_pipe(iter); + wait_on_pipe(iter); mutex_lock(&trace_types_lock); if (signal_pending(current)) { ret = -EINTR; @@ -6102,6 +6232,25 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) return 0; } +static void free_trace_buffers(struct trace_array *tr) +{ + if (!tr) + return; + + if (tr->trace_buffer.buffer) { + ring_buffer_free(tr->trace_buffer.buffer); + tr->trace_buffer.buffer = NULL; + free_percpu(tr->trace_buffer.data); + } + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->max_buffer.buffer) { + ring_buffer_free(tr->max_buffer.buffer); + tr->max_buffer.buffer = NULL; + } +#endif +} + static int new_instance_create(const char *name) { struct trace_array *tr; @@ -6131,6 +6280,8 @@ static int new_instance_create(const char *name) raw_spin_lock_init(&tr->start_lock); + tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + tr->current_trace = &nop_trace; INIT_LIST_HEAD(&tr->systems); @@ -6158,8 +6309,7 @@ static int new_instance_create(const char *name) return 0; out_free_tr: - if (tr->trace_buffer.buffer) - ring_buffer_free(tr->trace_buffer.buffer); + free_trace_buffers(tr); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -6199,8 +6349,7 @@ static int instance_delete(const char *name) event_trace_del_tracer(tr); ftrace_destroy_function_files(tr); debugfs_remove_recursive(tr->dir); - free_percpu(tr->trace_buffer.data); - ring_buffer_free(tr->trace_buffer.buffer); + free_trace_buffers(tr); kfree(tr->name); kfree(tr); @@ -6328,6 +6477,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); +#ifdef CONFIG_TRACER_MAX_TRACE + trace_create_file("tracing_max_latency", 0644, d_tracer, + &tr->max_latency, &tracing_max_lat_fops); +#endif + if (ftrace_create_function_files(tr, d_tracer)) WARN(1, "Could not allocate function filter files"); @@ -6353,11 +6507,6 @@ static __init int tracer_init_debugfs(void) init_tracer_debugfs(&global_trace, d_tracer); -#ifdef CONFIG_TRACER_MAX_TRACE - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, &tracing_max_lat_fops); -#endif - trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); @@ -6367,6 +6516,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("saved_cmdlines", 0444, d_tracer, NULL, &tracing_saved_cmdlines_fops); + trace_create_file("saved_cmdlines_size", 0644, d_tracer, + NULL, &tracing_saved_cmdlines_size_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); @@ -6603,18 +6755,19 @@ __init static int tracer_alloc_buffers(void) if (!temp_buffer) goto out_free_cpumask; + if (trace_create_savedcmd() < 0) + goto out_free_temp_buffer; + /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); - goto out_free_temp_buffer; + goto out_free_savedcmd; } if (global_trace.buffer_disabled) tracing_off(); - trace_init_cmdlines(); - if (trace_boot_clock) { ret = tracing_set_clock(&global_trace, trace_boot_clock); if (ret < 0) @@ -6629,6 +6782,10 @@ __init static int tracer_alloc_buffers(void) */ global_trace.current_trace = &nop_trace; + global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + + ftrace_init_global_array_ops(&global_trace); + register_tracer(&nop_trace); /* All seems OK, enable tracing */ @@ -6656,13 +6813,11 @@ __init static int tracer_alloc_buffers(void) return 0; +out_free_savedcmd: + free_saved_cmdlines_buffer(savedcmd); out_free_temp_buffer: ring_buffer_free(temp_buffer); out_free_cpumask: - free_percpu(global_trace.trace_buffer.data); -#ifdef CONFIG_TRACER_MAX_TRACE - free_percpu(global_trace.max_buffer.data); -#endif free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2e29d7ba5a52..9e82551dd566 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -190,7 +190,22 @@ struct trace_array { */ struct trace_buffer max_buffer; bool allocated_snapshot; + unsigned long max_latency; #endif + /* + * max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a arch_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + * + * It is also used in other places outside the update_max_tr + * so it needs to be defined outside of the + * CONFIG_TRACER_MAX_TRACE. + */ + arch_spinlock_t max_lock; int buffer_disabled; #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; @@ -237,6 +252,9 @@ static inline struct trace_array *top_trace_array(void) { struct trace_array *tr; + if (list_empty(ftrace_trace_arrays.prev)) + return NULL; + tr = list_entry(ftrace_trace_arrays.prev, typeof(*tr), list); WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); @@ -323,7 +341,6 @@ struct tracer_flags { * @stop: called when tracing is paused (echo 0 > tracing_enabled) * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened - * @wait_pipe: override how the user waits for traces on trace_pipe * @close: called when the trace file is released * @pipe_close: called when the trace_pipe file is released * @read: override the default read callback on trace_pipe @@ -342,7 +359,6 @@ struct tracer { void (*stop)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); - void (*wait_pipe)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); void (*pipe_close)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, @@ -416,13 +432,7 @@ enum { TRACE_FTRACE_IRQ_BIT, TRACE_FTRACE_SIRQ_BIT, - /* GLOBAL_BITs must be greater than FTRACE_BITs */ - TRACE_GLOBAL_BIT, - TRACE_GLOBAL_NMI_BIT, - TRACE_GLOBAL_IRQ_BIT, - TRACE_GLOBAL_SIRQ_BIT, - - /* INTERNAL_BITs must be greater than GLOBAL_BITs */ + /* INTERNAL_BITs must be greater than FTRACE_BITs */ TRACE_INTERNAL_BIT, TRACE_INTERNAL_NMI_BIT, TRACE_INTERNAL_IRQ_BIT, @@ -449,9 +459,6 @@ enum { #define TRACE_FTRACE_START TRACE_FTRACE_BIT #define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) -#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT -#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) - #define TRACE_LIST_START TRACE_INTERNAL_BIT #define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) @@ -560,8 +567,6 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); -void poll_wait_pipe(struct trace_iterator *iter); - void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, @@ -608,8 +613,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE -extern unsigned long tracing_max_latency; - void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); @@ -724,6 +727,8 @@ extern unsigned long trace_flags; #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_IRQS 0x40 +#define TRACE_GRAPH_PRINT_TAIL 0x80 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) @@ -823,6 +828,10 @@ extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent); void ftrace_destroy_function_files(struct trace_array *tr); +void ftrace_init_global_array_ops(struct trace_array *tr); +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); +void ftrace_reset_array_ops(struct trace_array *tr); +int using_ftrace_ops_list_func(void); #else static inline int ftrace_trace_task(struct task_struct *task) { @@ -836,6 +845,11 @@ ftrace_create_function_files(struct trace_array *tr, return 0; } static inline void ftrace_destroy_function_files(struct trace_array *tr) { } +static inline __init void +ftrace_init_global_array_ops(struct trace_array *tr) { } +static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +/* ftace_func_t type is not defined, use macro instead of static inline */ +#define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c new file mode 100644 index 000000000000..40a14cbcf8e0 --- /dev/null +++ b/kernel/trace/trace_benchmark.c @@ -0,0 +1,198 @@ +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/trace_clock.h> + +#define CREATE_TRACE_POINTS +#include "trace_benchmark.h" + +static struct task_struct *bm_event_thread; + +static char bm_str[BENCHMARK_EVENT_STRLEN] = "START"; + +static u64 bm_total; +static u64 bm_totalsq; +static u64 bm_last; +static u64 bm_max; +static u64 bm_min; +static u64 bm_first; +static u64 bm_cnt; +static u64 bm_stddev; +static unsigned int bm_avg; +static unsigned int bm_std; + +/* + * This gets called in a loop recording the time it took to write + * the tracepoint. What it writes is the time statistics of the last + * tracepoint write. As there is nothing to write the first time + * it simply writes "START". As the first write is cold cache and + * the rest is hot, we save off that time in bm_first and it is + * reported as "first", which is shown in the second write to the + * tracepoint. The "first" field is writen within the statics from + * then on but never changes. + */ +static void trace_do_benchmark(void) +{ + u64 start; + u64 stop; + u64 delta; + u64 stddev; + u64 seed; + u64 last_seed; + unsigned int avg; + unsigned int std = 0; + + /* Only run if the tracepoint is actually active */ + if (!trace_benchmark_event_enabled()) + return; + + local_irq_disable(); + start = trace_clock_local(); + trace_benchmark_event(bm_str); + stop = trace_clock_local(); + local_irq_enable(); + + bm_cnt++; + + delta = stop - start; + + /* + * The first read is cold cached, keep it separate from the + * other calculations. + */ + if (bm_cnt == 1) { + bm_first = delta; + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "first=%llu [COLD CACHED]", bm_first); + return; + } + + bm_last = delta; + + if (delta > bm_max) + bm_max = delta; + if (!bm_min || delta < bm_min) + bm_min = delta; + + /* + * When bm_cnt is greater than UINT_MAX, it breaks the statistics + * accounting. Freeze the statistics when that happens. + * We should have enough data for the avg and stddev anyway. + */ + if (bm_cnt > UINT_MAX) { + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev); + return; + } + + bm_total += delta; + bm_totalsq += delta * delta; + + + if (bm_cnt > 1) { + /* + * Apply Welford's method to calculate standard deviation: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total; + do_div(stddev, (u32)bm_cnt); + do_div(stddev, (u32)bm_cnt - 1); + } else + stddev = 0; + + delta = bm_total; + do_div(delta, bm_cnt); + avg = delta; + + if (stddev > 0) { + int i = 0; + /* + * stddev is the square of standard deviation but + * we want the actualy number. Use the average + * as our seed to find the std. + * + * The next try is: + * x = (x + N/x) / 2 + * + * Where N is the squared number to find the square + * root of. + */ + seed = avg; + do { + last_seed = seed; + seed = stddev; + if (!last_seed) + break; + do_div(seed, last_seed); + seed += last_seed; + do_div(seed, 2); + } while (i++ < 10 && last_seed != seed); + + std = seed; + } + + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, avg, std, stddev); + + bm_std = std; + bm_avg = avg; + bm_stddev = stddev; +} + +static int benchmark_event_kthread(void *arg) +{ + /* sleep a bit to make sure the tracepoint gets activated */ + msleep(100); + + while (!kthread_should_stop()) { + + trace_do_benchmark(); + + /* + * We don't go to sleep, but let others + * run as well. + */ + cond_resched(); + } + + return 0; +} + +/* + * When the benchmark tracepoint is enabled, it calls this + * function and the thread that calls the tracepoint is created. + */ +void trace_benchmark_reg(void) +{ + bm_event_thread = kthread_run(benchmark_event_kthread, + NULL, "event_benchmark"); + WARN_ON(!bm_event_thread); +} + +/* + * When the benchmark tracepoint is disabled, it calls this + * function and the thread that calls the tracepoint is deleted + * and all the numbers are reset. + */ +void trace_benchmark_unreg(void) +{ + if (!bm_event_thread) + return; + + kthread_stop(bm_event_thread); + + strcpy(bm_str, "START"); + bm_total = 0; + bm_totalsq = 0; + bm_last = 0; + bm_max = 0; + bm_min = 0; + bm_cnt = 0; + /* These don't need to be reset but reset them anyway */ + bm_first = 0; + bm_std = 0; + bm_avg = 0; + bm_stddev = 0; +} diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h new file mode 100644 index 000000000000..3c1df1df4e29 --- /dev/null +++ b/kernel/trace/trace_benchmark.h @@ -0,0 +1,41 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM benchmark + +#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BENCHMARK_H + +#include <linux/tracepoint.h> + +extern void trace_benchmark_reg(void); +extern void trace_benchmark_unreg(void); + +#define BENCHMARK_EVENT_STRLEN 128 + +TRACE_EVENT_FN(benchmark_event, + + TP_PROTO(const char *str), + + TP_ARGS(str), + + TP_STRUCT__entry( + __array( char, str, BENCHMARK_EVENT_STRLEN ) + ), + + TP_fast_assign( + memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); + ), + + TP_printk("%s", __entry->str), + + trace_benchmark_reg, trace_benchmark_unreg +); + +#endif /* _TRACE_BENCHMARK_H */ + +#undef TRACE_INCLUDE_FILE +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_benchmark + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3ddfd8f62c05..f99e0b3bca8c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -574,6 +574,9 @@ int trace_set_clr_event(const char *system, const char *event, int set) { struct trace_array *tr = top_trace_array(); + if (!tr) + return -ENODEV; + return __ftrace_set_clr_event(tr, NULL, system, event, set); } EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -2065,6 +2068,9 @@ event_enable_func(struct ftrace_hash *hash, bool enable; int ret; + if (!tr) + return -ENODEV; + /* hash funcs only work with set_ftrace_filter */ if (!enabled || !param) return -EINVAL; @@ -2396,6 +2402,9 @@ static __init int event_trace_enable(void) char *token; int ret; + if (!tr) + return -ENODEV; + for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { call = *iter; @@ -2442,6 +2451,8 @@ static __init int event_trace_init(void) int ret; tr = top_trace_array(); + if (!tr) + return -ENODEV; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -2535,6 +2546,8 @@ static __init void event_trace_self_tests(void) int ret; tr = top_trace_array(); + if (!tr) + return; pr_info("Running tests on trace events:\n"); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index ffd56351b521..57f0ec962d2c 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -26,8 +26,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs); -static struct ftrace_ops trace_ops; -static struct ftrace_ops trace_stack_ops; static struct tracer_flags func_flags; /* Our option */ @@ -83,28 +81,24 @@ void ftrace_destroy_function_files(struct trace_array *tr) static int function_trace_init(struct trace_array *tr) { - struct ftrace_ops *ops; - - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { - /* There's only one global tr */ - if (!trace_ops.private) { - trace_ops.private = tr; - trace_stack_ops.private = tr; - } + ftrace_func_t func; - if (func_flags.val & TRACE_FUNC_OPT_STACK) - ops = &trace_stack_ops; - else - ops = &trace_ops; - tr->ops = ops; - } else if (!tr->ops) { - /* - * Instance trace_arrays get their ops allocated - * at instance creation. Unless it failed - * the allocation. - */ + /* + * Instance trace_arrays get their ops allocated + * at instance creation. Unless it failed + * the allocation. + */ + if (!tr->ops) return -ENOMEM; - } + + /* Currently only the global instance can do stack tracing */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL && + func_flags.val & TRACE_FUNC_OPT_STACK) + func = function_stack_trace_call; + else + func = function_trace_call; + + ftrace_init_array_ops(tr, func); tr->trace_buffer.cpu = get_cpu(); put_cpu(); @@ -118,6 +112,7 @@ static void function_trace_reset(struct trace_array *tr) { tracing_stop_function_trace(tr); tracing_stop_cmdline_record(); + ftrace_reset_array_ops(tr); } static void function_trace_start(struct trace_array *tr) @@ -199,18 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, local_irq_restore(flags); } -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = function_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; - -static struct ftrace_ops trace_stack_ops __read_mostly = -{ - .func = function_stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; - static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, @@ -248,10 +231,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) unregister_ftrace_function(tr->ops); if (set) { - tr->ops = &trace_stack_ops; + tr->ops->func = function_stack_trace_call; register_ftrace_function(tr->ops); } else { - tr->ops = &trace_ops; + tr->ops->func = function_trace_call; register_ftrace_function(tr->ops); } @@ -269,7 +252,6 @@ static struct tracer function_trace __tracer_data = .init = function_trace_init, .reset = function_trace_reset, .start = function_trace_start, - .wait_pipe = poll_wait_pipe, .flags = &func_flags, .set_flag = func_set_flag, .allow_instances = true, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index deff11200261..4de3e57f723c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -38,15 +38,6 @@ struct fgraph_data { #define TRACE_GRAPH_INDENT 2 -/* Flag options */ -#define TRACE_GRAPH_PRINT_OVERRUN 0x1 -#define TRACE_GRAPH_PRINT_CPU 0x2 -#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 -#define TRACE_GRAPH_PRINT_PROC 0x8 -#define TRACE_GRAPH_PRINT_DURATION 0x10 -#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 -#define TRACE_GRAPH_PRINT_IRQS 0x40 - static unsigned int max_depth; static struct tracer_opt trace_opts[] = { @@ -64,11 +55,13 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, /* Display interrupts */ { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, + /* Display function name after trailing } */ + { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { - /* Don't display overruns and proc by default */ + /* Don't display overruns, proc, or tail by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, .opts = trace_opts @@ -1176,9 +1169,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * If the return function does not have a matching entry, * then the entry was lost. Instead of just printing * the '}' and letting the user guess what function this - * belongs to, write out the function name. + * belongs to, write out the function name. Always do + * that if the funcgraph-tail option is enabled. */ - if (func_match) { + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { ret = trace_seq_puts(s, "}\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -1505,7 +1499,6 @@ static struct tracer graph_trace __tracer_data = { .pipe_open = graph_trace_open, .close = graph_trace_close, .pipe_close = graph_trace_close, - .wait_pipe = poll_wait_pipe, .init = graph_trace_init, .reset = graph_trace_reset, .print_line = print_graph_function, diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 8ff02cbb892f..9bb104f748d0 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -151,12 +151,6 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, atomic_dec(&data->disabled); } - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = irqsoff_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -176,7 +170,7 @@ irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) for_each_possible_cpu(cpu) per_cpu(tracing_cpu, cpu) = 0; - tracing_max_latency = 0; + tr->max_latency = 0; tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); return start_irqsoff_tracer(irqsoff_trace, set); @@ -303,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) return 0; } else { - if (delta <= tracing_max_latency) + if (delta <= tr->max_latency) return 0; } return 1; @@ -333,13 +327,13 @@ check_critical_timing(struct trace_array *tr, pc = preempt_count(); - if (!report_latency(delta)) + if (!report_latency(tr, delta)) goto out; raw_spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ - if (!report_latency(delta)) + if (!report_latency(tr, delta)) goto out_unlock; __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); @@ -352,7 +346,7 @@ check_critical_timing(struct trace_array *tr, data->critical_end = parent_ip; if (likely(!is_tracing_stopped())) { - tracing_max_latency = delta; + tr->max_latency = delta; update_max_tr_single(tr, current, cpu); } @@ -531,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ -static int register_irqsoff_function(int graph, int set) +static int register_irqsoff_function(struct trace_array *tr, int graph, int set) { int ret; @@ -543,7 +537,7 @@ static int register_irqsoff_function(int graph, int set) ret = register_ftrace_graph(&irqsoff_graph_return, &irqsoff_graph_entry); else - ret = register_ftrace_function(&trace_ops); + ret = register_ftrace_function(tr->ops); if (!ret) function_enabled = true; @@ -551,7 +545,7 @@ static int register_irqsoff_function(int graph, int set) return ret; } -static void unregister_irqsoff_function(int graph) +static void unregister_irqsoff_function(struct trace_array *tr, int graph) { if (!function_enabled) return; @@ -559,17 +553,17 @@ static void unregister_irqsoff_function(int graph) if (graph) unregister_ftrace_graph(); else - unregister_ftrace_function(&trace_ops); + unregister_ftrace_function(tr->ops); function_enabled = false; } -static void irqsoff_function_set(int set) +static void irqsoff_function_set(struct trace_array *tr, int set) { if (set) - register_irqsoff_function(is_graph(), 1); + register_irqsoff_function(tr, is_graph(), 1); else - unregister_irqsoff_function(is_graph()); + unregister_irqsoff_function(tr, is_graph()); } static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) @@ -577,7 +571,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) struct tracer *tracer = tr->current_trace; if (mask & TRACE_ITER_FUNCTION) - irqsoff_function_set(set); + irqsoff_function_set(tr, set); return trace_keep_overwrite(tracer, mask, set); } @@ -586,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph) { int ret; - ret = register_irqsoff_function(graph, 0); + ret = register_irqsoff_function(tr, graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -600,25 +594,37 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - unregister_irqsoff_function(graph); + unregister_irqsoff_function(tr, graph); } -static void __irqsoff_tracer_init(struct trace_array *tr) +static bool irqsoff_busy; + +static int __irqsoff_tracer_init(struct trace_array *tr) { + if (irqsoff_busy) + return -EBUSY; + save_flags = trace_flags; /* non overwrite screws up the latency tracers */ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); - tracing_max_latency = 0; + tr->max_latency = 0; irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); tracing_reset_online_cpus(&tr->trace_buffer); - if (start_irqsoff_tracer(tr, is_graph())) + ftrace_init_array_ops(tr, irqsoff_tracer_call); + + /* Only toplevel instance supports graph tracing */ + if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && + is_graph()))) printk(KERN_ERR "failed to start irqsoff tracer\n"); + + irqsoff_busy = true; + return 0; } static void irqsoff_tracer_reset(struct trace_array *tr) @@ -630,6 +636,9 @@ static void irqsoff_tracer_reset(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + ftrace_reset_array_ops(tr); + + irqsoff_busy = false; } static void irqsoff_tracer_start(struct trace_array *tr) @@ -647,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer irqsoff_tracer __read_mostly = { @@ -668,6 +676,7 @@ static struct tracer irqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .allow_instances = true, .use_max_tr = true, }; # define register_irqsoff(trace) register_tracer(&trace) @@ -680,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_PREEMPT_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer preemptoff_tracer __read_mostly = @@ -702,6 +710,7 @@ static struct tracer preemptoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .allow_instances = true, .use_max_tr = true, }; # define register_preemptoff(trace) register_tracer(&trace) @@ -716,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; - __irqsoff_tracer_init(tr); - return 0; + return __irqsoff_tracer_init(tr); } static struct tracer preemptirqsoff_tracer __read_mostly = @@ -738,6 +746,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .allow_instances = true, .use_max_tr = true, }; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 903ae28962be..ef2fba1f46b5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1377,6 +1377,9 @@ static __init int kprobe_trace_self_tests_init(void) struct trace_kprobe *tk; struct ftrace_event_file *file; + if (tracing_is_disabled()) + return -ENODEV; + target = kprobe_trace_selftest_target; pr_info("Testing kprobe tracing: "); diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 69a5cc94c01a..fcf0a9e48916 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -91,7 +91,6 @@ struct tracer nop_trace __read_mostly = .name = "nop", .init = nop_trace_init, .reset = nop_trace_reset, - .wait_pipe = poll_wait_pipe, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_nop, #endif diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a436de18aa99..f3dad80c20b2 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -126,6 +126,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) EXPORT_SYMBOL_GPL(trace_seq_printf); /** + * trace_seq_bitmask - put a list of longs as a bitmask print output + * @s: trace sequence descriptor + * @maskp: points to an array of unsigned longs that represent a bitmask + * @nmaskbits: The number of bits that are valid in @maskp + * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * + * Writes a ASCII representation of a bitmask string into @s. + */ +int +trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (s->full || !len) + return 0; + + ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); + s->len += ret; + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + +/** * trace_seq_vprintf - sequence printing of trace information * @s: trace sequence descriptor * @fmt: printf format string @@ -399,6 +427,19 @@ EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); #endif const char * +ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size) +{ + const char *ret = p->buffer + p->len; + + trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq); + +const char * ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) { int i; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e14da5e97a69..19bd8928ce94 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, atomic_dec(&data->disabled); preempt_enable_notrace(); } - -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = wakeup_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; #endif /* CONFIG_FUNCTION_TRACER */ -static int register_wakeup_function(int graph, int set) +static int register_wakeup_function(struct trace_array *tr, int graph, int set) { int ret; @@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set) ret = register_ftrace_graph(&wakeup_graph_return, &wakeup_graph_entry); else - ret = register_ftrace_function(&trace_ops); + ret = register_ftrace_function(tr->ops); if (!ret) function_enabled = true; @@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set) return ret; } -static void unregister_wakeup_function(int graph) +static void unregister_wakeup_function(struct trace_array *tr, int graph) { if (!function_enabled) return; @@ -166,17 +160,17 @@ static void unregister_wakeup_function(int graph) if (graph) unregister_ftrace_graph(); else - unregister_ftrace_function(&trace_ops); + unregister_ftrace_function(tr->ops); function_enabled = false; } -static void wakeup_function_set(int set) +static void wakeup_function_set(struct trace_array *tr, int set) { if (set) - register_wakeup_function(is_graph(), 1); + register_wakeup_function(tr, is_graph(), 1); else - unregister_wakeup_function(is_graph()); + unregister_wakeup_function(tr, is_graph()); } static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) @@ -184,16 +178,16 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) struct tracer *tracer = tr->current_trace; if (mask & TRACE_ITER_FUNCTION) - wakeup_function_set(set); + wakeup_function_set(tr, set); return trace_keep_overwrite(tracer, mask, set); } -static int start_func_tracer(int graph) +static int start_func_tracer(struct trace_array *tr, int graph) { int ret; - ret = register_wakeup_function(graph, 0); + ret = register_wakeup_function(tr, graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -203,11 +197,11 @@ static int start_func_tracer(int graph) return ret; } -static void stop_func_tracer(int graph) +static void stop_func_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - unregister_wakeup_function(graph); + unregister_wakeup_function(tr, graph); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -221,12 +215,12 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (!(is_graph() ^ set)) return 0; - stop_func_tracer(!set); + stop_func_tracer(tr, !set); wakeup_reset(wakeup_trace); - tracing_max_latency = 0; + tr->max_latency = 0; - return start_func_tracer(set); + return start_func_tracer(tr, set); } static int wakeup_graph_entry(struct ftrace_graph_ent *trace) @@ -350,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s) /* * Should this new latency be reported/recorded? */ -static int report_latency(cycle_t delta) +static int report_latency(struct trace_array *tr, cycle_t delta) { if (tracing_thresh) { if (delta < tracing_thresh) return 0; } else { - if (delta <= tracing_max_latency) + if (delta <= tr->max_latency) return 0; } return 1; @@ -424,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore, T1 = ftrace_now(cpu); delta = T1-T0; - if (!report_latency(delta)) + if (!report_latency(wakeup_trace, delta)) goto out_unlock; if (likely(!is_tracing_stopped())) { - tracing_max_latency = delta; + wakeup_trace->max_latency = delta; update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); } @@ -587,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - if (start_func_tracer(is_graph())) + if (start_func_tracer(tr, is_graph())) printk(KERN_ERR "failed to start wakeup tracer\n"); return; @@ -600,13 +594,15 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - stop_func_tracer(is_graph()); + stop_func_tracer(tr, is_graph()); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); } +static bool wakeup_busy; + static int __wakeup_tracer_init(struct trace_array *tr) { save_flags = trace_flags; @@ -615,14 +611,20 @@ static int __wakeup_tracer_init(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); - tracing_max_latency = 0; + tr->max_latency = 0; wakeup_trace = tr; + ftrace_init_array_ops(tr, wakeup_tracer_call); start_wakeup_tracer(tr); + + wakeup_busy = true; return 0; } static int wakeup_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + wakeup_dl = 0; wakeup_rt = 0; return __wakeup_tracer_init(tr); @@ -630,6 +632,9 @@ static int wakeup_tracer_init(struct trace_array *tr) static int wakeup_rt_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + wakeup_dl = 0; wakeup_rt = 1; return __wakeup_tracer_init(tr); @@ -637,6 +642,9 @@ static int wakeup_rt_tracer_init(struct trace_array *tr) static int wakeup_dl_tracer_init(struct trace_array *tr) { + if (wakeup_busy) + return -EBUSY; + wakeup_dl = 1; wakeup_rt = 0; return __wakeup_tracer_init(tr); @@ -653,6 +661,8 @@ static void wakeup_tracer_reset(struct trace_array *tr) set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + ftrace_reset_array_ops(tr); + wakeup_busy = false; } static void wakeup_tracer_start(struct trace_array *tr) @@ -684,6 +694,7 @@ static struct tracer wakeup_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, + .allow_instances = true, .use_max_tr = true, }; @@ -694,7 +705,6 @@ static struct tracer wakeup_rt_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .wait_pipe = poll_wait_pipe, .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, @@ -706,6 +716,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, + .allow_instances = true, .use_max_tr = true, }; @@ -716,7 +727,6 @@ static struct tracer wakeup_dl_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .wait_pipe = poll_wait_pipe, .print_max = true, .print_header = wakeup_print_header, .print_line = wakeup_print_line, diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index e98fca60974f..5ef60499dc8e 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -65,7 +65,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) /* Don't allow flipping of max traces now */ local_irq_save(flags); - arch_spin_lock(&ftrace_max_lock); + arch_spin_lock(&buf->tr->max_lock); cnt = ring_buffer_entries(buf->buffer); @@ -83,7 +83,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) break; } tracing_on(); - arch_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&buf->tr->max_lock); local_irq_restore(flags); if (count) @@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = { .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; -static struct ftrace_ops test_global = { - .func = trace_selftest_test_global_func, - .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, -}; - static void print_counts(void) { printk("(%d %d %d %d %d) ", @@ -185,7 +180,7 @@ static void reset_counts(void) trace_selftest_test_dyn_cnt = 0; } -static int trace_selftest_ops(int cnt) +static int trace_selftest_ops(struct trace_array *tr, int cnt) { int save_ftrace_enabled = ftrace_enabled; struct ftrace_ops *dyn_ops; @@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt) register_ftrace_function(&test_probe1); register_ftrace_function(&test_probe2); register_ftrace_function(&test_probe3); - register_ftrace_function(&test_global); + /* First time we are running with main function */ + if (cnt > 1) { + ftrace_init_array_ops(tr, trace_selftest_test_global_func); + register_ftrace_function(tr->ops); + } DYN_FTRACE_TEST_NAME(); @@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt) goto out; if (trace_selftest_test_probe3_cnt != 1) goto out; - if (trace_selftest_test_global_cnt == 0) - goto out; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out; + } DYN_FTRACE_TEST_NAME2(); @@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt) goto out_free; if (trace_selftest_test_probe3_cnt != 3) goto out_free; - if (trace_selftest_test_global_cnt == 0) - goto out; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out; + } if (trace_selftest_test_dyn_cnt == 0) goto out_free; @@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt) unregister_ftrace_function(&test_probe1); unregister_ftrace_function(&test_probe2); unregister_ftrace_function(&test_probe3); - unregister_ftrace_function(&test_global); + if (cnt > 1) + unregister_ftrace_function(tr->ops); + ftrace_reset_array_ops(tr); /* Make sure everything is off */ reset_counts(); @@ -315,9 +320,9 @@ static int trace_selftest_ops(int cnt) } /* Test dynamic code modification and ftrace filters */ -int trace_selftest_startup_dynamic_tracing(struct tracer *trace, - struct trace_array *tr, - int (*func)(void)) +static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) { int save_ftrace_enabled = ftrace_enabled; unsigned long count; @@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, } /* Test the ops with global tracing running */ - ret = trace_selftest_ops(1); + ret = trace_selftest_ops(tr, 1); trace->reset(tr); out: @@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* Test the ops with global tracing off */ if (!ret) - ret = trace_selftest_ops(2); + ret = trace_selftest_ops(tr, 2); return ret; } @@ -802,7 +807,7 @@ out: int trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -814,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable interrupts for a bit */ local_irq_disable(); udelay(100); @@ -841,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) ret = -1; } - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -851,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) int trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -876,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable preemption for a bit */ preempt_disable(); udelay(100); @@ -903,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) ret = -1; } - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -913,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) int trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; unsigned long count; int ret; @@ -938,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; /* disable preemption and interrupts for a bit */ preempt_disable(); @@ -973,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } /* do the test by disabling interrupts first this time */ - tracing_max_latency = 0; + tr->max_latency = 0; tracing_start(); trace->start(tr); @@ -1004,7 +1009,7 @@ out: tracing_start(); out_no_start: trace->reset(tr); - tracing_max_latency = save_max; + tr->max_latency = save_max; return ret; } @@ -1057,7 +1062,7 @@ static int trace_wakeup_test_thread(void *data) int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) { - unsigned long save_max = tracing_max_latency; + unsigned long save_max = tr->max_latency; struct task_struct *p; struct completion is_ready; unsigned long count; @@ -1083,7 +1088,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) } /* reset the max latency */ - tracing_max_latency = 0; + tr->max_latency = 0; while (p->on_rq) { /* @@ -1113,7 +1118,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) trace->reset(tr); tracing_start(); - tracing_max_latency = save_max; + tr->max_latency = save_max; /* kill the thread */ kthread_stop(p); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 21b320e5d163..8a4e5cb66a4c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -51,11 +51,33 @@ static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; +static inline void print_max_stack(void) +{ + long i; + int size; + + pr_emerg(" Depth Size Location (%d entries)\n" + " ----- ---- --------\n", + max_stack_trace.nr_entries - 1); + + for (i = 0; i < max_stack_trace.nr_entries; i++) { + if (stack_dump_trace[i] == ULONG_MAX) + break; + if (i+1 == max_stack_trace.nr_entries || + stack_dump_trace[i+1] == ULONG_MAX) + size = stack_dump_index[i]; + else + size = stack_dump_index[i] - stack_dump_index[i+1]; + + pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i], + size, (void *)stack_dump_trace[i]); + } +} + static inline void check_stack(unsigned long ip, unsigned long *stack) { - unsigned long this_size, flags; - unsigned long *p, *top, *start; + unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; int frame_size = ACCESS_ONCE(tracer_frame); int i; @@ -85,8 +107,12 @@ check_stack(unsigned long ip, unsigned long *stack) max_stack_size = this_size; - max_stack_trace.nr_entries = 0; - max_stack_trace.skip = 3; + max_stack_trace.nr_entries = 0; + + if (using_ftrace_ops_list_func()) + max_stack_trace.skip = 4; + else + max_stack_trace.skip = 3; save_stack_trace(&max_stack_trace); @@ -145,8 +171,12 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - BUG_ON(current != &init_task && - *(end_of_stack(current)) != STACK_END_MAGIC); + if ((current != &init_task && + *(end_of_stack(current)) != STACK_END_MAGIC)) { + print_max_stack(); + BUG(); + } + out: arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 6620e5837ce2..33cbd8c203f8 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -239,6 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, * tracepoint_probe_register - Connect a probe to a tracepoint * @tp: tracepoint * @probe: probe handler + * @data: tracepoint data * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for @@ -264,6 +265,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register); * tracepoint_probe_unregister - Disconnect a probe from a tracepoint * @tp: tracepoint * @probe: probe function pointer + * @data: tracepoint data * * Returns 0 if ok, error value on error. */ diff --git a/kernel/user.c b/kernel/user.c index 294fc6a94168..4efa39350e44 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock); struct user_struct root_user = { .__count = ATOMIC_INIT(1), .processes = ATOMIC_INIT(1), - .files = ATOMIC_INIT(0), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index bf71b4b2d632..fcc02560fd6b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -286,7 +286,7 @@ EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in - * @uid: group identifier + * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. @@ -482,7 +482,8 @@ static int projid_m_show(struct seq_file *seq, void *v) return 0; } -static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) +static void *m_start(struct seq_file *seq, loff_t *ppos, + struct uid_gid_map *map) { struct uid_gid_extent *extent = NULL; loff_t pos = *ppos; @@ -546,7 +547,8 @@ struct seq_operations proc_projid_seq_operations = { .show = projid_m_show, }; -static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) +static bool mappings_overlap(struct uid_gid_map *new_map, + struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; @@ -653,7 +655,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = -EINVAL; pos = kbuf; new_map.nr_extents = 0; - for (;pos; pos = next_line) { + for (; pos; pos = next_line) { extent = &new_map.extent[new_map.nr_extents]; /* Find the end of line and ensure I don't look past it */ @@ -687,13 +689,16 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* Verify we have been given valid starting values */ if ((extent->first == (u32) -1) || - (extent->lower_first == (u32) -1 )) + (extent->lower_first == (u32) -1)) goto out; - /* Verify count is not zero and does not cause the extent to wrap */ + /* Verify count is not zero and does not cause the + * extent to wrap + */ if ((extent->first + extent->count) <= extent->first) goto out; - if ((extent->lower_first + extent->count) <= extent->lower_first) + if ((extent->lower_first + extent->count) <= + extent->lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ @@ -751,7 +756,8 @@ out: return ret; } -ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_uid_map_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; @@ -767,7 +773,8 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz &ns->uid_map, &ns->parent->uid_map); } -ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_gid_map_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; @@ -783,7 +790,8 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz &ns->gid_map, &ns->parent->gid_map); } -ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +ssize_t proc_projid_map_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; @@ -800,7 +808,7 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t &ns->projid_map, &ns->parent->projid_map); } -static bool new_idmap_permitted(const struct file *file, +static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { @@ -811,8 +819,7 @@ static bool new_idmap_permitted(const struct file *file, kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, file->f_cred->fsuid)) return true; - } - else if (cap_setid == CAP_SETGID) { + } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (gid_eq(gid, file->f_cred->fsgid)) return true; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4f69f9a5e221..c8eac43267e9 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -17,7 +17,7 @@ #ifdef CONFIG_PROC_SYSCTL -static void *get_uts(ctl_table *table, int write) +static void *get_uts(struct ctl_table *table, int write) { char *which = table->data; struct uts_namespace *uts_ns; @@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write) return which; } -static void put_uts(ctl_table *table, int write, void *which) +static void put_uts(struct ctl_table *table, int write, void *which) { if (!write) up_read(&uts_sem); @@ -44,14 +44,14 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, +static int proc_do_uts_string(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,buffer,lenp, ppos); + r = proc_dostring(&uts_table, write, buffer, lenp, ppos); put_uts(table, write, uts_table.data); if (write) @@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void) return 0; } -__initcall(utsname_sysctl_init); +device_initcall(utsname_sysctl_init); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a4bab46cd38e..6203d2900877 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -65,15 +65,12 @@ enum { * be executing on any CPU. The pool behaves as an unbound one. * * Note that DISASSOCIATED should be flipped only while holding - * manager_mutex to avoid changing binding state while - * create_worker() is in progress. + * attach_mutex to avoid changing binding state while + * worker_attach_to_pool() is in progress. */ - POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ - POOL_FREEZING = 1 << 3, /* freeze in progress */ /* worker flags */ - WORKER_STARTED = 1 << 0, /* started */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ @@ -124,8 +121,7 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * - * MG: pool->manager_mutex and pool->lock protected. Writes require both - * locks. Reads can happen under either lock. + * A: pool->attach_mutex protected. * * PL: wq_pool_mutex protected. * @@ -163,8 +159,11 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ - struct mutex manager_mutex; /* manager exclusion */ - struct idr worker_idr; /* MG: worker IDs and iteration */ + struct mutex attach_mutex; /* attach/detach exclusion */ + struct list_head workers; /* A: attached workers */ + struct completion *detach_completion; /* all workers detached */ + + struct ida worker_ida; /* worker IDs for task name */ struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */ @@ -340,16 +339,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, lockdep_is_held(&wq->mutex), \ "sched RCU or wq->mutex should be held") -#ifdef CONFIG_LOCKDEP -#define assert_manager_or_pool_lock(pool) \ - WARN_ONCE(debug_locks && \ - !lockdep_is_held(&(pool)->manager_mutex) && \ - !lockdep_is_held(&(pool)->lock), \ - "pool->manager_mutex or ->lock should be held") -#else -#define assert_manager_or_pool_lock(pool) do { } while (0) -#endif - #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -375,17 +364,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, /** * for_each_pool_worker - iterate through all workers of a worker_pool * @worker: iteration cursor - * @wi: integer used for iteration * @pool: worker_pool to iterate workers of * - * This must be called with either @pool->manager_mutex or ->lock held. + * This must be called with @pool->attach_mutex. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ -#define for_each_pool_worker(worker, wi, pool) \ - idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ - if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ +#define for_each_pool_worker(worker, pool) \ + list_for_each_entry((worker), &(pool)->workers, node) \ + if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \ else /** @@ -763,13 +751,6 @@ static bool need_to_create_worker(struct worker_pool *pool) return need_more_worker(pool) && !may_start_working(pool); } -/* Do I need to be the manager? */ -static bool need_to_manage_workers(struct worker_pool *pool) -{ - return need_to_create_worker(pool) || - (pool->flags & POOL_MANAGE_WORKERS); -} - /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { @@ -791,8 +772,8 @@ static bool too_many_workers(struct worker_pool *pool) * Wake up functions. */ -/* Return the first worker. Safe with preemption disabled */ -static struct worker *first_worker(struct worker_pool *pool) +/* Return the first idle worker. Safe with preemption disabled */ +static struct worker *first_idle_worker(struct worker_pool *pool) { if (unlikely(list_empty(&pool->idle_list))) return NULL; @@ -811,7 +792,7 @@ static struct worker *first_worker(struct worker_pool *pool) */ static void wake_up_worker(struct worker_pool *pool) { - struct worker *worker = first_worker(pool); + struct worker *worker = first_idle_worker(pool); if (likely(worker)) wake_up_process(worker->task); @@ -885,7 +866,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) */ if (atomic_dec_and_test(&pool->nr_running) && !list_empty(&pool->worklist)) - to_wakeup = first_worker(pool); + to_wakeup = first_idle_worker(pool); return to_wakeup ? to_wakeup->task : NULL; } @@ -1621,70 +1602,6 @@ static void worker_leave_idle(struct worker *worker) list_del_init(&worker->entry); } -/** - * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it - * @pool: target worker_pool - * - * Bind %current to the cpu of @pool if it is associated and lock @pool. - * - * Works which are scheduled while the cpu is online must at least be - * scheduled to a worker which is bound to the cpu so that if they are - * flushed from cpu callbacks while cpu is going down, they are - * guaranteed to execute on the cpu. - * - * This function is to be used by unbound workers and rescuers to bind - * themselves to the target cpu and may race with cpu going down or - * coming online. kthread_bind() can't be used because it may put the - * worker to already dead cpu and set_cpus_allowed_ptr() can't be used - * verbatim as it's best effort and blocking and pool may be - * [dis]associated in the meantime. - * - * This function tries set_cpus_allowed() and locks pool and verifies the - * binding against %POOL_DISASSOCIATED which is set during - * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker - * enters idle state or fetches works without dropping lock, it can - * guarantee the scheduling requirement described in the first paragraph. - * - * CONTEXT: - * Might sleep. Called without any lock but returns with pool->lock - * held. - * - * Return: - * %true if the associated pool is online (@worker is successfully - * bound), %false if offline. - */ -static bool worker_maybe_bind_and_lock(struct worker_pool *pool) -__acquires(&pool->lock) -{ - while (true) { - /* - * The following call may fail, succeed or succeed - * without actually migrating the task to the cpu if - * it races with cpu hotunplug operation. Verify - * against POOL_DISASSOCIATED. - */ - if (!(pool->flags & POOL_DISASSOCIATED)) - set_cpus_allowed_ptr(current, pool->attrs->cpumask); - - spin_lock_irq(&pool->lock); - if (pool->flags & POOL_DISASSOCIATED) - return false; - if (task_cpu(current) == pool->cpu && - cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) - return true; - spin_unlock_irq(&pool->lock); - - /* - * We've raced with CPU hot[un]plug. Give it a breather - * and retry migration. cond_resched() is required here; - * otherwise, we might deadlock against cpu_stop trying to - * bring down the CPU on non-preemptive kernel. - */ - cpu_relax(); - cond_resched(); - } -} - static struct worker *alloc_worker(void) { struct worker *worker; @@ -1693,6 +1610,7 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); + INIT_LIST_HEAD(&worker->node); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1700,12 +1618,68 @@ static struct worker *alloc_worker(void) } /** + * worker_attach_to_pool() - attach a worker to a pool + * @worker: worker to be attached + * @pool: the target pool + * + * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and + * cpu-binding of @worker are kept coordinated with the pool across + * cpu-[un]hotplugs. + */ +static void worker_attach_to_pool(struct worker *worker, + struct worker_pool *pool) +{ + mutex_lock(&pool->attach_mutex); + + /* + * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any + * online CPUs. It'll be re-applied when any of the CPUs come up. + */ + set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + + /* + * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains + * stable across this function. See the comments above the + * flag definition for details. + */ + if (pool->flags & POOL_DISASSOCIATED) + worker->flags |= WORKER_UNBOUND; + + list_add_tail(&worker->node, &pool->workers); + + mutex_unlock(&pool->attach_mutex); +} + +/** + * worker_detach_from_pool() - detach a worker from its pool + * @worker: worker which is attached to its pool + * @pool: the pool @worker is attached to + * + * Undo the attaching which had been done in worker_attach_to_pool(). The + * caller worker shouldn't access to the pool after detached except it has + * other reference to the pool. + */ +static void worker_detach_from_pool(struct worker *worker, + struct worker_pool *pool) +{ + struct completion *detach_completion = NULL; + + mutex_lock(&pool->attach_mutex); + list_del(&worker->node); + if (list_empty(&pool->workers)) + detach_completion = pool->detach_completion; + mutex_unlock(&pool->attach_mutex); + + if (detach_completion) + complete(detach_completion); +} + +/** * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to * - * Create a new worker which is bound to @pool. The returned worker - * can be started by calling start_worker() or destroyed using - * destroy_worker(). + * Create a new worker which is attached to @pool. The new worker must be + * started by start_worker(). * * CONTEXT: * Might sleep. Does GFP_KERNEL allocations. @@ -1719,19 +1693,8 @@ static struct worker *create_worker(struct worker_pool *pool) int id = -1; char id_buf[16]; - lockdep_assert_held(&pool->manager_mutex); - - /* - * ID is needed to determine kthread name. Allocate ID first - * without installing the pointer. - */ - idr_preload(GFP_KERNEL); - spin_lock_irq(&pool->lock); - - id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); - - spin_unlock_irq(&pool->lock); - idr_preload_end(); + /* ID is needed to determine kthread name */ + id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); if (id < 0) goto fail; @@ -1758,33 +1721,14 @@ static struct worker *create_worker(struct worker_pool *pool) /* prevent userland from meddling with cpumask of workqueue workers */ worker->task->flags |= PF_NO_SETAFFINITY; - /* - * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any - * online CPUs. It'll be re-applied when any of the CPUs come up. - */ - set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - - /* - * The caller is responsible for ensuring %POOL_DISASSOCIATED - * remains stable across this function. See the comments above the - * flag definition for details. - */ - if (pool->flags & POOL_DISASSOCIATED) - worker->flags |= WORKER_UNBOUND; - - /* successful, commit the pointer to idr */ - spin_lock_irq(&pool->lock); - idr_replace(&pool->worker_idr, worker, worker->id); - spin_unlock_irq(&pool->lock); + /* successful, attach the worker to the pool */ + worker_attach_to_pool(worker, pool); return worker; fail: - if (id >= 0) { - spin_lock_irq(&pool->lock); - idr_remove(&pool->worker_idr, id); - spin_unlock_irq(&pool->lock); - } + if (id >= 0) + ida_simple_remove(&pool->worker_ida, id); kfree(worker); return NULL; } @@ -1800,7 +1744,6 @@ fail: */ static void start_worker(struct worker *worker) { - worker->flags |= WORKER_STARTED; worker->pool->nr_workers++; worker_enter_idle(worker); wake_up_process(worker->task); @@ -1818,8 +1761,6 @@ static int create_and_start_worker(struct worker_pool *pool) { struct worker *worker; - mutex_lock(&pool->manager_mutex); - worker = create_worker(pool); if (worker) { spin_lock_irq(&pool->lock); @@ -1827,8 +1768,6 @@ static int create_and_start_worker(struct worker_pool *pool) spin_unlock_irq(&pool->lock); } - mutex_unlock(&pool->manager_mutex); - return worker ? 0 : -ENOMEM; } @@ -1836,46 +1775,30 @@ static int create_and_start_worker(struct worker_pool *pool) * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed * - * Destroy @worker and adjust @pool stats accordingly. + * Destroy @worker and adjust @pool stats accordingly. The worker should + * be idle. * * CONTEXT: - * spin_lock_irq(pool->lock) which is released and regrabbed. + * spin_lock_irq(pool->lock). */ static void destroy_worker(struct worker *worker) { struct worker_pool *pool = worker->pool; - lockdep_assert_held(&pool->manager_mutex); lockdep_assert_held(&pool->lock); /* sanity check frenzy */ if (WARN_ON(worker->current_work) || - WARN_ON(!list_empty(&worker->scheduled))) + WARN_ON(!list_empty(&worker->scheduled)) || + WARN_ON(!(worker->flags & WORKER_IDLE))) return; - if (worker->flags & WORKER_STARTED) - pool->nr_workers--; - if (worker->flags & WORKER_IDLE) - pool->nr_idle--; - - /* - * Once WORKER_DIE is set, the kworker may destroy itself at any - * point. Pin to ensure the task stays until we're done with it. - */ - get_task_struct(worker->task); + pool->nr_workers--; + pool->nr_idle--; list_del_init(&worker->entry); worker->flags |= WORKER_DIE; - - idr_remove(&pool->worker_idr, worker->id); - - spin_unlock_irq(&pool->lock); - - kthread_stop(worker->task); - put_task_struct(worker->task); - kfree(worker); - - spin_lock_irq(&pool->lock); + wake_up_process(worker->task); } static void idle_worker_timeout(unsigned long __pool) @@ -1884,7 +1807,7 @@ static void idle_worker_timeout(unsigned long __pool) spin_lock_irq(&pool->lock); - if (too_many_workers(pool)) { + while (too_many_workers(pool)) { struct worker *worker; unsigned long expires; @@ -1892,13 +1815,12 @@ static void idle_worker_timeout(unsigned long __pool) worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; - if (time_before(jiffies, expires)) + if (time_before(jiffies, expires)) { mod_timer(&pool->idle_timer, expires); - else { - /* it's been idle for too long, wake up manager */ - pool->flags |= POOL_MANAGE_WORKERS; - wake_up_worker(pool); + break; } + + destroy_worker(worker); } spin_unlock_irq(&pool->lock); @@ -2017,44 +1939,6 @@ restart: } /** - * maybe_destroy_worker - destroy workers which have been idle for a while - * @pool: pool to destroy workers for - * - * Destroy @pool workers which have been idle for longer than - * IDLE_WORKER_TIMEOUT. - * - * LOCKING: - * spin_lock_irq(pool->lock) which may be released and regrabbed - * multiple times. Called only from manager. - * - * Return: - * %false if no action was taken and pool->lock stayed locked, %true - * otherwise. - */ -static bool maybe_destroy_workers(struct worker_pool *pool) -{ - bool ret = false; - - while (too_many_workers(pool)) { - struct worker *worker; - unsigned long expires; - - worker = list_entry(pool->idle_list.prev, struct worker, entry); - expires = worker->last_active + IDLE_WORKER_TIMEOUT; - - if (time_before(jiffies, expires)) { - mod_timer(&pool->idle_timer, expires); - break; - } - - destroy_worker(worker); - ret = true; - } - - return ret; -} - -/** * manage_workers - manage worker pool * @worker: self * @@ -2083,8 +1967,6 @@ static bool manage_workers(struct worker *worker) bool ret = false; /* - * Managership is governed by two mutexes - manager_arb and - * manager_mutex. manager_arb handles arbitration of manager role. * Anyone who successfully grabs manager_arb wins the arbitration * and becomes the manager. mutex_trylock() on pool->manager_arb * failure while holding pool->lock reliably indicates that someone @@ -2093,40 +1975,12 @@ static bool manage_workers(struct worker *worker) * grabbing manager_arb is responsible for actually performing * manager duties. If manager_arb is grabbed and released without * actual management, the pool may stall indefinitely. - * - * manager_mutex is used for exclusion of actual management - * operations. The holder of manager_mutex can be sure that none - * of management operations, including creation and destruction of - * workers, won't take place until the mutex is released. Because - * manager_mutex doesn't interfere with manager role arbitration, - * it is guaranteed that the pool's management, while may be - * delayed, won't be disturbed by someone else grabbing - * manager_mutex. */ if (!mutex_trylock(&pool->manager_arb)) return ret; - /* - * With manager arbitration won, manager_mutex would be free in - * most cases. trylock first without dropping @pool->lock. - */ - if (unlikely(!mutex_trylock(&pool->manager_mutex))) { - spin_unlock_irq(&pool->lock); - mutex_lock(&pool->manager_mutex); - spin_lock_irq(&pool->lock); - ret = true; - } - - pool->flags &= ~POOL_MANAGE_WORKERS; - - /* - * Destroy and then create so that may_start_working() is true - * on return. - */ - ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); - mutex_unlock(&pool->manager_mutex); mutex_unlock(&pool->manager_arb); return ret; } @@ -2314,6 +2168,11 @@ woke_up: spin_unlock_irq(&pool->lock); WARN_ON_ONCE(!list_empty(&worker->entry)); worker->task->flags &= ~PF_WQ_WORKER; + + set_task_comm(worker->task, "kworker/dying"); + ida_simple_remove(&pool->worker_ida, worker->id); + worker_detach_from_pool(worker, pool); + kfree(worker); return 0; } @@ -2361,9 +2220,6 @@ recheck: worker_set_flags(worker, WORKER_PREP, false); sleep: - if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) - goto recheck; - /* * pool->lock is held and there's no work to process and no need to * manage, sleep. Workers are woken up only while holding @@ -2440,8 +2296,9 @@ repeat: spin_unlock_irq(&wq_mayday_lock); - /* migrate to the target cpu if possible */ - worker_maybe_bind_and_lock(pool); + worker_attach_to_pool(rescuer, pool); + + spin_lock_irq(&pool->lock); rescuer->pool = pool; /* @@ -2454,6 +2311,11 @@ repeat: move_linked_works(work, scheduled, &n); process_scheduled_works(rescuer); + spin_unlock_irq(&pool->lock); + + worker_detach_from_pool(rescuer, pool); + + spin_lock_irq(&pool->lock); /* * Put the reference grabbed by send_mayday(). @pool won't @@ -3550,9 +3412,10 @@ static int init_worker_pool(struct worker_pool *pool) (unsigned long)pool); mutex_init(&pool->manager_arb); - mutex_init(&pool->manager_mutex); - idr_init(&pool->worker_idr); + mutex_init(&pool->attach_mutex); + INIT_LIST_HEAD(&pool->workers); + ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; @@ -3567,7 +3430,7 @@ static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); - idr_destroy(&pool->worker_idr); + ida_destroy(&pool->worker_ida); free_workqueue_attrs(pool->attrs); kfree(pool); } @@ -3585,6 +3448,7 @@ static void rcu_free_pool(struct rcu_head *rcu) */ static void put_unbound_pool(struct worker_pool *pool) { + DECLARE_COMPLETION_ONSTACK(detach_completion); struct worker *worker; lockdep_assert_held(&wq_pool_mutex); @@ -3605,18 +3469,24 @@ static void put_unbound_pool(struct worker_pool *pool) /* * Become the manager and destroy all workers. Grabbing * manager_arb prevents @pool's workers from blocking on - * manager_mutex. + * attach_mutex. */ mutex_lock(&pool->manager_arb); - mutex_lock(&pool->manager_mutex); - spin_lock_irq(&pool->lock); - while ((worker = first_worker(pool))) + spin_lock_irq(&pool->lock); + while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); - spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->manager_mutex); + + mutex_lock(&pool->attach_mutex); + if (!list_empty(&pool->workers)) + pool->detach_completion = &detach_completion; + mutex_unlock(&pool->attach_mutex); + + if (pool->detach_completion) + wait_for_completion(pool->detach_completion); + mutex_unlock(&pool->manager_arb); /* shut down the timers */ @@ -3662,9 +3532,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) if (!pool || init_worker_pool(pool) < 0) goto fail; - if (workqueue_freezing) - pool->flags |= POOL_FREEZING; - lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); @@ -3771,7 +3638,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) spin_lock_irq(&pwq->pool->lock); - if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { + /* + * During [un]freezing, the caller is responsible for ensuring that + * this function is called at least once after @workqueue_freezing + * is updated and visible. + */ + if (!freezable || !workqueue_freezing) { pwq->max_active = wq->saved_max_active; while (!list_empty(&pwq->delayed_works) && @@ -4103,17 +3975,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, * Let's determine what needs to be done. If the target cpumask is * different from wq's, we need to compare it to @pwq's and create * a new one if they don't match. If the target cpumask equals - * wq's, the default pwq should be used. If @pwq is already the - * default one, nothing to do; otherwise, install the default one. + * wq's, the default pwq should be used. */ if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) goto out_unlock; } else { - if (pwq == wq->dfl_pwq) - goto out_unlock; - else - goto use_dfl_pwq; + goto use_dfl_pwq; } mutex_unlock(&wq->mutex); @@ -4121,8 +3989,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { - pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", - wq->name); + pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", + wq->name); mutex_lock(&wq->mutex); goto use_dfl_pwq; } @@ -4599,28 +4467,27 @@ static void wq_unbind_fn(struct work_struct *work) int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; - int wi; for_each_cpu_worker_pool(pool, cpu) { WARN_ON_ONCE(cpu != smp_processor_id()); - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); spin_lock_irq(&pool->lock); /* - * We've blocked all manager operations. Make all workers + * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * except for the ones which are still executing works from * before the last CPU down must be on the cpu. After * this, they may become diasporas. */ - for_each_pool_worker(worker, wi, pool) + for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); /* * Call schedule() so that we cross rq->lock and thus can @@ -4660,9 +4527,8 @@ static void wq_unbind_fn(struct work_struct *work) static void rebind_workers(struct worker_pool *pool) { struct worker *worker; - int wi; - lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->attach_mutex); /* * Restore CPU affinity of all workers. As all idle workers should @@ -4671,13 +4537,13 @@ static void rebind_workers(struct worker_pool *pool) * of all workers first and then clear UNBOUND. As we're called * from CPU_ONLINE, the following shouldn't fail. */ - for_each_pool_worker(worker, wi, pool) + for_each_pool_worker(worker, pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); - for_each_pool_worker(worker, wi, pool) { + for_each_pool_worker(worker, pool) { unsigned int worker_flags = worker->flags; /* @@ -4729,9 +4595,8 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) { static cpumask_t cpumask; struct worker *worker; - int wi; - lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->attach_mutex); /* is @cpu allowed for @pool? */ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) @@ -4743,7 +4608,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) return; /* as we're called from CPU_ONLINE, the following shouldn't fail */ - for_each_pool_worker(worker, wi, pool) + for_each_pool_worker(worker, pool) WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); } @@ -4776,7 +4641,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, mutex_lock(&wq_pool_mutex); for_each_pool(pool, pi) { - mutex_lock(&pool->manager_mutex); + mutex_lock(&pool->attach_mutex); if (pool->cpu == cpu) { spin_lock_irq(&pool->lock); @@ -4788,7 +4653,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, restore_unbound_workers_cpumask(pool, cpu); } - mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->attach_mutex); } /* update NUMA affinity of unbound workqueues */ @@ -4887,24 +4752,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu); */ void freeze_workqueues_begin(void) { - struct worker_pool *pool; struct workqueue_struct *wq; struct pool_workqueue *pwq; - int pi; mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; - /* set FREEZING */ - for_each_pool(pool, pi) { - spin_lock_irq(&pool->lock); - WARN_ON_ONCE(pool->flags & POOL_FREEZING); - pool->flags |= POOL_FREEZING; - spin_unlock_irq(&pool->lock); - } - list_for_each_entry(wq, &workqueues, list) { mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) @@ -4974,21 +4829,13 @@ void thaw_workqueues(void) { struct workqueue_struct *wq; struct pool_workqueue *pwq; - struct worker_pool *pool; - int pi; mutex_lock(&wq_pool_mutex); if (!workqueue_freezing) goto out_unlock; - /* clear FREEZING */ - for_each_pool(pool, pi) { - spin_lock_irq(&pool->lock); - WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); - pool->flags &= ~POOL_FREEZING; - spin_unlock_irq(&pool->lock); - } + workqueue_freezing = false; /* restore max_active and repopulate worklist */ list_for_each_entry(wq, &workqueues, list) { @@ -4998,7 +4845,6 @@ void thaw_workqueues(void) mutex_unlock(&wq->mutex); } - workqueue_freezing = false; out_unlock: mutex_unlock(&wq_pool_mutex); } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 7e2204db0b1a..45215870ac6c 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -37,6 +37,8 @@ struct worker { struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* I: the associated pool */ /* L: for rescuers */ + struct list_head node; /* A: anchored at pool->workers */ + /* A: runs through worker->node */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ |