diff options
Diffstat (limited to 'kernel')
97 files changed, 3586 insertions, 2068 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 39c4f26c484d..592d927e70f9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -32,6 +32,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/file.h> +#include <linux/hex.h> #include <linux/init.h> #include <linux/types.h> #include <linux/atomic.h> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index dc906dfdff94..5ab6bace7d0d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -25,6 +25,7 @@ #include <linux/prandom.h> #include <linux/bpf.h> #include <linux/btf.h> +#include <linux/hex.h> #include <linux/objtool.h> #include <linux/overflow.h> #include <linux/rbtree_latch.h> @@ -716,8 +717,8 @@ static struct bpf_ksym *bpf_ksym_find(unsigned long addr) return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } -int __bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char *sym) +int bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) { struct bpf_ksym *ksym; int ret = 0; diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 2fdfa828e3d3..e4e338cdb437 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -695,7 +695,6 @@ __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) int ret; BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); - BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock)); preempt_disable(); ret = res_spin_lock((rqspinlock_t *)lock); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 683c332dbafb..dd89bf809772 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -9,6 +9,7 @@ #include <linux/bpf_verifier.h> #include <linux/bsearch.h> #include <linux/btf.h> +#include <linux/hex.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/sched/signal.h> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 22051b4f1ccb..3bfe37693d68 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -52,7 +52,7 @@ struct cgroup_fs_context { bool cpuset_clone_children; bool none; /* User explicitly requested empty subsystem */ bool all_ss; /* Seen 'all' option */ - u16 subsys_mask; /* Selected subsystems */ + u32 subsys_mask; /* Selected subsystems */ char *name; /* Hierarchy name */ char *release_agent; /* Path for release notifications */ }; @@ -146,7 +146,7 @@ struct cgroup_mgctx { struct cgroup_taskset tset; /* subsystems affected by migration */ - u16 ss_mask; + u32 ss_mask; }; #define CGROUP_TASKSET_INIT(tset) \ @@ -235,8 +235,8 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup_favor_dynmods(struct cgroup_root *root, bool favor); void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_fs_context *ctx); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask); +int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask); int cgroup_do_get_tree(struct fs_context *fc); int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index a9e029b570c8..724950c4b690 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -28,7 +28,7 @@ #define CGROUP_PIDLIST_DESTROY_DELAY HZ /* Controllers blocked by the commandline in v1 */ -static u16 cgroup_no_v1_mask; +static u32 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; @@ -1037,13 +1037,13 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) static int check_cgroupfs_options(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); - u16 mask = U16_MAX; - u16 enabled = 0; + u32 mask = U32_MAX; + u32 enabled = 0; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); + mask = ~((u32)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) && @@ -1095,7 +1095,7 @@ int cgroup1_reconfigure(struct fs_context *fc) struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); int ret = 0; - u16 added_mask, removed_mask; + u32 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); @@ -1343,7 +1343,7 @@ static int __init cgroup_no_v1(char *str) continue; if (!strcmp(token, "all")) { - cgroup_no_v1_mask = U16_MAX; + cgroup_no_v1_mask = U32_MAX; continue; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5f0d33b04910..8af4351536cf 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -203,13 +203,13 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ -static u16 cgrp_dfl_inhibit_ss_mask; +static u32 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ -static u16 cgrp_dfl_implicit_ss_mask; +static u32 cgrp_dfl_implicit_ss_mask; /* some controllers can be threaded on the default hierarchy */ -static u16 cgrp_dfl_threaded_ss_mask; +static u32 cgrp_dfl_threaded_ss_mask; /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); @@ -231,10 +231,10 @@ static u64 css_serial_nr_next = 1; * These bitmasks identify subsystems with specific features to avoid * having to do iterative checks repeatedly. */ -static u16 have_fork_callback __read_mostly; -static u16 have_exit_callback __read_mostly; -static u16 have_release_callback __read_mostly; -static u16 have_canfork_callback __read_mostly; +static u32 have_fork_callback __read_mostly; +static u32 have_exit_callback __read_mostly; +static u32 have_release_callback __read_mostly; +static u32 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); @@ -472,13 +472,13 @@ static bool cgroup_is_valid_domain(struct cgroup *cgrp) } /* subsystems visibly enabled on a cgroup */ -static u16 cgroup_control(struct cgroup *cgrp) +static u32 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); - u16 root_ss_mask = cgrp->root->subsys_mask; + u32 root_ss_mask = cgrp->root->subsys_mask; if (parent) { - u16 ss_mask = parent->subtree_control; + u32 ss_mask = parent->subtree_control; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) @@ -493,12 +493,12 @@ static u16 cgroup_control(struct cgroup *cgrp) } /* subsystems enabled on a cgroup */ -static u16 cgroup_ss_mask(struct cgroup *cgrp) +static u32 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); if (parent) { - u16 ss_mask = parent->subtree_ss_mask; + u32 ss_mask = parent->subtree_ss_mask; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) @@ -1633,9 +1633,9 @@ static umode_t cgroup_file_mode(const struct cftype *cft) * This function calculates which subsystems need to be enabled if * @subtree_control is to be applied while restricted to @this_ss_mask. */ -static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) +static u32 cgroup_calc_subtree_ss_mask(u32 subtree_control, u32 this_ss_mask) { - u16 cur_ss_mask = subtree_control; + u32 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; @@ -1644,7 +1644,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { - u16 new_ss_mask = cur_ss_mask; + u32 new_ss_mask = cur_ss_mask; do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; @@ -1848,12 +1848,12 @@ err: return ret; } -int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) +int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, ret; - u16 dfl_disable_ss_mask = 0; + u32 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); @@ -2149,7 +2149,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -3131,7 +3131,7 @@ void cgroup_procs_write_finish(struct task_struct *task, put_task_struct(task); } -static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) +static void cgroup_print_ss_mask(struct seq_file *seq, u32 ss_mask) { struct cgroup_subsys *ss; bool printed = false; @@ -3496,9 +3496,9 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) cgroup_apply_control_disable(cgrp); } -static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) +static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u32 enable) { - u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; + u32 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; /* if nothing is getting enabled, nothing to worry about */ if (!enable) @@ -3541,7 +3541,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - u16 enable = 0, disable = 0; + u32 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -4945,7 +4945,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css) rcu_read_lock(); css_for_each_child(child, css) { - if (child->flags & CSS_ONLINE) { + if (css_is_online(child)) { ret = true; break; } @@ -5750,7 +5750,7 @@ static void offline_css(struct cgroup_subsys_state *css) lockdep_assert_held(&cgroup_mutex); - if (!(css->flags & CSS_ONLINE)) + if (!css_is_online(css)) return; if (ss->css_offline) @@ -6347,7 +6347,7 @@ int __init cgroup_init(void) struct cgroup_subsys *ss; int ssid; - BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 32); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 01976c8e7d49..fd7d19842ded 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -9,6 +9,7 @@ #include <linux/cpuset.h> #include <linux/spinlock.h> #include <linux/union_find.h> +#include <linux/sched/isolation.h> /* See "Frequency meter" comments, below. */ @@ -144,17 +145,12 @@ struct cpuset { */ nodemask_t old_mems_allowed; - struct fmeter fmeter; /* memory_pressure filter */ - /* * Tasks are being attached to this cpuset. Used to prevent * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). */ int attach_in_progress; - /* for custom sched domain */ - int relax_domain_level; - /* partition root state */ int partition_root_state; @@ -179,10 +175,19 @@ struct cpuset { /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; +#ifdef CONFIG_CPUSETS_V1 + struct fmeter fmeter; /* memory_pressure filter */ + + /* for custom sched domain */ + int relax_domain_level; + /* Used to merge intersecting subsets for generate_sched_domains */ struct uf_node node; +#endif }; +extern struct cpuset top_cpuset; + static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) { return css ? container_of(css, struct cpuset, css) : NULL; @@ -240,6 +245,30 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } +/* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? + */ +static inline int cpusets_overlap(struct cpuset *a, struct cpuset *b) +{ + return cpumask_intersects(a->effective_cpus, b->effective_cpus); +} + +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key.key) + 1; +} + +static inline bool cpuset_is_populated(struct cpuset *cs) +{ + lockdep_assert_cpuset_lock_held(); + + /* Cpusets in the process of attaching should be considered as populated */ + return cgroup_is_populated(cs->css.cgroup) || + cs->attach_in_progress; +} + /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child @@ -285,7 +314,6 @@ void cpuset_full_unlock(void); */ #ifdef CONFIG_CPUSETS_V1 extern struct cftype cpuset1_files[]; -void fmeter_init(struct fmeter *fmp); void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk); void cpuset1_update_tasks_flags(struct cpuset *cs); @@ -293,8 +321,13 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2); +void cpuset1_init(struct cpuset *cs); +void cpuset1_online_css(struct cgroup_subsys_state *css); +int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes); + #else -static inline void fmeter_init(struct fmeter *fmp) {} static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) {} static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {} @@ -303,6 +336,13 @@ static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, bool cpus_updated, bool mems_updated) {} static inline int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { return 0; } +static inline bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, + struct cpuset *cs2) { return false; } +static inline void cpuset1_init(struct cpuset *cs) {} +static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {} +static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes) { return 0; }; + #endif /* CONFIG_CPUSETS_V1 */ #endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 12e76774c75b..7a23b9e8778f 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -62,7 +62,7 @@ struct cpuset_remove_tasks_struct { #define FM_SCALE 1000 /* faux fixed point scale */ /* Initialize a frequency meter */ -void fmeter_init(struct fmeter *fmp) +static void fmeter_init(struct fmeter *fmp) { fmp->cnt = 0; fmp->val = 0; @@ -368,11 +368,44 @@ int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) if (par && !is_cpuset_subset(trial, par)) goto out; + /* + * Cpusets with tasks - existing or newly being attached - can't + * be changed to have empty cpus_allowed or mems_allowed. + */ + ret = -ENOSPC; + if (cpuset_is_populated(cur)) { + if (!cpumask_empty(cur->cpus_allowed) && + cpumask_empty(trial->cpus_allowed)) + goto out; + if (!nodes_empty(cur->mems_allowed) && + nodes_empty(trial->mems_allowed)) + goto out; + } + ret = 0; out: return ret; } +/* + * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts + * to legacy (v1) + * @cs1: first cpuset to check + * @cs2: second cpuset to check + * + * Returns: true if CPU exclusivity conflict exists, false otherwise + * + * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect. + */ +bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +{ + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) + return cpumask_intersects(cs1->cpus_allowed, + cs2->cpus_allowed); + + return false; +} + #ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() @@ -499,6 +532,242 @@ out_unlock: return retval; } +void cpuset1_init(struct cpuset *cs) +{ + fmeter_init(&cs->fmeter); + cs->relax_domain_level = -1; +} + +void cpuset1_online_css(struct cgroup_subsys_state *css) +{ + struct cpuset *tmp_cs; + struct cgroup_subsys_state *pos_css; + struct cpuset *cs = css_cs(css); + struct cpuset *parent = parent_cs(cs); + + lockdep_assert_cpus_held(); + lockdep_assert_cpuset_lock_held(); + + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); + + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) + return; + + /* + * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is + * set. This flag handling is implemented in cgroup core for + * historical reasons - the flag may be specified during mount. + * + * Currently, if any sibling cpusets have exclusive cpus or mem, we + * refuse to clone the configuration - thereby refusing the task to + * be entered, and as a result refusing the sys_unshare() or + * clone() which initiated it. If this becomes a problem for some + * users who wish to allow that scenario, then this could be + * changed to grant parent->cpus_allowed-sibling_cpus_exclusive + * (and likewise for mems) to the new cgroup. + */ + rcu_read_lock(); + cpuset_for_each_child(tmp_cs, pos_css, parent) { + if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + + cpuset_callback_lock_irq(); + cs->mems_allowed = parent->mems_allowed; + cs->effective_mems = parent->mems_allowed; + cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); + cpuset_callback_unlock_irq(); +} + +static void +update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) +{ + if (dattr->relax_domain_level < c->relax_domain_level) + dattr->relax_domain_level = c->relax_domain_level; +} + +static void update_domain_attr_tree(struct sched_domain_attr *dattr, + struct cpuset *root_cs) +{ + struct cpuset *cp; + struct cgroup_subsys_state *pos_css; + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + /* skip the whole subtree if @cp doesn't have any CPU */ + if (cpumask_empty(cp->cpus_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + + if (is_sched_load_balance(cp)) + update_domain_attr(dattr, cp); + } + rcu_read_unlock(); +} + +/* + * cpuset1_generate_sched_domains() + * + * Finding the best partition (set of domains): + * The double nested loops below over i, j scan over the load + * balanced cpusets (using the array of cpuset pointers in csa[]) + * looking for pairs of cpusets that have overlapping cpus_allowed + * and merging them using a union-find algorithm. + * + * The union of the cpus_allowed masks from the set of all cpusets + * having the same root then form the one element of the partition + * (one sched domain) to be passed to partition_sched_domains(). + */ +int cpuset1_generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes) +{ + struct cpuset *cp; /* top-down scan of cpusets */ + struct cpuset **csa; /* array of all cpuset ptrs */ + int csn; /* how many cpuset ptrs in csa so far */ + int i, j; /* indices for partition finding loops */ + cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ + struct sched_domain_attr *dattr; /* attributes for custom domains */ + int ndoms = 0; /* number of sched domains in result */ + int nslot; /* next empty doms[] struct cpumask slot */ + struct cgroup_subsys_state *pos_css; + int nslot_update; + + lockdep_assert_cpuset_lock_held(); + + doms = NULL; + dattr = NULL; + csa = NULL; + + /* Special case for the 99% of systems with one, full, sched domain */ + if (is_sched_load_balance(&top_cpuset)) { + ndoms = 1; + doms = alloc_sched_domains(ndoms); + if (!doms) + goto done; + + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); + if (dattr) { + *dattr = SD_ATTR_INIT; + update_domain_attr_tree(dattr, &top_cpuset); + } + cpumask_and(doms[0], top_cpuset.effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + + goto done; + } + + csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); + if (!csa) + goto done; + csn = 0; + + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { + if (cp == &top_cpuset) + continue; + + /* + * Continue traversing beyond @cp iff @cp has some CPUs and + * isn't load balancing. The former is obvious. The + * latter: All child cpusets contain a subset of the + * parent's cpus, so just skip them, and then we call + * update_domain_attr_tree() to calc relax_domain_level of + * the corresponding sched domain. + */ + if (!cpumask_empty(cp->cpus_allowed) && + !(is_sched_load_balance(cp) && + cpumask_intersects(cp->cpus_allowed, + housekeeping_cpumask(HK_TYPE_DOMAIN)))) + continue; + + if (is_sched_load_balance(cp) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + } + rcu_read_unlock(); + + for (i = 0; i < csn; i++) + uf_node_init(&csa[i]->node); + + /* Merge overlapping cpusets */ + for (i = 0; i < csn; i++) { + for (j = i + 1; j < csn; j++) { + if (cpusets_overlap(csa[i], csa[j])) + uf_union(&csa[i]->node, &csa[j]->node); + } + } + + /* Count the total number of domains */ + for (i = 0; i < csn; i++) { + if (uf_find(&csa[i]->node) == &csa[i]->node) + ndoms++; + } + + /* + * Now we know how many domains to create. + * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. + */ + doms = alloc_sched_domains(ndoms); + if (!doms) + goto done; + + /* + * The rest of the code, including the scheduler, can deal with + * dattr==NULL case. No need to abort if alloc fails. + */ + dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), + GFP_KERNEL); + + for (nslot = 0, i = 0; i < csn; i++) { + nslot_update = 0; + for (j = i; j < csn; j++) { + if (uf_find(&csa[j]->node) == &csa[i]->node) { + struct cpumask *dp = doms[nslot]; + + if (i == j) { + nslot_update = 1; + cpumask_clear(dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + } + cpumask_or(dp, dp, csa[j]->effective_cpus); + cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); + if (dattr) + update_domain_attr_tree(dattr + nslot, csa[j]); + } + } + if (nslot_update) + nslot++; + } + BUG_ON(nslot != ndoms); + +done: + kfree(csa); + + /* + * Fallback to the default domain if kmalloc() failed. + * See comments in partition_sched_domains(). + */ + if (doms == NULL) + ndoms = 1; + + *domains = doms; + *attributes = dattr; + return ndoms; +} + /* * for the common functions, 'private' gives the type of file */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 01a553caee56..832179236529 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -119,6 +119,17 @@ static bool force_sd_rebuild; * For simplicity, a local partition can be created under a local or remote * partition but a remote partition cannot have any partition root in its * ancestor chain except the cgroup root. + * + * A valid partition can be formed by setting exclusive_cpus or cpus_allowed + * if exclusive_cpus is not set. In the case of partition with empty + * exclusive_cpus, all the conflicting exclusive CPUs specified in the + * following cpumasks of sibling cpusets will be removed from its + * cpus_allowed in determining its effective_xcpus. + * - effective_xcpus + * - exclusive_cpus + * + * The "cpuset.cpus.exclusive" control file should be used for setting up + * partition if the users want to get as many CPUs as possible. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -201,12 +212,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) * If cpu_online_mask is used while a hotunplug operation is happening in * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ -static struct cpuset top_cpuset = { +struct cpuset top_cpuset = { .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, - .relax_domain_level = -1, - .remote_partition = false, }; /* @@ -261,6 +270,11 @@ void cpuset_unlock(void) mutex_unlock(&cpuset_mutex); } +void lockdep_assert_cpuset_lock_held(void) +{ + lockdep_assert_held(&cpuset_mutex); +} + /** * cpuset_full_lock - Acquire full protection for cpuset modification * @@ -319,7 +333,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) */ static inline void dec_attach_in_progress_locked(struct cpuset *cs) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); cs->attach_in_progress--; if (!cs->attach_in_progress) @@ -353,15 +367,6 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } -static inline bool cpuset_is_populated(struct cpuset *cs) -{ - lockdep_assert_held(&cpuset_mutex); - - /* Cpusets in the process of attaching should be considered as populated */ - return cgroup_is_populated(cs->css.cgroup) || - cs->attach_in_progress; -} - /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked @@ -453,9 +458,8 @@ static void guarantee_active_cpus(struct task_struct *tsk, */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { - while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) + while (!nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY])) cs = parent_cs(cs); - nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } /** @@ -603,36 +607,32 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) /** * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts - * @cs1: first cpuset to check - * @cs2: second cpuset to check + * @trial: the trial cpuset to be checked + * @sibling: a sibling cpuset to be checked against + * @xcpus_changed: set if exclusive_cpus has been set * * Returns: true if CPU exclusivity conflict exists, false otherwise * * Conflict detection rules: - * 1. If either cpuset is CPU exclusive, they must be mutually exclusive - * 2. exclusive_cpus masks cannot intersect between cpusets - * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs + * o cgroup v1 + * See cpuset1_cpus_excl_conflict() + * o cgroup v2 + * - The exclusive_cpus values cannot overlap. + * - New exclusive_cpus cannot be a superset of a sibling's cpus_allowed. */ -static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) +static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling, + bool xcpus_changed) { - /* If either cpuset is exclusive, check if they are mutually exclusive */ - if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) - return !cpusets_are_exclusive(cs1, cs2); - - /* Exclusive_cpus cannot intersect */ - if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) - return true; - - /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ - if (!cpumask_empty(cs1->cpus_allowed) && - cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) - return true; + if (!cpuset_v2()) + return cpuset1_cpus_excl_conflict(trial, sibling); - if (!cpumask_empty(cs2->cpus_allowed) && - cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) + /* The cpus_allowed of a sibling cpuset cannot be a subset of the new exclusive_cpus */ + if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) && + cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus)) return true; - return false; + /* Exclusive_cpus cannot intersect */ + return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus); } static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) @@ -666,6 +666,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; + bool xcpus_changed; int ret = 0; rcu_read_lock(); @@ -682,20 +683,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); /* - * Cpusets with tasks - existing or newly being attached - can't - * be changed to have empty cpus_allowed or mems_allowed. - */ - ret = -ENOSPC; - if (cpuset_is_populated(cur)) { - if (!cpumask_empty(cur->cpus_allowed) && - cpumask_empty(trial->cpus_allowed)) - goto out; - if (!nodes_empty(cur->mems_allowed) && - nodes_empty(trial->mems_allowed)) - goto out; - } - - /* * We can't shrink if we won't have enough room for SCHED_DEADLINE * tasks. This check is not done when scheduling is disabled as the * users should know what they are doing. @@ -722,10 +709,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; + xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus); cpuset_for_each_child(c, css, par) { if (c == cur) continue; - if (cpus_excl_conflict(trial, c)) + if (cpus_excl_conflict(trial, c, xcpus_changed)) goto out; if (mems_excl_conflict(trial, c)) goto out; @@ -738,49 +726,6 @@ out: } #ifdef CONFIG_SMP -/* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? - */ -static int cpusets_overlap(struct cpuset *a, struct cpuset *b) -{ - return cpumask_intersects(a->effective_cpus, b->effective_cpus); -} - -static void -update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) -{ - if (dattr->relax_domain_level < c->relax_domain_level) - dattr->relax_domain_level = c->relax_domain_level; - return; -} - -static void update_domain_attr_tree(struct sched_domain_attr *dattr, - struct cpuset *root_cs) -{ - struct cpuset *cp; - struct cgroup_subsys_state *pos_css; - - rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - /* skip the whole subtree if @cp doesn't have any CPU */ - if (cpumask_empty(cp->cpus_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } - - if (is_sched_load_balance(cp)) - update_domain_attr(dattr, cp); - } - rcu_read_unlock(); -} - -/* Must be called with cpuset_mutex held. */ -static inline int nr_cpusets(void) -{ - /* jump label reference count + the top-level cpuset */ - return static_key_count(&cpusets_enabled_key.key) + 1; -} /* * generate_sched_domains() @@ -820,103 +765,46 @@ static inline int nr_cpusets(void) * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) - * - * Finding the best partition (set of domains): - * The double nested loops below over i, j scan over the load - * balanced cpusets (using the array of cpuset pointers in csa[]) - * looking for pairs of cpusets that have overlapping cpus_allowed - * and merging them using a union-find algorithm. - * - * The union of the cpus_allowed masks from the set of all cpusets - * having the same root then form the one element of the partition - * (one sched domain) to be passed to partition_sched_domains(). - * */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ - int csn; /* how many cpuset ptrs in csa so far */ int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ - int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; - bool root_load_balance = is_sched_load_balance(&top_cpuset); - bool cgrpv2 = cpuset_v2(); - int nslot_update; + + if (!cpuset_v2()) + return cpuset1_generate_sched_domains(domains, attributes); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && cpumask_empty(subpartitions_cpus)) { -single_root_domain: + if (cpumask_empty(subpartitions_cpus)) { ndoms = 1; - doms = alloc_sched_domains(ndoms); - if (!doms) - goto done; - - dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); - if (dattr) { - *dattr = SD_ATTR_INIT; - update_domain_attr_tree(dattr, &top_cpuset); - } - cpumask_and(doms[0], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_TYPE_DOMAIN)); - - goto done; + /* !csa will be checked and can be correctly handled */ + goto generate_doms; } csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); if (!csa) goto done; - csn = 0; + /* Find how many partitions and cache them to csa[] */ rcu_read_lock(); - if (root_load_balance) - csa[csn++] = &top_cpuset; cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { - if (cp == &top_cpuset) - continue; - - if (cgrpv2) - goto v2; - - /* - * v1: - * Continue traversing beyond @cp iff @cp has some CPUs and - * isn't load balancing. The former is obvious. The - * latter: All child cpusets contain a subset of the - * parent's cpus, so just skip them, and then we call - * update_domain_attr_tree() to calc relax_domain_level of - * the corresponding sched domain. - */ - if (!cpumask_empty(cp->cpus_allowed) && - !(is_sched_load_balance(cp) && - cpumask_intersects(cp->cpus_allowed, - housekeeping_cpumask(HK_TYPE_DOMAIN)))) - continue; - - if (is_sched_load_balance(cp) && - !cpumask_empty(cp->effective_cpus)) - csa[csn++] = cp; - - /* skip @cp's subtree */ - pos_css = css_rightmost_descendant(pos_css); - continue; - -v2: /* * Only valid partition roots that are not isolated and with - * non-empty effective_cpus will be saved into csn[]. + * non-empty effective_cpus will be saved into csa[]. */ if ((cp->partition_root_state == PRS_ROOT) && !cpumask_empty(cp->effective_cpus)) - csa[csn++] = cp; + csa[ndoms++] = cp; /* * Skip @cp's subtree if not a partition root and has no @@ -927,40 +815,18 @@ v2: } rcu_read_unlock(); - /* - * If there are only isolated partitions underneath the cgroup root, - * we can optimize out unneeded sched domains scanning. - */ - if (root_load_balance && (csn == 1)) - goto single_root_domain; - - for (i = 0; i < csn; i++) - uf_node_init(&csa[i]->node); - - /* Merge overlapping cpusets */ - for (i = 0; i < csn; i++) { - for (j = i + 1; j < csn; j++) { - if (cpusets_overlap(csa[i], csa[j])) { + for (i = 0; i < ndoms; i++) { + for (j = i + 1; j < ndoms; j++) { + if (cpusets_overlap(csa[i], csa[j])) /* * Cgroup v2 shouldn't pass down overlapping * partition root cpusets. */ - WARN_ON_ONCE(cgrpv2); - uf_union(&csa[i]->node, &csa[j]->node); - } + WARN_ON_ONCE(1); } } - /* Count the total number of domains */ - for (i = 0; i < csn; i++) { - if (uf_find(&csa[i]->node) == &csa[i]->node) - ndoms++; - } - - /* - * Now we know how many domains to create. - * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. - */ +generate_doms: doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -977,45 +843,19 @@ v2: * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a * subset of HK_TYPE_DOMAIN housekeeping CPUs. */ - if (cgrpv2) { - for (i = 0; i < ndoms; i++) { - /* - * The top cpuset may contain some boot time isolated - * CPUs that need to be excluded from the sched domain. - */ - if (csa[i] == &top_cpuset) - cpumask_and(doms[i], csa[i]->effective_cpus, - housekeeping_cpumask(HK_TYPE_DOMAIN)); - else - cpumask_copy(doms[i], csa[i]->effective_cpus); - if (dattr) - dattr[i] = SD_ATTR_INIT; - } - goto done; - } - - for (nslot = 0, i = 0; i < csn; i++) { - nslot_update = 0; - for (j = i; j < csn; j++) { - if (uf_find(&csa[j]->node) == &csa[i]->node) { - struct cpumask *dp = doms[nslot]; - - if (i == j) { - nslot_update = 1; - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; - } - cpumask_or(dp, dp, csa[j]->effective_cpus); - cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); - if (dattr) - update_domain_attr_tree(dattr + nslot, csa[j]); - } - } - if (nslot_update) - nslot++; + for (i = 0; i < ndoms; i++) { + /* + * The top cpuset may contain some boot time isolated + * CPUs that need to be excluded from the sched domain. + */ + if (!csa || csa[i] == &top_cpuset) + cpumask_and(doms[i], top_cpuset.effective_cpus, + housekeeping_cpumask(HK_TYPE_DOMAIN)); + else + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; } - BUG_ON(nslot != ndoms); done: kfree(csa); @@ -1055,7 +895,7 @@ void dl_rebuild_rd_accounting(void) int cpu; u64 cookie = ++dl_cookie; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); @@ -1100,53 +940,33 @@ void dl_rebuild_rd_accounting(void) */ void rebuild_sched_domains_locked(void) { - struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; cpumask_var_t *doms; - struct cpuset *cs; int ndoms; + int i; lockdep_assert_cpus_held(); - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); force_sd_rebuild = false; - /* - * If we have raced with CPU hotplug, return early to avoid - * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_handle_hotplug() will rebuild sched domains. - * - * With no CPUs in any subpartitions, top_cpuset's effective CPUs - * should be the same as the active CPUs, so checking only top_cpuset - * is enough to detect racing CPU offlines. - */ - if (cpumask_empty(subpartitions_cpus) && - !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - return; + /* Generate domain masks and attrs */ + ndoms = generate_sched_domains(&doms, &attr); /* - * With subpartition CPUs, however, the effective CPUs of a partition - * root should be only a subset of the active CPUs. Since a CPU in any - * partition root could be offlined, all must be checked. - */ - if (!cpumask_empty(subpartitions_cpus)) { - rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (!is_partition_valid(cs)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } - if (!cpumask_subset(cs->effective_cpus, - cpu_active_mask)) { - rcu_read_unlock(); - return; - } - } - rcu_read_unlock(); + * cpuset_hotplug_workfn is invoked synchronously now, thus this + * function should not race with CPU hotplug. And the effective CPUs + * must not include any offline CPUs. Passing an offline CPU in the + * doms to partition_sched_domains() will trigger a kernel panic. + * + * We perform a final check here: if the doms contains any + * offline CPUs, a warning is emitted and we return directly to + * prevent the panic. + */ + for (i = 0; i < ndoms; ++i) { + if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) + return; } - /* Generate domain masks and attrs */ - ndoms = generate_sched_domains(&doms, &attr); - /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); } @@ -1501,23 +1321,29 @@ static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, int retval = 0; if (cpumask_empty(excpus)) - return retval; + return 0; /* - * Exclude exclusive CPUs from siblings + * Remove exclusive CPUs from siblings */ rcu_read_lock(); cpuset_for_each_child(sibling, css, parent) { + struct cpumask *sibling_xcpus; + if (sibling == cs) continue; - if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { - cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); - retval++; - continue; - } - if (cpumask_intersects(excpus, sibling->effective_xcpus)) { - cpumask_andnot(excpus, excpus, sibling->effective_xcpus); + /* + * If exclusive_cpus is defined, effective_xcpus will always + * be a subset. Otherwise, effective_xcpus will only be set + * in a valid partition root. + */ + sibling_xcpus = cpumask_empty(sibling->exclusive_cpus) + ? sibling->effective_xcpus + : sibling->exclusive_cpus; + + if (cpumask_intersects(excpus, sibling_xcpus)) { + cpumask_andnot(excpus, excpus, sibling_xcpus); retval++; } } @@ -1806,7 +1632,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int parent_prs = parent->partition_root_state; bool nocpu; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ /* @@ -2315,17 +2141,13 @@ get_css: spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; - if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) - compute_excpus(cp, cp->effective_xcpus); - /* - * Make sure effective_xcpus is properly set for a valid - * partition root. + * Need to compute effective_xcpus if either exclusive_cpus + * is non-empty or it is a valid partition root. */ - if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) - cpumask_and(cp->effective_xcpus, - cp->cpus_allowed, parent->effective_xcpus); - else if (new_prs < 0) + if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus)) + compute_excpus(cp, cp->effective_xcpus); + if (new_prs <= 0) reset_partition_data(cp); spin_unlock_irq(&callback_lock); @@ -2378,7 +2200,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); /* * Check all its siblings and call update_cpumasks_hier() @@ -2387,27 +2209,20 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus - * directly. + * directly. It should not impact valid partition. * * The update_cpumasks_hier() function may sleep. So we have to * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { - if (sibling == cs) + if (sibling == cs || is_partition_valid(sibling)) continue; - if (!is_partition_valid(sibling)) { - compute_effective_cpumask(tmp->new_cpus, sibling, - parent); - if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) - continue; - } else if (is_remote_partition(sibling)) { - /* - * Change in a sibling cpuset won't affect a remote - * partition root. - */ + + compute_effective_cpumask(tmp->new_cpus, sibling, + parent); + if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) continue; - } if (!css_tryget_online(&sibling->css)) continue; @@ -2463,43 +2278,6 @@ static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *tri return PERR_NONE; } -static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, - struct tmpmasks *tmp) -{ - int retval; - struct cpuset *parent = parent_cs(cs); - - retval = validate_change(cs, trialcs); - - if ((retval == -EINVAL) && cpuset_v2()) { - struct cgroup_subsys_state *css; - struct cpuset *cp; - - /* - * The -EINVAL error code indicates that partition sibling - * CPU exclusivity rule has been violated. We still allow - * the cpumask change to proceed while invalidating the - * partition. However, any conflicting sibling partitions - * have to be marked as invalid too. - */ - trialcs->prs_err = PERR_NOTEXCL; - rcu_read_lock(); - cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = user_xcpus(trialcs); - - if (is_partition_valid(cp) && - cpumask_intersects(xcpus, cp->effective_xcpus)) { - rcu_read_unlock(); - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); - rcu_read_lock(); - } - } - rcu_read_unlock(); - retval = 0; - } - return retval; -} - /** * partition_cpus_change - Handle partition state changes due to CPU mask updates * @cs: The target cpuset being modified @@ -2559,15 +2337,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; - if (alloc_tmpmasks(&tmp)) - return -ENOMEM; - compute_trialcs_excpus(trialcs, cs); trialcs->prs_err = PERR_NONE; - retval = cpus_allowed_validate_change(cs, trialcs, &tmp); + retval = validate_change(cs, trialcs); if (retval < 0) - goto out_free; + return retval; + + if (alloc_tmpmasks(&tmp)) + return -ENOMEM; /* * Check all the descendants in update_cpumasks_hier() if @@ -2590,7 +2368,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) update_partition_sd_lb(cs, old_prs); -out_free: + free_tmpmasks(&tmp); return retval; } @@ -2843,13 +2621,13 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); - nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); + bool has_mems = nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); /* * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (is_in_v2_mode() && nodes_empty(*new_mems)) + if (is_in_v2_mode() && !has_mems) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -3249,7 +3027,7 @@ static nodemask_t cpuset_attach_nodemask_to; static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); if (cs != &top_cpuset) guarantee_active_cpus(task, cpus_attach); @@ -3605,8 +3383,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) return ERR_PTR(-ENOMEM); __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - fmeter_init(&cs->fmeter); - cs->relax_domain_level = -1; + cpuset1_init(cs); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cpuset_v2()) @@ -3619,17 +3396,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); - struct cpuset *tmp_cs; - struct cgroup_subsys_state *pos_css; if (!parent) return 0; cpuset_full_lock(); - if (is_spread_page(parent)) - set_bit(CS_SPREAD_PAGE, &cs->flags); - if (is_spread_slab(parent)) - set_bit(CS_SPREAD_SLAB, &cs->flags); /* * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated */ @@ -3644,39 +3415,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; } spin_unlock_irq(&callback_lock); + cpuset1_online_css(css); - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) - goto out_unlock; - - /* - * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is - * set. This flag handling is implemented in cgroup core for - * historical reasons - the flag may be specified during mount. - * - * Currently, if any sibling cpusets have exclusive cpus or mem, we - * refuse to clone the configuration - thereby refusing the task to - * be entered, and as a result refusing the sys_unshare() or - * clone() which initiated it. If this becomes a problem for some - * users who wish to allow that scenario, then this could be - * changed to grant parent->cpus_allowed-sibling_cpus_exclusive - * (and likewise for mems) to the new cgroup. - */ - rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_css, parent) { - if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { - rcu_read_unlock(); - goto out_unlock; - } - } - rcu_read_unlock(); - - spin_lock_irq(&callback_lock); - cs->mems_allowed = parent->mems_allowed; - cs->effective_mems = parent->mems_allowed; - cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); - cpumask_copy(cs->effective_cpus, parent->cpus_allowed); - spin_unlock_irq(&callback_lock); -out_unlock: cpuset_full_unlock(); return 0; } @@ -3876,7 +3616,7 @@ int __init cpuset_init(void) cpumask_setall(top_cpuset.exclusive_cpus); nodes_setall(top_cpuset.effective_mems); - fmeter_init(&top_cpuset.fmeter); + cpuset1_init(&top_cpuset); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -4210,7 +3950,7 @@ static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask */ void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { - lockdep_assert_held(&cpuset_mutex); + lockdep_assert_cpuset_lock_held(); __cpuset_cpus_allowed_locked(tsk, pmask); } diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 81ea38dd6f9d..a5490097fe52 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -230,7 +230,7 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) } static void cgroup_masks_read_one(struct seq_file *seq, const char *name, - u16 mask) + u32 mask) { struct cgroup_subsys *ss; int ssid; diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 9f6ab7dabf67..774702591d26 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -84,7 +84,7 @@ CONFIG_SLUB_DEBUG_ON=y # Debug Oops, Lockups and Hangs # CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0 -# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0 CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DETECT_HUNG_TASK=y CONFIG_PANIC_ON_OOPS=y diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 99dac1aa972a..3952b3e102e0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -44,9 +44,15 @@ note_buf_t __percpu *crash_notes; int kimage_crash_copy_vmcoreinfo(struct kimage *image) { - struct page *vmcoreinfo_page; + struct page *vmcoreinfo_base; + struct page *vmcoreinfo_pages[DIV_ROUND_UP(VMCOREINFO_BYTES, PAGE_SIZE)]; + unsigned int order, nr_pages; + int i; void *safecopy; + nr_pages = DIV_ROUND_UP(VMCOREINFO_BYTES, PAGE_SIZE); + order = get_order(VMCOREINFO_BYTES); + if (!IS_ENABLED(CONFIG_CRASH_DUMP)) return 0; if (image->type != KEXEC_TYPE_CRASH) @@ -61,12 +67,15 @@ int kimage_crash_copy_vmcoreinfo(struct kimage *image) * happens to generate vmcoreinfo note, hereby we rely on * vmap for this purpose. */ - vmcoreinfo_page = kimage_alloc_control_pages(image, 0); - if (!vmcoreinfo_page) { + vmcoreinfo_base = kimage_alloc_control_pages(image, order); + if (!vmcoreinfo_base) { pr_warn("Could not allocate vmcoreinfo buffer\n"); return -ENOMEM; } - safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); + for (i = 0; i < nr_pages; i++) + vmcoreinfo_pages[i] = vmcoreinfo_base + i; + + safecopy = vmap(vmcoreinfo_pages, nr_pages, VM_MAP, PAGE_KERNEL); if (!safecopy) { pr_warn("Could not vmap vmcoreinfo buffer\n"); return -ENOMEM; diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c index 401423ba477d..37129243054d 100644 --- a/kernel/crash_dump_dm_crypt.c +++ b/kernel/crash_dump_dm_crypt.c @@ -143,6 +143,7 @@ static int read_key_from_user_keying(struct dm_crypt_key *dm_key) { const struct user_key_payload *ukp; struct key *key; + int ret = 0; kexec_dprintk("Requesting logon key %s", dm_key->key_desc); key = request_key(&key_type_logon, dm_key->key_desc, NULL); @@ -152,20 +153,28 @@ static int read_key_from_user_keying(struct dm_crypt_key *dm_key) return PTR_ERR(key); } + down_read(&key->sem); ukp = user_key_payload_locked(key); - if (!ukp) - return -EKEYREVOKED; + if (!ukp) { + ret = -EKEYREVOKED; + goto out; + } if (ukp->datalen > KEY_SIZE_MAX) { pr_err("Key size %u exceeds maximum (%u)\n", ukp->datalen, KEY_SIZE_MAX); - return -EINVAL; + ret = -EINVAL; + goto out; } memcpy(dm_key->data, ukp->data, ukp->datalen); dm_key->key_size = ukp->datalen; kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size, dm_key->key_desc, dm_key->data); - return 0; + +out: + up_read(&key->sem); + key_put(key); + return ret; } struct config_key { @@ -223,7 +232,7 @@ static void config_key_release(struct config_item *item) key_count--; } -static struct configfs_item_operations config_key_item_ops = { +static const struct configfs_item_operations config_key_item_ops = { .release = config_key_release, }; @@ -298,7 +307,7 @@ static struct configfs_attribute *config_keys_attrs[] = { * Note that, since no extra work is required on ->drop_item(), * no ->drop_item() is provided. */ -static struct configfs_group_operations config_keys_group_ops = { +static const struct configfs_group_operations config_keys_group_ops = { .make_item = config_keys_make_item, }; diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 22fe969c5d2e..f586afd76c80 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -27,6 +27,7 @@ #include <linux/kernel.h> #include <linux/sched/signal.h> +#include <linux/hex.h> #include <linux/kgdb.h> #include <linux/kdb.h> #include <linux/serial_core.h> diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 30e7912ebb0d..2e55c493c98b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -18,6 +18,8 @@ do { \ d->type##_delay_max = tsk->delays->type##_delay_max; \ d->type##_delay_min = tsk->delays->type##_delay_min; \ + d->type##_delay_max_ts.tv_sec = tsk->delays->type##_delay_max_ts.tv_sec; \ + d->type##_delay_max_ts.tv_nsec = tsk->delays->type##_delay_max_ts.tv_nsec; \ tmp = d->type##_delay_total + tsk->delays->type##_delay; \ d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \ d->type##_count += tsk->delays->type##_count; \ @@ -104,7 +106,8 @@ void __delayacct_tsk_init(struct task_struct *tsk) * Finish delay accounting for a statistic using its timestamps (@start), * accumulator (@total) and @count */ -static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min) +static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, + u64 *max, u64 *min, struct timespec64 *ts) { s64 ns = local_clock() - *start; unsigned long flags; @@ -113,8 +116,10 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *cou raw_spin_lock_irqsave(lock, flags); *total += ns; (*count)++; - if (ns > *max) + if (ns > *max) { *max = ns; + ktime_get_real_ts64(ts); + } if (*min == 0 || ns < *min) *min = ns; raw_spin_unlock_irqrestore(lock, flags); @@ -137,7 +142,8 @@ void __delayacct_blkio_end(struct task_struct *p) &p->delays->blkio_delay, &p->delays->blkio_count, &p->delays->blkio_delay_max, - &p->delays->blkio_delay_min); + &p->delays->blkio_delay_min, + &p->delays->blkio_delay_max_ts); } int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -170,6 +176,8 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->cpu_delay_max = tsk->sched_info.max_run_delay; d->cpu_delay_min = tsk->sched_info.min_run_delay; + d->cpu_delay_max_ts.tv_sec = tsk->sched_info.max_run_delay_ts.tv_sec; + d->cpu_delay_max_ts.tv_nsec = tsk->sched_info.max_run_delay_ts.tv_nsec; tmp = (s64)d->cpu_delay_total + t2; d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; tmp = (s64)d->cpu_run_virtual_total + t3; @@ -217,7 +225,8 @@ void __delayacct_freepages_end(void) ¤t->delays->freepages_delay, ¤t->delays->freepages_count, ¤t->delays->freepages_delay_max, - ¤t->delays->freepages_delay_min); + ¤t->delays->freepages_delay_min, + ¤t->delays->freepages_delay_max_ts); } void __delayacct_thrashing_start(bool *in_thrashing) @@ -241,7 +250,8 @@ void __delayacct_thrashing_end(bool *in_thrashing) ¤t->delays->thrashing_delay, ¤t->delays->thrashing_count, ¤t->delays->thrashing_delay_max, - ¤t->delays->thrashing_delay_min); + ¤t->delays->thrashing_delay_min, + ¤t->delays->thrashing_delay_max_ts); } void __delayacct_swapin_start(void) @@ -256,7 +266,8 @@ void __delayacct_swapin_end(void) ¤t->delays->swapin_delay, ¤t->delays->swapin_count, ¤t->delays->swapin_delay_max, - ¤t->delays->swapin_delay_min); + ¤t->delays->swapin_delay_min, + ¤t->delays->swapin_delay_max_ts); } void __delayacct_compact_start(void) @@ -271,7 +282,8 @@ void __delayacct_compact_end(void) ¤t->delays->compact_delay, ¤t->delays->compact_count, ¤t->delays->compact_delay_max, - ¤t->delays->compact_delay_min); + ¤t->delays->compact_delay_min, + ¤t->delays->compact_delay_max_ts); } void __delayacct_wpcopy_start(void) @@ -286,7 +298,8 @@ void __delayacct_wpcopy_end(void) ¤t->delays->wpcopy_delay, ¤t->delays->wpcopy_count, ¤t->delays->wpcopy_delay_max, - ¤t->delays->wpcopy_delay_min); + ¤t->delays->wpcopy_delay_min, + ¤t->delays->wpcopy_delay_max_ts); } void __delayacct_irq(struct task_struct *task, u32 delta) @@ -296,8 +309,10 @@ void __delayacct_irq(struct task_struct *task, u32 delta) raw_spin_lock_irqsave(&task->delays->lock, flags); task->delays->irq_delay += delta; task->delays->irq_count++; - if (delta > task->delays->irq_delay_max) + if (delta > task->delays->irq_delay_max) { task->delays->irq_delay_max = delta; + ktime_get_real_ts64(&task->delays->irq_delay_max_ts); + } if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min)) task->delays->irq_delay_min = delta; raw_spin_unlock_irqrestore(&task->delays->lock, flags); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 31cfdb6b4bc3..159900736f25 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -47,12 +47,6 @@ config ARCH_HAS_DMA_SET_MASK config ARCH_HAS_DMA_WRITE_COMBINE bool -# -# Select if the architectures provides the arch_dma_mark_clean hook -# -config ARCH_HAS_DMA_MARK_CLEAN - bool - config DMA_DECLARE_COHERENT bool diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 138ede653de4..43d6a996d7a7 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -63,6 +63,7 @@ enum map_err_types { * @sg_mapped_ents: 'mapped_ents' from dma_map_sg * @paddr: physical start address of the mapping * @map_err_type: track whether dma_mapping_error() was checked + * @is_cache_clean: driver promises not to write to buffer while mapped * @stack_len: number of backtrace entries in @stack_entries * @stack_entries: stack of backtrace history */ @@ -76,7 +77,8 @@ struct dma_debug_entry { int sg_call_ents; int sg_mapped_ents; phys_addr_t paddr; - enum map_err_types map_err_type; + enum map_err_types map_err_type; + bool is_cache_clean; #ifdef CONFIG_STACKTRACE unsigned int stack_len; unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; @@ -472,12 +474,15 @@ static int active_cacheline_dec_overlap(phys_addr_t cln) return active_cacheline_set_overlap(cln, --overlap); } -static int active_cacheline_insert(struct dma_debug_entry *entry) +static int active_cacheline_insert(struct dma_debug_entry *entry, + bool *overlap_cache_clean) { phys_addr_t cln = to_cacheline_number(entry); unsigned long flags; int rc; + *overlap_cache_clean = false; + /* If the device is not writing memory then we don't have any * concerns about the cpu consuming stale data. This mitigates * legitimate usages of overlapping mappings. @@ -487,8 +492,16 @@ static int active_cacheline_insert(struct dma_debug_entry *entry) spin_lock_irqsave(&radix_lock, flags); rc = radix_tree_insert(&dma_active_cacheline, cln, entry); - if (rc == -EEXIST) + if (rc == -EEXIST) { + struct dma_debug_entry *existing; + active_cacheline_inc_overlap(cln); + existing = radix_tree_lookup(&dma_active_cacheline, cln); + /* A lookup failure here after we got -EEXIST is unexpected. */ + WARN_ON(!existing); + if (existing) + *overlap_cache_clean = existing->is_cache_clean; + } spin_unlock_irqrestore(&radix_lock, flags); return rc; @@ -583,19 +596,24 @@ DEFINE_SHOW_ATTRIBUTE(dump); */ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) { + bool overlap_cache_clean; struct hash_bucket *bucket; unsigned long flags; int rc; + entry->is_cache_clean = !!(attrs & DMA_ATTR_CPU_CACHE_CLEAN); + bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); put_hash_bucket(bucket, flags); - rc = active_cacheline_insert(entry); + rc = active_cacheline_insert(entry, &overlap_cache_clean); if (rc == -ENOMEM) { pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; - } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + } else if (rc == -EEXIST && + !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + !(entry->is_cache_clean && overlap_cache_clean) && !(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && is_swiotlb_active(entry->dev))) { err_printk(entry->dev, entry, diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 50c3fe2a1d55..c9fa983990cd 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -425,9 +425,6 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, arch_sync_dma_for_cpu(paddr, sg->length, dir); swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir); - - if (dir == DMA_FROM_DEVICE) - arch_dma_mark_clean(paddr, sg->length); } if (!dev_is_dma_coherent(dev)) diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index da2fadf45bcd..f476c63b668c 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -75,9 +75,6 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, } swiotlb_sync_single_for_cpu(dev, paddr, size, dir); - - if (dir == DMA_FROM_DEVICE) - arch_dma_mark_clean(paddr, size); } static inline dma_addr_t dma_direct_map_phys(struct device *dev, diff --git a/kernel/fork.c b/kernel/fork.c index 9c5effbdbdc1..e832da9d15a4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1357,7 +1357,7 @@ struct file *get_task_exe_file(struct task_struct *task) * @task: The task. * * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning - * this kernel workthread has transiently adopted a user mm with use_mm, + * this kernel workthread has transiently adopted a user mm with kthread_use_mm, * to do its AIO) is not set and if so returns a reference to it, after * bumping up the use count. User must release the mm via mmput() * after use. Typically used by /proc and ptrace. @@ -2069,7 +2069,7 @@ __latent_entropy struct task_struct *copy_process( p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* - * Clear TID on mm_release()? + * TID is cleared in mm_release() when the task exits */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4c01e9d5ccc7..c2258b133939 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1913,6 +1913,7 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) irq_domain_free_irq_data(virq, nr_irqs); irq_free_descs(virq, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs); static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq) { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 049e296f586c..aec2f06858af 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -151,8 +151,10 @@ static unsigned int get_symbol_offset(unsigned long pos) unsigned long kallsyms_sym_address(int idx) { - /* values are unsigned offsets */ - return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; + /* non-relocatable 32-bit kernels just embed the value directly */ + if (!IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_RELOCATABLE)) + return (u32)kallsyms_offsets[idx]; + return (unsigned long)offset_to_ptr(kallsyms_offsets + idx); } static unsigned int get_symbol_seq(int index) @@ -345,7 +347,7 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, return 1; } return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) || - !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); + !!bpf_address_lookup(addr, symbolsize, offset, namebuf); } static int kallsyms_lookup_buildid(unsigned long addr, @@ -355,8 +357,21 @@ static int kallsyms_lookup_buildid(unsigned long addr, { int ret; - namebuf[KSYM_NAME_LEN - 1] = 0; + /* + * kallsyms_lookus() returns pointer to namebuf on success and + * NULL on error. But some callers ignore the return value. + * Instead they expect @namebuf filled either with valid + * or empty string. + */ namebuf[0] = 0; + /* + * Initialize the module-related return values. They are not set + * when the symbol is in vmlinux or it is a bpf address. + */ + if (modname) + *modname = NULL; + if (modbuildid) + *modbuildid = NULL; if (is_ksym_addr(addr)) { unsigned long pos; @@ -365,10 +380,6 @@ static int kallsyms_lookup_buildid(unsigned long addr, /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), namebuf, KSYM_NAME_LEN); - if (modname) - *modname = NULL; - if (modbuildid) - *modbuildid = NULL; return strlen(namebuf); } @@ -377,12 +388,11 @@ static int kallsyms_lookup_buildid(unsigned long addr, ret = module_address_lookup(addr, symbolsize, offset, modname, modbuildid, namebuf); if (!ret) - ret = bpf_address_lookup(addr, symbolsize, - offset, modname, namebuf); + ret = bpf_address_lookup(addr, symbolsize, offset, namebuf); if (!ret) - ret = ftrace_mod_address_lookup(addr, symbolsize, - offset, modname, namebuf); + ret = ftrace_mod_address_lookup(addr, symbolsize, offset, + modname, modbuildid, namebuf); return ret; } @@ -426,6 +436,37 @@ int lookup_symbol_name(unsigned long addr, char *symname) return lookup_module_symbol_name(addr, symname); } +#ifdef CONFIG_STACKTRACE_BUILD_ID + +static int append_buildid(char *buffer, const char *modname, + const unsigned char *buildid) +{ + if (!modname) + return 0; + + if (!buildid) { + pr_warn_once("Undefined buildid for the module %s\n", modname); + return 0; + } + + /* build ID should match length of sprintf */ +#ifdef CONFIG_MODULES + static_assert(sizeof(typeof_member(struct module, build_id)) == 20); +#endif + + return sprintf(buffer, " %20phN", buildid); +} + +#else /* CONFIG_STACKTRACE_BUILD_ID */ + +static int append_buildid(char *buffer, const char *modname, + const unsigned char *buildid) +{ + return 0; +} + +#endif /* CONFIG_STACKTRACE_BUILD_ID */ + /* Look up a kernel symbol and return it in a text buffer. */ static int __sprint_symbol(char *buffer, unsigned long address, int symbol_offset, int add_offset, int add_buildid) @@ -435,6 +476,9 @@ static int __sprint_symbol(char *buffer, unsigned long address, unsigned long offset, size; int len; + /* Prevent module removal until modname and modbuildid are printed */ + guard(rcu)(); + address += symbol_offset; len = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, buffer); @@ -448,15 +492,8 @@ static int __sprint_symbol(char *buffer, unsigned long address, if (modname) { len += sprintf(buffer + len, " [%s", modname); -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) - if (add_buildid && buildid) { - /* build ID should match length of sprintf */ -#if IS_ENABLED(CONFIG_MODULES) - static_assert(sizeof(typeof_member(struct module, build_id)) == 20); -#endif - len += sprintf(buffer + len, " %20phN", buildid); - } -#endif + if (add_buildid) + len += append_buildid(buffer + len, modname, buildid); len += sprintf(buffer + len, "]"); } diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h index 9633782f8250..81a867dbe57d 100644 --- a/kernel/kallsyms_internal.h +++ b/kernel/kallsyms_internal.h @@ -8,7 +8,6 @@ extern const int kallsyms_offsets[]; extern const u8 kallsyms_names[]; extern const unsigned int kallsyms_num_syms; -extern const unsigned long kallsyms_relative_base; extern const char kallsyms_token_table[]; extern const u16 kallsyms_token_index[]; diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 219d22857c98..8ef8167be745 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -176,7 +176,7 @@ static bool __report_matches(const struct expect_report *r) /* Title */ cur = expect[0]; - end = &expect[0][sizeof(expect[0]) - 1]; + end = ARRAY_END(expect[0]); cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ", is_assert ? "assert: race" : "data-race"); if (r->access[1].fn) { @@ -200,7 +200,7 @@ static bool __report_matches(const struct expect_report *r) /* Access 1 */ cur = expect[1]; - end = &expect[1][sizeof(expect[1]) - 1]; + end = ARRAY_END(expect[1]); if (!r->access[1].fn) cur += scnprintf(cur, end - cur, "race at unknown origin, with "); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index eb62a9794242..2bfbb2d144e6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -883,6 +883,60 @@ out_free_sha_regions: #ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY /* + * kexec_purgatory_find_symbol - find a symbol in the purgatory + * @pi: Purgatory to search in. + * @name: Name of the symbol. + * + * Return: pointer to symbol in read-only symtab on success, NULL on error. + */ +static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + const Elf_Shdr *sechdrs; + const Elf_Ehdr *ehdr; + const Elf_Sym *syms; + const char *strtab; + int i, k; + + if (!pi->ehdr) + return NULL; + + ehdr = pi->ehdr; + sechdrs = (void *)ehdr + ehdr->e_shoff; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (void *)ehdr + sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} +/* * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory. * @pi: Purgatory to be loaded. * @kbuf: Buffer to setup. @@ -960,6 +1014,10 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, unsigned long offset; size_t sechdrs_size; Elf_Shdr *sechdrs; + const Elf_Sym *entry_sym; + u16 entry_shndx = 0; + unsigned long entry_off = 0; + bool start_fixed = false; int i; /* @@ -977,6 +1035,12 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, bss_addr = kbuf->mem + kbuf->bufsz; kbuf->image->start = pi->ehdr->e_entry; + entry_sym = kexec_purgatory_find_symbol(pi, "purgatory_start"); + if (entry_sym) { + entry_shndx = entry_sym->st_shndx; + entry_off = entry_sym->st_value; + } + for (i = 0; i < pi->ehdr->e_shnum; i++) { unsigned long align; void *src, *dst; @@ -994,6 +1058,13 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, offset = ALIGN(offset, align); + if (!start_fixed && entry_sym && i == entry_shndx && + (sechdrs[i].sh_flags & SHF_EXECINSTR) && + entry_off < sechdrs[i].sh_size) { + kbuf->image->start = kbuf->mem + offset + entry_off; + start_fixed = true; + } + /* * Check if the segment contains the entry point, if so, * calculate the value of image->start based on it. @@ -1004,13 +1075,14 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, * is not set to the initial value, and warn the user so they * have a chance to fix their purgatory's linker script. */ - if (sechdrs[i].sh_flags & SHF_EXECINSTR && + if (!start_fixed && sechdrs[i].sh_flags & SHF_EXECINSTR && pi->ehdr->e_entry >= sechdrs[i].sh_addr && pi->ehdr->e_entry < (sechdrs[i].sh_addr + sechdrs[i].sh_size) && - !WARN_ON(kbuf->image->start != pi->ehdr->e_entry)) { + kbuf->image->start == pi->ehdr->e_entry) { kbuf->image->start -= sechdrs[i].sh_addr; kbuf->image->start += kbuf->mem + offset; + start_fixed = true; } src = (void *)pi->ehdr + sechdrs[i].sh_offset; @@ -1128,61 +1200,6 @@ out_free_kbuf: return ret; } -/* - * kexec_purgatory_find_symbol - find a symbol in the purgatory - * @pi: Purgatory to search in. - * @name: Name of the symbol. - * - * Return: pointer to symbol in read-only symtab on success, NULL on error. - */ -static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, - const char *name) -{ - const Elf_Shdr *sechdrs; - const Elf_Ehdr *ehdr; - const Elf_Sym *syms; - const char *strtab; - int i, k; - - if (!pi->ehdr) - return NULL; - - ehdr = pi->ehdr; - sechdrs = (void *)ehdr + ehdr->e_shoff; - - for (i = 0; i < ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type != SHT_SYMTAB) - continue; - - if (sechdrs[i].sh_link >= ehdr->e_shnum) - /* Invalid strtab section number */ - continue; - strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset; - syms = (void *)ehdr + sechdrs[i].sh_offset; - - /* Go through symbols for a match */ - for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { - if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) - continue; - - if (strcmp(strtab + syms[k].st_name, name) != 0) - continue; - - if (syms[k].st_shndx == SHN_UNDEF || - syms[k].st_shndx >= ehdr->e_shnum) { - pr_debug("Symbol: %s has bad section index %d.\n", - name, syms[k].st_shndx); - return NULL; - } - - /* Found the symbol we are looking for */ - return &syms[k]; - } - } - - return NULL; -} - void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) { struct purgatory_info *pi = &image->purgatory_info; diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig index d2aeaf13c3ac..1a8513f16ef7 100644 --- a/kernel/liveupdate/Kconfig +++ b/kernel/liveupdate/Kconfig @@ -54,7 +54,6 @@ config KEXEC_HANDOVER_ENABLE_DEFAULT config LIVEUPDATE bool "Live Update Orchestrator" depends on KEXEC_HANDOVER - depends on SHMEM help Enable the Live Update Orchestrator. Live Update is a mechanism, typically based on kexec, that allows the kernel to be updated @@ -73,4 +72,20 @@ config LIVEUPDATE If unsure, say N. +config LIVEUPDATE_MEMFD + bool "Live update support for memfd" + depends on LIVEUPDATE + depends on MEMFD_CREATE + depends on SHMEM + default LIVEUPDATE + help + Enable live update support for memfd regions. This allows preserving + memfd-backed memory across kernel live updates. + + This can be used to back VM memory with memfds, allowing the guest + memory to persist, or for other user workloads needing to preserve + pages. + + If unsure, say N. + endmenu diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile index 7cad2eece32d..d2f779cbe279 100644 --- a/kernel/liveupdate/Makefile +++ b/kernel/liveupdate/Makefile @@ -3,6 +3,7 @@ luo-y := \ luo_core.o \ luo_file.o \ + luo_flb.o \ luo_session.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 90d411a59f76..fb3a7b67676e 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -15,6 +15,7 @@ #include <linux/count_zeros.h> #include <linux/kexec.h> #include <linux/kexec_handover.h> +#include <linux/kho/abi/kexec_handover.h> #include <linux/libfdt.h> #include <linux/list.h> #include <linux/memblock.h> @@ -24,7 +25,6 @@ #include <asm/early_ioremap.h> -#include "kexec_handover_internal.h" /* * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. @@ -33,10 +33,7 @@ #include "../kexec_internal.h" #include "kexec_handover_internal.h" -#define KHO_FDT_COMPATIBLE "kho-v1" -#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" -#define PROP_SUB_FDT "fdt" - +/* The magic token for preserved pages */ #define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */ /* @@ -219,10 +216,32 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, return 0; } +/* For physically contiguous 0-order pages. */ +static void kho_init_pages(struct page *page, unsigned long nr_pages) +{ + for (unsigned long i = 0; i < nr_pages; i++) + set_page_count(page + i, 1); +} + +static void kho_init_folio(struct page *page, unsigned int order) +{ + unsigned long nr_pages = (1 << order); + + /* Head page gets refcount of 1. */ + set_page_count(page, 1); + + /* For higher order folios, tail pages get a page count of zero. */ + for (unsigned long i = 1; i < nr_pages; i++) + set_page_count(page + i, 0); + + if (order > 0) + prep_compound_page(page, order); +} + static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) { struct page *page = pfn_to_online_page(PHYS_PFN(phys)); - unsigned int nr_pages, ref_cnt; + unsigned long nr_pages; union kho_page_info info; if (!page) @@ -240,20 +259,11 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) /* Clear private to make sure later restores on this page error out. */ page->private = 0; - /* Head page gets refcount of 1. */ - set_page_count(page, 1); - /* - * For higher order folios, tail pages get a page count of zero. - * For physically contiguous order-0 pages every pages gets a page - * count of 1 - */ - ref_cnt = is_folio ? 0 : 1; - for (unsigned int i = 1; i < nr_pages; i++) - set_page_count(page + i, ref_cnt); - - if (is_folio && info.order) - prep_compound_page(page, info.order); + if (is_folio) + kho_init_folio(page, info.order); + else + kho_init_pages(page, nr_pages); /* Always mark headpage's codetag as empty to avoid accounting mismatch */ clear_page_tag_ref(page); @@ -289,9 +299,9 @@ EXPORT_SYMBOL_GPL(kho_restore_folio); * Restore a contiguous list of order 0 pages that was preserved with * kho_preserve_pages(). * - * Return: 0 on success, error code on failure + * Return: the first page on success, NULL on failure. */ -struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) +struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages) { const unsigned long start_pfn = PHYS_PFN(phys); const unsigned long end_pfn = start_pfn + nr_pages; @@ -386,7 +396,7 @@ static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk) void *ptr; u64 phys; - ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL); + ptr = fdt_getprop_w(kho_out.fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, NULL); /* Check and discard previous memory map */ phys = get_unaligned((u64 *)ptr); @@ -474,7 +484,7 @@ static phys_addr_t __init kho_get_mem_map_phys(const void *fdt) const void *mem_ptr; int len; - mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); + mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len); if (!mem_ptr || len != sizeof(u64)) { pr_err("failed to get preserved memory bitmaps\n"); return 0; @@ -645,11 +655,13 @@ static void __init kho_reserve_scratch(void) scratch_size_update(); /* FIXME: deal with node hot-plug/remove */ - kho_scratch_cnt = num_online_nodes() + 2; + kho_scratch_cnt = nodes_weight(node_states[N_MEMORY]) + 2; size = kho_scratch_cnt * sizeof(*kho_scratch); kho_scratch = memblock_alloc(size, PAGE_SIZE); - if (!kho_scratch) + if (!kho_scratch) { + pr_err("Failed to reserve scratch array\n"); goto err_disable_kho; + } /* * reserve scratch area in low memory for lowmem allocations in the @@ -658,8 +670,10 @@ static void __init kho_reserve_scratch(void) size = scratch_size_lowmem; addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, ARCH_LOW_ADDRESS_LIMIT); - if (!addr) + if (!addr) { + pr_err("Failed to reserve lowmem scratch buffer\n"); goto err_free_scratch_desc; + } kho_scratch[i].addr = addr; kho_scratch[i].size = size; @@ -668,20 +682,28 @@ static void __init kho_reserve_scratch(void) /* reserve large contiguous area for allocations without nid */ size = scratch_size_global; addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); - if (!addr) + if (!addr) { + pr_err("Failed to reserve global scratch buffer\n"); goto err_free_scratch_areas; + } kho_scratch[i].addr = addr; kho_scratch[i].size = size; i++; - for_each_online_node(nid) { + /* + * Loop over nodes that have both memory and are online. Skip + * memoryless nodes, as we can not allocate scratch areas there. + */ + for_each_node_state(nid, N_MEMORY) { size = scratch_size_node(nid); addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid, true); - if (!addr) + if (!addr) { + pr_err("Failed to reserve nid %d scratch buffer\n", nid); goto err_free_scratch_areas; + } kho_scratch[i].addr = addr; kho_scratch[i].size = size; @@ -735,7 +757,8 @@ int kho_add_subtree(const char *name, void *fdt) goto out_pack; } - err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys)); + err = fdt_setprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, + &phys, sizeof(phys)); if (err < 0) goto out_pack; @@ -766,7 +789,7 @@ void kho_remove_subtree(void *fdt) const u64 *val; int len; - val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len); + val = fdt_getprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, &len); if (!val || len != sizeof(phys_addr_t)) continue; @@ -831,7 +854,7 @@ EXPORT_SYMBOL_GPL(kho_unpreserve_folio); * * Return: 0 on success, error code on failure */ -int kho_preserve_pages(struct page *page, unsigned int nr_pages) +int kho_preserve_pages(struct page *page, unsigned long nr_pages) { struct kho_mem_track *track = &kho_out.track; const unsigned long start_pfn = page_to_pfn(page); @@ -875,7 +898,7 @@ EXPORT_SYMBOL_GPL(kho_preserve_pages); * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger * preserved blocks is not supported. */ -void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +void kho_unpreserve_pages(struct page *page, unsigned long nr_pages) { struct kho_mem_track *track = &kho_out.track; const unsigned long start_pfn = page_to_pfn(page); @@ -885,21 +908,6 @@ void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) } EXPORT_SYMBOL_GPL(kho_unpreserve_pages); -struct kho_vmalloc_hdr { - DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); -}; - -#define KHO_VMALLOC_SIZE \ - ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \ - sizeof(phys_addr_t)) - -struct kho_vmalloc_chunk { - struct kho_vmalloc_hdr hdr; - phys_addr_t phys[KHO_VMALLOC_SIZE]; -}; - -static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE); - /* vmalloc flags KHO supports */ #define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP) @@ -1315,7 +1323,7 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) if (offset < 0) return -ENOENT; - val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); + val = fdt_getprop(fdt, offset, KHO_FDT_SUB_TREE_PROP_NAME, &len); if (!val || len != sizeof(*val)) return -EINVAL; @@ -1335,7 +1343,7 @@ static __init int kho_out_fdt_setup(void) err |= fdt_finish_reservemap(root); err |= fdt_begin_node(root, ""); err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); - err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map, + err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, &empty_mem_map, sizeof(empty_mem_map)); err |= fdt_end_node(root); err |= fdt_finish(root); @@ -1451,46 +1459,40 @@ void __init kho_memory_init(void) void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len) { + unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); struct kho_scratch *scratch = NULL; phys_addr_t mem_map_phys; void *fdt = NULL; - int err = 0; - unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); + int err; /* Validate the input FDT */ fdt = early_memremap(fdt_phys, fdt_len); if (!fdt) { pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); - err = -EFAULT; - goto out; + goto err_report; } err = fdt_check_header(fdt); if (err) { pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", fdt_phys, err); - err = -EINVAL; - goto out; + goto err_unmap_fdt; } err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); if (err) { pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", fdt_phys, KHO_FDT_COMPATIBLE, err); - err = -EINVAL; - goto out; + goto err_unmap_fdt; } mem_map_phys = kho_get_mem_map_phys(fdt); - if (!mem_map_phys) { - err = -ENOENT; - goto out; - } + if (!mem_map_phys) + goto err_unmap_fdt; scratch = early_memremap(scratch_phys, scratch_len); if (!scratch) { pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", scratch_phys, scratch_len); - err = -EFAULT; - goto out; + goto err_unmap_fdt; } /* @@ -1507,7 +1509,7 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, if (WARN_ON(err)) { pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe", &area->addr, &size, ERR_PTR(err)); - goto out; + goto err_unmap_scratch; } pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); } @@ -1529,13 +1531,14 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, kho_scratch_cnt = scratch_cnt; pr_info("found kexec handover data.\n"); -out: - if (fdt) - early_memunmap(fdt, fdt_len); - if (scratch) - early_memunmap(scratch, scratch_len); - if (err) - pr_warn("disabling KHO revival: %d\n", err); + return; + +err_unmap_scratch: + early_memunmap(scratch, scratch_len); +err_unmap_fdt: + early_memunmap(fdt, fdt_len); +err_report: + pr_warn("disabling KHO revival\n"); } /* Helper functions for kexec_file_load */ diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index 944663d99dd9..dda7bb57d421 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -35,8 +35,7 @@ * iommu, interrupts, vfio, participating filesystems, and memory management. * * LUO uses Kexec Handover to transfer memory state from the current kernel to - * the next kernel. For more details see - * Documentation/core-api/kho/concepts.rst. + * the next kernel. For more details see Documentation/core-api/kho/index.rst. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -128,7 +127,9 @@ static int __init luo_early_startup(void) if (err) return err; - return 0; + err = luo_flb_setup_incoming(luo_global.fdt_in); + + return err; } static int __init liveupdate_early_init(void) @@ -165,6 +166,7 @@ static int __init luo_fdt_setup(void) err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE); err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln)); err |= luo_session_setup_outgoing(fdt_out); + err |= luo_flb_setup_outgoing(fdt_out); err |= fdt_end_node(fdt_out); err |= fdt_finish(fdt_out); if (err) @@ -226,6 +228,8 @@ int liveupdate_reboot(void) if (err) return err; + luo_flb_serialize(); + err = kho_finalize(); if (err) { pr_err("kho_finalize failed %d\n", err); diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index 9f7283379ebc..4c7df52a6507 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -104,6 +104,7 @@ #include <linux/io.h> #include <linux/kexec_handover.h> #include <linux/kho/abi/luo.h> +#include <linux/list_private.h> #include <linux/liveupdate.h> #include <linux/module.h> #include <linux/sizes.h> @@ -273,7 +274,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) goto err_fput; err = -ENOENT; - luo_list_for_each_private(fh, &luo_file_handler_list, list) { + list_private_for_each_entry(fh, &luo_file_handler_list, list) { if (fh->ops->can_preserve(fh, file)) { err = 0; break; @@ -284,10 +285,14 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) if (err) goto err_free_files_mem; + err = luo_flb_file_preserve(fh); + if (err) + goto err_free_files_mem; + luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL); if (!luo_file) { err = -ENOMEM; - goto err_free_files_mem; + goto err_flb_unpreserve; } luo_file->file = file; @@ -311,6 +316,8 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) err_kfree: kfree(luo_file); +err_flb_unpreserve: + luo_flb_file_unpreserve(fh); err_free_files_mem: luo_free_files_mem(file_set); err_fput: @@ -352,6 +359,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set) args.serialized_data = luo_file->serialized_data; args.private_data = luo_file->private_data; luo_file->fh->ops->unpreserve(&args); + luo_flb_file_unpreserve(luo_file->fh); list_del(&luo_file->list); file_set->count--; @@ -627,6 +635,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set, args.retrieved = luo_file->retrieved; luo_file->fh->ops->finish(&args); + luo_flb_file_finish(luo_file->fh); } /** @@ -758,7 +767,7 @@ int luo_file_deserialize(struct luo_file_set *file_set, bool handler_found = false; struct luo_file *luo_file; - luo_list_for_each_private(fh, &luo_file_handler_list, list) { + list_private_for_each_entry(fh, &luo_file_handler_list, list) { if (!strcmp(fh->compatible, file_ser[i].compatible)) { handler_found = true; break; @@ -833,7 +842,7 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) return -EBUSY; /* Check for duplicate compatible strings */ - luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) { + list_private_for_each_entry(fh_iter, &luo_file_handler_list, list) { if (!strcmp(fh_iter->compatible, fh->compatible)) { pr_err("File handler registration failed: Compatible string '%s' already registered.\n", fh->compatible); @@ -848,10 +857,13 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) goto err_resume; } + INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, flb_list)); INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list)); list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list); luo_session_resume(); + liveupdate_test_register(fh); + return 0; err_resume: @@ -868,23 +880,38 @@ err_resume: * * It ensures safe removal by checking that: * No live update session is currently in progress. + * No FLB registered with this file handler. * * If the unregistration fails, the internal test state is reverted. * * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live - * update is in progress, can't quiesce live update. + * update is in progress, can't quiesce live update or FLB is registred with + * this file handler. */ int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) { + int err = -EBUSY; + if (!liveupdate_enabled()) return -EOPNOTSUPP; + liveupdate_test_unregister(fh); + if (!luo_session_quiesce()) - return -EBUSY; + goto err_register; + + if (!list_empty(&ACCESS_PRIVATE(fh, flb_list))) + goto err_resume; list_del(&ACCESS_PRIVATE(fh, list)); module_put(fh->ops->owner); luo_session_resume(); return 0; + +err_resume: + luo_session_resume(); +err_register: + liveupdate_test_register(fh); + return err; } diff --git a/kernel/liveupdate/luo_flb.c b/kernel/liveupdate/luo_flb.c new file mode 100644 index 000000000000..4c437de5c0b0 --- /dev/null +++ b/kernel/liveupdate/luo_flb.c @@ -0,0 +1,654 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +/** + * DOC: LUO File Lifecycle Bound Global Data + * + * File-Lifecycle-Bound (FLB) objects provide a mechanism for managing global + * state that is shared across multiple live-updatable files. The lifecycle of + * this shared state is tied to the preservation of the files that depend on it. + * + * An FLB represents a global resource, such as the IOMMU core state, that is + * required by multiple file descriptors (e.g., all VFIO fds). + * + * The preservation of the FLB's state is triggered when the *first* file + * depending on it is preserved. The cleanup of this state (unpreserve or + * finish) is triggered when the *last* file depending on it is unpreserved or + * finished. + * + * Handler Dependency: A file handler declares its dependency on one or more + * FLBs by registering them via liveupdate_register_flb(). + * + * Callback Model: Each FLB is defined by a set of operations + * (&struct liveupdate_flb_ops) that LUO invokes at key points: + * + * - .preserve(): Called for the first file. Saves global state. + * - .unpreserve(): Called for the last file (if aborted pre-reboot). + * - .retrieve(): Called on-demand in the new kernel to restore the state. + * - .finish(): Called for the last file in the new kernel for cleanup. + * + * This reference-counted approach ensures that shared state is saved exactly + * once and restored exactly once, regardless of how many files depend on it, + * and that its lifecycle is correctly managed across the kexec transition. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cleanup.h> +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/io.h> +#include <linux/kexec_handover.h> +#include <linux/kho/abi/luo.h> +#include <linux/libfdt.h> +#include <linux/list_private.h> +#include <linux/liveupdate.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/unaligned.h> +#include "luo_internal.h" + +#define LUO_FLB_PGCNT 1ul +#define LUO_FLB_MAX (((LUO_FLB_PGCNT << PAGE_SHIFT) - \ + sizeof(struct luo_flb_header_ser)) / sizeof(struct luo_flb_ser)) + +struct luo_flb_header { + struct luo_flb_header_ser *header_ser; + struct luo_flb_ser *ser; + bool active; +}; + +struct luo_flb_global { + struct luo_flb_header incoming; + struct luo_flb_header outgoing; + struct list_head list; + long count; +}; + +static struct luo_flb_global luo_flb_global = { + .list = LIST_HEAD_INIT(luo_flb_global.list), +}; + +/* + * struct luo_flb_link - Links an FLB definition to a file handler's internal + * list of dependencies. + * @flb: A pointer to the registered &struct liveupdate_flb definition. + * @list: The list_head for linking. + */ +struct luo_flb_link { + struct liveupdate_flb *flb; + struct list_head list; +}; + +/* luo_flb_get_private - Access private field, and if needed initialize it. */ +static struct luo_flb_private *luo_flb_get_private(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = &ACCESS_PRIVATE(flb, private); + + if (!private->initialized) { + mutex_init(&private->incoming.lock); + mutex_init(&private->outgoing.lock); + INIT_LIST_HEAD(&private->list); + private->users = 0; + private->initialized = true; + } + + return private; +} + +static int luo_flb_file_preserve_one(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + + scoped_guard(mutex, &private->outgoing.lock) { + if (!private->outgoing.count) { + struct liveupdate_flb_op_args args = {0}; + int err; + + args.flb = flb; + err = flb->ops->preserve(&args); + if (err) + return err; + private->outgoing.data = args.data; + private->outgoing.obj = args.obj; + } + private->outgoing.count++; + } + + return 0; +} + +static void luo_flb_file_unpreserve_one(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + + scoped_guard(mutex, &private->outgoing.lock) { + private->outgoing.count--; + if (!private->outgoing.count) { + struct liveupdate_flb_op_args args = {0}; + + args.flb = flb; + args.data = private->outgoing.data; + args.obj = private->outgoing.obj; + + if (flb->ops->unpreserve) + flb->ops->unpreserve(&args); + + private->outgoing.data = 0; + private->outgoing.obj = NULL; + } + } +} + +static int luo_flb_retrieve_one(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + struct luo_flb_header *fh = &luo_flb_global.incoming; + struct liveupdate_flb_op_args args = {0}; + bool found = false; + int err; + + guard(mutex)(&private->incoming.lock); + + if (private->incoming.finished) + return -ENODATA; + + if (private->incoming.retrieved) + return 0; + + if (!fh->active) + return -ENODATA; + + for (int i = 0; i < fh->header_ser->count; i++) { + if (!strcmp(fh->ser[i].name, flb->compatible)) { + private->incoming.data = fh->ser[i].data; + private->incoming.count = fh->ser[i].count; + found = true; + break; + } + } + + if (!found) + return -ENOENT; + + args.flb = flb; + args.data = private->incoming.data; + + err = flb->ops->retrieve(&args); + if (err) + return err; + + private->incoming.obj = args.obj; + private->incoming.retrieved = true; + + return 0; +} + +static void luo_flb_file_finish_one(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + u64 count; + + scoped_guard(mutex, &private->incoming.lock) + count = --private->incoming.count; + + if (!count) { + struct liveupdate_flb_op_args args = {0}; + + if (!private->incoming.retrieved) { + int err = luo_flb_retrieve_one(flb); + + if (WARN_ON(err)) + return; + } + + scoped_guard(mutex, &private->incoming.lock) { + args.flb = flb; + args.obj = private->incoming.obj; + flb->ops->finish(&args); + + private->incoming.data = 0; + private->incoming.obj = NULL; + private->incoming.finished = true; + } + } +} + +/** + * luo_flb_file_preserve - Notifies FLBs that a file is about to be preserved. + * @fh: The file handler for the preserved file. + * + * This function iterates through all FLBs associated with the given file + * handler. It increments the reference count for each FLB. If the count becomes + * 1, it triggers the FLB's .preserve() callback to save the global state. + * + * This operation is atomic. If any FLB's .preserve() op fails, it will roll + * back by calling .unpreserve() on any FLBs that were successfully preserved + * during this call. + * + * Context: Called from luo_preserve_file() + * Return: 0 on success, or a negative errno on failure. + */ +int luo_flb_file_preserve(struct liveupdate_file_handler *fh) +{ + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + int err = 0; + + list_for_each_entry(iter, flb_list, list) { + err = luo_flb_file_preserve_one(iter->flb); + if (err) + goto exit_err; + } + + return 0; + +exit_err: + list_for_each_entry_continue_reverse(iter, flb_list, list) + luo_flb_file_unpreserve_one(iter->flb); + + return err; +} + +/** + * luo_flb_file_unpreserve - Notifies FLBs that a dependent file was unpreserved. + * @fh: The file handler for the unpreserved file. + * + * This function iterates through all FLBs associated with the given file + * handler, in reverse order of registration. It decrements the reference count + * for each FLB. If the count becomes 0, it triggers the FLB's .unpreserve() + * callback to clean up the global state. + * + * Context: Called when a preserved file is being cleaned up before reboot + * (e.g., from luo_file_unpreserve_files()). + */ +void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh) +{ + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + + list_for_each_entry_reverse(iter, flb_list, list) + luo_flb_file_unpreserve_one(iter->flb); +} + +/** + * luo_flb_file_finish - Notifies FLBs that a dependent file has been finished. + * @fh: The file handler for the finished file. + * + * This function iterates through all FLBs associated with the given file + * handler, in reverse order of registration. It decrements the incoming + * reference count for each FLB. If the count becomes 0, it triggers the FLB's + * .finish() callback for final cleanup in the new kernel. + * + * Context: Called from luo_file_finish() for each file being finished. + */ +void luo_flb_file_finish(struct liveupdate_file_handler *fh) +{ + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + + list_for_each_entry_reverse(iter, flb_list, list) + luo_flb_file_finish_one(iter->flb); +} + +/** + * liveupdate_register_flb - Associate an FLB with a file handler and register it globally. + * @fh: The file handler that will now depend on the FLB. + * @flb: The File-Lifecycle-Bound object to associate. + * + * Establishes a dependency, informing the LUO core that whenever a file of + * type @fh is preserved, the state of @flb must also be managed. + * + * On the first registration of a given @flb object, it is added to a global + * registry. This function checks for duplicate registrations, both for a + * specific handler and globally, and ensures the total number of unique + * FLBs does not exceed the system limit. + * + * Context: Typically called from a subsystem's module init function after + * both the handler and the FLB have been defined and initialized. + * Return: 0 on success. Returns a negative errno on failure: + * -EINVAL if arguments are NULL or not initialized. + * -ENOMEM on memory allocation failure. + * -EEXIST if this FLB is already registered with this handler. + * -ENOSPC if the maximum number of global FLBs has been reached. + * -EOPNOTSUPP if live update is disabled or not configured. + */ +int liveupdate_register_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *link __free(kfree) = NULL; + struct liveupdate_flb *gflb; + struct luo_flb_link *iter; + int err; + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + if (WARN_ON(!flb->ops->preserve || !flb->ops->unpreserve || + !flb->ops->retrieve || !flb->ops->finish)) { + return -EINVAL; + } + + /* + * File handler must already be registered, as it initializes the + * flb_list + */ + if (WARN_ON(list_empty(&ACCESS_PRIVATE(fh, list)))) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + + /* + * Ensure the system is quiescent (no active sessions). + * This acts as a global lock for registration: no other thread can + * be in this section, and no sessions can be creating/using FDs. + */ + if (!luo_session_quiesce()) + return -EBUSY; + + /* Check that this FLB is not already linked to this file handler */ + err = -EEXIST; + list_for_each_entry(iter, flb_list, list) { + if (iter->flb == flb) + goto err_resume; + } + + /* + * If this FLB is not linked to global list it's the first time the FLB + * is registered + */ + if (!private->users) { + if (WARN_ON(!list_empty(&private->list))) { + err = -EINVAL; + goto err_resume; + } + + if (luo_flb_global.count == LUO_FLB_MAX) { + err = -ENOSPC; + goto err_resume; + } + + /* Check that compatible string is unique in global list */ + list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) { + if (!strcmp(gflb->compatible, flb->compatible)) + goto err_resume; + } + + if (!try_module_get(flb->ops->owner)) { + err = -EAGAIN; + goto err_resume; + } + + list_add_tail(&private->list, &luo_flb_global.list); + luo_flb_global.count++; + } + + /* Finally, link the FLB to the file handler */ + private->users++; + link->flb = flb; + list_add_tail(&no_free_ptr(link)->list, flb_list); + luo_session_resume(); + + return 0; + +err_resume: + luo_session_resume(); + return err; +} + +/** + * liveupdate_unregister_flb - Remove an FLB dependency from a file handler. + * @fh: The file handler that is currently depending on the FLB. + * @flb: The File-Lifecycle-Bound object to remove. + * + * Removes the association between the specified file handler and the FLB + * previously established by liveupdate_register_flb(). + * + * This function manages the global lifecycle of the FLB. It decrements the + * FLB's usage count. If this was the last file handler referencing this FLB, + * the FLB is removed from the global registry and the reference to its + * owner module (acquired during registration) is released. + * + * Context: This function ensures the session is quiesced (no active FDs + * being created) during the update. It is typically called from a + * subsystem's module exit function. + * Return: 0 on success. + * -EOPNOTSUPP if live update is disabled. + * -EBUSY if the live update session is active and cannot be quiesced. + * -ENOENT if the FLB was not found in the file handler's list. + */ +int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + int err = -ENOENT; + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + /* + * Ensure the system is quiescent (no active sessions). + * This acts as a global lock for unregistration. + */ + if (!luo_session_quiesce()) + return -EBUSY; + + /* Find and remove the link from the file handler's list */ + list_for_each_entry(iter, flb_list, list) { + if (iter->flb == flb) { + list_del(&iter->list); + kfree(iter); + err = 0; + break; + } + } + + if (err) + goto err_resume; + + private->users--; + /* + * If this is the last file-handler with which we are registred, remove + * from the global list, and relese module reference. + */ + if (!private->users) { + list_del_init(&private->list); + luo_flb_global.count--; + module_put(flb->ops->owner); + } + + luo_session_resume(); + + return 0; + +err_resume: + luo_session_resume(); + return err; +} + +/** + * liveupdate_flb_get_incoming - Retrieve the incoming FLB object. + * @flb: The FLB definition. + * @objp: Output parameter; will be populated with the live shared object. + * + * Returns a pointer to its shared live object for the incoming (post-reboot) + * path. + * + * If this is the first time the object is requested in the new kernel, this + * function will trigger the FLB's .retrieve() callback to reconstruct the + * object from its preserved state. Subsequent calls will return the same + * cached object. + * + * Return: 0 on success, or a negative errno on failure. -ENODATA means no + * incoming FLB data, -ENOENT means specific flb not found in the incoming + * data, and -EOPNOTSUPP when live update is disabled or not configured. + */ +int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + if (!private->incoming.obj) { + int err = luo_flb_retrieve_one(flb); + + if (err) + return err; + } + + guard(mutex)(&private->incoming.lock); + *objp = private->incoming.obj; + + return 0; +} + +/** + * liveupdate_flb_get_outgoing - Retrieve the outgoing FLB object. + * @flb: The FLB definition. + * @objp: Output parameter; will be populated with the live shared object. + * + * Returns a pointer to its shared live object for the outgoing (pre-reboot) + * path. + * + * This function assumes the object has already been created by the FLB's + * .preserve() callback, which is triggered when the first dependent file + * is preserved. + * + * Return: 0 on success, or a negative errno on failure. + */ +int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + guard(mutex)(&private->outgoing.lock); + *objp = private->outgoing.obj; + + return 0; +} + +int __init luo_flb_setup_outgoing(void *fdt_out) +{ + struct luo_flb_header_ser *header_ser; + u64 header_ser_pa; + int err; + + header_ser = kho_alloc_preserve(LUO_FLB_PGCNT << PAGE_SHIFT); + if (IS_ERR(header_ser)) + return PTR_ERR(header_ser); + + header_ser_pa = virt_to_phys(header_ser); + + err = fdt_begin_node(fdt_out, LUO_FDT_FLB_NODE_NAME); + err |= fdt_property_string(fdt_out, "compatible", + LUO_FDT_FLB_COMPATIBLE); + err |= fdt_property(fdt_out, LUO_FDT_FLB_HEADER, &header_ser_pa, + sizeof(header_ser_pa)); + err |= fdt_end_node(fdt_out); + + if (err) + goto err_unpreserve; + + header_ser->pgcnt = LUO_FLB_PGCNT; + luo_flb_global.outgoing.header_ser = header_ser; + luo_flb_global.outgoing.ser = (void *)(header_ser + 1); + luo_flb_global.outgoing.active = true; + + return 0; + +err_unpreserve: + kho_unpreserve_free(header_ser); + + return err; +} + +int __init luo_flb_setup_incoming(void *fdt_in) +{ + struct luo_flb_header_ser *header_ser; + int err, header_size, offset; + const void *ptr; + u64 header_ser_pa; + + offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_FLB_NODE_NAME); + if (offset < 0) { + pr_err("Unable to get FLB node [%s]\n", LUO_FDT_FLB_NODE_NAME); + + return -ENOENT; + } + + err = fdt_node_check_compatible(fdt_in, offset, + LUO_FDT_FLB_COMPATIBLE); + if (err) { + pr_err("FLB node is incompatible with '%s' [%d]\n", + LUO_FDT_FLB_COMPATIBLE, err); + + return -EINVAL; + } + + header_size = 0; + ptr = fdt_getprop(fdt_in, offset, LUO_FDT_FLB_HEADER, &header_size); + if (!ptr || header_size != sizeof(u64)) { + pr_err("Unable to get FLB header property '%s' [%d]\n", + LUO_FDT_FLB_HEADER, header_size); + + return -EINVAL; + } + + header_ser_pa = get_unaligned((u64 *)ptr); + header_ser = phys_to_virt(header_ser_pa); + + luo_flb_global.incoming.header_ser = header_ser; + luo_flb_global.incoming.ser = (void *)(header_ser + 1); + luo_flb_global.incoming.active = true; + + return 0; +} + +/** + * luo_flb_serialize - Serializes all active FLB objects for KHO. + * + * This function is called from the reboot path. It iterates through all + * registered File-Lifecycle-Bound (FLB) objects. For each FLB that has been + * preserved (i.e., its reference count is greater than zero), it writes its + * metadata into the memory region designated for Kexec Handover. + * + * The serialized data includes the FLB's compatibility string, its opaque + * data handle, and the final reference count. This allows the new kernel to + * find the appropriate handler and reconstruct the FLB's state. + * + * Context: Called from liveupdate_reboot() just before kho_finalize(). + */ +void luo_flb_serialize(void) +{ + struct luo_flb_header *fh = &luo_flb_global.outgoing; + struct liveupdate_flb *gflb; + int i = 0; + + list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) { + struct luo_flb_private *private = luo_flb_get_private(gflb); + + if (private->outgoing.count > 0) { + strscpy(fh->ser[i].name, gflb->compatible, + sizeof(fh->ser[i].name)); + fh->ser[i].data = private->outgoing.data; + fh->ser[i].count = private->outgoing.count; + i++; + } + } + + fh->header_ser->count = i; +} diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h index c8973b543d1d..8083d8739b09 100644 --- a/kernel/liveupdate/luo_internal.h +++ b/kernel/liveupdate/luo_internal.h @@ -40,13 +40,6 @@ static inline int luo_ucmd_respond(struct luo_ucmd *ucmd, */ #define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__) -/* Mimics list_for_each_entry() but for private list head entries */ -#define luo_list_for_each_private(pos, head, member) \ - for (struct list_head *__iter = (head)->next; \ - __iter != (head) && \ - ({ pos = container_of(__iter, typeof(*(pos)), member); 1; }); \ - __iter = __iter->next) - /** * struct luo_file_set - A set of files that belong to the same sessions. * @files_list: An ordered list of files associated with this session, it is @@ -107,4 +100,19 @@ int luo_file_deserialize(struct luo_file_set *file_set, void luo_file_set_init(struct luo_file_set *file_set); void luo_file_set_destroy(struct luo_file_set *file_set); +int luo_flb_file_preserve(struct liveupdate_file_handler *fh); +void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh); +void luo_flb_file_finish(struct liveupdate_file_handler *fh); +int __init luo_flb_setup_outgoing(void *fdt); +int __init luo_flb_setup_incoming(void *fdt); +void luo_flb_serialize(void); + +#ifdef CONFIG_LIVEUPDATE_TEST +void liveupdate_test_register(struct liveupdate_file_handler *fh); +void liveupdate_test_unregister(struct liveupdate_file_handler *fh); +#else +static inline void liveupdate_test_register(struct liveupdate_file_handler *fh) { } +static inline void liveupdate_test_unregister(struct liveupdate_file_handler *fh) { } +#endif + #endif /* _LINUX_LUO_INTERNAL_H */ diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 00a60796327c..0fc11e45df9b 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -334,13 +334,8 @@ int module_address_lookup(unsigned long addr, if (mod) { if (modname) *modname = mod->name; - if (modbuildid) { -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) - *modbuildid = mod->build_id; -#else - *modbuildid = NULL; -#endif - } + if (modbuildid) + *modbuildid = module_buildid(mod); sym = find_kallsyms_symbol(mod, addr, size, offset); diff --git a/kernel/panic.c b/kernel/panic.c index 0c20fcaae98a..c78600212b6c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -42,6 +42,7 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +#define PANIC_MSG_BUFSZ 1024 #ifdef CONFIG_SMP /* @@ -74,6 +75,8 @@ EXPORT_SYMBOL_GPL(panic_timeout); unsigned long panic_print; +static int panic_force_cpu = -1; + ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); @@ -300,6 +303,150 @@ void __weak crash_smp_send_stop(void) } atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +atomic_t panic_redirect_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); + +#if defined(CONFIG_SMP) && defined(CONFIG_CRASH_DUMP) +static char *panic_force_buf; + +static int __init panic_force_cpu_setup(char *str) +{ + int cpu; + + if (!str) + return -EINVAL; + + if (kstrtoint(str, 0, &cpu) || cpu < 0 || cpu >= nr_cpu_ids) { + pr_warn("panic_force_cpu: invalid value '%s'\n", str); + return -EINVAL; + } + + panic_force_cpu = cpu; + return 0; +} +early_param("panic_force_cpu", panic_force_cpu_setup); + +static int __init panic_force_cpu_late_init(void) +{ + if (panic_force_cpu < 0) + return 0; + + panic_force_buf = kmalloc(PANIC_MSG_BUFSZ, GFP_KERNEL); + + return 0; +} +late_initcall(panic_force_cpu_late_init); + +static void do_panic_on_target_cpu(void *info) +{ + panic("%s", (char *)info); +} + +/** + * panic_smp_redirect_cpu - Redirect panic to target CPU + * @target_cpu: CPU that should handle the panic + * @msg: formatted panic message + * + * Default implementation uses IPI. Architectures with NMI support + * can override this for more reliable delivery. + * + * Return: 0 on success, negative errno on failure + */ +int __weak panic_smp_redirect_cpu(int target_cpu, void *msg) +{ + static call_single_data_t panic_csd; + + panic_csd.func = do_panic_on_target_cpu; + panic_csd.info = msg; + + return smp_call_function_single_async(target_cpu, &panic_csd); +} + +/** + * panic_try_force_cpu - Redirect panic to a specific CPU for crash kernel + * @fmt: panic message format string + * @args: arguments for format string + * + * Some platforms require panic handling to occur on a specific CPU + * for the crash kernel to function correctly. This function redirects + * panic handling to the CPU specified via the panic_force_cpu= boot parameter. + * + * Returns false if panic should proceed on current CPU. + * Returns true if panic was redirected. + */ +__printf(1, 0) +static bool panic_try_force_cpu(const char *fmt, va_list args) +{ + int this_cpu = raw_smp_processor_id(); + int old_cpu = PANIC_CPU_INVALID; + const char *msg; + + /* Feature not enabled via boot parameter */ + if (panic_force_cpu < 0) + return false; + + /* Already on target CPU - proceed normally */ + if (this_cpu == panic_force_cpu) + return false; + + /* Target CPU is offline, can't redirect */ + if (!cpu_online(panic_force_cpu)) { + pr_warn("panic: target CPU %d is offline, continuing on CPU %d\n", + panic_force_cpu, this_cpu); + return false; + } + + /* Another panic already in progress */ + if (panic_in_progress()) + return false; + + /* + * Only one CPU can do the redirect. Use atomic cmpxchg to ensure + * we don't race with another CPU also trying to redirect. + */ + if (!atomic_try_cmpxchg(&panic_redirect_cpu, &old_cpu, this_cpu)) + return false; + + /* + * Use dynamically allocated buffer if available, otherwise + * fall back to static message for early boot panics or allocation failure. + */ + if (panic_force_buf) { + vsnprintf(panic_force_buf, PANIC_MSG_BUFSZ, fmt, args); + msg = panic_force_buf; + } else { + msg = "Redirected panic (buffer unavailable)"; + } + + console_verbose(); + bust_spinlocks(1); + + pr_emerg("panic: Redirecting from CPU %d to CPU %d for crash kernel.\n", + this_cpu, panic_force_cpu); + + /* Dump original CPU before redirecting */ + if (!test_taint(TAINT_DIE) && + oops_in_progress <= 1 && + IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) { + dump_stack(); + } + + if (panic_smp_redirect_cpu(panic_force_cpu, (void *)msg) != 0) { + atomic_set(&panic_redirect_cpu, PANIC_CPU_INVALID); + pr_warn("panic: failed to redirect to CPU %d, continuing on CPU %d\n", + panic_force_cpu, this_cpu); + return false; + } + + /* IPI/NMI sent, this CPU should stop */ + return true; +} +#else +__printf(1, 0) +static inline bool panic_try_force_cpu(const char *fmt, va_list args) +{ + return false; +} +#endif /* CONFIG_SMP && CONFIG_CRASH_DUMP */ bool panic_try_start(void) { @@ -428,7 +575,7 @@ static void panic_other_cpus_shutdown(bool crash_kexec) */ void vpanic(const char *fmt, va_list args) { - static char buf[1024]; + static char buf[PANIC_MSG_BUFSZ]; long i, i_next = 0, len; int state = 0; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; @@ -452,6 +599,15 @@ void vpanic(const char *fmt, va_list args) local_irq_disable(); preempt_disable_notrace(); + /* Redirect panic to target CPU if configured via panic_force_cpu=. */ + if (panic_try_force_cpu(fmt, args)) { + /* + * Mark ourselves offline so panic_other_cpus_shutdown() won't wait + * for us on architectures that check num_online_cpus(). + */ + set_cpu_online(smp_processor_id(), false); + panic_smp_self_stop(); + } /* * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want @@ -484,7 +640,11 @@ void vpanic(const char *fmt, va_list args) /* * Avoid nested stack-dumping if a panic occurs during oops processing */ - if (test_taint(TAINT_DIE) || oops_in_progress > 1) { + if (atomic_read(&panic_redirect_cpu) != PANIC_CPU_INVALID && + panic_force_cpu == raw_smp_processor_id()) { + pr_emerg("panic: Redirected from CPU %d, skipping stack dump.\n", + atomic_read(&panic_redirect_cpu)); + } else if (test_taint(TAINT_DIE) || oops_in_progress > 1) { panic_this_cpu_backtrace_printed = true; } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) { dump_stack(); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7e462957c9bf..c4eb284b8e72 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -174,10 +174,10 @@ sector_t alloc_swapdev_block(int swap) * Allocate a swap page and register that it has been allocated, so that * it can be freed in case of an error. */ - offset = swp_offset(get_swap_page_of_type(swap)); + offset = swp_offset(swap_alloc_hibernation_slot(swap)); if (offset) { if (swsusp_extents_insert(offset)) - swap_free(swp_entry(swap, offset)); + swap_free_hibernation_slot(swp_entry(swap, offset)); else return swapdev_block(swap, offset); } @@ -186,6 +186,7 @@ sector_t alloc_swapdev_block(int swap) void free_all_swap_pages(int swap) { + unsigned long offset; struct rb_node *node; /* @@ -197,8 +198,9 @@ void free_all_swap_pages(int swap) ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); - swap_free_nr(swp_entry(swap, ext->start), - ext->end - ext->start + 1); + + for (offset = ext->start; offset <= ext->end; offset++) + swap_free_hibernation_slot(swp_entry(swap, offset)); kfree(ext); } diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 5f5f626f4279..5fdea5682756 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -281,12 +281,20 @@ struct printk_buffers { * nothing to output and this record should be skipped. * @seq: The sequence number of the record used for @pbufs->outbuf. * @dropped: The number of dropped records from reading @seq. + * @cpu: CPU on which the message was generated. + * @pid: PID of the task that generated the message + * @comm: Name of the task that generated the message. */ struct printk_message { struct printk_buffers *pbufs; unsigned int outbuf_len; u64 seq; unsigned long dropped; +#ifdef CONFIG_PRINTK_EXECUTION_CTX + int cpu; + pid_t pid; + char comm[TASK_COMM_LEN]; +#endif }; bool printk_get_next_message(struct printk_message *pmsg, u64 seq, diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 32fc12e53675..d558b18505cd 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -946,6 +946,20 @@ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) } EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf); +#ifdef CONFIG_PRINTK_EXECUTION_CTX +static void wctxt_load_execution_ctx(struct nbcon_write_context *wctxt, + struct printk_message *pmsg) +{ + wctxt->cpu = pmsg->cpu; + wctxt->pid = pmsg->pid; + memcpy(wctxt->comm, pmsg->comm, sizeof(wctxt->comm)); + static_assert(sizeof(wctxt->comm) == sizeof(pmsg->comm)); +} +#else +static void wctxt_load_execution_ctx(struct nbcon_write_context *wctxt, + struct printk_message *pmsg) {} +#endif + /** * nbcon_emit_next_record - Emit a record in the acquired context * @wctxt: The write context that will be handed to the write function @@ -1048,6 +1062,8 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_a /* Initialize the write context for driver callbacks. */ nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len); + wctxt_load_execution_ctx(wctxt, &pmsg); + if (use_atomic) con->write_atomic(con, wctxt); else @@ -1758,9 +1774,12 @@ bool nbcon_alloc(struct console *con) /* Synchronize the kthread start. */ lockdep_assert_console_list_lock_held(); - /* The write_thread() callback is mandatory. */ - if (WARN_ON(!con->write_thread)) + /* Check for mandatory nbcon callbacks. */ + if (WARN_ON(!con->write_thread || + !con->device_lock || + !con->device_unlock)) { return false; + } rcuwait_init(&con->rcuwait); init_irq_work(&con->irq_work, nbcon_irq_work); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 37d16ef27f13..a181394604d1 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2133,12 +2133,40 @@ static inline void printk_delay(int level) } } +#define CALLER_ID_MASK 0x80000000 + static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : - 0x80000000 + smp_processor_id(); + CALLER_ID_MASK + smp_processor_id(); +} + +#ifdef CONFIG_PRINTK_EXECUTION_CTX +/* Store the opposite info than caller_id. */ +static u32 printk_caller_id2(void) +{ + return !in_task() ? task_pid_nr(current) : + CALLER_ID_MASK + smp_processor_id(); +} + +static pid_t printk_info_get_pid(const struct printk_info *info) +{ + u32 caller_id = info->caller_id; + u32 caller_id2 = info->caller_id2; + + return caller_id & CALLER_ID_MASK ? caller_id2 : caller_id; } +static int printk_info_get_cpu(const struct printk_info *info) +{ + u32 caller_id = info->caller_id; + u32 caller_id2 = info->caller_id2; + + return ((caller_id & CALLER_ID_MASK ? + caller_id : caller_id2) & ~CALLER_ID_MASK); +} +#endif + /** * printk_parse_prefix - Parse level and control flags. * @@ -2215,6 +2243,28 @@ static u16 printk_sprint(char *text, u16 size, int facility, return text_len; } +#ifdef CONFIG_PRINTK_EXECUTION_CTX +static void printk_store_execution_ctx(struct printk_info *info) +{ + info->caller_id2 = printk_caller_id2(); + get_task_comm(info->comm, current); +} + +static void pmsg_load_execution_ctx(struct printk_message *pmsg, + const struct printk_info *info) +{ + pmsg->cpu = printk_info_get_cpu(info); + pmsg->pid = printk_info_get_pid(info); + memcpy(pmsg->comm, info->comm, sizeof(pmsg->comm)); + static_assert(sizeof(pmsg->comm) == sizeof(info->comm)); +} +#else +static void printk_store_execution_ctx(struct printk_info *info) {} + +static void pmsg_load_execution_ctx(struct printk_message *pmsg, + const struct printk_info *info) {} +#endif + __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, @@ -2322,6 +2372,7 @@ int vprintk_store(int facility, int level, r.info->caller_id = caller_id; if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); + printk_store_execution_ctx(r.info); /* A message without a trailing newline can be continued. */ if (!(flags & LOG_NEWLINE)) @@ -3004,6 +3055,7 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, pmsg->seq = r.info->seq; pmsg->dropped = r.info->seq - seq; force_con = r.info->flags & LOG_FORCE_CON; + pmsg_load_execution_ctx(pmsg, r.info); /* * Skip records that are not forced to be printed on consoles and that @@ -3364,22 +3416,6 @@ void console_unlock(void) } EXPORT_SYMBOL(console_unlock); -/** - * console_conditional_schedule - yield the CPU if required - * - * If the console code is currently allowed to sleep, and - * if this CPU should yield the CPU to another task, do - * so here. - * - * Must be called within console_lock();. - */ -void __sched console_conditional_schedule(void) -{ - if (console_may_schedule) - cond_resched(); -} -EXPORT_SYMBOL(console_conditional_schedule); - void console_unblank(void) { bool found_unblank = false; diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 4ef81349d9fb..1651b53ece34 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -23,6 +23,11 @@ struct printk_info { u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ u32 caller_id; /* thread id or processor id */ +#ifdef CONFIG_PRINTK_EXECUTION_CTX + u32 caller_id2; /* caller_id complement */ + /* name of the task that generated the message */ + char comm[TASK_COMM_LEN]; +#endif struct dev_printk_info dev_info; }; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c469c708fdd6..66ba6a2f83d3 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -789,7 +789,8 @@ void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) struct srcu_data *sdp; /* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */ - WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi()); + WARN_ON_ONCE(read_flavor != SRCU_READ_FLAVOR_NMI && + read_flavor != SRCU_READ_FLAVOR_FAST && in_nmi()); WARN_ON_ONCE(read_flavor & (read_flavor - 1)); sdp = raw_cpu_ptr(ssp->sda); diff --git a/kernel/resource.c b/kernel/resource.c index e4e9bac12e6e..31341bdd7707 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -48,6 +48,14 @@ struct resource iomem_resource = { }; EXPORT_SYMBOL(iomem_resource); +struct resource soft_reserve_resource = { + .name = "Soft Reserved", + .start = 0, + .end = -1, + .desc = IORES_DESC_SOFT_RESERVED, + .flags = IORESOURCE_MEM, +}; + static DEFINE_RWLOCK(resource_lock); /* @@ -82,7 +90,7 @@ static struct resource *next_resource(struct resource *p, bool skip_children, #ifdef CONFIG_PROC_FS -enum { MAX_IORES_LEVEL = 5 }; +enum { MAX_IORES_LEVEL = 8 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) @@ -321,13 +329,14 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long } /** - * find_next_iomem_res - Finds the lowest iomem resource that covers part of - * [@start..@end]. + * find_next_res - Finds the lowest resource that covers part of + * [@start..@end]. * * If a resource is found, returns 0 and @*res is overwritten with the part * of the resource that's within [@start..@end]; if none is found, returns * -ENODEV. Returns -EINVAL for invalid parameters. * + * @parent: resource tree root to search * @start: start address of the resource searched for * @end: end address of same resource * @flags: flags which the resource must have @@ -337,9 +346,9 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long * The caller must specify @start, @end, @flags, and @desc * (which may be IORES_DESC_NONE). */ -static int find_next_iomem_res(resource_size_t start, resource_size_t end, - unsigned long flags, unsigned long desc, - struct resource *res) +static int find_next_res(struct resource *parent, resource_size_t start, + resource_size_t end, unsigned long flags, + unsigned long desc, struct resource *res) { /* Skip children until we find a top level range that matches */ bool skip_children = true; @@ -353,7 +362,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for_each_resource(&iomem_resource, p, skip_children) { + for_each_resource(parent, p, skip_children) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; @@ -390,16 +399,23 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, return p ? 0 : -ENODEV; } -static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, - unsigned long flags, unsigned long desc, - void *arg, - int (*func)(struct resource *, void *)) +static int find_next_iomem_res(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + struct resource *res) +{ + return find_next_res(&iomem_resource, start, end, flags, desc, res); +} + +static int walk_res_desc(struct resource *parent, resource_size_t start, + resource_size_t end, unsigned long flags, + unsigned long desc, void *arg, + int (*func)(struct resource *, void *)) { struct resource res; int ret = -EINVAL; while (start < end && - !find_next_iomem_res(start, end, flags, desc, &res)) { + !find_next_res(parent, start, end, flags, desc, &res)) { ret = (*func)(&res, arg); if (ret) break; @@ -410,6 +426,15 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, return ret; } +static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + void *arg, + int (*func)(struct resource *, void *)) +{ + return walk_res_desc(&iomem_resource, start, end, flags, desc, arg, func); +} + + /** * walk_iomem_res_desc - Walks through iomem resources and calls func() * with matching resource ranges. @@ -435,6 +460,18 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, EXPORT_SYMBOL_GPL(walk_iomem_res_desc); /* + * In support of device drivers claiming Soft Reserved resources, walk the Soft + * Reserved resource deferral tree. + */ +int walk_soft_reserve_res(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)) +{ + return walk_res_desc(&soft_reserve_resource, start, end, IORESOURCE_MEM, + IORES_DESC_SOFT_RESERVED, arg, func); +} +EXPORT_SYMBOL_GPL(walk_soft_reserve_res); + +/* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. * Now, this function is only for System RAM, it deals with full ranges and @@ -656,6 +693,18 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags, } EXPORT_SYMBOL_GPL(region_intersects); +/* + * Check if the provided range is registered in the Soft Reserved resource + * deferral tree for driver consideration. + */ +int region_intersects_soft_reserve(resource_size_t start, size_t size) +{ + guard(read_lock)(&resource_lock); + return __region_intersects(&soft_reserve_resource, start, size, + IORESOURCE_MEM, IORES_DESC_SOFT_RESERVED); +} +EXPORT_SYMBOL_GPL(region_intersects_soft_reserve); + void __weak arch_remove_reservations(struct resource *avail) { } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7c8b769c0d0d..759777694c78 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10788,10 +10788,9 @@ void sched_mm_cid_exit(struct task_struct *t) return; /* * Mode change. The task has the CID unset - * already. The CPU CID is still valid and - * does not have MM_CID_TRANSIT set as the - * mode change has just taken effect under - * mm::mm_cid::lock. Drop it. + * already and dealt with an eventually set + * TRANSIT bit. If the CID is owned by the CPU + * then drop it. */ mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e6bf73456176..c18e81e8ef51 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3520,8 +3520,8 @@ static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) * operations inside scheduler locks. */ dsq->id = SCX_DSQ_INVALID; - llist_add(&dsq->free_node, &dsqs_to_free); - irq_work_queue(&free_dsq_irq_work); + if (llist_add(&dsq->free_node, &dsqs_to_free)) + irq_work_queue(&free_dsq_irq_work); out_unlock_dsq: raw_spin_unlock_irqrestore(&dsq->lock, flags); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e51bfa3586fa..b82fb70a9d54 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3813,8 +3813,10 @@ static __always_inline void mm_unset_cid_on_task(struct task_struct *t) static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp) { /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */ - pcp->cid = cpu_cid_to_cid(pcp->cid); - mm_drop_cid(mm, pcp->cid); + if (cid_on_cpu(pcp->cid)) { + pcp->cid = cpu_cid_to_cid(pcp->cid); + mm_drop_cid(mm, pcp->cid); + } } static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c903f1a42891..a612cf253c87 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -253,8 +253,10 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) delta = rq_clock(rq) - t->sched_info.last_queued; t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; - if (delta > t->sched_info.max_run_delay) + if (delta > t->sched_info.max_run_delay) { t->sched_info.max_run_delay = delta; + ktime_get_real_ts64(&t->sched_info.max_run_delay_ts); + } if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) t->sched_info.min_run_delay = delta; rq_sched_info_dequeue(rq, delta); @@ -278,8 +280,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; - if (delta > t->sched_info.max_run_delay) + if (delta > t->sched_info.max_run_delay) { t->sched_info.max_run_delay = delta; + ktime_get_real_ts64(&t->sched_info.max_run_delay_ts); + } if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) t->sched_info.min_run_delay = delta; diff --git a/kernel/sys.c b/kernel/sys.c index 35ea9d79a42e..c86eba9aa7e9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2388,6 +2388,21 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st return -EINVAL; } +int __weak arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status) +{ + return -EINVAL; +} + +int __weak arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + +int __weak arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) static int prctl_set_vma(unsigned long opt, unsigned long addr, @@ -2873,6 +2888,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = rseq_slice_extension_prctl(arg2, arg3); break; + case PR_GET_INDIR_BR_LP_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_get_indir_br_lp_status(me, (unsigned long __user *)arg2); + break; + case PR_SET_INDIR_BR_LP_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_set_indir_br_lp_status(me, arg2); + break; + case PR_LOCK_INDIR_BR_LP_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_lock_indir_br_lp_status(me, arg2); + break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); error = -EINVAL; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d7042a09fe46..49de13cae428 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -136,6 +136,7 @@ config BUILDTIME_MCOUNT_SORT config TRACER_MAX_TRACE bool + select TRACER_SNAPSHOT config TRACE_CLOCK bool @@ -425,7 +426,6 @@ config IRQSOFF_TRACER select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP - select TRACER_SNAPSHOT select TRACER_SNAPSHOT_PER_CPU_SWAP help This option measures the time spent in irqs-off critical @@ -448,7 +448,6 @@ config PREEMPT_TRACER select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP - select TRACER_SNAPSHOT select TRACER_SNAPSHOT_PER_CPU_SWAP select TRACE_PREEMPT_TOGGLE help @@ -470,7 +469,6 @@ config SCHED_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE - select TRACER_SNAPSHOT help This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. @@ -620,7 +618,6 @@ config TRACE_SYSCALL_BUF_SIZE_DEFAULT config TRACER_SNAPSHOT bool "Create a snapshot trace buffer" - select TRACER_MAX_TRACE help Allow tracing users to take snapshot of the current buffer using the ftrace interface, e.g.: @@ -628,6 +625,9 @@ config TRACER_SNAPSHOT echo 1 > /sys/kernel/tracing/snapshot cat snapshot + Note, the latency tracers select this option. To disable it, + all the latency tracers need to be disabled. + config TRACER_SNAPSHOT_PER_CPU_SWAP bool "Allow snapshot to swap per CPU" depends on TRACER_SNAPSHOT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index fc5dcc888e13..04096c21d06b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -68,6 +68,7 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o +obj-$(CONFIG_TRACING) += trace_pid.o obj-$(CONFIG_TRACING) += pid_list.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c4db5c2e7103..f2de9cf15d0e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1832,7 +1832,9 @@ static struct trace_event trace_blk_event = { .funcs = &trace_blk_event_funcs, }; -static int __init init_blk_tracer(void) +static struct work_struct blktrace_works __initdata; + +static int __init __init_blk_tracer(void) { if (!register_trace_event(&trace_blk_event)) { pr_warn("Warning: could not register block events\n"); @@ -1852,6 +1854,25 @@ static int __init init_blk_tracer(void) return 0; } +static void __init blktrace_works_func(struct work_struct *work) +{ + __init_blk_tracer(); +} + +static int __init init_blk_tracer(void) +{ + int ret = 0; + + if (trace_init_wq) { + INIT_WORK(&blktrace_works, blktrace_works_func); + queue_work(trace_init_wq, &blktrace_works); + } else { + ret = __init_blk_tracer(); + } + + return ret; +} + device_initcall(init_blk_tracer); static int blk_trace_remove_queue(struct request_queue *q) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f7baeb8278ca..eadaef8592a3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2076,7 +2076,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; - cant_sleep(); + rcu_read_lock_dont_migrate(); if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); goto out; @@ -2085,13 +2085,12 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) run_ctx.bpf_cookie = link->cookie; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - rcu_read_lock(); (void) bpf_prog_run(prog, args); - rcu_read_unlock(); bpf_reset_run_ctx(old_run_ctx); out: bpf_prog_put_recursion_context(prog); + rcu_read_unlock_migrate(); } #define UNPACK(...) __VA_ARGS__ diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index cc48d16be43e..4df766c690f9 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1303,7 +1303,7 @@ static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *go static_call_update(fgraph_func, func); static_call_update(fgraph_retfunc, retfunc); if (enable_branch) - static_branch_disable(&fgraph_do_direct); + static_branch_enable(&fgraph_do_direct); } static void ftrace_graph_disable_direct(bool disable_branch) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9b10c633bdd..1ce17c8af409 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1147,6 +1147,7 @@ struct ftrace_page { }; #define ENTRY_SIZE sizeof(struct dyn_ftrace) +#define ENTRIES_PER_PAGE_GROUP(order) ((PAGE_SIZE << (order)) / ENTRY_SIZE) static struct ftrace_page *ftrace_pages_start; static struct ftrace_page *ftrace_pages; @@ -3873,7 +3874,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count, *num_pages += 1 << order; ftrace_number_of_groups++; - cnt = (PAGE_SIZE << order) / ENTRY_SIZE; + cnt = ENTRIES_PER_PAGE_GROUP(order); pg->order = order; if (cnt > count) @@ -7668,7 +7669,7 @@ static int ftrace_process_locs(struct module *mod, long skip; /* Count the number of entries unused and compare it to skipped. */ - pg_remaining = (PAGE_SIZE << pg->order) / ENTRY_SIZE - pg->index; + pg_remaining = ENTRIES_PER_PAGE_GROUP(pg->order) - pg->index; if (!WARN(skipped < pg_remaining, "Extra allocated pages for ftrace")) { @@ -7676,7 +7677,7 @@ static int ftrace_process_locs(struct module *mod, for (pg = pg_unuse; pg && skip > 0; pg = pg->next) { remaining += 1 << pg->order; - skip -= (PAGE_SIZE << pg->order) / ENTRY_SIZE; + skip -= ENTRIES_PER_PAGE_GROUP(pg->order); } pages -= remaining; @@ -8112,7 +8113,8 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym) + unsigned long *off, char **modname, + const unsigned char **modbuildid, char *sym) { struct ftrace_mod_map *mod_map; int ret = 0; @@ -8124,6 +8126,8 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, if (ret) { if (modname) *modname = mod_map->mod->name; + if (modbuildid) + *modbuildid = module_buildid(mod_map->mod); break; } } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 630221b00838..d33103408955 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ +#include <linux/sched/isolation.h> #include <linux/trace_recursion.h> #include <linux/trace_events.h> #include <linux/ring_buffer.h> @@ -4013,19 +4014,36 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer) rb_end_commit(cpu_buffer); } +static bool +rb_irq_work_queue(struct rb_irq_work *irq_work) +{ + int cpu; + + /* irq_work_queue_on() is not NMI-safe */ + if (unlikely(in_nmi())) + return irq_work_queue(&irq_work->work); + + /* + * If CPU isolation is not active, cpu is always the current + * CPU, and the following is equivallent to irq_work_queue(). + */ + cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE); + return irq_work_queue_on(&irq_work->work, cpu); +} + static __always_inline void rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&buffer->irq_work.work); + rb_irq_work_queue(&buffer->irq_work); } if (cpu_buffer->irq_work.waiters_pending) { cpu_buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); + rb_irq_work_queue(&cpu_buffer->irq_work); } if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched)) @@ -4045,7 +4063,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->irq_work.wakeup_full = true; cpu_buffer->irq_work.full_waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); + rb_irq_work_queue(&cpu_buffer->irq_work); } #ifdef CONFIG_RING_BUFFER_RECORD_RECURSION diff --git a/kernel/trace/rv/monitors/nrp/nrp.c b/kernel/trace/rv/monitors/nrp/nrp.c index 5a83b7171432..4b5646a70094 100644 --- a/kernel/trace/rv/monitors/nrp/nrp.c +++ b/kernel/trace/rv/monitors/nrp/nrp.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "nrp" @@ -15,17 +14,16 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_TASK #include "nrp.h" - -static struct rv_monitor rv_nrp; -DECLARE_DA_MON_PER_TASK(nrp, unsigned char); +#include <rv/da_monitor.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/trace/irq_vectors.h> static void handle_vector_irq_entry(void *data, int vector) { - da_handle_event_nrp(current, irq_entry_nrp); + da_handle_event(current, irq_entry_nrp); } static void attach_vector_irq(void) @@ -60,7 +58,7 @@ static void detach_vector_irq(void) { } static void handle_irq_entry(void *data, int irq, struct irqaction *action) { - da_handle_event_nrp(current, irq_entry_nrp); + da_handle_event(current, irq_entry_nrp); } static void handle_sched_need_resched(void *data, struct task_struct *tsk, @@ -72,22 +70,22 @@ static void handle_sched_need_resched(void *data, struct task_struct *tsk, * which may not mirror the system state but makes the monitor simpler, */ if (tif == TIF_NEED_RESCHED) - da_handle_start_event_nrp(tsk, sched_need_resched_nrp); + da_handle_start_event(tsk, sched_need_resched_nrp); } static void handle_schedule_entry(void *data, bool preempt) { if (preempt) - da_handle_event_nrp(current, schedule_entry_preempt_nrp); + da_handle_event(current, schedule_entry_preempt_nrp); else - da_handle_event_nrp(current, schedule_entry_nrp); + da_handle_event(current, schedule_entry_nrp); } static int enable_nrp(void) { int retval; - retval = da_monitor_init_nrp(); + retval = da_monitor_init(); if (retval) return retval; @@ -101,33 +99,33 @@ static int enable_nrp(void) static void disable_nrp(void) { - rv_nrp.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("nrp", irq_handler_entry, handle_irq_entry); rv_detach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched); rv_detach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry); detach_vector_irq(); - da_monitor_destroy_nrp(); + da_monitor_destroy(); } -static struct rv_monitor rv_nrp = { +static struct rv_monitor rv_this = { .name = "nrp", .description = "need resched preempts.", .enable = enable_nrp, .disable = disable_nrp, - .reset = da_monitor_reset_all_nrp, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_nrp(void) { - return rv_register_monitor(&rv_nrp, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_nrp(void) { - rv_unregister_monitor(&rv_nrp); + rv_unregister_monitor(&rv_this); } module_init(register_nrp); diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h index c9f12207cbf6..3270d4c0139f 100644 --- a/kernel/trace/rv/monitors/nrp/nrp.h +++ b/kernel/trace/rv/monitors/nrp/nrp.h @@ -5,22 +5,24 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME nrp + enum states_nrp { - preempt_irq_nrp = 0, + preempt_irq_nrp, any_thread_running_nrp, nested_preempt_nrp, rescheduling_nrp, - state_max_nrp + state_max_nrp, }; #define INVALID_STATE state_max_nrp enum events_nrp { - irq_entry_nrp = 0, + irq_entry_nrp, sched_need_resched_nrp, schedule_entry_nrp, schedule_entry_preempt_nrp, - event_max_nrp + event_max_nrp, }; struct automaton_nrp { @@ -36,38 +38,38 @@ static const struct automaton_nrp automaton_nrp = { "preempt_irq", "any_thread_running", "nested_preempt", - "rescheduling" + "rescheduling", }, .event_names = { "irq_entry", "sched_need_resched", "schedule_entry", - "schedule_entry_preempt" + "schedule_entry_preempt", }, .function = { { preempt_irq_nrp, preempt_irq_nrp, nested_preempt_nrp, - nested_preempt_nrp + nested_preempt_nrp, }, { any_thread_running_nrp, rescheduling_nrp, any_thread_running_nrp, - INVALID_STATE + INVALID_STATE, }, { nested_preempt_nrp, preempt_irq_nrp, any_thread_running_nrp, - any_thread_running_nrp + any_thread_running_nrp, }, { preempt_irq_nrp, rescheduling_nrp, any_thread_running_nrp, - any_thread_running_nrp + any_thread_running_nrp, }, }, .initial_state = preempt_irq_nrp, diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c index 50d64e7fb8c4..25a40e90fa40 100644 --- a/kernel/trace/rv/monitors/opid/opid.c +++ b/kernel/trace/rv/monitors/opid/opid.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "opid" @@ -16,17 +15,16 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_CPU #include "opid.h" - -static struct rv_monitor rv_opid; -DECLARE_DA_MON_PER_CPU(opid, unsigned char); +#include <rv/da_monitor.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/trace/irq_vectors.h> static void handle_vector_irq_entry(void *data, int vector) { - da_handle_event_opid(irq_entry_opid); + da_handle_event(irq_entry_opid); } static void attach_vector_irq(void) @@ -61,52 +59,52 @@ static void detach_vector_irq(void) { } static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_opid(irq_disable_opid); + da_handle_event(irq_disable_opid); } static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_opid(irq_enable_opid); + da_handle_event(irq_enable_opid); } static void handle_irq_entry(void *data, int irq, struct irqaction *action) { - da_handle_event_opid(irq_entry_opid); + da_handle_event(irq_entry_opid); } static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_opid(preempt_disable_opid); + da_handle_event(preempt_disable_opid); } static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_opid(preempt_enable_opid); + da_handle_event(preempt_enable_opid); } static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif) { /* The monitor's intitial state is not in_irq */ if (this_cpu_read(hardirq_context)) - da_handle_event_opid(sched_need_resched_opid); + da_handle_event(sched_need_resched_opid); else - da_handle_start_event_opid(sched_need_resched_opid); + da_handle_start_event(sched_need_resched_opid); } static void handle_sched_waking(void *data, struct task_struct *p) { /* The monitor's intitial state is not in_irq */ if (this_cpu_read(hardirq_context)) - da_handle_event_opid(sched_waking_opid); + da_handle_event(sched_waking_opid); else - da_handle_start_event_opid(sched_waking_opid); + da_handle_start_event(sched_waking_opid); } static int enable_opid(void) { int retval; - retval = da_monitor_init_opid(); + retval = da_monitor_init(); if (retval) return retval; @@ -124,7 +122,7 @@ static int enable_opid(void) static void disable_opid(void) { - rv_opid.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("opid", irq_disable, handle_irq_disable); rv_detach_trace_probe("opid", irq_enable, handle_irq_enable); @@ -135,29 +133,29 @@ static void disable_opid(void) rv_detach_trace_probe("opid", sched_waking, handle_sched_waking); detach_vector_irq(); - da_monitor_destroy_opid(); + da_monitor_destroy(); } /* * This is the monitor register section. */ -static struct rv_monitor rv_opid = { +static struct rv_monitor rv_this = { .name = "opid", .description = "operations with preemption and irq disabled.", .enable = enable_opid, .disable = disable_opid, - .reset = da_monitor_reset_all_opid, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_opid(void) { - return rv_register_monitor(&rv_opid, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_opid(void) { - rv_unregister_monitor(&rv_opid); + rv_unregister_monitor(&rv_this); } module_init(register_opid); diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h index b4b8c2ff7f64..092992514970 100644 --- a/kernel/trace/rv/monitors/opid/opid.h +++ b/kernel/trace/rv/monitors/opid/opid.h @@ -5,26 +5,28 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME opid + enum states_opid { - disabled_opid = 0, + disabled_opid, enabled_opid, in_irq_opid, irq_disabled_opid, preempt_disabled_opid, - state_max_opid + state_max_opid, }; #define INVALID_STATE state_max_opid enum events_opid { - irq_disable_opid = 0, + irq_disable_opid, irq_enable_opid, irq_entry_opid, preempt_disable_opid, preempt_enable_opid, sched_need_resched_opid, sched_waking_opid, - event_max_opid + event_max_opid, }; struct automaton_opid { @@ -41,7 +43,7 @@ static const struct automaton_opid automaton_opid = { "enabled", "in_irq", "irq_disabled", - "preempt_disabled" + "preempt_disabled", }, .event_names = { "irq_disable", @@ -50,7 +52,7 @@ static const struct automaton_opid automaton_opid = { "preempt_disable", "preempt_enable", "sched_need_resched", - "sched_waking" + "sched_waking", }, .function = { { @@ -60,7 +62,7 @@ static const struct automaton_opid automaton_opid = { INVALID_STATE, irq_disabled_opid, disabled_opid, - disabled_opid + disabled_opid, }, { irq_disabled_opid, @@ -69,7 +71,7 @@ static const struct automaton_opid automaton_opid = { preempt_disabled_opid, enabled_opid, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, @@ -78,7 +80,7 @@ static const struct automaton_opid automaton_opid = { INVALID_STATE, INVALID_STATE, in_irq_opid, - in_irq_opid + in_irq_opid, }, { INVALID_STATE, @@ -87,7 +89,7 @@ static const struct automaton_opid automaton_opid = { disabled_opid, INVALID_STATE, irq_disabled_opid, - INVALID_STATE + INVALID_STATE, }, { disabled_opid, @@ -96,7 +98,7 @@ static const struct automaton_opid automaton_opid = { INVALID_STATE, enabled_opid, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, }, .initial_state = disabled_opid, diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c index fd75fc927d65..17f271231c99 100644 --- a/kernel/trace/rv/monitors/rtapp/rtapp.c +++ b/kernel/trace/rv/monitors/rtapp/rtapp.c @@ -8,8 +8,6 @@ #include "rtapp.h" -struct rv_monitor rv_rtapp; - struct rv_monitor rv_rtapp = { .name = "rtapp", .description = "Collection of monitors for detecting problems with real-time applications", diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c index d04db4b543f9..dd9d96fc6e21 100644 --- a/kernel/trace/rv/monitors/sched/sched.c +++ b/kernel/trace/rv/monitors/sched/sched.c @@ -8,8 +8,6 @@ #include "sched.h" -struct rv_monitor rv_sched; - struct rv_monitor rv_sched = { .name = "sched", .description = "container for several scheduler monitor specifications.", diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c index 04c36405e2e3..5a3bd5e16e62 100644 --- a/kernel/trace/rv/monitors/sco/sco.c +++ b/kernel/trace/rv/monitors/sco/sco.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "sco" @@ -14,31 +13,30 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_CPU #include "sco.h" - -static struct rv_monitor rv_sco; -DECLARE_DA_MON_PER_CPU(sco, unsigned char); +#include <rv/da_monitor.h> static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) { - da_handle_start_event_sco(sched_set_state_sco); + da_handle_start_event(sched_set_state_sco); } static void handle_schedule_entry(void *data, bool preempt) { - da_handle_event_sco(schedule_entry_sco); + da_handle_event(schedule_entry_sco); } static void handle_schedule_exit(void *data, bool is_switch) { - da_handle_start_event_sco(schedule_exit_sco); + da_handle_start_event(schedule_exit_sco); } static int enable_sco(void) { int retval; - retval = da_monitor_init_sco(); + retval = da_monitor_init(); if (retval) return retval; @@ -51,32 +49,32 @@ static int enable_sco(void) static void disable_sco(void) { - rv_sco.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state); rv_detach_trace_probe("sco", sched_entry_tp, handle_schedule_entry); rv_detach_trace_probe("sco", sched_exit_tp, handle_schedule_exit); - da_monitor_destroy_sco(); + da_monitor_destroy(); } -static struct rv_monitor rv_sco = { +static struct rv_monitor rv_this = { .name = "sco", .description = "scheduling context operations.", .enable = enable_sco, .disable = disable_sco, - .reset = da_monitor_reset_all_sco, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_sco(void) { - return rv_register_monitor(&rv_sco, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_sco(void) { - rv_unregister_monitor(&rv_sco); + rv_unregister_monitor(&rv_this); } module_init(register_sco); diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h index 7a4c1f2d5ca1..bac3beb51e72 100644 --- a/kernel/trace/rv/monitors/sco/sco.h +++ b/kernel/trace/rv/monitors/sco/sco.h @@ -5,19 +5,21 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME sco + enum states_sco { - thread_context_sco = 0, + thread_context_sco, scheduling_context_sco, - state_max_sco + state_max_sco, }; #define INVALID_STATE state_max_sco enum events_sco { - sched_set_state_sco = 0, + sched_set_state_sco, schedule_entry_sco, schedule_exit_sco, - event_max_sco + event_max_sco, }; struct automaton_sco { @@ -31,12 +33,12 @@ struct automaton_sco { static const struct automaton_sco automaton_sco = { .state_names = { "thread_context", - "scheduling_context" + "scheduling_context", }, .event_names = { "sched_set_state", "schedule_entry", - "schedule_exit" + "schedule_exit", }, .function = { { thread_context_sco, scheduling_context_sco, INVALID_STATE }, diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c index 1e351ba52fee..83b48627dc9f 100644 --- a/kernel/trace/rv/monitors/scpd/scpd.c +++ b/kernel/trace/rv/monitors/scpd/scpd.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "scpd" @@ -15,36 +14,35 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_CPU #include "scpd.h" - -static struct rv_monitor rv_scpd; -DECLARE_DA_MON_PER_CPU(scpd, unsigned char); +#include <rv/da_monitor.h> static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_scpd(preempt_disable_scpd); + da_handle_event(preempt_disable_scpd); } static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_start_event_scpd(preempt_enable_scpd); + da_handle_start_event(preempt_enable_scpd); } static void handle_schedule_entry(void *data, bool preempt) { - da_handle_event_scpd(schedule_entry_scpd); + da_handle_event(schedule_entry_scpd); } static void handle_schedule_exit(void *data, bool is_switch) { - da_handle_event_scpd(schedule_exit_scpd); + da_handle_event(schedule_exit_scpd); } static int enable_scpd(void) { int retval; - retval = da_monitor_init_scpd(); + retval = da_monitor_init(); if (retval) return retval; @@ -58,33 +56,33 @@ static int enable_scpd(void) static void disable_scpd(void) { - rv_scpd.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("scpd", preempt_disable, handle_preempt_disable); rv_detach_trace_probe("scpd", preempt_enable, handle_preempt_enable); rv_detach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry); rv_detach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit); - da_monitor_destroy_scpd(); + da_monitor_destroy(); } -static struct rv_monitor rv_scpd = { +static struct rv_monitor rv_this = { .name = "scpd", .description = "schedule called with preemption disabled.", .enable = enable_scpd, .disable = disable_scpd, - .reset = da_monitor_reset_all_scpd, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_scpd(void) { - return rv_register_monitor(&rv_scpd, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_scpd(void) { - rv_unregister_monitor(&rv_scpd); + rv_unregister_monitor(&rv_this); } module_init(register_scpd); diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h index 295f735a5811..d6329da2671b 100644 --- a/kernel/trace/rv/monitors/scpd/scpd.h +++ b/kernel/trace/rv/monitors/scpd/scpd.h @@ -5,20 +5,22 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME scpd + enum states_scpd { - cant_sched_scpd = 0, + cant_sched_scpd, can_sched_scpd, - state_max_scpd + state_max_scpd, }; #define INVALID_STATE state_max_scpd enum events_scpd { - preempt_disable_scpd = 0, + preempt_disable_scpd, preempt_enable_scpd, schedule_entry_scpd, schedule_exit_scpd, - event_max_scpd + event_max_scpd, }; struct automaton_scpd { @@ -32,13 +34,13 @@ struct automaton_scpd { static const struct automaton_scpd automaton_scpd = { .state_names = { "cant_sched", - "can_sched" + "can_sched", }, .event_names = { "preempt_disable", "preempt_enable", "schedule_entry", - "schedule_exit" + "schedule_exit", }, .function = { { can_sched_scpd, INVALID_STATE, INVALID_STATE, INVALID_STATE }, diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c index 558950f524a5..b80b73795dec 100644 --- a/kernel/trace/rv/monitors/snep/snep.c +++ b/kernel/trace/rv/monitors/snep/snep.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "snep" @@ -15,36 +14,35 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_CPU #include "snep.h" - -static struct rv_monitor rv_snep; -DECLARE_DA_MON_PER_CPU(snep, unsigned char); +#include <rv/da_monitor.h> static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_start_event_snep(preempt_disable_snep); + da_handle_start_event(preempt_disable_snep); } static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_start_event_snep(preempt_enable_snep); + da_handle_start_event(preempt_enable_snep); } static void handle_schedule_entry(void *data, bool preempt) { - da_handle_event_snep(schedule_entry_snep); + da_handle_event(schedule_entry_snep); } static void handle_schedule_exit(void *data, bool is_switch) { - da_handle_start_event_snep(schedule_exit_snep); + da_handle_start_event(schedule_exit_snep); } static int enable_snep(void) { int retval; - retval = da_monitor_init_snep(); + retval = da_monitor_init(); if (retval) return retval; @@ -58,33 +56,33 @@ static int enable_snep(void) static void disable_snep(void) { - rv_snep.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("snep", preempt_disable, handle_preempt_disable); rv_detach_trace_probe("snep", preempt_enable, handle_preempt_enable); rv_detach_trace_probe("snep", sched_entry_tp, handle_schedule_entry); rv_detach_trace_probe("snep", sched_exit_tp, handle_schedule_exit); - da_monitor_destroy_snep(); + da_monitor_destroy(); } -static struct rv_monitor rv_snep = { +static struct rv_monitor rv_this = { .name = "snep", .description = "schedule does not enable preempt.", .enable = enable_snep, .disable = disable_snep, - .reset = da_monitor_reset_all_snep, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_snep(void) { - return rv_register_monitor(&rv_snep, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_snep(void) { - rv_unregister_monitor(&rv_snep); + rv_unregister_monitor(&rv_this); } module_init(register_snep); diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h index 4cd9abb77b7b..357520a5b3d1 100644 --- a/kernel/trace/rv/monitors/snep/snep.h +++ b/kernel/trace/rv/monitors/snep/snep.h @@ -5,20 +5,22 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME snep + enum states_snep { - non_scheduling_context_snep = 0, + non_scheduling_context_snep, scheduling_contex_snep, - state_max_snep + state_max_snep, }; #define INVALID_STATE state_max_snep enum events_snep { - preempt_disable_snep = 0, + preempt_disable_snep, preempt_enable_snep, schedule_entry_snep, schedule_exit_snep, - event_max_snep + event_max_snep, }; struct automaton_snep { @@ -32,26 +34,26 @@ struct automaton_snep { static const struct automaton_snep automaton_snep = { .state_names = { "non_scheduling_context", - "scheduling_contex" + "scheduling_contex", }, .event_names = { "preempt_disable", "preempt_enable", "schedule_entry", - "schedule_exit" + "schedule_exit", }, .function = { { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, INVALID_STATE, INVALID_STATE, - non_scheduling_context_snep + non_scheduling_context_snep, }, }, .initial_state = non_scheduling_context_snep, diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c index 540e686e699f..f168b1a4b12c 100644 --- a/kernel/trace/rv/monitors/snroc/snroc.c +++ b/kernel/trace/rv/monitors/snroc/snroc.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "snroc" @@ -14,14 +13,13 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_TASK #include "snroc.h" - -static struct rv_monitor rv_snroc; -DECLARE_DA_MON_PER_TASK(snroc, unsigned char); +#include <rv/da_monitor.h> static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) { - da_handle_event_snroc(tsk, sched_set_state_snroc); + da_handle_event(tsk, sched_set_state_snroc); } static void handle_sched_switch(void *data, bool preempt, @@ -29,15 +27,15 @@ static void handle_sched_switch(void *data, bool preempt, struct task_struct *next, unsigned int prev_state) { - da_handle_start_event_snroc(prev, sched_switch_out_snroc); - da_handle_event_snroc(next, sched_switch_in_snroc); + da_handle_start_event(prev, sched_switch_out_snroc); + da_handle_event(next, sched_switch_in_snroc); } static int enable_snroc(void) { int retval; - retval = da_monitor_init_snroc(); + retval = da_monitor_init(); if (retval) return retval; @@ -49,31 +47,31 @@ static int enable_snroc(void) static void disable_snroc(void) { - rv_snroc.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state); rv_detach_trace_probe("snroc", sched_switch, handle_sched_switch); - da_monitor_destroy_snroc(); + da_monitor_destroy(); } -static struct rv_monitor rv_snroc = { +static struct rv_monitor rv_this = { .name = "snroc", .description = "set non runnable on its own context.", .enable = enable_snroc, .disable = disable_snroc, - .reset = da_monitor_reset_all_snroc, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_snroc(void) { - return rv_register_monitor(&rv_snroc, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_snroc(void) { - rv_unregister_monitor(&rv_snroc); + rv_unregister_monitor(&rv_this); } module_init(register_snroc); diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h index c3650a2b1b10..88b7328ad31a 100644 --- a/kernel/trace/rv/monitors/snroc/snroc.h +++ b/kernel/trace/rv/monitors/snroc/snroc.h @@ -5,19 +5,21 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME snroc + enum states_snroc { - other_context_snroc = 0, + other_context_snroc, own_context_snroc, - state_max_snroc + state_max_snroc, }; #define INVALID_STATE state_max_snroc enum events_snroc { - sched_set_state_snroc = 0, + sched_set_state_snroc, sched_switch_in_snroc, sched_switch_out_snroc, - event_max_snroc + event_max_snroc, }; struct automaton_snroc { @@ -31,12 +33,12 @@ struct automaton_snroc { static const struct automaton_snroc automaton_snroc = { .state_names = { "other_context", - "own_context" + "own_context", }, .event_names = { "sched_set_state", "sched_switch_in", - "sched_switch_out" + "sched_switch_out", }, .function = { { INVALID_STATE, own_context_snroc, INVALID_STATE }, diff --git a/kernel/trace/rv/monitors/sssw/sssw.c b/kernel/trace/rv/monitors/sssw/sssw.c index 84b8d890d9d4..a91321c890cd 100644 --- a/kernel/trace/rv/monitors/sssw/sssw.c +++ b/kernel/trace/rv/monitors/sssw/sssw.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "sssw" @@ -15,17 +14,16 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_TASK #include "sssw.h" - -static struct rv_monitor rv_sssw; -DECLARE_DA_MON_PER_TASK(sssw, unsigned char); +#include <rv/da_monitor.h> static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) { if (state == TASK_RUNNING) - da_handle_start_event_sssw(tsk, sched_set_state_runnable_sssw); + da_handle_start_event(tsk, sched_set_state_runnable_sssw); else - da_handle_event_sssw(tsk, sched_set_state_sleepable_sssw); + da_handle_event(tsk, sched_set_state_sleepable_sssw); } static void handle_sched_switch(void *data, bool preempt, @@ -34,15 +32,15 @@ static void handle_sched_switch(void *data, bool preempt, unsigned int prev_state) { if (preempt) - da_handle_event_sssw(prev, sched_switch_preempt_sssw); + da_handle_event(prev, sched_switch_preempt_sssw); else if (prev_state == TASK_RUNNING) - da_handle_event_sssw(prev, sched_switch_yield_sssw); + da_handle_event(prev, sched_switch_yield_sssw); else if (prev_state == TASK_RTLOCK_WAIT) /* special case of sleeping task with racy conditions */ - da_handle_event_sssw(prev, sched_switch_blocking_sssw); + da_handle_event(prev, sched_switch_blocking_sssw); else - da_handle_event_sssw(prev, sched_switch_suspend_sssw); - da_handle_event_sssw(next, sched_switch_in_sssw); + da_handle_event(prev, sched_switch_suspend_sssw); + da_handle_event(next, sched_switch_in_sssw); } static void handle_sched_wakeup(void *data, struct task_struct *p) @@ -51,21 +49,21 @@ static void handle_sched_wakeup(void *data, struct task_struct *p) * Wakeup can also lead to signal_wakeup although the system is * actually runnable. The monitor can safely start with this event. */ - da_handle_start_event_sssw(p, sched_wakeup_sssw); + da_handle_start_event(p, sched_wakeup_sssw); } static void handle_signal_deliver(void *data, int sig, struct kernel_siginfo *info, struct k_sigaction *ka) { - da_handle_event_sssw(current, signal_deliver_sssw); + da_handle_event(current, signal_deliver_sssw); } static int enable_sssw(void) { int retval; - retval = da_monitor_init_sssw(); + retval = da_monitor_init(); if (retval) return retval; @@ -79,33 +77,33 @@ static int enable_sssw(void) static void disable_sssw(void) { - rv_sssw.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state); rv_detach_trace_probe("sssw", sched_switch, handle_sched_switch); rv_detach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup); rv_detach_trace_probe("sssw", signal_deliver, handle_signal_deliver); - da_monitor_destroy_sssw(); + da_monitor_destroy(); } -static struct rv_monitor rv_sssw = { +static struct rv_monitor rv_this = { .name = "sssw", .description = "set state sleep and wakeup.", .enable = enable_sssw, .disable = disable_sssw, - .reset = da_monitor_reset_all_sssw, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_sssw(void) { - return rv_register_monitor(&rv_sssw, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_sssw(void) { - rv_unregister_monitor(&rv_sssw); + rv_unregister_monitor(&rv_this); } module_init(register_sssw); diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h index 243d54050c94..1a4b806061c3 100644 --- a/kernel/trace/rv/monitors/sssw/sssw.h +++ b/kernel/trace/rv/monitors/sssw/sssw.h @@ -5,18 +5,20 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME sssw + enum states_sssw { - runnable_sssw = 0, + runnable_sssw, signal_wakeup_sssw, sleepable_sssw, sleeping_sssw, - state_max_sssw + state_max_sssw, }; #define INVALID_STATE state_max_sssw enum events_sssw { - sched_set_state_runnable_sssw = 0, + sched_set_state_runnable_sssw, sched_set_state_sleepable_sssw, sched_switch_blocking_sssw, sched_switch_in_sssw, @@ -25,7 +27,7 @@ enum events_sssw { sched_switch_yield_sssw, sched_wakeup_sssw, signal_deliver_sssw, - event_max_sssw + event_max_sssw, }; struct automaton_sssw { @@ -41,7 +43,7 @@ static const struct automaton_sssw automaton_sssw = { "runnable", "signal_wakeup", "sleepable", - "sleeping" + "sleeping", }, .event_names = { "sched_set_state_runnable", @@ -52,7 +54,7 @@ static const struct automaton_sssw automaton_sssw = { "sched_switch_suspend", "sched_switch_yield", "sched_wakeup", - "signal_deliver" + "signal_deliver", }, .function = { { @@ -64,7 +66,7 @@ static const struct automaton_sssw automaton_sssw = { INVALID_STATE, runnable_sssw, runnable_sssw, - runnable_sssw + runnable_sssw, }, { INVALID_STATE, @@ -75,7 +77,7 @@ static const struct automaton_sssw automaton_sssw = { INVALID_STATE, signal_wakeup_sssw, signal_wakeup_sssw, - runnable_sssw + runnable_sssw, }, { runnable_sssw, @@ -86,7 +88,7 @@ static const struct automaton_sssw automaton_sssw = { sleeping_sssw, signal_wakeup_sssw, runnable_sssw, - sleepable_sssw + sleepable_sssw, }, { INVALID_STATE, @@ -97,7 +99,7 @@ static const struct automaton_sssw automaton_sssw = { INVALID_STATE, INVALID_STATE, runnable_sssw, - INVALID_STATE + INVALID_STATE, }, }, .initial_state = runnable_sssw, diff --git a/kernel/trace/rv/monitors/sts/sts.c b/kernel/trace/rv/monitors/sts/sts.c index c4a9cd67c1d2..ce031cbf202a 100644 --- a/kernel/trace/rv/monitors/sts/sts.c +++ b/kernel/trace/rv/monitors/sts/sts.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "sts" @@ -16,17 +15,16 @@ #include <rv_trace.h> #include <monitors/sched/sched.h> +#define RV_MON_TYPE RV_MON_PER_CPU #include "sts.h" - -static struct rv_monitor rv_sts; -DECLARE_DA_MON_PER_CPU(sts, unsigned char); +#include <rv/da_monitor.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/trace/irq_vectors.h> static void handle_vector_irq_entry(void *data, int vector) { - da_handle_event_sts(irq_entry_sts); + da_handle_event(irq_entry_sts); } static void attach_vector_irq(void) @@ -61,17 +59,17 @@ static void detach_vector_irq(void) { } static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_sts(irq_disable_sts); + da_handle_event(irq_disable_sts); } static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_sts(irq_enable_sts); + da_handle_event(irq_enable_sts); } static void handle_irq_entry(void *data, int irq, struct irqaction *action) { - da_handle_event_sts(irq_entry_sts); + da_handle_event(irq_entry_sts); } static void handle_sched_switch(void *data, bool preempt, @@ -79,24 +77,24 @@ static void handle_sched_switch(void *data, bool preempt, struct task_struct *next, unsigned int prev_state) { - da_handle_event_sts(sched_switch_sts); + da_handle_event(sched_switch_sts); } static void handle_schedule_entry(void *data, bool preempt) { - da_handle_event_sts(schedule_entry_sts); + da_handle_event(schedule_entry_sts); } static void handle_schedule_exit(void *data, bool is_switch) { - da_handle_start_event_sts(schedule_exit_sts); + da_handle_start_event(schedule_exit_sts); } static int enable_sts(void) { int retval; - retval = da_monitor_init_sts(); + retval = da_monitor_init(); if (retval) return retval; @@ -113,7 +111,7 @@ static int enable_sts(void) static void disable_sts(void) { - rv_sts.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("sts", irq_disable, handle_irq_disable); rv_detach_trace_probe("sts", irq_enable, handle_irq_enable); @@ -123,29 +121,29 @@ static void disable_sts(void) rv_detach_trace_probe("sts", sched_exit_tp, handle_schedule_exit); detach_vector_irq(); - da_monitor_destroy_sts(); + da_monitor_destroy(); } /* * This is the monitor register section. */ -static struct rv_monitor rv_sts = { +static struct rv_monitor rv_this = { .name = "sts", .description = "schedule implies task switch.", .enable = enable_sts, .disable = disable_sts, - .reset = da_monitor_reset_all_sts, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_sts(void) { - return rv_register_monitor(&rv_sts, &rv_sched); + return rv_register_monitor(&rv_this, &rv_sched); } static void __exit unregister_sts(void) { - rv_unregister_monitor(&rv_sts); + rv_unregister_monitor(&rv_this); } module_init(register_sts); diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h index 3368b6599a00..6f7b2d9d72e6 100644 --- a/kernel/trace/rv/monitors/sts/sts.h +++ b/kernel/trace/rv/monitors/sts/sts.h @@ -5,27 +5,29 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME sts + enum states_sts { - can_sched_sts = 0, + can_sched_sts, cant_sched_sts, disable_to_switch_sts, enable_to_exit_sts, in_irq_sts, scheduling_sts, switching_sts, - state_max_sts + state_max_sts, }; #define INVALID_STATE state_max_sts enum events_sts { - irq_disable_sts = 0, + irq_disable_sts, irq_enable_sts, irq_entry_sts, sched_switch_sts, schedule_entry_sts, schedule_exit_sts, - event_max_sts + event_max_sts, }; struct automaton_sts { @@ -44,7 +46,7 @@ static const struct automaton_sts automaton_sts = { "enable_to_exit", "in_irq", "scheduling", - "switching" + "switching", }, .event_names = { "irq_disable", @@ -52,7 +54,7 @@ static const struct automaton_sts automaton_sts = { "irq_entry", "sched_switch", "schedule_entry", - "schedule_exit" + "schedule_exit", }, .function = { { @@ -61,7 +63,7 @@ static const struct automaton_sts automaton_sts = { INVALID_STATE, INVALID_STATE, scheduling_sts, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, @@ -69,7 +71,7 @@ static const struct automaton_sts automaton_sts = { cant_sched_sts, INVALID_STATE, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, @@ -77,7 +79,7 @@ static const struct automaton_sts automaton_sts = { in_irq_sts, switching_sts, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { enable_to_exit_sts, @@ -85,7 +87,7 @@ static const struct automaton_sts automaton_sts = { enable_to_exit_sts, INVALID_STATE, INVALID_STATE, - can_sched_sts + can_sched_sts, }, { INVALID_STATE, @@ -93,7 +95,7 @@ static const struct automaton_sts automaton_sts = { in_irq_sts, INVALID_STATE, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { disable_to_switch_sts, @@ -101,7 +103,7 @@ static const struct automaton_sts automaton_sts = { INVALID_STATE, INVALID_STATE, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, { INVALID_STATE, @@ -109,7 +111,7 @@ static const struct automaton_sts automaton_sts = { INVALID_STATE, INVALID_STATE, INVALID_STATE, - INVALID_STATE + INVALID_STATE, }, }, .initial_state = can_sched_sts, diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index 4b4e99615a11..22d77ec42463 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "wip" @@ -14,31 +13,30 @@ #include <trace/events/sched.h> #include <trace/events/preemptirq.h> +#define RV_MON_TYPE RV_MON_PER_CPU #include "wip.h" - -static struct rv_monitor rv_wip; -DECLARE_DA_MON_PER_CPU(wip, unsigned char); +#include <rv/da_monitor.h> static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_event_wip(preempt_disable_wip); + da_handle_event(preempt_disable_wip); } static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) { - da_handle_start_event_wip(preempt_enable_wip); + da_handle_start_event(preempt_enable_wip); } static void handle_sched_waking(void *data, struct task_struct *task) { - da_handle_event_wip(sched_waking_wip); + da_handle_event(sched_waking_wip); } static int enable_wip(void) { int retval; - retval = da_monitor_init_wip(); + retval = da_monitor_init(); if (retval) return retval; @@ -51,32 +49,32 @@ static int enable_wip(void) static void disable_wip(void) { - rv_wip.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable); rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable); rv_detach_trace_probe("wip", sched_waking, handle_sched_waking); - da_monitor_destroy_wip(); + da_monitor_destroy(); } -static struct rv_monitor rv_wip = { +static struct rv_monitor rv_this = { .name = "wip", .description = "wakeup in preemptive per-cpu testing monitor.", .enable = enable_wip, .disable = disable_wip, - .reset = da_monitor_reset_all_wip, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_wip(void) { - return rv_register_monitor(&rv_wip, NULL); + return rv_register_monitor(&rv_this, NULL); } static void __exit unregister_wip(void) { - rv_unregister_monitor(&rv_wip); + rv_unregister_monitor(&rv_this); } module_init(register_wip); diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index c7193748bf36..b4c3eea94c86 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -5,19 +5,21 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME wip + enum states_wip { - preemptive_wip = 0, + preemptive_wip, non_preemptive_wip, - state_max_wip + state_max_wip, }; #define INVALID_STATE state_max_wip enum events_wip { - preempt_disable_wip = 0, + preempt_disable_wip, preempt_enable_wip, sched_waking_wip, - event_max_wip + event_max_wip, }; struct automaton_wip { @@ -31,12 +33,12 @@ struct automaton_wip { static const struct automaton_wip automaton_wip = { .state_names = { "preemptive", - "non_preemptive" + "non_preemptive", }, .event_names = { "preempt_disable", "preempt_enable", - "sched_waking" + "sched_waking", }, .function = { { non_preemptive_wip, INVALID_STATE, INVALID_STATE }, diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 4145bea2729e..579e7e217ee0 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -6,40 +6,38 @@ #include <linux/init.h> #include <linux/rv.h> #include <rv/instrumentation.h> -#include <rv/da_monitor.h> #define MODULE_NAME "wwnr" #include <rv_trace.h> #include <trace/events/sched.h> +#define RV_MON_TYPE RV_MON_PER_TASK #include "wwnr.h" - -static struct rv_monitor rv_wwnr; -DECLARE_DA_MON_PER_TASK(wwnr, unsigned char); +#include <rv/da_monitor.h> static void handle_switch(void *data, bool preempt, struct task_struct *p, struct task_struct *n, unsigned int prev_state) { /* start monitoring only after the first suspension */ if (prev_state == TASK_INTERRUPTIBLE) - da_handle_start_event_wwnr(p, switch_out_wwnr); + da_handle_start_event(p, switch_out_wwnr); else - da_handle_event_wwnr(p, switch_out_wwnr); + da_handle_event(p, switch_out_wwnr); - da_handle_event_wwnr(n, switch_in_wwnr); + da_handle_event(n, switch_in_wwnr); } static void handle_wakeup(void *data, struct task_struct *p) { - da_handle_event_wwnr(p, wakeup_wwnr); + da_handle_event(p, wakeup_wwnr); } static int enable_wwnr(void) { int retval; - retval = da_monitor_init_wwnr(); + retval = da_monitor_init(); if (retval) return retval; @@ -51,31 +49,31 @@ static int enable_wwnr(void) static void disable_wwnr(void) { - rv_wwnr.enabled = 0; + rv_this.enabled = 0; rv_detach_trace_probe("wwnr", sched_switch, handle_switch); rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup); - da_monitor_destroy_wwnr(); + da_monitor_destroy(); } -static struct rv_monitor rv_wwnr = { +static struct rv_monitor rv_this = { .name = "wwnr", .description = "wakeup while not running per-task testing model.", .enable = enable_wwnr, .disable = disable_wwnr, - .reset = da_monitor_reset_all_wwnr, + .reset = da_monitor_reset_all, .enabled = 0, }; static int __init register_wwnr(void) { - return rv_register_monitor(&rv_wwnr, NULL); + return rv_register_monitor(&rv_this, NULL); } static void __exit unregister_wwnr(void) { - rv_unregister_monitor(&rv_wwnr); + rv_unregister_monitor(&rv_this); } module_init(register_wwnr); diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index 0a59d23edf61..a28006512c9b 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -5,19 +5,21 @@ * Documentation/trace/rv/deterministic_automata.rst */ +#define MONITOR_NAME wwnr + enum states_wwnr { - not_running_wwnr = 0, + not_running_wwnr, running_wwnr, - state_max_wwnr + state_max_wwnr, }; #define INVALID_STATE state_max_wwnr enum events_wwnr { - switch_in_wwnr = 0, + switch_in_wwnr, switch_out_wwnr, wakeup_wwnr, - event_max_wwnr + event_max_wwnr, }; struct automaton_wwnr { @@ -31,12 +33,12 @@ struct automaton_wwnr { static const struct automaton_wwnr automaton_wwnr = { .state_names = { "not_running", - "running" + "running", }, .event_names = { "switch_in", "switch_out", - "wakeup" + "wakeup", }, .function = { { running_wwnr, INVALID_STATE, not_running_wwnr }, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8bd4ec08fb36..2f6fbf9e7caf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -67,7 +67,7 @@ * insertions into the ring-buffer such as trace_printk could occurred * at the same time, giving false positive or negative results. */ -static bool __read_mostly tracing_selftest_running; +bool __read_mostly tracing_selftest_running; /* * If boot-time tracing including tracers/events via kernel cmdline @@ -83,7 +83,6 @@ void __init disable_tracing_selftest(const char *reason) } } #else -#define tracing_selftest_running 0 #define tracing_selftest_disabled 0 #endif @@ -114,7 +113,7 @@ DEFINE_PER_CPU(bool, trace_taskinfo_save); * of the tracer is successful. But that is the only place that sets * this back to zero. */ -static int tracing_disabled = 1; +int tracing_disabled = 1; cpumask_var_t __read_mostly tracing_buffer_mask; @@ -535,22 +534,11 @@ static struct trace_array global_trace = { .trace_flags = TRACE_DEFAULT_FLAGS, }; -static struct trace_array *printk_trace = &global_trace; +struct trace_array *printk_trace = &global_trace; /* List of trace_arrays interested in the top level trace_marker */ static LIST_HEAD(marker_copies); -static __always_inline bool printk_binsafe(struct trace_array *tr) -{ - /* - * The binary format of traceprintk can cause a crash if used - * by a buffer from another boot. Force the use of the - * non binary version of trace_printk if the trace_printk - * buffer is a boot mapped ring buffer. - */ - return !(tr->flags & TRACE_ARRAY_FL_BOOT); -} - static void update_printk_trace(struct trace_array *tr) { if (printk_trace == tr) @@ -649,248 +637,6 @@ int tracing_check_open_get_tr(struct trace_array *tr) return 0; } -/** - * trace_find_filtered_pid - check if a pid exists in a filtered_pid list - * @filtered_pids: The list of pids to check - * @search_pid: The PID to find in @filtered_pids - * - * Returns true if @search_pid is found in @filtered_pids, and false otherwise. - */ -bool -trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) -{ - return trace_pid_list_is_set(filtered_pids, search_pid); -} - -/** - * trace_ignore_this_task - should a task be ignored for tracing - * @filtered_pids: The list of pids to check - * @filtered_no_pids: The list of pids not to be traced - * @task: The task that should be ignored if not filtered - * - * Checks if @task should be traced or not from @filtered_pids. - * Returns true if @task should *NOT* be traced. - * Returns false if @task should be traced. - */ -bool -trace_ignore_this_task(struct trace_pid_list *filtered_pids, - struct trace_pid_list *filtered_no_pids, - struct task_struct *task) -{ - /* - * If filtered_no_pids is not empty, and the task's pid is listed - * in filtered_no_pids, then return true. - * Otherwise, if filtered_pids is empty, that means we can - * trace all tasks. If it has content, then only trace pids - * within filtered_pids. - */ - - return (filtered_pids && - !trace_find_filtered_pid(filtered_pids, task->pid)) || - (filtered_no_pids && - trace_find_filtered_pid(filtered_no_pids, task->pid)); -} - -/** - * trace_filter_add_remove_task - Add or remove a task from a pid_list - * @pid_list: The list to modify - * @self: The current task for fork or NULL for exit - * @task: The task to add or remove - * - * If adding a task, if @self is defined, the task is only added if @self - * is also included in @pid_list. This happens on fork and tasks should - * only be added when the parent is listed. If @self is NULL, then the - * @task pid will be removed from the list, which would happen on exit - * of a task. - */ -void trace_filter_add_remove_task(struct trace_pid_list *pid_list, - struct task_struct *self, - struct task_struct *task) -{ - if (!pid_list) - return; - - /* For forks, we only add if the forking task is listed */ - if (self) { - if (!trace_find_filtered_pid(pid_list, self->pid)) - return; - } - - /* "self" is set for forks, and NULL for exits */ - if (self) - trace_pid_list_set(pid_list, task->pid); - else - trace_pid_list_clear(pid_list, task->pid); -} - -/** - * trace_pid_next - Used for seq_file to get to the next pid of a pid_list - * @pid_list: The pid list to show - * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) - * @pos: The position of the file - * - * This is used by the seq_file "next" operation to iterate the pids - * listed in a trace_pid_list structure. - * - * Returns the pid+1 as we want to display pid of zero, but NULL would - * stop the iteration. - */ -void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) -{ - long pid = (unsigned long)v; - unsigned int next; - - (*pos)++; - - /* pid already is +1 of the actual previous bit */ - if (trace_pid_list_next(pid_list, pid, &next) < 0) - return NULL; - - pid = next; - - /* Return pid + 1 to allow zero to be represented */ - return (void *)(pid + 1); -} - -/** - * trace_pid_start - Used for seq_file to start reading pid lists - * @pid_list: The pid list to show - * @pos: The position of the file - * - * This is used by seq_file "start" operation to start the iteration - * of listing pids. - * - * Returns the pid+1 as we want to display pid of zero, but NULL would - * stop the iteration. - */ -void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) -{ - unsigned long pid; - unsigned int first; - loff_t l = 0; - - if (trace_pid_list_first(pid_list, &first) < 0) - return NULL; - - pid = first; - - /* Return pid + 1 so that zero can be the exit value */ - for (pid++; pid && l < *pos; - pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) - ; - return (void *)pid; -} - -/** - * trace_pid_show - show the current pid in seq_file processing - * @m: The seq_file structure to write into - * @v: A void pointer of the pid (+1) value to display - * - * Can be directly used by seq_file operations to display the current - * pid value. - */ -int trace_pid_show(struct seq_file *m, void *v) -{ - unsigned long pid = (unsigned long)v - 1; - - seq_printf(m, "%lu\n", pid); - return 0; -} - -/* 128 should be much more than enough */ -#define PID_BUF_SIZE 127 - -int trace_pid_write(struct trace_pid_list *filtered_pids, - struct trace_pid_list **new_pid_list, - const char __user *ubuf, size_t cnt) -{ - struct trace_pid_list *pid_list; - struct trace_parser parser; - unsigned long val; - int nr_pids = 0; - ssize_t read = 0; - ssize_t ret; - loff_t pos; - pid_t pid; - - if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) - return -ENOMEM; - - /* - * Always recreate a new array. The write is an all or nothing - * operation. Always create a new array when adding new pids by - * the user. If the operation fails, then the current list is - * not modified. - */ - pid_list = trace_pid_list_alloc(); - if (!pid_list) { - trace_parser_put(&parser); - return -ENOMEM; - } - - if (filtered_pids) { - /* copy the current bits to the new max */ - ret = trace_pid_list_first(filtered_pids, &pid); - while (!ret) { - ret = trace_pid_list_set(pid_list, pid); - if (ret < 0) - goto out; - - ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); - nr_pids++; - } - } - - ret = 0; - while (cnt > 0) { - - pos = 0; - - ret = trace_get_user(&parser, ubuf, cnt, &pos); - if (ret < 0) - break; - - read += ret; - ubuf += ret; - cnt -= ret; - - if (!trace_parser_loaded(&parser)) - break; - - ret = -EINVAL; - if (kstrtoul(parser.buffer, 0, &val)) - break; - - pid = (pid_t)val; - - if (trace_pid_list_set(pid_list, pid) < 0) { - ret = -1; - break; - } - nr_pids++; - - trace_parser_clear(&parser); - ret = 0; - } - out: - trace_parser_put(&parser); - - if (ret < 0) { - trace_pid_list_free(pid_list); - return ret; - } - - if (!nr_pids) { - /* Cleared the list of pids */ - trace_pid_list_free(pid_list); - pid_list = NULL; - } - - *new_pid_list = pid_list; - - return read; -} - static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu) { u64 ts; @@ -1033,56 +779,6 @@ static inline void trace_access_lock_init(void) #endif -#ifdef CONFIG_STACKTRACE -static void __ftrace_trace_stack(struct trace_array *tr, - struct trace_buffer *buffer, - unsigned int trace_ctx, - int skip, struct pt_regs *regs); -static inline void ftrace_trace_stack(struct trace_array *tr, - struct trace_buffer *buffer, - unsigned int trace_ctx, - int skip, struct pt_regs *regs); - -#else -static inline void __ftrace_trace_stack(struct trace_array *tr, - struct trace_buffer *buffer, - unsigned int trace_ctx, - int skip, struct pt_regs *regs) -{ -} -static inline void ftrace_trace_stack(struct trace_array *tr, - struct trace_buffer *buffer, - unsigned long trace_ctx, - int skip, struct pt_regs *regs) -{ -} - -#endif - -static __always_inline void -trace_event_setup(struct ring_buffer_event *event, - int type, unsigned int trace_ctx) -{ - struct trace_entry *ent = ring_buffer_event_data(event); - - tracing_generic_entry_update(ent, type, trace_ctx); -} - -static __always_inline struct ring_buffer_event * -__trace_buffer_lock_reserve(struct trace_buffer *buffer, - int type, - unsigned long len, - unsigned int trace_ctx) -{ - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(buffer, len); - if (event != NULL) - trace_event_setup(event, type, trace_ctx); - - return event; -} - void tracer_tracing_on(struct trace_array *tr) { if (tr->array_buffer.buffer) @@ -1110,130 +806,10 @@ void tracing_on(void) } EXPORT_SYMBOL_GPL(tracing_on); - -static __always_inline void -__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) -{ - __this_cpu_write(trace_taskinfo_save, true); - - /* If this is the temp buffer, we need to commit fully */ - if (this_cpu_read(trace_buffered_event) == event) { - /* Length is in event->array[0] */ - ring_buffer_write(buffer, event->array[0], &event->array[1]); - /* Release the temp buffer */ - this_cpu_dec(trace_buffered_event_cnt); - /* ring_buffer_unlock_commit() enables preemption */ - preempt_enable_notrace(); - } else - ring_buffer_unlock_commit(buffer); -} - -int __trace_array_puts(struct trace_array *tr, unsigned long ip, - const char *str, int size) -{ - struct ring_buffer_event *event; - struct trace_buffer *buffer; - struct print_entry *entry; - unsigned int trace_ctx; - int alloc; - - if (!(tr->trace_flags & TRACE_ITER(PRINTK))) - return 0; - - if (unlikely(tracing_selftest_running && tr == &global_trace)) - return 0; - - if (unlikely(tracing_disabled)) - return 0; - - alloc = sizeof(*entry) + size + 2; /* possible \n added */ - - trace_ctx = tracing_gen_ctx(); - buffer = tr->array_buffer.buffer; - guard(ring_buffer_nest)(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, - trace_ctx); - if (!event) - return 0; - - entry = ring_buffer_event_data(event); - entry->ip = ip; - - memcpy(&entry->buf, str, size); - - /* Add a newline if necessary */ - if (entry->buf[size - 1] != '\n') { - entry->buf[size] = '\n'; - entry->buf[size + 1] = '\0'; - } else - entry->buf[size] = '\0'; - - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - return size; -} -EXPORT_SYMBOL_GPL(__trace_array_puts); - -/** - * __trace_puts - write a constant string into the trace buffer. - * @ip: The address of the caller - * @str: The constant string to write - * @size: The size of the string. - */ -int __trace_puts(unsigned long ip, const char *str, int size) -{ - return __trace_array_puts(printk_trace, ip, str, size); -} -EXPORT_SYMBOL_GPL(__trace_puts); - -/** - * __trace_bputs - write the pointer to a constant string into trace buffer - * @ip: The address of the caller - * @str: The constant string to write to the buffer to - */ -int __trace_bputs(unsigned long ip, const char *str) -{ - struct trace_array *tr = READ_ONCE(printk_trace); - struct ring_buffer_event *event; - struct trace_buffer *buffer; - struct bputs_entry *entry; - unsigned int trace_ctx; - int size = sizeof(struct bputs_entry); - - if (!printk_binsafe(tr)) - return __trace_puts(ip, str, strlen(str)); - - if (!(tr->trace_flags & TRACE_ITER(PRINTK))) - return 0; - - if (unlikely(tracing_selftest_running || tracing_disabled)) - return 0; - - trace_ctx = tracing_gen_ctx(); - buffer = tr->array_buffer.buffer; - - guard(ring_buffer_nest)(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, - trace_ctx); - if (!event) - return 0; - - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->str = str; - - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - - return 1; -} -EXPORT_SYMBOL_GPL(__trace_bputs); - #ifdef CONFIG_TRACER_SNAPSHOT static void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data) { - struct tracer *tracer = tr->current_trace; unsigned long flags; if (in_nmi()) { @@ -1249,15 +825,15 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr, return; } - /* Note, snapshot can not be used when the tracer uses it */ - if (tracer->use_max_tr) { - trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); + if (tr->mapped) { + trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); return; } - if (tr->mapped) { - trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); + /* Note, snapshot can not be used when the tracer uses it */ + if (tracer_uses_snapshot(tr->current_trace)) { + trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); return; } @@ -1357,12 +933,12 @@ int tracing_alloc_snapshot_instance(struct trace_array *tr) /* Make the snapshot buffer have the same order as main buffer */ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); - ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); + ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order); if (ret < 0) return ret; /* allocate spare buffer */ - ret = resize_buffer_duplicate_size(&tr->max_buffer, + ret = resize_buffer_duplicate_size(&tr->snapshot_buffer, &tr->array_buffer, RING_BUFFER_ALL_CPUS); if (ret < 0) return ret; @@ -1380,10 +956,10 @@ static void free_snapshot(struct trace_array *tr) * The max_tr ring buffer has some state (e.g. ring->clock) and * we want preserve it. */ - ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0); - ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); - set_buffer_entries(&tr->max_buffer, 1); - tracing_reset_online_cpus(&tr->max_buffer); + ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, 0); + ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); + set_buffer_entries(&tr->snapshot_buffer, 1); + tracing_reset_online_cpus(&tr->snapshot_buffer); tr->allocated_snapshot = false; } @@ -1499,7 +1075,7 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, guard(mutex)(&trace_types_lock); - if (tr->current_trace->use_max_tr) + if (tracer_uses_snapshot(tr->current_trace)) return -EBUSY; /* @@ -1666,9 +1242,18 @@ EXPORT_SYMBOL_GPL(tracing_off); void disable_trace_on_warning(void) { if (__disable_trace_on_warning) { + struct trace_array *tr = READ_ONCE(printk_trace); + trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_, "Disabling tracing due to warning\n"); tracing_off(); + + /* Disable trace_printk() buffer too */ + if (tr != &global_trace) { + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "Disabling tracing due to warning\n"); + tracer_tracing_off(tr); + } } } @@ -1903,10 +1488,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) unsigned long __read_mostly tracing_thresh; #ifdef CONFIG_TRACER_MAX_TRACE -static const struct file_operations tracing_max_lat_fops; - #ifdef LATENCY_FS_NOTIFY - static struct workqueue_struct *fsnotify_wq; static void latency_fsnotify_workfn(struct work_struct *work) @@ -1923,17 +1505,6 @@ static void latency_fsnotify_workfn_irq(struct irq_work *iwork) queue_work(fsnotify_wq, &tr->fsnotify_work); } -static void trace_create_maxlat_file(struct trace_array *tr, - struct dentry *d_tracer) -{ - INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); - init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); - tr->d_max_latency = trace_create_file("tracing_max_latency", - TRACE_MODE_WRITE, - d_tracer, tr, - &tracing_max_lat_fops); -} - __init static int latency_fsnotify_init(void) { fsnotify_wq = alloc_workqueue("tr_max_lat_wq", @@ -1958,14 +1529,22 @@ void latency_fsnotify(struct trace_array *tr) */ irq_work_queue(&tr->fsnotify_irqwork); } +#endif /* !LATENCY_FS_NOTIFY */ -#else /* !LATENCY_FS_NOTIFY */ - -#define trace_create_maxlat_file(tr, d_tracer) \ - trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ - d_tracer, tr, &tracing_max_lat_fops) +static const struct file_operations tracing_max_lat_fops; +static void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) +{ +#ifdef LATENCY_FS_NOTIFY + INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); + init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); #endif + tr->d_max_latency = trace_create_file("tracing_max_latency", + TRACE_MODE_WRITE, + d_tracer, tr, + &tracing_max_lat_fops); +} /* * Copy the new maximum trace into the separate maximum-trace @@ -1976,8 +1555,8 @@ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct array_buffer *trace_buf = &tr->array_buffer; - struct array_buffer *max_buf = &tr->max_buffer; struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); + struct array_buffer *max_buf = &tr->snapshot_buffer; struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); max_buf->cpu = cpu; @@ -2006,7 +1585,14 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(tsk); latency_fsnotify(tr); } +#else +static inline void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) { } +static inline void __update_max_tr(struct trace_array *tr, + struct task_struct *tsk, int cpu) { } +#endif /* CONFIG_TRACER_MAX_TRACE */ +#ifdef CONFIG_TRACER_SNAPSHOT /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer @@ -2036,17 +1622,16 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, /* Inherit the recordable setting from array_buffer */ if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) - ring_buffer_record_on(tr->max_buffer.buffer); + ring_buffer_record_on(tr->snapshot_buffer.buffer); else - ring_buffer_record_off(tr->max_buffer.buffer); + ring_buffer_record_off(tr->snapshot_buffer.buffer); -#ifdef CONFIG_TRACER_SNAPSHOT if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { arch_spin_unlock(&tr->max_lock); return; } -#endif - swap(tr->array_buffer.buffer, tr->max_buffer.buffer); + + swap(tr->array_buffer.buffer, tr->snapshot_buffer.buffer); __update_max_tr(tr, tsk, cpu); @@ -2081,7 +1666,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&tr->max_lock); - ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu); + ret = ring_buffer_swap_cpu(tr->snapshot_buffer.buffer, tr->array_buffer.buffer, cpu); if (ret == -EBUSY) { /* @@ -2091,7 +1676,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) * and flag that it failed. * Another reason is resize is in progress. */ - trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, + trace_array_printk_buf(tr->snapshot_buffer.buffer, _THIS_IP_, "Failed to swap buffers due to commit or resize in progress\n"); } @@ -2100,8 +1685,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); } - -#endif /* CONFIG_TRACER_MAX_TRACE */ +#endif /* CONFIG_TRACER_SNAPSHOT */ struct pipe_wait { struct trace_iterator *iter; @@ -2134,13 +1718,13 @@ static int wait_on_pipe(struct trace_iterator *iter, int full) ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full, wait_pipe_cond, &pwait); -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT /* * Make sure this is still the snapshot buffer, as if a snapshot were * to happen, this would now be the main buffer. */ if (iter->snapshot) - iter->array_buffer = &iter->tr->max_buffer; + iter->array_buffer = &iter->tr->snapshot_buffer; #endif return ret; } @@ -2205,10 +1789,10 @@ static int run_tracer_selftest(struct tracer *type) tr->current_trace_flags = type->flags ? : type->default_flags; #ifdef CONFIG_TRACER_MAX_TRACE - if (type->use_max_tr) { + if (tracer_uses_snapshot(type)) { /* If we expanded the buffers, make sure the max is expanded too */ if (tr->ring_buffer_expanded) - ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, + ring_buffer_resize(tr->snapshot_buffer.buffer, trace_buf_size, RING_BUFFER_ALL_CPUS); tr->allocated_snapshot = true; } @@ -2230,12 +1814,12 @@ static int run_tracer_selftest(struct tracer *type) tracing_reset_online_cpus(&tr->array_buffer); #ifdef CONFIG_TRACER_MAX_TRACE - if (type->use_max_tr) { + if (tracer_uses_snapshot(type)) { tr->allocated_snapshot = false; /* Shrink the max buffer again */ if (tr->ring_buffer_expanded) - ring_buffer_resize(tr->max_buffer.buffer, 1, + ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); } #endif @@ -2477,8 +2061,8 @@ void tracing_reset_all_online_cpus_unlocked(void) continue; tr->clear_trace = false; tracing_reset_online_cpus(&tr->array_buffer); -#ifdef CONFIG_TRACER_MAX_TRACE - tracing_reset_online_cpus(&tr->max_buffer); +#ifdef CONFIG_TRACER_SNAPSHOT + tracing_reset_online_cpus(&tr->snapshot_buffer); #endif } } @@ -2517,8 +2101,8 @@ static void tracing_start_tr(struct trace_array *tr) if (buffer) ring_buffer_record_enable(buffer); -#ifdef CONFIG_TRACER_MAX_TRACE - buffer = tr->max_buffer.buffer; +#ifdef CONFIG_TRACER_SNAPSHOT + buffer = tr->snapshot_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #endif @@ -2553,8 +2137,8 @@ static void tracing_stop_tr(struct trace_array *tr) if (buffer) ring_buffer_record_disable(buffer); -#ifdef CONFIG_TRACER_MAX_TRACE - buffer = tr->max_buffer.buffer; +#ifdef CONFIG_TRACER_SNAPSHOT + buffer = tr->snapshot_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #endif @@ -3002,10 +2586,10 @@ struct ftrace_stacks { static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); -static void __ftrace_trace_stack(struct trace_array *tr, - struct trace_buffer *buffer, - unsigned int trace_ctx, - int skip, struct pt_regs *regs) +void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs) { struct ring_buffer_event *event; unsigned int size, nr_entries; @@ -3088,17 +2672,6 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace_clear_recursion(bit); } -static inline void ftrace_trace_stack(struct trace_array *tr, - struct trace_buffer *buffer, - unsigned int trace_ctx, - int skip, struct pt_regs *regs) -{ - if (!(tr->trace_flags & TRACE_ITER(STACKTRACE))) - return; - - __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); -} - void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip) { @@ -3233,324 +2806,6 @@ void trace_last_func_repeats(struct trace_array *tr, __buffer_unlock_commit(buffer, event); } -/* created for use with alloc_percpu */ -struct trace_buffer_struct { - int nesting; - char buffer[4][TRACE_BUF_SIZE]; -}; - -static struct trace_buffer_struct __percpu *trace_percpu_buffer; - -/* - * This allows for lockless recording. If we're nested too deeply, then - * this returns NULL. - */ -static char *get_trace_buf(void) -{ - struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); - - if (!trace_percpu_buffer || buffer->nesting >= 4) - return NULL; - - buffer->nesting++; - - /* Interrupts must see nesting incremented before we use the buffer */ - barrier(); - return &buffer->buffer[buffer->nesting - 1][0]; -} - -static void put_trace_buf(void) -{ - /* Don't let the decrement of nesting leak before this */ - barrier(); - this_cpu_dec(trace_percpu_buffer->nesting); -} - -static int alloc_percpu_trace_buffer(void) -{ - struct trace_buffer_struct __percpu *buffers; - - if (trace_percpu_buffer) - return 0; - - buffers = alloc_percpu(struct trace_buffer_struct); - if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) - return -ENOMEM; - - trace_percpu_buffer = buffers; - return 0; -} - -static int buffers_allocated; - -void trace_printk_init_buffers(void) -{ - if (buffers_allocated) - return; - - if (alloc_percpu_trace_buffer()) - return; - - /* trace_printk() is for debug use only. Don't use it in production. */ - - pr_warn("\n"); - pr_warn("**********************************************************\n"); - pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warn("** **\n"); - pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); - pr_warn("** **\n"); - pr_warn("** This means that this is a DEBUG kernel and it is **\n"); - pr_warn("** unsafe for production use. **\n"); - pr_warn("** **\n"); - pr_warn("** If you see this message and you are not debugging **\n"); - pr_warn("** the kernel, report this immediately to your vendor! **\n"); - pr_warn("** **\n"); - pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warn("**********************************************************\n"); - - /* Expand the buffers to set size */ - tracing_update_buffers(&global_trace); - - buffers_allocated = 1; - - /* - * trace_printk_init_buffers() can be called by modules. - * If that happens, then we need to start cmdline recording - * directly here. If the global_trace.buffer is already - * allocated here, then this was called by module code. - */ - if (global_trace.array_buffer.buffer) - tracing_start_cmdline_record(); -} -EXPORT_SYMBOL_GPL(trace_printk_init_buffers); - -void trace_printk_start_comm(void) -{ - /* Start tracing comms if trace printk is set */ - if (!buffers_allocated) - return; - tracing_start_cmdline_record(); -} - -static void trace_printk_start_stop_comm(int enabled) -{ - if (!buffers_allocated) - return; - - if (enabled) - tracing_start_cmdline_record(); - else - tracing_stop_cmdline_record(); -} - -/** - * trace_vbprintk - write binary msg to tracing buffer - * @ip: The address of the caller - * @fmt: The string format to write to the buffer - * @args: Arguments for @fmt - */ -int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) -{ - struct ring_buffer_event *event; - struct trace_buffer *buffer; - struct trace_array *tr = READ_ONCE(printk_trace); - struct bprint_entry *entry; - unsigned int trace_ctx; - char *tbuffer; - int len = 0, size; - - if (!printk_binsafe(tr)) - return trace_vprintk(ip, fmt, args); - - if (unlikely(tracing_selftest_running || tracing_disabled)) - return 0; - - /* Don't pollute graph traces with trace_vprintk internals */ - pause_graph_tracing(); - - trace_ctx = tracing_gen_ctx(); - guard(preempt_notrace)(); - - tbuffer = get_trace_buf(); - if (!tbuffer) { - len = 0; - goto out_nobuffer; - } - - len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); - - if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) - goto out_put; - - size = sizeof(*entry) + sizeof(u32) * len; - buffer = tr->array_buffer.buffer; - scoped_guard(ring_buffer_nest, buffer) { - event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - trace_ctx); - if (!event) - goto out_put; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->fmt = fmt; - - memcpy(entry->buf, tbuffer, sizeof(u32) * len); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); - } -out_put: - put_trace_buf(); - -out_nobuffer: - unpause_graph_tracing(); - - return len; -} -EXPORT_SYMBOL_GPL(trace_vbprintk); - -static __printf(3, 0) -int __trace_array_vprintk(struct trace_buffer *buffer, - unsigned long ip, const char *fmt, va_list args) -{ - struct ring_buffer_event *event; - int len = 0, size; - struct print_entry *entry; - unsigned int trace_ctx; - char *tbuffer; - - if (tracing_disabled) - return 0; - - /* Don't pollute graph traces with trace_vprintk internals */ - pause_graph_tracing(); - - trace_ctx = tracing_gen_ctx(); - guard(preempt_notrace)(); - - - tbuffer = get_trace_buf(); - if (!tbuffer) { - len = 0; - goto out_nobuffer; - } - - len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); - - size = sizeof(*entry) + len + 1; - scoped_guard(ring_buffer_nest, buffer) { - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - trace_ctx); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - - memcpy(&entry->buf, tbuffer, len + 1); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); - } -out: - put_trace_buf(); - -out_nobuffer: - unpause_graph_tracing(); - - return len; -} - -int trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args) -{ - if (tracing_selftest_running && tr == &global_trace) - return 0; - - return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); -} - -/** - * trace_array_printk - Print a message to a specific instance - * @tr: The instance trace_array descriptor - * @ip: The instruction pointer that this is called from. - * @fmt: The format to print (printf format) - * - * If a subsystem sets up its own instance, they have the right to - * printk strings into their tracing instance buffer using this - * function. Note, this function will not write into the top level - * buffer (use trace_printk() for that), as writing into the top level - * buffer should only have events that can be individually disabled. - * trace_printk() is only used for debugging a kernel, and should not - * be ever incorporated in normal use. - * - * trace_array_printk() can be used, as it will not add noise to the - * top level tracing buffer. - * - * Note, trace_array_init_printk() must be called on @tr before this - * can be used. - */ -int trace_array_printk(struct trace_array *tr, - unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!tr) - return -ENOENT; - - /* This is only allowed for created instances */ - if (tr == &global_trace) - return 0; - - if (!(tr->trace_flags & TRACE_ITER(PRINTK))) - return 0; - - va_start(ap, fmt); - ret = trace_array_vprintk(tr, ip, fmt, ap); - va_end(ap); - return ret; -} -EXPORT_SYMBOL_GPL(trace_array_printk); - -/** - * trace_array_init_printk - Initialize buffers for trace_array_printk() - * @tr: The trace array to initialize the buffers for - * - * As trace_array_printk() only writes into instances, they are OK to - * have in the kernel (unlike trace_printk()). This needs to be called - * before trace_array_printk() can be used on a trace_array. - */ -int trace_array_init_printk(struct trace_array *tr) -{ - if (!tr) - return -ENOENT; - - /* This is only allowed for created instances */ - if (tr == &global_trace) - return -EINVAL; - - return alloc_percpu_trace_buffer(); -} -EXPORT_SYMBOL_GPL(trace_array_init_printk); - -int trace_array_printk_buf(struct trace_buffer *buffer, - unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!(printk_trace->trace_flags & TRACE_ITER(PRINTK))) - return 0; - - va_start(ap, fmt); - ret = __trace_array_vprintk(buffer, ip, fmt, ap); - va_end(ap); - return ret; -} - -int trace_vprintk(unsigned long ip, const char *fmt, va_list args) -{ - return trace_array_vprintk(printk_trace, ip, fmt, args); -} -EXPORT_SYMBOL_GPL(trace_vprintk); - static void trace_iterator_increment(struct trace_iterator *iter) { struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); @@ -3987,10 +3242,8 @@ static void *s_start(struct seq_file *m, loff_t *pos) } mutex_unlock(&trace_types_lock); -#ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->trace->use_max_tr) + if (iter->snapshot && tracer_uses_snapshot(iter->trace)) return ERR_PTR(-EBUSY); -#endif if (*pos != iter->pos) { iter->ent = NULL; @@ -4029,10 +3282,8 @@ static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; -#ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->trace->use_max_tr) + if (iter->snapshot && tracer_uses_snapshot(iter->trace)) return; -#endif trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); @@ -4286,7 +3537,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) /* ftrace and system call events are still OK */ if ((event->type > __TRACE_LAST_TYPE) && !is_syscall_event(event)) - return print_event_fields(iter, event); + return print_event_fields(iter, event); } return event->funcs->trace(iter, sym_flags, event); } @@ -4509,7 +3760,7 @@ static void test_ftrace_alive(struct seq_file *m) "# MAY BE MISSING FUNCTION EVENTS\n"); } -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT static void show_snapshot_main_help(struct seq_file *m) { seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" @@ -4687,10 +3938,10 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->tr = tr; -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT /* Currently only the top directory has a snapshot */ if (tr->current_trace->print_max || snapshot) - iter->array_buffer = &tr->max_buffer; + iter->array_buffer = &tr->snapshot_buffer; else #endif iter->array_buffer = &tr->array_buffer; @@ -4759,11 +4010,6 @@ int tracing_open_generic(struct inode *inode, struct file *filp) return 0; } -bool tracing_is_disabled(void) -{ - return (tracing_disabled) ? true: false; -} - /* * Open and update trace_array ref count. * Must have the current trace_array passed to it. @@ -4881,6 +4127,8 @@ static int tracing_single_release_tr(struct inode *inode, struct file *file) return single_release(inode, file); } +static bool update_last_data_if_empty(struct trace_array *tr); + static int tracing_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -4898,13 +4146,15 @@ static int tracing_open(struct inode *inode, struct file *file) #ifdef CONFIG_TRACER_MAX_TRACE if (tr->current_trace->print_max) - trace_buf = &tr->max_buffer; + trace_buf = &tr->snapshot_buffer; #endif if (cpu == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(trace_buf); else tracing_reset_cpu(trace_buf, cpu); + + update_last_data_if_empty(tr); } if (file->f_mode & FMODE_READ) { @@ -4929,11 +4179,9 @@ static int tracing_open(struct inode *inode, struct file *file) static bool trace_ok_for_array(struct tracer *t, struct trace_array *tr) { -#ifdef CONFIG_TRACER_SNAPSHOT /* arrays with mapped buffer range do not have snapshots */ - if (tr->range_addr_start && t->use_max_tr) + if (tr->range_addr_start && tracer_uses_snapshot(t)) return false; -#endif return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; } @@ -5110,15 +4358,15 @@ int tracing_set_cpumask(struct trace_array *tr, if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); -#ifdef CONFIG_TRACER_MAX_TRACE - ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_SNAPSHOT + ring_buffer_record_disable_cpu(tr->snapshot_buffer.buffer, cpu); #endif } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); -#ifdef CONFIG_TRACER_MAX_TRACE - ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_SNAPSHOT + ring_buffer_record_enable_cpu(tr->snapshot_buffer.buffer, cpu); #endif } } @@ -5327,8 +4575,8 @@ int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled) case TRACE_ITER(OVERWRITE): ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled); -#ifdef CONFIG_TRACER_MAX_TRACE - ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); +#ifdef CONFIG_TRACER_SNAPSHOT + ring_buffer_change_overwrite(tr->snapshot_buffer.buffer, enabled); #endif break; @@ -5971,6 +5219,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, int tracer_init(struct tracer *t, struct trace_array *tr) { tracing_reset_online_cpus(&tr->array_buffer); + update_last_data_if_empty(tr); return t->init(tr); } @@ -5991,7 +5240,7 @@ static void update_buffer_entries(struct array_buffer *buf, int cpu) } } -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT /* resize @tr's buffer to the size of @size_tr's entries */ static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, struct array_buffer *size_buf, int cpu_id) @@ -6017,7 +5266,7 @@ static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, return ret; } -#endif /* CONFIG_TRACER_MAX_TRACE */ +#endif /* CONFIG_TRACER_SNAPSHOT */ static int __tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu) @@ -6042,11 +5291,11 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, if (ret < 0) goto out_start; -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT if (!tr->allocated_snapshot) goto out; - ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); + ret = ring_buffer_resize(tr->snapshot_buffer.buffer, size, cpu); if (ret < 0) { int r = resize_buffer_duplicate_size(&tr->array_buffer, &tr->array_buffer, cpu); @@ -6071,10 +5320,10 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, goto out_start; } - update_buffer_entries(&tr->max_buffer, cpu); + update_buffer_entries(&tr->snapshot_buffer, cpu); out: -#endif /* CONFIG_TRACER_MAX_TRACE */ +#endif /* CONFIG_TRACER_SNAPSHOT */ update_buffer_entries(&tr->array_buffer, cpu); out_start: @@ -6265,6 +5514,9 @@ int tracing_update_buffers(struct trace_array *tr) { int ret = 0; + if (!tr) + tr = &global_trace; + guard(mutex)(&trace_types_lock); update_last_data(tr); @@ -6299,9 +5551,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) { struct tracer *trace = NULL; struct tracers *t; -#ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; -#endif int ret; guard(mutex)(&trace_types_lock); @@ -6329,7 +5579,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) return 0; #ifdef CONFIG_TRACER_SNAPSHOT - if (trace->use_max_tr) { + if (tracer_uses_snapshot(trace)) { local_irq_disable(); arch_spin_lock(&tr->max_lock); ret = tr->cond_snapshot ? -EBUSY : 0; @@ -6361,14 +5611,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (tr->current_trace->reset) tr->current_trace->reset(tr); -#ifdef CONFIG_TRACER_MAX_TRACE - had_max_tr = tr->current_trace->use_max_tr; + had_max_tr = tracer_uses_snapshot(tr->current_trace); /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; tr->current_trace_flags = nop_trace.flags; - if (had_max_tr && !trace->use_max_tr) { + if (had_max_tr && !tracer_uses_snapshot(trace)) { /* * We need to make sure that the update_max_tr sees that * current_trace changed to nop_trace to keep it from @@ -6381,24 +5630,19 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) tracing_disarm_snapshot(tr); } - if (!had_max_tr && trace->use_max_tr) { + if (!had_max_tr && tracer_uses_snapshot(trace)) { ret = tracing_arm_snapshot_locked(tr); if (ret) return ret; } -#else - tr->current_trace = &nop_trace; -#endif tr->current_trace_flags = t->flags ? : t->tracer->flags; if (trace->init) { ret = tracer_init(trace, tr); if (ret) { -#ifdef CONFIG_TRACER_MAX_TRACE - if (trace->use_max_tr) + if (tracer_uses_snapshot(trace)) tracing_disarm_snapshot(tr); -#endif tr->current_trace_flags = nop_trace.flags; return ret; } @@ -7603,7 +6847,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, unsigned long ip; char *buf; - if (tracing_disabled) + if (unlikely(tracing_disabled)) return -EINVAL; if (!(tr->trace_flags & TRACE_ITER(MARKERS))) @@ -7683,7 +6927,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, ssize_t written = -ENODEV; char *buf; - if (tracing_disabled) + if (unlikely(tracing_disabled)) return -EINVAL; if (!(tr->trace_flags & TRACE_ITER(MARKERS))) @@ -7784,11 +7028,12 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) */ tracing_reset_online_cpus(&tr->array_buffer); -#ifdef CONFIG_TRACER_MAX_TRACE - if (tr->max_buffer.buffer) - ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); - tracing_reset_online_cpus(&tr->max_buffer); +#ifdef CONFIG_TRACER_SNAPSHOT + if (tr->snapshot_buffer.buffer) + ring_buffer_set_clock(tr->snapshot_buffer.buffer, trace_clocks[i].func); + tracing_reset_online_cpus(&tr->snapshot_buffer); #endif + update_last_data_if_empty(tr); if (tr->scratch && !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) { struct trace_scratch *tscratch = tr->scratch; @@ -7881,26 +7126,6 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve return ring_buffer_event_time_stamp(buffer, rbe); } -/* - * Set or disable using the per CPU trace_buffer_event when possible. - */ -int tracing_set_filter_buffering(struct trace_array *tr, bool set) -{ - guard(mutex)(&trace_types_lock); - - if (set && tr->no_filter_buffering_ref++) - return 0; - - if (!set) { - if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) - return -EINVAL; - - --tr->no_filter_buffering_ref; - } - - return 0; -} - struct ftrace_buffer_info { struct trace_iterator iter; void *spare; @@ -7939,7 +7164,7 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) ret = 0; iter->tr = tr; - iter->array_buffer = &tr->max_buffer; + iter->array_buffer = &tr->snapshot_buffer; iter->cpu_file = tracing_get_cpu(inode); m->private = iter; file->private_data = m; @@ -7976,7 +7201,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, guard(mutex)(&trace_types_lock); - if (tr->current_trace->use_max_tr) + if (tracer_uses_snapshot(tr->current_trace)) return -EBUSY; local_irq_disable(); @@ -8002,7 +7227,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; #endif if (tr->allocated_snapshot) - ret = resize_buffer_duplicate_size(&tr->max_buffer, + ret = resize_buffer_duplicate_size(&tr->snapshot_buffer, &tr->array_buffer, iter->cpu_file); ret = tracing_arm_snapshot_locked(tr); @@ -8023,9 +7248,9 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, default: if (tr->allocated_snapshot) { if (iter->cpu_file == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&tr->max_buffer); + tracing_reset_online_cpus(&tr->snapshot_buffer); else - tracing_reset_cpu(&tr->max_buffer, iter->cpu_file); + tracing_reset_cpu(&tr->snapshot_buffer, iter->cpu_file); } break; } @@ -8075,13 +7300,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp) info = filp->private_data; - if (info->iter.trace->use_max_tr) { + if (tracer_uses_snapshot(info->iter.trace)) { tracing_buffers_release(inode, filp); return -EBUSY; } info->iter.snapshot = true; - info->iter.array_buffer = &info->iter.tr->max_buffer; + info->iter.array_buffer = &info->iter.tr->snapshot_buffer; return ret; } @@ -8631,10 +7856,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (!count) return 0; -#ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->tr->current_trace->use_max_tr) + if (iter->snapshot && tracer_uses_snapshot(iter->tr->current_trace)) return -EBUSY; -#endif page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); @@ -8818,10 +8041,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, int entries, i; ssize_t ret = 0; -#ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->tr->current_trace->use_max_tr) + if (iter->snapshot && tracer_uses_snapshot(iter->tr->current_trace)) return -EBUSY; -#endif page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); if (*ppos & (page_size - 1)) @@ -8955,7 +8176,7 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned return 0; } -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT static int get_snapshot_map(struct trace_array *tr) { int err = 0; @@ -9398,7 +8619,7 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_stats_fops); - trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu, + trace_create_cpu_file("buffer_size_kb", TRACE_MODE_WRITE, d_cpu, tr, cpu, &tracing_entries_fops); if (tr->range_addr_start) @@ -9959,12 +9180,12 @@ buffer_subbuf_size_write(struct file *filp, const char __user *ubuf, if (ret) goto out; -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT if (!tr->allocated_snapshot) goto out_max; - ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); + ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order); if (ret) { /* Put back the old order */ cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order); @@ -10180,12 +9401,12 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) if (ret) return ret; -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT /* Fix mapped buffer trace arrays do not have snapshot buffers */ if (tr->range_addr_start) return 0; - ret = allocate_trace_buffer(tr, &tr->max_buffer, + ret = allocate_trace_buffer(tr, &tr->snapshot_buffer, allocate_snapshot ? size : 1); if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { free_trace_buffer(&tr->array_buffer); @@ -10207,8 +9428,8 @@ static void free_trace_buffers(struct trace_array *tr) free_trace_buffer(&tr->array_buffer); kfree(tr->module_delta); -#ifdef CONFIG_TRACER_MAX_TRACE - free_trace_buffer(&tr->max_buffer); +#ifdef CONFIG_TRACER_SNAPSHOT + free_trace_buffer(&tr->snapshot_buffer); #endif } @@ -10349,7 +9570,7 @@ trace_array_create_systems(const char *name, const char *systems, tr->syscall_buf_sz = global_trace.syscall_buf_sz; tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT spin_lock_init(&tr->snapshot_trigger_lock); #endif tr->current_trace = &nop_trace; @@ -10674,9 +9895,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); -#ifdef CONFIG_TRACER_MAX_TRACE trace_create_maxlat_file(tr, d_tracer); -#endif if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); @@ -10775,7 +9994,7 @@ int tracing_init_dentry(void) extern struct trace_eval_map *__start_ftrace_eval_maps[]; extern struct trace_eval_map *__stop_ftrace_eval_maps[]; -static struct workqueue_struct *eval_map_wq __initdata; +struct workqueue_struct *trace_init_wq __initdata; static struct work_struct eval_map_work __initdata; static struct work_struct tracerfs_init_work __initdata; @@ -10791,15 +10010,15 @@ static int __init trace_eval_init(void) { INIT_WORK(&eval_map_work, eval_map_work_func); - eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0); - if (!eval_map_wq) { - pr_err("Unable to allocate eval_map_wq\n"); + trace_init_wq = alloc_workqueue("trace_init_wq", WQ_UNBOUND, 0); + if (!trace_init_wq) { + pr_err("Unable to allocate trace_init_wq\n"); /* Do work here */ eval_map_work_func(&eval_map_work); return -ENOMEM; } - queue_work(eval_map_wq, &eval_map_work); + queue_work(trace_init_wq, &eval_map_work); return 0; } @@ -10808,8 +10027,8 @@ subsys_initcall(trace_eval_init); static int __init trace_eval_sync(void) { /* Make sure the eval map updates are finished */ - if (eval_map_wq) - destroy_workqueue(eval_map_wq); + if (trace_init_wq) + destroy_workqueue(trace_init_wq); return 0; } @@ -10970,9 +10189,9 @@ static __init int tracer_init_tracefs(void) if (ret) return 0; - if (eval_map_wq) { + if (trace_init_wq) { INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func); - queue_work(eval_map_wq, &tracerfs_init_work); + queue_work(trace_init_wq, &tracerfs_init_work); } else { tracer_init_tracefs_work_func(NULL); } @@ -11305,7 +10524,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, return done; } -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT __init static bool tr_needs_alloc_snapshot(const char *name) { char *test; @@ -11495,7 +10714,7 @@ __init static void enable_instances(void) } } else { /* Only non mapped buffers have snapshot buffers */ - if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) + if (IS_ENABLED(CONFIG_TRACER_SNAPSHOT)) do_allocate_snapshot(name); } @@ -11622,7 +10841,7 @@ __init static int tracer_alloc_buffers(void) global_trace.current_trace_flags = nop_trace.flags; global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT spin_lock_init(&global_trace.snapshot_trigger_lock); #endif ftrace_init_global_array_ops(&global_trace); @@ -11690,7 +10909,7 @@ struct trace_array *trace_get_global_array(void) void __init ftrace_boot_snapshot(void) { -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT struct trace_array *tr; if (!snapshot_at_boot) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c11edec5d8f5..b8f3804586a0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -131,7 +131,7 @@ enum trace_type { #define FAULT_STRING "(fault)" -#define HIST_STACKTRACE_DEPTH 16 +#define HIST_STACKTRACE_DEPTH 31 #define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) #define HIST_STACKTRACE_SKIP 5 @@ -332,29 +332,33 @@ struct trace_array { struct list_head list; char *name; struct array_buffer array_buffer; -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT /* - * The max_buffer is used to snapshot the trace when a maximum + * The snapshot_buffer is used to snapshot the trace when a maximum * latency is reached, or when the user initiates a snapshot. * Some tracers will use this to store a maximum trace while * it continues examining live traces. * - * The buffers for the max_buffer are set up the same as the array_buffer - * When a snapshot is taken, the buffer of the max_buffer is swapped - * with the buffer of the array_buffer and the buffers are reset for - * the array_buffer so the tracing can continue. + * The buffers for the snapshot_buffer are set up the same as the + * array_buffer. When a snapshot is taken, the buffer of the + * snapshot_buffer is swapped with the buffer of the array_buffer + * and the buffers are reset for the array_buffer so the tracing can + * continue. */ - struct array_buffer max_buffer; + struct array_buffer snapshot_buffer; bool allocated_snapshot; spinlock_t snapshot_trigger_lock; unsigned int snapshot; +#ifdef CONFIG_TRACER_MAX_TRACE unsigned long max_latency; -#ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; +#ifdef CONFIG_FSNOTIFY struct work_struct fsnotify_work; struct irq_work fsnotify_irqwork; -#endif -#endif +#endif /* CONFIG_FSNOTIFY */ +#endif /* CONFIG_TRACER_MAX_TRACE */ +#endif /* CONFIG_TRACER_SNAPSHOT */ + /* The below is for memory mapped ring buffer */ unsigned int mapped; unsigned long range_addr_start; @@ -380,7 +384,7 @@ struct trace_array { * * It is also used in other places outside the update_max_tr * so it needs to be defined outside of the - * CONFIG_TRACER_MAX_TRACE. + * CONFIG_TRACER_SNAPSHOT. */ arch_spinlock_t max_lock; #ifdef CONFIG_FTRACE_SYSCALLS @@ -479,13 +483,14 @@ extern struct trace_array *trace_array_find(const char *instance); extern struct trace_array *trace_array_find_get(const char *instance); extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe); -extern int tracing_set_filter_buffering(struct trace_array *tr, bool set); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr); +extern struct trace_array *printk_trace; + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. @@ -661,6 +666,8 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL; } +extern int tracing_disabled; + int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void tracing_reset_online_cpus(struct array_buffer *buf); @@ -672,7 +679,6 @@ int tracing_release_generic_tr(struct inode *inode, struct file *file); int tracing_open_file_tr(struct inode *inode, struct file *filp); int tracing_release_file_tr(struct inode *inode, struct file *filp); int tracing_single_release_file_tr(struct inode *inode, struct file *filp); -bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); void tracer_tracing_off(struct trace_array *tr); @@ -772,6 +778,7 @@ extern cpumask_var_t __read_mostly tracing_buffer_mask; extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; +extern struct workqueue_struct *trace_init_wq __initdata; /* PID filtering */ @@ -790,22 +797,22 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, struct trace_pid_list **new_pid_list, const char __user *ubuf, size_t cnt); -#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef CONFIG_TRACER_SNAPSHOT void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, void *cond_data); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); -#ifdef CONFIG_FSNOTIFY -#define LATENCY_FS_NOTIFY +#if defined(CONFIG_TRACER_MAX_TRACE) && defined(CONFIG_FSNOTIFY) +# define LATENCY_FS_NOTIFY #endif -#endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef LATENCY_FS_NOTIFY void latency_fsnotify(struct trace_array *tr); #else static inline void latency_fsnotify(struct trace_array *tr) { } #endif +#endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef CONFIG_STACKTRACE void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip); @@ -816,6 +823,18 @@ static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, } #endif /* CONFIG_STACKTRACE */ +#ifdef CONFIG_TRACER_MAX_TRACE +static inline bool tracer_uses_snapshot(struct tracer *tracer) +{ + return tracer->use_max_tr; +} +#else +static inline bool tracer_uses_snapshot(struct tracer *tracer) +{ + return false; +} +#endif + void trace_last_func_repeats(struct trace_array *tr, struct trace_func_repeats *last_info, unsigned int trace_ctx); @@ -865,6 +884,7 @@ extern int trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); +extern bool __read_mostly tracing_selftest_running; /* * Tracer data references selftest functions that only occur * on boot up. These can be __init functions. Thus, when selftests @@ -877,6 +897,7 @@ static inline void __init disable_tracing_selftest(const char *reason) } /* Tracers are seldom changed. Optimize when selftests are disabled. */ #define __tracer_data __read_mostly +#define tracing_selftest_running 0 #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); @@ -1414,6 +1435,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(COPY_MARKER, "copy_trace_marker"), \ C(PAUSE_ON_TRACE, "pause-on-trace"), \ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ + C(BITMASK_LIST, "bitmask-list"), \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ @@ -1567,6 +1589,47 @@ char *trace_user_fault_read(struct trace_user_buf_info *tinfo, const char __user *ptr, size_t size, trace_user_buf_copy copy_func, void *data); +static __always_inline void +trace_event_setup(struct ring_buffer_event *event, + int type, unsigned int trace_ctx) +{ + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, type, trace_ctx); +} + +static __always_inline struct ring_buffer_event * +__trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, + unsigned long len, + unsigned int trace_ctx) +{ + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(buffer, len); + if (event != NULL) + trace_event_setup(event, type, trace_ctx); + + return event; +} + +static __always_inline void +__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_taskinfo_save, true); + + /* If this is the temp buffer, we need to commit fully */ + if (this_cpu_read(trace_buffered_event) == event) { + /* Length is in event->array[0] */ + ring_buffer_write(buffer, event->array[0], &event->array[1]); + /* Release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + /* ring_buffer_unlock_commit() enables preemption */ + preempt_enable_notrace(); + } else + ring_buffer_unlock_commit(buffer); +} + static inline void __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) @@ -2087,6 +2150,7 @@ extern const char *__stop___tracepoint_str[]; void trace_printk_control(bool enabled); void trace_printk_start_comm(void); +void trace_printk_start_stop_comm(int enabled); int trace_keep_overwrite(struct tracer *tracer, u64 mask, int set); int set_tracer_flag(struct trace_array *tr, u64 mask, int enabled); @@ -2119,7 +2183,7 @@ extern void tracing_log_err(struct trace_array *tr, * about performance). The internal_trace_puts() is for such * a purpose. */ -#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str) #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ @@ -2237,6 +2301,37 @@ static inline void sanitize_event_name(char *name) *name = '_'; } +#ifdef CONFIG_STACKTRACE +void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs); + +static __always_inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs) +{ + if (!(tr->trace_flags & TRACE_ITER(STACKTRACE))) + return; + + __ftrace_trace_stack(tr, buffer, trace_ctx, skip, regs); +} +#else +static inline void __ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs) +{ +} +static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned long trace_ctx, + int skip, struct pt_regs *regs) +{ +} +#endif + /* * This is a generic way to read and write a u64 value from a file in tracefs. * diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 137b4d9bb116..61fe01dce7a6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -649,6 +649,22 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) } EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); +/** + * trace_event_buffer_reserve - reserve space on the ring buffer for an event + * @fbuffer: information about how to save the event + * @trace_file: the instance file descriptor for the event + * @len: The length of the event + * + * The @fbuffer has information about the ring buffer and data will + * be added to it to be used by the call to trace_event_buffer_commit(). + * The @trace_file is the desrciptor with information about the status + * of the given event for a specific trace_array instance. + * The @len is the length of data to save for the event. + * + * Returns a pointer to the data on the ring buffer or NULL if the + * event was not reserved (event was filtered, too big, or the buffer + * simply was disabled for write). + */ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len) @@ -1662,6 +1678,82 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } +static int get_call_len(struct trace_event_call *call) +{ + int len; + + /* Get the length of "<system>:<event>" */ + len = strlen(call->class->system) + 1; + len += strlen(trace_event_name(call)); + + /* Set the index to 32 bytes to separate event from data */ + return len >= 32 ? 1 : 32 - len; +} + +/** + * t_show_filters - seq_file callback to display active event filters + * @m: The seq_file interface for formatted output + * @v: The current trace_event_file being iterated + * + * Identifies and prints active filters for the current event file in the + * iteration. If a filter is applied to the current event and, if so, + * prints the system name, event name, and the filter string. + */ +static int t_show_filters(struct seq_file *m, void *v) +{ + struct trace_event_file *file = v; + struct trace_event_call *call = file->event_call; + struct event_filter *filter; + int len; + + guard(rcu)(); + filter = rcu_dereference(file->filter); + if (!filter || !filter->filter_string) + return 0; + + len = get_call_len(call); + + seq_printf(m, "%s:%s%*.s%s\n", call->class->system, + trace_event_name(call), len, "", filter->filter_string); + + return 0; +} + +/** + * t_show_triggers - seq_file callback to display active event triggers + * @m: The seq_file interface for formatted output + * @v: The current trace_event_file being iterated + * + * Iterates through the trigger list of the current event file and prints + * each active trigger's configuration using its associated print + * operation. + */ +static int t_show_triggers(struct seq_file *m, void *v) +{ + struct trace_event_file *file = v; + struct trace_event_call *call = file->event_call; + struct event_trigger_data *data; + int len; + + /* + * The event_mutex is held by t_start(), protecting the + * file->triggers list traversal. + */ + if (list_empty(&file->triggers)) + return 0; + + len = get_call_len(call); + + list_for_each_entry_rcu(data, &file->triggers, list) { + seq_printf(m, "%s:%s%*.s", call->class->system, + trace_event_name(call), len, ""); + + data->cmd_ops->print(m, data); + } + + return 0; +} + #ifdef CONFIG_MODULES static int s_show(struct seq_file *m, void *v) { @@ -2176,7 +2268,7 @@ static int subsystem_open(struct inode *inode, struct file *filp) struct event_subsystem *system = NULL; int ret; - if (tracing_is_disabled()) + if (unlikely(tracing_disabled)) return -ENODEV; /* Make sure the system still exists */ @@ -2489,6 +2581,8 @@ ftrace_event_npid_write(struct file *filp, const char __user *ubuf, static int ftrace_event_avail_open(struct inode *inode, struct file *file); static int ftrace_event_set_open(struct inode *inode, struct file *file); +static int ftrace_event_show_filters_open(struct inode *inode, struct file *file); +static int ftrace_event_show_triggers_open(struct inode *inode, struct file *file); static int ftrace_event_set_pid_open(struct inode *inode, struct file *file); static int ftrace_event_set_npid_open(struct inode *inode, struct file *file); static int ftrace_event_release(struct inode *inode, struct file *file); @@ -2507,6 +2601,20 @@ static const struct seq_operations show_set_event_seq_ops = { .stop = s_stop, }; +static const struct seq_operations show_show_event_filters_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show_filters, + .stop = t_stop, +}; + +static const struct seq_operations show_show_event_triggers_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show_triggers, + .stop = t_stop, +}; + static const struct seq_operations show_set_pid_seq_ops = { .start = p_start, .next = p_next, @@ -2536,6 +2644,20 @@ static const struct file_operations ftrace_set_event_fops = { .release = ftrace_event_release, }; +static const struct file_operations ftrace_show_event_filters_fops = { + .open = ftrace_event_show_filters_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations ftrace_show_event_triggers_fops = { + .open = ftrace_event_show_triggers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static const struct file_operations ftrace_set_event_pid_fops = { .open = ftrace_event_set_pid_open, .read = seq_read, @@ -2680,6 +2802,34 @@ ftrace_event_set_open(struct inode *inode, struct file *file) return ret; } +/** + * ftrace_event_show_filters_open - open interface for set_event_filters + * @inode: The inode of the file + * @file: The file being opened + * + * Connects the set_event_filters file to the sequence operations + * required to iterate over and display active event filters. + */ +static int +ftrace_event_show_filters_open(struct inode *inode, struct file *file) +{ + return ftrace_event_open(inode, file, &show_show_event_filters_seq_ops); +} + +/** + * ftrace_event_show_triggers_open - open interface for show_event_triggers + * @inode: The inode of the file + * @file: The file being opened + * + * Connects the show_event_triggers file to the sequence operations + * required to iterate over and display active event triggers. + */ +static int +ftrace_event_show_triggers_open(struct inode *inode, struct file *file) +{ + return ftrace_event_open(inode, file, &show_show_event_triggers_seq_ops); +} + static int ftrace_event_set_pid_open(struct inode *inode, struct file *file) { @@ -3963,11 +4113,6 @@ void trace_put_event_file(struct trace_event_file *file) EXPORT_SYMBOL_GPL(trace_put_event_file); #ifdef CONFIG_DYNAMIC_FTRACE - -/* Avoid typos */ -#define ENABLE_EVENT_STR "enable_event" -#define DISABLE_EVENT_STR "disable_event" - struct event_probe_data { struct trace_event_file *file; unsigned long count; @@ -4400,6 +4545,12 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) if (!entry) return -ENOMEM; + trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr, + &ftrace_show_event_filters_fops); + + trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr, + &ftrace_show_event_triggers_fops); + nr_entries = ARRAY_SIZE(events_entries); e_events = eventfs_create_events_dir("events", parent, events_entries, diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 385af8405392..7001e34476ee 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1375,7 +1375,7 @@ static void free_filter_list_tasks(struct rcu_head *rhp) struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work); - queue_rcu_work(system_wq, &filter_list->rwork); + queue_rcu_work(system_dfl_wq, &filter_list->rwork); } /* diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c97bb2fda5c0..e6f449f53afc 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -105,38 +105,44 @@ enum field_op_id { FIELD_OP_MULT, }; +#define FIELD_FUNCS \ + C(NOP, "nop"), \ + C(VAR_REF, "var_ref"), \ + C(COUNTER, "counter"), \ + C(CONST, "const"), \ + C(LOG2, "log2"), \ + C(BUCKET, "bucket"), \ + C(TIMESTAMP, "timestamp"), \ + C(CPU, "cpu"), \ + C(COMM, "comm"), \ + C(STRING, "string"), \ + C(DYNSTRING, "dynstring"), \ + C(RELDYNSTRING, "reldynstring"), \ + C(PSTRING, "pstring"), \ + C(S64, "s64"), \ + C(U64, "u64"), \ + C(S32, "s32"), \ + C(U32, "u32"), \ + C(S16, "s16"), \ + C(U16, "u16"), \ + C(S8, "s8"), \ + C(U8, "u8"), \ + C(UMINUS, "uminus"), \ + C(MINUS, "minus"), \ + C(PLUS, "plus"), \ + C(DIV, "div"), \ + C(MULT, "mult"), \ + C(DIV_POWER2, "div_power2"), \ + C(DIV_NOT_POWER2, "div_not_power2"), \ + C(DIV_MULT_SHIFT, "div_mult_shift"), \ + C(EXECNAME, "execname"), \ + C(STACK, "stack"), + +#undef C +#define C(a, b) HIST_FIELD_FN_##a + enum hist_field_fn { - HIST_FIELD_FN_NOP, - HIST_FIELD_FN_VAR_REF, - HIST_FIELD_FN_COUNTER, - HIST_FIELD_FN_CONST, - HIST_FIELD_FN_LOG2, - HIST_FIELD_FN_BUCKET, - HIST_FIELD_FN_TIMESTAMP, - HIST_FIELD_FN_CPU, - HIST_FIELD_FN_COMM, - HIST_FIELD_FN_STRING, - HIST_FIELD_FN_DYNSTRING, - HIST_FIELD_FN_RELDYNSTRING, - HIST_FIELD_FN_PSTRING, - HIST_FIELD_FN_S64, - HIST_FIELD_FN_U64, - HIST_FIELD_FN_S32, - HIST_FIELD_FN_U32, - HIST_FIELD_FN_S16, - HIST_FIELD_FN_U16, - HIST_FIELD_FN_S8, - HIST_FIELD_FN_U8, - HIST_FIELD_FN_UMINUS, - HIST_FIELD_FN_MINUS, - HIST_FIELD_FN_PLUS, - HIST_FIELD_FN_DIV, - HIST_FIELD_FN_MULT, - HIST_FIELD_FN_DIV_POWER2, - HIST_FIELD_FN_DIV_NOT_POWER2, - HIST_FIELD_FN_DIV_MULT_SHIFT, - HIST_FIELD_FN_EXECNAME, - HIST_FIELD_FN_STACK, + FIELD_FUNCS }; /* @@ -3157,7 +3163,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, u64 var_val; /* Make sure stacktrace can fit in the string variable length */ - BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) >= STR_VAR_LEN_MAX); + BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) > STR_VAR_LEN_MAX); for (i = 0, j = field_var_str_start; i < n_field_vars; i++) { struct field_var *field_var = field_vars[i]; @@ -5854,6 +5860,12 @@ const struct file_operations event_hist_fops = { }; #ifdef CONFIG_HIST_TRIGGERS_DEBUG + +#undef C +#define C(a, b) b + +static const char * const field_funcs[] = { FIELD_FUNCS }; + static void hist_field_debug_show_flags(struct seq_file *m, unsigned long flags) { @@ -5918,6 +5930,7 @@ static int hist_field_debug_show(struct seq_file *m, seq_printf(m, " type: %s\n", field->type); seq_printf(m, " size: %u\n", field->size); seq_printf(m, " is_signed: %u\n", field->is_signed); + seq_printf(m, " function: hist_field_%s()\n", field_funcs[field->fn_num]); return 0; } @@ -6518,6 +6531,26 @@ static bool existing_hist_update_only(char *glob, return updated; } +/* + * Set or disable using the per CPU trace_buffer_event when possible. + */ +static int tracing_set_filter_buffering(struct trace_array *tr, bool set) +{ + guard(mutex)(&trace_types_lock); + + if (set && tr->no_filter_buffering_ref++) + return 0; + + if (!set) { + if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) + return -EINVAL; + + --tr->no_filter_buffering_ref; + } + + return 0; +} + static int hist_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) @@ -6907,11 +6940,9 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, out_unreg: event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); out_free: - event_trigger_reset_filter(cmd_ops, trigger_data); - remove_hist_vars(hist_data); - kfree(trigger_data); + trigger_data_free(trigger_data); destroy_hist_data(hist_data); goto out; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 45c187e77e21..ce42fbf16f4a 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -499,9 +499,9 @@ static unsigned int trace_stack(struct synth_trace_event *entry, return len; } -static notrace void trace_event_raw_event_synth(void *__data, - u64 *var_ref_vals, - unsigned int *var_ref_idx) +static void trace_event_raw_event_synth(void *__data, + u64 *var_ref_vals, + unsigned int *var_ref_idx) { unsigned int i, n_u64, val_idx, len, data_size = 0; struct trace_event_file *trace_file = __data; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 06b75bcfc7b8..7fa26327c9c7 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1347,18 +1347,13 @@ traceon_trigger(struct event_trigger_data *data, { struct trace_event_file *file = data->private_data; - if (file) { - if (tracer_tracing_is_on(file->tr)) - return; - - tracer_tracing_on(file->tr); + if (WARN_ON_ONCE(!file)) return; - } - if (tracing_is_on()) + if (tracer_tracing_is_on(file->tr)) return; - tracing_on(); + tracer_tracing_on(file->tr); } static bool @@ -1368,13 +1363,11 @@ traceon_count_func(struct event_trigger_data *data, { struct trace_event_file *file = data->private_data; - if (file) { - if (tracer_tracing_is_on(file->tr)) - return false; - } else { - if (tracing_is_on()) - return false; - } + if (WARN_ON_ONCE(!file)) + return false; + + if (tracer_tracing_is_on(file->tr)) + return false; if (!data->count) return false; @@ -1392,18 +1385,13 @@ traceoff_trigger(struct event_trigger_data *data, { struct trace_event_file *file = data->private_data; - if (file) { - if (!tracer_tracing_is_on(file->tr)) - return; - - tracer_tracing_off(file->tr); + if (WARN_ON_ONCE(!file)) return; - } - if (!tracing_is_on()) + if (!tracer_tracing_is_on(file->tr)) return; - tracing_off(); + tracer_tracing_off(file->tr); } static bool @@ -1413,13 +1401,11 @@ traceoff_count_func(struct event_trigger_data *data, { struct trace_event_file *file = data->private_data; - if (file) { - if (!tracer_tracing_is_on(file->tr)) - return false; - } else { - if (!tracing_is_on()) - return false; - } + if (WARN_ON_ONCE(!file)) + return false; + + if (!tracer_tracing_is_on(file->tr)) + return false; if (!data->count) return false; @@ -1481,10 +1467,10 @@ snapshot_trigger(struct event_trigger_data *data, { struct trace_event_file *file = data->private_data; - if (file) - tracing_snapshot_instance(file->tr); - else - tracing_snapshot(); + if (WARN_ON_ONCE(!file)) + return; + + tracing_snapshot_instance(file->tr); } static int @@ -1570,10 +1556,10 @@ stacktrace_trigger(struct event_trigger_data *data, { struct trace_event_file *file = data->private_data; - if (file) - __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP); - else - trace_dump_stack(STACK_SKIP); + if (WARN_ON_ONCE(!file)) + return; + + __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP); } static int diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 2f7b94e98317..3fe274b84f1c 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -102,9 +102,9 @@ struct hwlat_sample { /* keep the global state somewhere. */ static struct hwlat_data { - struct mutex lock; /* protect changes */ + struct mutex lock; /* protect changes */ - u64 count; /* total since reset */ + atomic64_t count; /* total since reset */ u64 sample_window; /* total sampling window (on+off) */ u64 sample_width; /* active sampling portion of window */ @@ -193,8 +193,7 @@ void trace_hwlat_callback(bool enter) * get_sample - sample the CPU TSC and look for likely hardware latencies * * Used to repeatedly capture the CPU TSC (or similar), looking for potential - * hardware-induced latency. Called with interrupts disabled and with - * hwlat_data.lock held. + * hardware-induced latency. Called with interrupts disabled. */ static int get_sample(void) { @@ -204,6 +203,7 @@ static int get_sample(void) time_type start, t1, t2, last_t2; s64 diff, outer_diff, total, last_total = 0; u64 sample = 0; + u64 sample_width = READ_ONCE(hwlat_data.sample_width); u64 thresh = tracing_thresh; u64 outer_sample = 0; int ret = -1; @@ -267,7 +267,7 @@ static int get_sample(void) if (diff > sample) sample = diff; /* only want highest value */ - } while (total <= hwlat_data.sample_width); + } while (total <= sample_width); barrier(); /* finish the above in the view for NMIs */ trace_hwlat_callback_enabled = false; @@ -285,8 +285,7 @@ static int get_sample(void) if (kdata->nmi_total_ts) do_div(kdata->nmi_total_ts, NSEC_PER_USEC); - hwlat_data.count++; - s.seqnum = hwlat_data.count; + s.seqnum = atomic64_inc_return(&hwlat_data.count); s.duration = sample; s.outer_duration = outer_sample; s.nmi_total_ts = kdata->nmi_total_ts; @@ -832,7 +831,7 @@ static int hwlat_tracer_init(struct trace_array *tr) hwlat_trace = tr; - hwlat_data.count = 0; + atomic64_set(&hwlat_data.count, 0); tr->max_latency = 0; save_tracing_thresh = tracing_thresh; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9953506370a5..061658518605 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -2048,6 +2048,10 @@ static __init int init_kprobe_trace(void) trace_create_file("kprobe_profile", TRACE_MODE_READ, NULL, NULL, &kprobe_profile_ops); + /* If no 'kprobe_event=' cmd is provided, return directly. */ + if (kprobe_boot_events_buf[0] == '\0') + return 0; + setup_boot_kprobe_events(); return 0; @@ -2079,7 +2083,7 @@ static __init int kprobe_trace_self_tests_init(void) struct trace_kprobe *tk; struct trace_event_file *file; - if (tracing_is_disabled()) + if (unlikely(tracing_disabled)) return -ENODEV; if (tracing_selftest_disabled) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index cc2d3306bb60..1996d7aba038 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -194,13 +194,37 @@ trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, EXPORT_SYMBOL(trace_print_symbols_seq_u64); #endif +/** + * trace_print_bitmask_seq - print a bitmask to a sequence buffer + * @iter: The trace iterator for the current event instance + * @bitmask_ptr: The pointer to the bitmask data + * @bitmask_size: The size of the bitmask in bytes + * + * Prints a bitmask into a sequence buffer as either a hex string or a + * human-readable range list, depending on the instance's "bitmask-list" + * trace option. The bitmask is formatted into the iterator's temporary + * scratchpad rather than the primary sequence buffer. This avoids + * duplication and pointer-collision issues when the returned string is + * processed by a "%s" specifier in a TP_printk() macro. + * + * Returns a pointer to the formatted string within the temporary buffer. + */ const char * -trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, +trace_print_bitmask_seq(struct trace_iterator *iter, void *bitmask_ptr, unsigned int bitmask_size) { - const char *ret = trace_seq_buffer_ptr(p); + struct trace_seq *p = &iter->tmp_seq; + const struct trace_array *tr = iter->tr; + const char *ret; + + trace_seq_init(p); + ret = trace_seq_buffer_ptr(p); + + if (tr->trace_flags & TRACE_ITER(BITMASK_LIST)) + trace_seq_bitmask_list(p, bitmask_ptr, bitmask_size * 8); + else + trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); - trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); trace_seq_putc(p, 0); return ret; diff --git a/kernel/trace/trace_pid.c b/kernel/trace/trace_pid.c new file mode 100644 index 000000000000..7127c8de4174 --- /dev/null +++ b/kernel/trace/trace_pid.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "trace.h" + +/** + * trace_find_filtered_pid - check if a pid exists in a filtered_pid list + * @filtered_pids: The list of pids to check + * @search_pid: The PID to find in @filtered_pids + * + * Returns true if @search_pid is found in @filtered_pids, and false otherwise. + */ +bool +trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) +{ + return trace_pid_list_is_set(filtered_pids, search_pid); +} + +/** + * trace_ignore_this_task - should a task be ignored for tracing + * @filtered_pids: The list of pids to check + * @filtered_no_pids: The list of pids not to be traced + * @task: The task that should be ignored if not filtered + * + * Checks if @task should be traced or not from @filtered_pids. + * Returns true if @task should *NOT* be traced. + * Returns false if @task should be traced. + */ +bool +trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct trace_pid_list *filtered_no_pids, + struct task_struct *task) +{ + /* + * If filtered_no_pids is not empty, and the task's pid is listed + * in filtered_no_pids, then return true. + * Otherwise, if filtered_pids is empty, that means we can + * trace all tasks. If it has content, then only trace pids + * within filtered_pids. + */ + + return (filtered_pids && + !trace_find_filtered_pid(filtered_pids, task->pid)) || + (filtered_no_pids && + trace_find_filtered_pid(filtered_no_pids, task->pid)); +} + +/** + * trace_filter_add_remove_task - Add or remove a task from a pid_list + * @pid_list: The list to modify + * @self: The current task for fork or NULL for exit + * @task: The task to add or remove + * + * If adding a task, if @self is defined, the task is only added if @self + * is also included in @pid_list. This happens on fork and tasks should + * only be added when the parent is listed. If @self is NULL, then the + * @task pid will be removed from the list, which would happen on exit + * of a task. + */ +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) +{ + if (!pid_list) + return; + + /* For forks, we only add if the forking task is listed */ + if (self) { + if (!trace_find_filtered_pid(pid_list, self->pid)) + return; + } + + /* "self" is set for forks, and NULL for exits */ + if (self) + trace_pid_list_set(pid_list, task->pid); + else + trace_pid_list_clear(pid_list, task->pid); +} + +/** + * trace_pid_next - Used for seq_file to get to the next pid of a pid_list + * @pid_list: The pid list to show + * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) + * @pos: The position of the file + * + * This is used by the seq_file "next" operation to iterate the pids + * listed in a trace_pid_list structure. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) +{ + long pid = (unsigned long)v; + unsigned int next; + + (*pos)++; + + /* pid already is +1 of the actual previous bit */ + if (trace_pid_list_next(pid_list, pid, &next) < 0) + return NULL; + + pid = next; + + /* Return pid + 1 to allow zero to be represented */ + return (void *)(pid + 1); +} + +/** + * trace_pid_start - Used for seq_file to start reading pid lists + * @pid_list: The pid list to show + * @pos: The position of the file + * + * This is used by seq_file "start" operation to start the iteration + * of listing pids. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) +{ + unsigned long pid; + unsigned int first; + loff_t l = 0; + + if (trace_pid_list_first(pid_list, &first) < 0) + return NULL; + + pid = first; + + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; + pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) + ; + return (void *)pid; +} + +/** + * trace_pid_show - show the current pid in seq_file processing + * @m: The seq_file structure to write into + * @v: A void pointer of the pid (+1) value to display + * + * Can be directly used by seq_file operations to display the current + * pid value. + */ +int trace_pid_show(struct seq_file *m, void *v) +{ + unsigned long pid = (unsigned long)v - 1; + + seq_printf(m, "%lu\n", pid); + return 0; +} + +/* 128 should be much more than enough */ +#define PID_BUF_SIZE 127 + +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt) +{ + struct trace_pid_list *pid_list; + struct trace_parser parser; + unsigned long val; + int nr_pids = 0; + ssize_t read = 0; + ssize_t ret; + loff_t pos; + pid_t pid; + + if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) + return -ENOMEM; + + /* + * Always recreate a new array. The write is an all or nothing + * operation. Always create a new array when adding new pids by + * the user. If the operation fails, then the current list is + * not modified. + */ + pid_list = trace_pid_list_alloc(); + if (!pid_list) { + trace_parser_put(&parser); + return -ENOMEM; + } + + if (filtered_pids) { + /* copy the current bits to the new max */ + ret = trace_pid_list_first(filtered_pids, &pid); + while (!ret) { + ret = trace_pid_list_set(pid_list, pid); + if (ret < 0) + goto out; + + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); + nr_pids++; + } + } + + ret = 0; + while (cnt > 0) { + + pos = 0; + + ret = trace_get_user(&parser, ubuf, cnt, &pos); + if (ret < 0) + break; + + read += ret; + ubuf += ret; + cnt -= ret; + + if (!trace_parser_loaded(&parser)) + break; + + ret = -EINVAL; + if (kstrtoul(parser.buffer, 0, &val)) + break; + + pid = (pid_t)val; + + if (trace_pid_list_set(pid_list, pid) < 0) { + ret = -1; + break; + } + nr_pids++; + + trace_parser_clear(&parser); + ret = 0; + } + out: + trace_parser_put(&parser); + + if (ret < 0) { + trace_pid_list_free(pid_list); + return ret; + } + + if (!nr_pids) { + /* Cleared the list of pids */ + trace_pid_list_free(pid_list); + pid_list = NULL; + } + + *new_pid_list = pid_list; + + return read; +} + diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 29f6e95439b6..6a29e4350b55 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -376,6 +376,436 @@ static const struct file_operations ftrace_formats_fops = { .release = seq_release, }; +static __always_inline bool printk_binsafe(struct trace_array *tr) +{ + /* + * The binary format of traceprintk can cause a crash if used + * by a buffer from another boot. Force the use of the + * non binary version of trace_printk if the trace_printk + * buffer is a boot mapped ring buffer. + */ + return !(tr->flags & TRACE_ARRAY_FL_BOOT); +} + +int __trace_array_puts(struct trace_array *tr, unsigned long ip, + const char *str, int size) +{ + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct print_entry *entry; + unsigned int trace_ctx; + int alloc; + + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) + return 0; + + if (unlikely(tracing_selftest_running && + (tr->flags & TRACE_ARRAY_FL_GLOBAL))) + return 0; + + if (unlikely(tracing_disabled)) + return 0; + + alloc = sizeof(*entry) + size + 2; /* possible \n added */ + + trace_ctx = tracing_gen_ctx(); + buffer = tr->array_buffer.buffer; + guard(ring_buffer_nest)(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + trace_ctx); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + + memcpy(&entry->buf, str, size); + + /* Add a newline if necessary */ + if (entry->buf[size - 1] != '\n') { + entry->buf[size] = '\n'; + entry->buf[size + 1] = '\0'; + } else + entry->buf[size] = '\0'; + + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); + return size; +} +EXPORT_SYMBOL_GPL(__trace_array_puts); + +/** + * __trace_puts - write a constant string into the trace buffer. + * @ip: The address of the caller + * @str: The constant string to write + */ +int __trace_puts(unsigned long ip, const char *str) +{ + return __trace_array_puts(printk_trace, ip, str, strlen(str)); +} +EXPORT_SYMBOL_GPL(__trace_puts); + +/** + * __trace_bputs - write the pointer to a constant string into trace buffer + * @ip: The address of the caller + * @str: The constant string to write to the buffer to + */ +int __trace_bputs(unsigned long ip, const char *str) +{ + struct trace_array *tr = READ_ONCE(printk_trace); + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct bputs_entry *entry; + unsigned int trace_ctx; + int size = sizeof(struct bputs_entry); + + if (!printk_binsafe(tr)) + return __trace_puts(ip, str); + + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) + return 0; + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + trace_ctx = tracing_gen_ctx(); + buffer = tr->array_buffer.buffer; + + guard(ring_buffer_nest)(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, + trace_ctx); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->str = str; + + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); + + return 1; +} +EXPORT_SYMBOL_GPL(__trace_bputs); + +/* created for use with alloc_percpu */ +struct trace_buffer_struct { + int nesting; + char buffer[4][TRACE_BUF_SIZE]; +}; + +static struct trace_buffer_struct __percpu *trace_percpu_buffer; + +/* + * This allows for lockless recording. If we're nested too deeply, then + * this returns NULL. + */ +static char *get_trace_buf(void) +{ + struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); + + if (!trace_percpu_buffer || buffer->nesting >= 4) + return NULL; + + buffer->nesting++; + + /* Interrupts must see nesting incremented before we use the buffer */ + barrier(); + return &buffer->buffer[buffer->nesting - 1][0]; +} + +static void put_trace_buf(void) +{ + /* Don't let the decrement of nesting leak before this */ + barrier(); + this_cpu_dec(trace_percpu_buffer->nesting); +} + +static int alloc_percpu_trace_buffer(void) +{ + struct trace_buffer_struct __percpu *buffers; + + if (trace_percpu_buffer) + return 0; + + buffers = alloc_percpu(struct trace_buffer_struct); + if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) + return -ENOMEM; + + trace_percpu_buffer = buffers; + return 0; +} + +static int buffers_allocated; + +void trace_printk_init_buffers(void) +{ + if (buffers_allocated) + return; + + if (alloc_percpu_trace_buffer()) + return; + + /* trace_printk() is for debug use only. Don't use it in production. */ + + pr_warn("\n"); + pr_warn("**********************************************************\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("** **\n"); + pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warn("** **\n"); + pr_warn("** This means that this is a DEBUG kernel and it is **\n"); + pr_warn("** unsafe for production use. **\n"); + pr_warn("** **\n"); + pr_warn("** If you see this message and you are not debugging **\n"); + pr_warn("** the kernel, report this immediately to your vendor! **\n"); + pr_warn("** **\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("**********************************************************\n"); + + /* Expand the buffers to set size */ + if (tracing_update_buffers(NULL) < 0) + pr_err("Failed to expand tracing buffers for trace_printk() calls\n"); + else + buffers_allocated = 1; + + /* + * trace_printk_init_buffers() can be called by modules. + * If that happens, then we need to start cmdline recording + * directly here. + */ + if (system_state == SYSTEM_RUNNING) + tracing_start_cmdline_record(); +} +EXPORT_SYMBOL_GPL(trace_printk_init_buffers); + +void trace_printk_start_comm(void) +{ + /* Start tracing comms if trace printk is set */ + if (!buffers_allocated) + return; + tracing_start_cmdline_record(); +} + +void trace_printk_start_stop_comm(int enabled) +{ + if (!buffers_allocated) + return; + + if (enabled) + tracing_start_cmdline_record(); + else + tracing_stop_cmdline_record(); +} + +/** + * trace_vbprintk - write binary msg to tracing buffer + * @ip: The address of the caller + * @fmt: The string format to write to the buffer + * @args: Arguments for @fmt + */ +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) +{ + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct trace_array *tr = READ_ONCE(printk_trace); + struct bprint_entry *entry; + unsigned int trace_ctx; + char *tbuffer; + int len = 0, size; + + if (!printk_binsafe(tr)) + return trace_vprintk(ip, fmt, args); + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + + trace_ctx = tracing_gen_ctx(); + guard(preempt_notrace)(); + + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; + goto out_nobuffer; + } + + len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); + + if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) + goto out_put; + + size = sizeof(*entry) + sizeof(u32) * len; + buffer = tr->array_buffer.buffer; + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + trace_ctx); + if (!event) + goto out_put; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->fmt = fmt; + + memcpy(entry->buf, tbuffer, sizeof(u32) * len); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + } +out_put: + put_trace_buf(); + +out_nobuffer: + unpause_graph_tracing(); + + return len; +} +EXPORT_SYMBOL_GPL(trace_vbprintk); + +static __printf(3, 0) +int __trace_array_vprintk(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) +{ + struct ring_buffer_event *event; + int len = 0, size; + struct print_entry *entry; + unsigned int trace_ctx; + char *tbuffer; + + if (unlikely(tracing_disabled)) + return 0; + + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + + trace_ctx = tracing_gen_ctx(); + guard(preempt_notrace)(); + + + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; + goto out_nobuffer; + } + + len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); + + size = sizeof(*entry) + len + 1; + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + + memcpy(&entry->buf, tbuffer, len + 1); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); + } +out: + put_trace_buf(); + +out_nobuffer: + unpause_graph_tracing(); + + return len; +} + +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args) +{ + if (tracing_selftest_running && (tr->flags & TRACE_ARRAY_FL_GLOBAL)) + return 0; + + return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); +} + +/** + * trace_array_printk - Print a message to a specific instance + * @tr: The instance trace_array descriptor + * @ip: The instruction pointer that this is called from. + * @fmt: The format to print (printf format) + * + * If a subsystem sets up its own instance, they have the right to + * printk strings into their tracing instance buffer using this + * function. Note, this function will not write into the top level + * buffer (use trace_printk() for that), as writing into the top level + * buffer should only have events that can be individually disabled. + * trace_printk() is only used for debugging a kernel, and should not + * be ever incorporated in normal use. + * + * trace_array_printk() can be used, as it will not add noise to the + * top level tracing buffer. + * + * Note, trace_array_init_printk() must be called on @tr before this + * can be used. + */ +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!tr) + return -ENOENT; + + /* This is only allowed for created instances */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return 0; + + if (!(tr->trace_flags & TRACE_ITER(PRINTK))) + return 0; + + va_start(ap, fmt); + ret = trace_array_vprintk(tr, ip, fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(trace_array_printk); + +/** + * trace_array_init_printk - Initialize buffers for trace_array_printk() + * @tr: The trace array to initialize the buffers for + * + * As trace_array_printk() only writes into instances, they are OK to + * have in the kernel (unlike trace_printk()). This needs to be called + * before trace_array_printk() can be used on a trace_array. + */ +int trace_array_init_printk(struct trace_array *tr) +{ + if (!tr) + return -ENOENT; + + /* This is only allowed for created instances */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return -EINVAL; + + return alloc_percpu_trace_buffer(); +} +EXPORT_SYMBOL_GPL(trace_array_init_printk); + +int trace_array_printk_buf(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(printk_trace->trace_flags & TRACE_ITER(PRINTK))) + return 0; + + va_start(ap, fmt); + ret = __trace_array_vprintk(buffer, ip, fmt, ap); + va_end(ap); + return ret; +} + +int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +{ + return trace_array_vprintk(printk_trace, ip, fmt, args); +} +EXPORT_SYMBOL_GPL(trace_vprintk); + static __init int init_trace_printk_function_export(void) { int ret; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index d88c44f1dfa5..be53fe6fee6a 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1225,7 +1225,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) /* check both trace buffers */ ret = trace_test_buffer(&tr->array_buffer, NULL); if (!ret) - ret = trace_test_buffer(&tr->max_buffer, &count); + ret = trace_test_buffer(&tr->snapshot_buffer, &count); trace->reset(tr); tracing_start(); @@ -1287,7 +1287,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) /* check both trace buffers */ ret = trace_test_buffer(&tr->array_buffer, NULL); if (!ret) - ret = trace_test_buffer(&tr->max_buffer, &count); + ret = trace_test_buffer(&tr->snapshot_buffer, &count); trace->reset(tr); tracing_start(); @@ -1355,7 +1355,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * if (ret) goto out; - ret = trace_test_buffer(&tr->max_buffer, &count); + ret = trace_test_buffer(&tr->snapshot_buffer, &count); if (ret) goto out; @@ -1385,7 +1385,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * if (ret) goto out; - ret = trace_test_buffer(&tr->max_buffer, &count); + ret = trace_test_buffer(&tr->snapshot_buffer, &count); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -1513,7 +1513,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* check both trace buffers */ ret = trace_test_buffer(&tr->array_buffer, NULL); if (!ret) - ret = trace_test_buffer(&tr->max_buffer, &count); + ret = trace_test_buffer(&tr->snapshot_buffer, &count); trace->reset(tr); diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 32684ef4fb9d..85f6f10d107f 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -106,7 +106,7 @@ EXPORT_SYMBOL_GPL(trace_seq_printf); * Writes a ASCII representation of a bitmask string into @s. */ void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, - int nmaskbits) + int nmaskbits) { unsigned int save_len = s->seq.len; @@ -125,6 +125,33 @@ void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, EXPORT_SYMBOL_GPL(trace_seq_bitmask); /** + * trace_seq_bitmask_list - write a bitmask array in its list representation + * @s: trace sequence descriptor + * @maskp: points to an array of unsigned longs that represent a bitmask + * @nmaskbits: The number of bits that are valid in @maskp + * + * Writes a list representation (e.g., 0-3,5-7) of a bitmask string into @s. + */ +void trace_seq_bitmask_list(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + seq_buf_printf(&s->seq, "%*pbl", nmaskbits, maskp); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + } +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask_list); + +/** * trace_seq_vprintf - sequence printing of trace information * @s: trace sequence descriptor * @fmt: printf format string diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 62719d2941c9..fd2ee879815c 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -34,9 +34,13 @@ enum tp_transition_sync { struct tp_transition_snapshot { unsigned long rcu; + unsigned long srcu_gp; bool ongoing; }; +DEFINE_SRCU_FAST(tracepoint_srcu); +EXPORT_SYMBOL_GPL(tracepoint_srcu); + /* Protected by tracepoints_mutex */ static struct tp_transition_snapshot tp_transition_snapshot[_NR_TP_TRANSITION_SYNC]; @@ -46,6 +50,7 @@ static void tp_rcu_get_state(enum tp_transition_sync sync) /* Keep the latest get_state snapshot. */ snapshot->rcu = get_state_synchronize_rcu(); + snapshot->srcu_gp = start_poll_synchronize_srcu(&tracepoint_srcu); snapshot->ongoing = true; } @@ -56,6 +61,8 @@ static void tp_rcu_cond_sync(enum tp_transition_sync sync) if (!snapshot->ongoing) return; cond_synchronize_rcu(snapshot->rcu); + if (!poll_state_synchronize_srcu(&tracepoint_srcu, snapshot->srcu_gp)) + synchronize_srcu(&tracepoint_srcu); snapshot->ongoing = false; } @@ -112,10 +119,13 @@ static inline void release_probes(struct tracepoint *tp, struct tracepoint_func struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); - if (tracepoint_is_faultable(tp)) - call_rcu_tasks_trace(&tp_probes->rcu, rcu_free_old_probes); - else - call_rcu(&tp_probes->rcu, rcu_free_old_probes); + if (tracepoint_is_faultable(tp)) { + call_rcu_tasks_trace(&tp_probes->rcu, + rcu_free_old_probes); + } else { + call_srcu(&tracepoint_srcu, &tp_probes->rcu, + rcu_free_old_probes); + } } } diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 6ea2f6363b90..5c153106e642 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -125,7 +125,7 @@ static void __acct_update_integrals(struct task_struct *tsk, { u64 time, delta; - if (!likely(tsk->mm)) + if (unlikely(!tsk->mm || (tsk->flags & PF_KTHREAD))) return; time = stime + utime; diff --git a/kernel/ucount.c b/kernel/ucount.c index 586af49fc03e..fc4a8f2d3096 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -47,7 +47,7 @@ static int set_permissions(struct ctl_table_header *head, int mode; /* Allow users with CAP_SYS_RESOURCE unrestrained access */ - if (ns_capable(user_ns, CAP_SYS_RESOURCE)) + if (ns_capable_noaudit(user_ns, CAP_SYS_RESOURCE)) mode = (table->mode & S_IRWXU) >> 6; else /* Allow all others at most read-only access */ diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index e2784038bbed..8d82913223a1 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -141,7 +141,9 @@ EXPORT_SYMBOL_GPL(hwerr_log_error_type); static int __init crash_save_vmcoreinfo_init(void) { - vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); + int order; + order = get_order(VMCOREINFO_BYTES); + vmcoreinfo_data = (unsigned char *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); if (!vmcoreinfo_data) { pr_warn("Memory allocation for vmcoreinfo_data failed\n"); return -ENOMEM; @@ -150,7 +152,7 @@ static int __init crash_save_vmcoreinfo_init(void) vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, GFP_KERNEL | __GFP_ZERO); if (!vmcoreinfo_note) { - free_page((unsigned long)vmcoreinfo_data); + free_pages((unsigned long)vmcoreinfo_data, order); vmcoreinfo_data = NULL; pr_warn("Memory allocation for vmcoreinfo_note failed\n"); return -ENOMEM; @@ -242,7 +244,6 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(kallsyms_token_table); VMCOREINFO_SYMBOL(kallsyms_token_index); VMCOREINFO_SYMBOL(kallsyms_offsets); - VMCOREINFO_SYMBOL(kallsyms_relative_base); #endif /* CONFIG_KALLSYMS */ arch_crash_save_vmcoreinfo(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 366122f4a0f8..7d675781bc91 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -363,7 +363,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = - IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC); + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC; static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; @@ -550,7 +550,7 @@ static bool need_counting_irqs(void) u8 util; int tail = __this_cpu_read(cpustat_tail); - tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT; + tail = (tail + NUM_SAMPLE_PERIODS - 1) % NUM_SAMPLE_PERIODS; util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]); return util > HARDIRQ_PERCENT_THRESH; } @@ -774,8 +774,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); - int duration; int softlockup_all_cpu_backtrace; + int duration, thresh_count; unsigned long flags; if (!watchdog_enabled) @@ -879,7 +879,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT); - if (softlockup_panic) + thresh_count = duration / get_softlockup_thresh(); + + if (softlockup_panic && thresh_count >= softlockup_panic) panic("softlockup: hung tasks"); } @@ -1228,7 +1230,7 @@ static const struct ctl_table watchdog_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "softlockup_sys_info", diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index d3ca70e3c256..cf05775a96d3 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -118,18 +118,11 @@ static void watchdog_overflow_callback(struct perf_event *event, watchdog_hardlockup_check(smp_processor_id(), regs); } -static int hardlockup_detector_event_create(void) +static struct perf_event *hardlockup_detector_event_create(unsigned int cpu) { - unsigned int cpu; struct perf_event_attr *wd_attr; struct perf_event *evt; - /* - * Preemption is not disabled because memory will be allocated. - * Ensure CPU-locality by calling this in per-CPU kthread. - */ - WARN_ON(!is_percpu_thread()); - cpu = raw_smp_processor_id(); wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); @@ -143,14 +136,7 @@ static int hardlockup_detector_event_create(void) watchdog_overflow_callback, NULL); } - if (IS_ERR(evt)) { - pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, - PTR_ERR(evt)); - return PTR_ERR(evt); - } - WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak"); - this_cpu_write(watchdog_ev, evt); - return 0; + return evt; } /** @@ -159,17 +145,26 @@ static int hardlockup_detector_event_create(void) */ void watchdog_hardlockup_enable(unsigned int cpu) { + struct perf_event *evt; + WARN_ON_ONCE(cpu != smp_processor_id()); - if (hardlockup_detector_event_create()) + evt = hardlockup_detector_event_create(cpu); + if (IS_ERR(evt)) { + pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, + PTR_ERR(evt)); return; + } /* use original value for check */ if (!atomic_fetch_inc(&watchdog_cpus)) pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); + WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak"); + this_cpu_write(watchdog_ev, evt); + watchdog_init_timestamp(); - perf_event_enable(this_cpu_read(watchdog_ev)); + perf_event_enable(evt); } /** @@ -263,19 +258,30 @@ bool __weak __init arch_perf_nmi_is_available(void) */ int __init watchdog_hardlockup_probe(void) { + struct perf_event *evt; + unsigned int cpu; int ret; if (!arch_perf_nmi_is_available()) return -ENODEV; - ret = hardlockup_detector_event_create(); + if (!hw_nmi_get_sample_period(watchdog_thresh)) + return -EINVAL; - if (ret) { + /* + * Test hardware PMU availability by creating a temporary perf event. + * The event is released immediately. + */ + cpu = raw_smp_processor_id(); + evt = hardlockup_detector_event_create(cpu); + if (IS_ERR(evt)) { pr_info("Perf NMI watchdog permanently disabled\n"); + ret = PTR_ERR(evt); } else { - perf_event_release_kernel(this_cpu_read(watchdog_ev)); - this_cpu_write(watchdog_ev, NULL); + perf_event_release_kernel(evt); + ret = 0; } + return ret; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index eb5660013222..c515cff01828 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -117,6 +117,8 @@ enum wq_internal_consts { MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ + RESCUER_BATCH = 16, /* process items per turn */ + /* * Rescue workers are used only on emergencies and shared by * all cpus. Give MIN_NICE. @@ -286,6 +288,7 @@ struct pool_workqueue { struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ + struct work_struct mayday_cursor; /* L: cursor on pool->worklist */ u64 stats[PWQ_NR_STATS]; @@ -1120,6 +1123,12 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool, return NULL; } +static void mayday_cursor_func(struct work_struct *work) +{ + /* should not be processed, only for marking position */ + BUG(); +} + /** * move_linked_works - move linked works to a list * @work: start of series of works to be scheduled @@ -1182,6 +1191,16 @@ static bool assign_work(struct work_struct *work, struct worker *worker, lockdep_assert_held(&pool->lock); + /* The cursor work should not be processed */ + if (unlikely(work->func == mayday_cursor_func)) { + /* only worker_thread() can possibly take this branch */ + WARN_ON_ONCE(worker->rescue_wq); + if (nextp) + *nextp = list_next_entry(work, entry); + list_del_init(&work->entry); + return false; + } + /* * A single work shouldn't be executed concurrently by multiple workers. * __queue_work() ensures that @work doesn't jump to a different pool @@ -2976,9 +2995,8 @@ static void idle_cull_fn(struct work_struct *work) reap_dying_workers(&cull_list); } -static void send_mayday(struct work_struct *work) +static void send_mayday(struct pool_workqueue *pwq) { - struct pool_workqueue *pwq = get_work_pwq(work); struct workqueue_struct *wq = pwq->wq; lockdep_assert_held(&wq_mayday_lock); @@ -3016,7 +3034,7 @@ static void pool_mayday_timeout(struct timer_list *t) * rescuers. */ list_for_each_entry(work, &pool->worklist, entry) - send_mayday(work); + send_mayday(get_work_pwq(work)); } raw_spin_unlock(&wq_mayday_lock); @@ -3440,22 +3458,57 @@ sleep: static bool assign_rescuer_work(struct pool_workqueue *pwq, struct worker *rescuer) { struct worker_pool *pool = pwq->pool; + struct work_struct *cursor = &pwq->mayday_cursor; struct work_struct *work, *n; - /* need rescue? */ - if (!pwq->nr_active || !need_to_create_worker(pool)) + /* have work items to rescue? */ + if (!pwq->nr_active) return false; - /* - * Slurp in all works issued via this workqueue and - * process'em. - */ - list_for_each_entry_safe(work, n, &pool->worklist, entry) { - if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) + /* need rescue? */ + if (!need_to_create_worker(pool)) { + /* + * The pool has idle workers and doesn't need the rescuer, so it + * could simply return false here. + * + * However, the memory pressure might not be fully relieved. + * In PERCPU pool with concurrency enabled, having idle workers + * does not necessarily mean memory pressure is gone; it may + * simply mean regular workers have woken up, completed their + * work, and gone idle again due to concurrency limits. + * + * In this case, those working workers may later sleep again, + * the pool may run out of idle workers, and it will have to + * allocate new ones and wait for the timer to send mayday, + * causing unnecessary delay - especially if memory pressure + * was never resolved throughout. + * + * Do more work if memory pressure is still on to reduce + * relapse, using (pool->flags & POOL_MANAGER_ACTIVE), though + * not precisely, unless there are other PWQs needing help. + */ + if (!(pool->flags & POOL_MANAGER_ACTIVE) || + !list_empty(&pwq->wq->maydays)) + return false; + } + + /* search from the start or cursor if available */ + if (list_empty(&cursor->entry)) + work = list_first_entry(&pool->worklist, struct work_struct, entry); + else + work = list_next_entry(cursor, entry); + + /* find the next work item to rescue */ + list_for_each_entry_safe_from(work, n, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) { pwq->stats[PWQ_STAT_RESCUED]++; + /* put the cursor for next search */ + list_move_tail(&cursor->entry, &n->entry); + return true; + } } - return !list_empty(&rescuer->scheduled); + return false; } /** @@ -3512,6 +3565,7 @@ repeat: struct pool_workqueue *pwq = list_first_entry(&wq->maydays, struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; + unsigned int count = 0; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -3524,31 +3578,27 @@ repeat: WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); - if (assign_rescuer_work(pwq, rescuer)) { + while (assign_rescuer_work(pwq, rescuer)) { process_scheduled_works(rescuer); /* - * The above execution of rescued work items could - * have created more to rescue through - * pwq_activate_first_inactive() or chained - * queueing. Let's put @pwq back on mayday list so - * that such back-to-back work items, which may be - * being used to relieve memory pressure, don't - * incur MAYDAY_INTERVAL delay inbetween. + * If the per-turn work item limit is reached and other + * PWQs are in mayday, requeue mayday for this PWQ and + * let the rescuer handle the other PWQs first. */ - if (pwq->nr_active && need_to_create_worker(pool)) { + if (++count > RESCUER_BATCH && !list_empty(&pwq->wq->maydays) && + pwq->nr_active && need_to_create_worker(pool)) { raw_spin_lock(&wq_mayday_lock); - /* - * Queue iff somebody else hasn't queued it already. - */ - if (list_empty(&pwq->mayday_node)) { - get_pwq(pwq); - list_add_tail(&pwq->mayday_node, &wq->maydays); - } + send_mayday(pwq); raw_spin_unlock(&wq_mayday_lock); + break; } } + /* The cursor can not be left behind without the rescuer watching it. */ + if (!list_empty(&pwq->mayday_cursor.entry) && list_empty(&pwq->mayday_node)) + list_del_init(&pwq->mayday_cursor.entry); + /* * Leave this pool. Notify regular workers; otherwise, we end up * with 0 concurrency and stalling the execution. @@ -5167,6 +5217,19 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); kthread_init_work(&pwq->release_work, pwq_release_workfn); + + /* + * Set the dummy cursor work with valid function and get_work_pwq(). + * + * The cursor work should only be in the pwq->pool->worklist, and + * should not be treated as a processable work item. + * + * WORK_STRUCT_PENDING and WORK_STRUCT_INACTIVE just make it less + * surprise for kernel debugging tools and reviewers. + */ + INIT_WORK(&pwq->mayday_cursor, mayday_cursor_func); + atomic_long_set(&pwq->mayday_cursor.data, (unsigned long)pwq | + WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | WORK_STRUCT_INACTIVE); } /* sync @pwq with the current state of its associated wq and link it */ @@ -7508,9 +7571,13 @@ static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; -static unsigned int wq_panic_on_stall; +static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC; module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644); +static unsigned int wq_panic_on_stall_time; +module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644); +MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)"); + /* * Show workers that might prevent the processing of pending work items. * The only candidates are CPU-bound workers in the running state. @@ -7562,14 +7629,25 @@ static void show_cpu_pools_hogs(void) rcu_read_unlock(); } -static void panic_on_wq_watchdog(void) +/* + * It triggers a panic in two scenarios: when the total number of stalls + * exceeds a threshold, and when a stall lasts longer than + * wq_panic_on_stall_time + */ +static void panic_on_wq_watchdog(unsigned int stall_time_sec) { static unsigned int wq_stall; if (wq_panic_on_stall) { wq_stall++; - BUG_ON(wq_stall >= wq_panic_on_stall); + if (wq_stall >= wq_panic_on_stall) + panic("workqueue: %u stall(s) exceeded threshold %u\n", + wq_stall, wq_panic_on_stall); } + + if (wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time) + panic("workqueue: stall lasted %us, exceeding threshold %us\n", + stall_time_sec, wq_panic_on_stall_time); } static void wq_watchdog_reset_touched(void) @@ -7584,10 +7662,12 @@ static void wq_watchdog_reset_touched(void) static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; + unsigned int max_stall_time = 0; bool lockup_detected = false; bool cpu_pool_stall = false; unsigned long now = jiffies; struct worker_pool *pool; + unsigned int stall_time; int pi; if (!thresh) @@ -7621,14 +7701,15 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) /* did we stall? */ if (time_after(now, ts + thresh)) { lockup_detected = true; + stall_time = jiffies_to_msecs(now - pool_ts) / 1000; + max_stall_time = max(max_stall_time, stall_time); if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) { pool->cpu_stall = true; cpu_pool_stall = true; } pr_emerg("BUG: workqueue lockup - pool"); pr_cont_pool_info(pool); - pr_cont(" stuck for %us!\n", - jiffies_to_msecs(now - pool_ts) / 1000); + pr_cont(" stuck for %us!\n", stall_time); } @@ -7641,7 +7722,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) show_cpu_pools_hogs(); if (lockup_detected) - panic_on_wq_watchdog(); + panic_on_wq_watchdog(max_stall_time); wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh); |
